whispercpp 1.3.5 → 1.3.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/LICENSE +1 -1
- data/README.md +99 -2
- data/ext/extconf.rb +1 -0
- data/ext/ruby_whisper.c +20 -4
- data/ext/ruby_whisper.h +30 -2
- data/ext/ruby_whisper_context.c +216 -124
- data/ext/ruby_whisper_context_params.c +163 -0
- data/ext/ruby_whisper_model.c +0 -1
- data/ext/ruby_whisper_params.c +0 -1
- data/ext/ruby_whisper_segment.c +0 -1
- data/ext/ruby_whisper_token.c +29 -9
- data/ext/ruby_whisper_transcribe.cpp +4 -1
- data/ext/ruby_whisper_vad_context.c +48 -1
- data/ext/ruby_whisper_vad_context_detect.cpp +6 -5
- data/ext/ruby_whisper_vad_params.c +0 -1
- data/ext/ruby_whisper_vad_segment.c +0 -1
- data/ext/ruby_whisper_vad_segments.c +0 -1
- data/ext/sources/CMakeLists.txt +1 -1
- data/ext/sources/bindings/javascript/package.json +1 -1
- data/ext/sources/cmake/whisper-config.cmake.in +5 -40
- data/ext/sources/examples/bench/bench.cpp +23 -18
- data/ext/sources/examples/cli/cli.cpp +8 -0
- data/ext/sources/examples/common-ggml.cpp +2 -0
- data/ext/sources/examples/miniaudio.h +4507 -2131
- data/ext/sources/examples/server/server.cpp +18 -4
- data/ext/sources/examples/talk-llama/CMakeLists.txt +3 -2
- data/ext/sources/examples/talk-llama/llama-adapter.cpp +7 -13
- data/ext/sources/examples/talk-llama/llama-adapter.h +4 -3
- data/ext/sources/examples/talk-llama/llama-arch.cpp +335 -17
- data/ext/sources/examples/talk-llama/llama-arch.h +42 -0
- data/ext/sources/examples/talk-llama/llama-batch.cpp +3 -1
- data/ext/sources/examples/talk-llama/llama-chat.cpp +21 -1
- data/ext/sources/examples/talk-llama/llama-chat.h +1 -0
- data/ext/sources/examples/talk-llama/llama-context.cpp +508 -520
- data/ext/sources/examples/talk-llama/llama-context.h +27 -28
- data/ext/sources/examples/talk-llama/llama-cparams.h +5 -0
- data/ext/sources/examples/talk-llama/llama-ext.h +12 -0
- data/ext/sources/examples/talk-llama/llama-grammar.cpp +8 -8
- data/ext/sources/examples/talk-llama/llama-graph.cpp +583 -130
- data/ext/sources/examples/talk-llama/llama-graph.h +131 -10
- data/ext/sources/examples/talk-llama/llama-hparams.cpp +57 -40
- data/ext/sources/examples/talk-llama/llama-hparams.h +79 -10
- data/ext/sources/examples/talk-llama/llama-impl.cpp +4 -4
- data/ext/sources/examples/talk-llama/llama-impl.h +13 -1
- data/ext/sources/examples/talk-llama/llama-kv-cache-iswa.cpp +3 -1
- data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +274 -89
- data/ext/sources/examples/talk-llama/llama-kv-cache.h +2 -3
- data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.cpp +275 -0
- data/ext/sources/examples/talk-llama/llama-memory-hybrid-iswa.h +140 -0
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +11 -13
- data/ext/sources/examples/talk-llama/llama-mmap.cpp +28 -11
- data/ext/sources/examples/talk-llama/llama-model-loader.cpp +527 -119
- data/ext/sources/examples/talk-llama/llama-model-loader.h +35 -5
- data/ext/sources/examples/talk-llama/llama-model-saver.cpp +60 -46
- data/ext/sources/examples/talk-llama/llama-model-saver.h +5 -2
- data/ext/sources/examples/talk-llama/llama-model.cpp +1365 -647
- data/ext/sources/examples/talk-llama/llama-model.h +72 -19
- data/ext/sources/examples/talk-llama/llama-quant.cpp +578 -346
- data/ext/sources/examples/talk-llama/{llama-sampling.cpp → llama-sampler.cpp} +190 -76
- data/ext/sources/examples/talk-llama/{llama-sampling.h → llama-sampler.h} +0 -2
- data/ext/sources/examples/talk-llama/llama-vocab.cpp +118 -48
- data/ext/sources/examples/talk-llama/llama-vocab.h +5 -0
- data/ext/sources/examples/talk-llama/llama.cpp +76 -22
- data/ext/sources/examples/talk-llama/llama.h +63 -30
- data/ext/sources/examples/talk-llama/models/afmoe.cpp +2 -3
- data/ext/sources/examples/talk-llama/models/apertus.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/arcee.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/arctic.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/baichuan.cpp +4 -3
- data/ext/sources/examples/talk-llama/models/bailingmoe.cpp +1 -2
- data/ext/sources/examples/talk-llama/models/bailingmoe2.cpp +3 -5
- data/ext/sources/examples/talk-llama/models/bert.cpp +13 -7
- data/ext/sources/examples/talk-llama/models/bitnet.cpp +9 -24
- data/ext/sources/examples/talk-llama/models/bloom.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/chameleon.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/chatglm.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/codeshell.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/cogvlm.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/cohere2-iswa.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/command-r.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/dbrx.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/deci.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/deepseek.cpp +4 -6
- data/ext/sources/examples/talk-llama/models/deepseek2.cpp +24 -21
- data/ext/sources/examples/talk-llama/models/delta-net-base.cpp +445 -0
- data/ext/sources/examples/talk-llama/models/dots1.cpp +4 -6
- data/ext/sources/examples/talk-llama/models/dream.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/ernie4-5-moe.cpp +4 -6
- data/ext/sources/examples/talk-llama/models/ernie4-5.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/eurobert.cpp +97 -0
- data/ext/sources/examples/talk-llama/models/exaone-moe.cpp +145 -0
- data/ext/sources/examples/talk-llama/models/exaone.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/exaone4.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/falcon-h1.cpp +2 -4
- data/ext/sources/examples/talk-llama/models/falcon.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/gemma-embedding.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/gemma.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/gemma2-iswa.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/gemma3.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/gemma3n-iswa.cpp +7 -7
- data/ext/sources/examples/talk-llama/models/glm4-moe.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/glm4.cpp +14 -7
- data/ext/sources/examples/talk-llama/models/gpt2.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/gptneox.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/granite-hybrid.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/granite.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/grok.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/grovemoe.cpp +5 -7
- data/ext/sources/examples/talk-llama/models/hunyuan-dense.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/hunyuan-moe.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/internlm2.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/jais.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/jais2.cpp +123 -0
- data/ext/sources/examples/talk-llama/models/jamba.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/kimi-linear.cpp +381 -0
- data/ext/sources/examples/talk-llama/models/lfm2.cpp +145 -124
- data/ext/sources/examples/talk-llama/models/llada-moe.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/llada.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/llama-iswa.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/llama.cpp +18 -11
- data/ext/sources/examples/talk-llama/models/maincoder.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/{graph-context-mamba.cpp → mamba-base.cpp} +9 -3
- data/ext/sources/examples/talk-llama/models/mamba.cpp +1 -2
- data/ext/sources/examples/talk-llama/models/mimo2-iswa.cpp +11 -5
- data/ext/sources/examples/talk-llama/models/minicpm3.cpp +14 -13
- data/ext/sources/examples/talk-llama/models/minimax-m2.cpp +4 -5
- data/ext/sources/examples/talk-llama/models/mistral3.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/models.h +181 -46
- data/ext/sources/examples/talk-llama/models/modern-bert.cpp +2 -9
- data/ext/sources/examples/talk-llama/models/mpt.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/nemotron-h.cpp +26 -14
- data/ext/sources/examples/talk-llama/models/nemotron.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/neo-bert.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/olmo.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/olmo2.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/olmoe.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/openai-moe-iswa.cpp +1 -1
- data/ext/sources/examples/talk-llama/models/openelm.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/orion.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/paddleocr.cpp +122 -0
- data/ext/sources/examples/talk-llama/models/pangu-embedded.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/phi2.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/phi3.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/plamo.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/plamo2.cpp +9 -5
- data/ext/sources/examples/talk-llama/models/plamo3.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/plm.cpp +15 -14
- data/ext/sources/examples/talk-llama/models/qwen.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/qwen2.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/qwen2moe.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/qwen2vl.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/qwen3.cpp +12 -9
- data/ext/sources/examples/talk-llama/models/qwen35.cpp +381 -0
- data/ext/sources/examples/talk-llama/models/qwen35moe.cpp +422 -0
- data/ext/sources/examples/talk-llama/models/qwen3moe.cpp +15 -8
- data/ext/sources/examples/talk-llama/models/qwen3next.cpp +84 -432
- data/ext/sources/examples/talk-llama/models/qwen3vl-moe.cpp +9 -18
- data/ext/sources/examples/talk-llama/models/qwen3vl.cpp +8 -17
- data/ext/sources/examples/talk-llama/models/refact.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/rnd1.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/rwkv6-base.cpp +2 -0
- data/ext/sources/examples/talk-llama/models/rwkv7-base.cpp +2 -0
- data/ext/sources/examples/talk-llama/models/seed-oss.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/smallthinker.cpp +4 -4
- data/ext/sources/examples/talk-llama/models/smollm3.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/stablelm.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/starcoder.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/starcoder2.cpp +3 -3
- data/ext/sources/examples/talk-llama/models/step35-iswa.cpp +165 -0
- data/ext/sources/examples/talk-llama/models/t5-dec.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/t5-enc.cpp +2 -2
- data/ext/sources/examples/talk-llama/models/xverse.cpp +3 -3
- data/ext/sources/examples/talk-llama/unicode.cpp +21 -65
- data/ext/sources/ggml/CMakeLists.txt +9 -3
- data/ext/sources/ggml/include/ggml-backend.h +1 -1
- data/ext/sources/ggml/include/ggml-cann.h +1 -1
- data/ext/sources/ggml/include/ggml-cpu.h +5 -0
- data/ext/sources/ggml/include/ggml-openvino.h +37 -0
- data/ext/sources/ggml/include/ggml-opt.h +1 -1
- data/ext/sources/ggml/include/ggml-rpc.h +6 -1
- data/ext/sources/ggml/include/ggml-virtgpu.h +14 -0
- data/ext/sources/ggml/include/ggml.h +56 -9
- data/ext/sources/ggml/src/CMakeLists.txt +3 -0
- data/ext/sources/ggml/src/ggml-alloc.c +4 -9
- data/ext/sources/ggml/src/ggml-backend-dl.cpp +48 -0
- data/ext/sources/ggml/src/ggml-backend-dl.h +45 -0
- data/ext/sources/ggml/src/ggml-backend-reg.cpp +28 -86
- data/ext/sources/ggml/src/ggml-backend.cpp +5 -2
- data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +1 -1
- data/ext/sources/ggml/src/ggml-blas/ggml-blas.cpp +6 -2
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.cpp +1 -1
- data/ext/sources/ggml/src/ggml-cann/acl_tensor.h +1 -1
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.cpp +348 -189
- data/ext/sources/ggml/src/ggml-cann/aclnn_ops.h +40 -85
- data/ext/sources/ggml/src/ggml-cann/common.h +3 -4
- data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +44 -62
- data/ext/sources/ggml/src/ggml-common.h +11 -0
- data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +16 -11
- data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +42 -19
- data/ext/sources/ggml/src/ggml-cpu/amx/common.h +34 -10
- data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +85 -85
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +85 -1
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +2744 -548
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +1653 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +1391 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +8 -10
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +9 -9
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/repack.cpp +118 -18
- data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +107 -26
- data/ext/sources/ggml/src/ggml-cpu/binary-ops.cpp +2 -6
- data/ext/sources/ggml/src/ggml-cpu/common.h +8 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +3 -0
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +59 -12
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +15 -0
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kernels.cpp +21 -20
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +965 -252
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +584 -197
- data/ext/sources/ggml/src/ggml-cpu/ops.cpp +903 -188
- data/ext/sources/ggml/src/ggml-cpu/ops.h +1 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.c +40 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.h +3 -0
- data/ext/sources/ggml/src/ggml-cpu/repack.cpp +2890 -679
- data/ext/sources/ggml/src/ggml-cpu/repack.h +119 -8
- data/ext/sources/ggml/src/ggml-cpu/simd-gemm.h +136 -0
- data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +111 -3
- data/ext/sources/ggml/src/ggml-cpu/unary-ops.cpp +1 -1
- data/ext/sources/ggml/src/ggml-cpu/vec.cpp +17 -0
- data/ext/sources/ggml/src/ggml-cuda/CMakeLists.txt +1 -1
- data/ext/sources/ggml/src/ggml-cuda/argsort.cu +19 -10
- data/ext/sources/ggml/src/ggml-cuda/binbcast.cu +32 -30
- data/ext/sources/ggml/src/ggml-cuda/common.cuh +134 -18
- data/ext/sources/ggml/src/ggml-cuda/convert.cu +41 -27
- data/ext/sources/ggml/src/ggml-cuda/cpy.cu +6 -3
- data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +78 -64
- data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +384 -143
- data/ext/sources/ggml/src/ggml-cuda/fattn-tile.cuh +36 -22
- data/ext/sources/ggml/src/ggml-cuda/fattn-vec.cuh +3 -3
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +26 -5
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/fattn.cu +127 -12
- data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cu +263 -0
- data/ext/sources/ggml/src/ggml-cuda/gated_delta_net.cuh +4 -0
- data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +595 -200
- data/ext/sources/ggml/src/ggml-cuda/mean.cu +9 -8
- data/ext/sources/ggml/src/ggml-cuda/mma.cuh +173 -6
- data/ext/sources/ggml/src/ggml-cuda/mmf.cu +30 -10
- data/ext/sources/ggml/src/ggml-cuda/mmf.cuh +158 -85
- data/ext/sources/ggml/src/ggml-cuda/mmq.cuh +34 -22
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cu +127 -67
- data/ext/sources/ggml/src/ggml-cuda/mmvf.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cu +157 -65
- data/ext/sources/ggml/src/ggml-cuda/mmvq.cuh +1 -0
- data/ext/sources/ggml/src/ggml-cuda/norm.cu +18 -76
- data/ext/sources/ggml/src/ggml-cuda/pad.cu +13 -10
- data/ext/sources/ggml/src/ggml-cuda/quantize.cu +1 -1
- data/ext/sources/ggml/src/ggml-cuda/reduce_rows.cuh +2 -16
- data/ext/sources/ggml/src/ggml-cuda/rope.cu +233 -133
- data/ext/sources/ggml/src/ggml-cuda/softmax.cu +8 -83
- data/ext/sources/ggml/src/ggml-cuda/solve_tri.cu +1 -1
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cu +56 -32
- data/ext/sources/ggml/src/ggml-cuda/ssm-conv.cuh +1 -1
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_32.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_32.cu +5 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu +1 -0
- data/ext/sources/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +3 -3
- data/ext/sources/ggml/src/ggml-cuda/top-k.cu +0 -1
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cu +199 -135
- data/ext/sources/ggml/src/ggml-cuda/topk-moe.cuh +20 -14
- data/ext/sources/ggml/src/ggml-cuda/unary.cu +55 -0
- data/ext/sources/ggml/src/ggml-cuda/unary.cuh +2 -0
- data/ext/sources/ggml/src/ggml-cuda/vecdotq.cuh +31 -17
- data/ext/sources/ggml/src/ggml-cuda/vendors/hip.h +10 -0
- data/ext/sources/ggml/src/ggml-hexagon/CMakeLists.txt +82 -45
- data/ext/sources/ggml/src/ggml-hexagon/ggml-hexagon.cpp +334 -160
- data/ext/sources/ggml/src/ggml-hexagon/htp/CMakeLists.txt +7 -5
- data/ext/sources/ggml/src/ggml-hexagon/htp/act-ops.c +328 -197
- data/ext/sources/ggml/src/ggml-hexagon/htp/argsort-ops.c +281 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/binary-ops.c +765 -234
- data/ext/sources/ggml/src/ggml-hexagon/htp/cpy-ops.c +252 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +412 -265
- data/ext/sources/ggml/src/ggml-hexagon/htp/get-rows-ops.c +23 -23
- data/ext/sources/ggml/src/ggml-hexagon/htp/{htp-dma.c → hex-dma.c} +1 -1
- data/ext/sources/ggml/src/ggml-hexagon/htp/{htp-dma.h → hex-dma.h} +28 -3
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-dump.h +77 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-fastdiv.h +37 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hex-utils.h +51 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ctx.h +1 -1
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-msg.h +27 -37
- data/ext/sources/ggml/src/ggml-hexagon/htp/htp-ops.h +6 -35
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-arith.h +443 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-base.h +240 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-copy.h +245 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-div.h +251 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-dump.h +129 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.h +215 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-floor.h +100 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.h +210 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-reduce.h +296 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-scale.h +133 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.h +141 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sqrt.h +126 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-types.h +36 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.h +20 -1347
- data/ext/sources/ggml/src/ggml-hexagon/htp/main.c +211 -13
- data/ext/sources/ggml/src/ggml-hexagon/htp/matmul-ops.c +1119 -952
- data/ext/sources/ggml/src/ggml-hexagon/htp/rope-ops.c +254 -244
- data/ext/sources/ggml/src/ggml-hexagon/htp/set-rows-ops.c +36 -36
- data/ext/sources/ggml/src/ggml-hexagon/htp/softmax-ops.c +155 -138
- data/ext/sources/ggml/src/ggml-hexagon/htp/ssm-conv.c +339 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/sum-rows-ops.c +128 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp/unary-ops.c +209 -114
- data/ext/sources/ggml/src/ggml-hexagon/htp/worker-pool.c +1 -5
- data/ext/sources/ggml/src/ggml-hexagon/htp-drv.cpp +418 -0
- data/ext/sources/ggml/src/ggml-hexagon/htp-drv.h +121 -0
- data/ext/sources/ggml/src/ggml-hexagon/libdl.h +79 -0
- data/ext/sources/ggml/src/ggml-hexagon/libggml-htp.inf +38 -0
- data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +6 -0
- data/ext/sources/ggml/src/ggml-impl.h +62 -0
- data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +10 -10
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp +13 -2
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.h +8 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m +147 -17
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.cpp +274 -73
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.h +22 -4
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-device.m +102 -36
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +174 -23
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.cpp +580 -280
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-ops.h +5 -4
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.cpp +320 -107
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +1068 -825
- data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +19 -1
- data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +3108 -636
- data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +41 -99
- data/ext/sources/ggml/src/ggml-opencl/kernels/cpy.cl +45 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cumsum.cl +139 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/cvt.cl +204 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/diag.cl +27 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/exp.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/expm1.cl +87 -56
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q4_1_f32.cl +132 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_general_q8_0_f32.cl +195 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q4_1_f32.cl +283 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/l2_norm.cl +71 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mean.cl +114 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_0_f32_l4_lm.cl +163 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q4_1_f32_l4_lm.cl +165 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q6_k_f32_l4_lm.cl +158 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_8x4.cl +129 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32.cl +219 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_1_f32_flat.cl +229 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q4_k_f32.cl +180 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/{mul_mv_q6_k.cl → mul_mv_q6_k_f32.cl} +4 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_q6_k_f32_flat.cl +194 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/neg.cl +125 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +31 -32
- data/ext/sources/ggml/src/ggml-opencl/kernels/scale.cl +14 -4
- data/ext/sources/ggml/src/ggml-opencl/kernels/softplus.cl +88 -60
- data/ext/sources/ggml/src/ggml-opencl/kernels/solve_tri.cl +51 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +114 -13
- data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +94 -48
- data/ext/sources/ggml/src/ggml-opencl/kernels/transpose.cl +26 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/tri.cl +32 -0
- data/ext/sources/ggml/src/ggml-openvino/.clang-format +154 -0
- data/ext/sources/ggml/src/ggml-openvino/CMakeLists.txt +22 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.cpp +975 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-decoder.h +294 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +373 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino-extra.h +182 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-openvino.cpp +1110 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-quants.cpp +884 -0
- data/ext/sources/ggml/src/ggml-openvino/ggml-quants.h +153 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/decoder.h +74 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.cpp +27 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/frontend.h +23 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.cpp +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/input_model.h +29 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/node_context.h +112 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/cont.cpp +48 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/cpy.cpp +21 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +90 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +69 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp +61 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +62 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +90 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/permute.cpp +102 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/reshape.cpp +83 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +46 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/rope.cpp +123 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/scale.cpp +41 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +76 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/softmax.cpp +89 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/transpose.cpp +23 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp +27 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op/view.cpp +53 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.cpp +46 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/op_table.h +39 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.cpp +123 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/eliminate_zp.h +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.cpp +60 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/fuse_to_sdpa.h +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.h +29 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.cpp +58 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/pass/squeeze_matmul.h +17 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.cpp +293 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/translate_session.h +28 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/utils.cpp +226 -0
- data/ext/sources/ggml/src/ggml-openvino/openvino/utils.h +85 -0
- data/ext/sources/ggml/src/ggml-openvino/utils.cpp +823 -0
- data/ext/sources/ggml/src/ggml-openvino/utils.h +123 -0
- data/ext/sources/ggml/src/ggml-quants.c +96 -5
- data/ext/sources/ggml/src/ggml-quants.h +3 -0
- data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +15 -88
- data/ext/sources/ggml/src/ggml-sycl/add-id.cpp +5 -1
- data/ext/sources/ggml/src/ggml-sycl/backend.hpp +1 -0
- data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +21 -20
- data/ext/sources/ggml/src/ggml-sycl/common.hpp +315 -10
- data/ext/sources/ggml/src/ggml-sycl/convert.cpp +69 -1
- data/ext/sources/ggml/src/ggml-sycl/convert.hpp +22 -1
- data/ext/sources/ggml/src/ggml-sycl/count-equal.cpp +1 -1
- data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +791 -47
- data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +78 -68
- data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +2 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-common.hpp +1179 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-tile.cpp +55 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-tile.hpp +1338 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn-vec.hpp +667 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn.cpp +225 -0
- data/ext/sources/ggml/src/ggml-sycl/fattn.hpp +22 -0
- data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.cpp +309 -0
- data/ext/sources/ggml/src/ggml-sycl/gated_delta_net.hpp +8 -0
- data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +316 -51
- data/ext/sources/ggml/src/ggml-sycl/norm.cpp +65 -66
- data/ext/sources/ggml/src/ggml-sycl/outprod.cpp +3 -3
- data/ext/sources/ggml/src/ggml-sycl/presets.hpp +3 -0
- data/ext/sources/ggml/src/ggml-sycl/quants.hpp +1 -1
- data/ext/sources/ggml/src/ggml-sycl/rope.cpp +450 -287
- data/ext/sources/ggml/src/ggml-sycl/rope.hpp +6 -0
- data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +6 -6
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq112-dv112.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq128-dv128.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq256-dv256.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq40-dv40.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq576-dv512.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq64-dv64.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq72-dv72.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq80-dv80.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-tile-instance-dkq96-dv96.cpp +5 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-f16-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_0-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q4_1-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_0-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q5_1-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-f16.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q4_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q5_1.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/template-instances/fattn-vec-instance-q8_0-q8_0.cpp +7 -0
- data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +13 -0
- data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +1 -1
- data/ext/sources/ggml/src/ggml-virtgpu/CMakeLists.txt +70 -0
- data/ext/sources/ggml/src/ggml-virtgpu/apir_cs_ggml-rpc-front.cpp +87 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/CMakeLists.txt +21 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/apir_cs_ggml-rpc-back.cpp +115 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-convert.h +13 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +102 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +105 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +179 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched-device.cpp +148 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +51 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +73 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +27 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +32 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/backend.cpp +144 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +95 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +94 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +50 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +378 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +232 -0
- data/ext/sources/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +58 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +81 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-buffer.cpp +119 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +158 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +213 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-backend.cpp +69 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggml-remoting.h +71 -0
- data/ext/sources/ggml/src/ggml-virtgpu/ggmlremoting_functions.yaml +166 -0
- data/ext/sources/ggml/src/ggml-virtgpu/include/apir_hw.h +9 -0
- data/ext/sources/ggml/src/ggml-virtgpu/regenerate_remoting.py +333 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-apir.h +15 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +58 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +110 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +173 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +192 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +36 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +53 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.cpp +98 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-shm.h +23 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.cpp +179 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu-utils.h +86 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.cpp +544 -0
- data/ext/sources/ggml/src/ggml-virtgpu/virtgpu.h +117 -0
- data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +1 -1
- data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1250 -465
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +16 -8
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/elu.comp +27 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +374 -170
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +66 -22
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +389 -201
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +106 -58
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_mask_opt.comp +162 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +9 -8
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp +128 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +12 -9
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +20 -17
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +11 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +8 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +3 -1
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +5 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +3 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +2 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +36 -63
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +10 -5
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +7 -4
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp +21 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +16 -10
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +55 -35
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +1314 -109
- data/ext/sources/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1660 -1371
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argmax.wgsl +72 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort.wgsl +106 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/argsort_merge.wgsl +134 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +141 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +65 -72
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/concat.wgsl +75 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +6 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/cumsum.wgsl +66 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +40 -5
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl +105 -60
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{get_rows.tmpl.wgsl → get_rows.wgsl} +53 -259
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat.tmpl.wgsl → mul_mat.wgsl} +68 -257
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +692 -23
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat_reg_tile.tmpl.wgsl → mul_mat_reg_tile.wgsl} +28 -128
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{mul_mat_subgroup_matrix.tmpl.wgsl → mul_mat_subgroup_matrix.wgsl} +31 -137
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +480 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/pad.wgsl +86 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/repeat.wgsl +67 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/{scale.tmpl.wgsl → scale.wgsl} +9 -36
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +40 -12
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/sum_rows.wgsl +55 -0
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary.wgsl +193 -0
- data/ext/sources/ggml/src/ggml-zdnn/ggml-zdnn.cpp +6 -1
- data/ext/sources/ggml/src/ggml-zendnn/CMakeLists.txt +31 -32
- data/ext/sources/ggml/src/ggml-zendnn/ggml-zendnn.cpp +9 -6
- data/ext/sources/ggml/src/ggml.c +167 -33
- data/ext/sources/ggml/src/gguf.cpp +229 -44
- data/ext/sources/src/whisper.cpp +6 -28
- data/sig/whisper.rbs +43 -2
- data/test/test_context_params.rb +82 -0
- data/test/test_token.rb +11 -0
- data/test/test_vad_context.rb +58 -8
- data/test/test_whisper.rb +20 -0
- data/whispercpp.gemspec +1 -1
- metadata +240 -28
- data/ext/sources/ggml/cmake/BuildTypes.cmake +0 -54
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h +0 -333
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-exp.c +0 -94
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-inverse.c +0 -72
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +0 -49
- data/ext/sources/ggml/src/ggml-hexagon/htp/hvx-utils.c +0 -1020
- data/ext/sources/ggml/src/ggml-hexagon/htp/ops-utils.h +0 -149
- data/ext/sources/ggml/src/ggml-hexagon/htp-utils.c +0 -454
- data/ext/sources/ggml/src/ggml-hexagon/htp-utils.h +0 -221
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +0 -188
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +0 -45
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +0 -267
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.tmpl.wgsl +0 -112
- data/ext/sources/ggml/src/ggml-webgpu/wgsl-shaders/unary_op.wgsl +0 -483
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
#include "llama-model.h"
|
|
2
2
|
|
|
3
|
+
#include "ggml.h"
|
|
3
4
|
#include "llama-impl.h"
|
|
4
5
|
#include "llama-mmap.h"
|
|
5
6
|
#include "llama-cparams.h"
|
|
@@ -8,6 +9,7 @@
|
|
|
8
9
|
#include "llama-kv-cache.h"
|
|
9
10
|
#include "llama-kv-cache-iswa.h"
|
|
10
11
|
#include "llama-memory-hybrid.h"
|
|
12
|
+
#include "llama-memory-hybrid-iswa.h"
|
|
11
13
|
#include "llama-memory-recurrent.h"
|
|
12
14
|
|
|
13
15
|
#include "ggml-cpp.h"
|
|
@@ -17,6 +19,7 @@
|
|
|
17
19
|
#include <algorithm>
|
|
18
20
|
#include <cassert>
|
|
19
21
|
#include <cfloat>
|
|
22
|
+
#include <cstdint>
|
|
20
23
|
#include <cstring>
|
|
21
24
|
#include <cmath>
|
|
22
25
|
#include <functional>
|
|
@@ -60,6 +63,7 @@ const char * llm_type_name(llm_type type) {
|
|
|
60
63
|
case LLM_TYPE_0_3B: return "0.3B";
|
|
61
64
|
case LLM_TYPE_0_5B: return "0.5B";
|
|
62
65
|
case LLM_TYPE_0_6B: return "0.6B";
|
|
66
|
+
case LLM_TYPE_0_8B: return "0.8B";
|
|
63
67
|
case LLM_TYPE_1B: return "1B";
|
|
64
68
|
case LLM_TYPE_1_2B: return "1.2B";
|
|
65
69
|
case LLM_TYPE_1_3B: return "1.3B";
|
|
@@ -122,17 +126,25 @@ const char * llm_type_name(llm_type type) {
|
|
|
122
126
|
case LLM_TYPE_8B_A1B: return "8B.A1B";
|
|
123
127
|
case LLM_TYPE_16B_A1B: return "16B.A1B";
|
|
124
128
|
case LLM_TYPE_21B_A3B: return "21B.A3B";
|
|
129
|
+
case LLM_TYPE_24B_A2B: return "24B.A2B";
|
|
125
130
|
case LLM_TYPE_30B_A3B: return "30B.A3B";
|
|
126
131
|
case LLM_TYPE_31B_A3_5B: return "31B.A3.5B";
|
|
132
|
+
case LLM_TYPE_35B_A3B: return "35B.A3B";
|
|
133
|
+
case LLM_TYPE_48B_A3B: return "48B.A3B";
|
|
127
134
|
case LLM_TYPE_80B_A3B: return "80B.A3B";
|
|
128
135
|
case LLM_TYPE_100B_A6B: return "100B.A6B";
|
|
129
136
|
case LLM_TYPE_102B_A12B: return "102B.A12B";
|
|
130
137
|
case LLM_TYPE_106B_A12B: return "106B.A12B";
|
|
138
|
+
case LLM_TYPE_120B_A12B: return "120B.A12B";
|
|
139
|
+
case LLM_TYPE_122B_A10B: return "122B.A10B";
|
|
140
|
+
case LLM_TYPE_196B_A11B: return "196B.A11B";
|
|
131
141
|
case LLM_TYPE_230B_A10B: return "230B.A10B";
|
|
132
142
|
case LLM_TYPE_235B_A22B: return "235B.A22B";
|
|
133
143
|
case LLM_TYPE_300B_A47B: return "300B.A47B";
|
|
134
144
|
case LLM_TYPE_310B_A15B: return "310B.A15B";
|
|
135
145
|
case LLM_TYPE_355B_A32B: return "355B.A32B";
|
|
146
|
+
case LLM_TYPE_397B_A17B: return "397B.A17B";
|
|
147
|
+
case LLM_TYPE_744B_A40B: return "744B.A40B";
|
|
136
148
|
case LLM_TYPE_E2B: return "E2B";
|
|
137
149
|
case LLM_TYPE_E4B: return "E4B";
|
|
138
150
|
default: return "?B";
|
|
@@ -168,160 +180,6 @@ static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::st
|
|
|
168
180
|
return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
|
|
169
181
|
}
|
|
170
182
|
|
|
171
|
-
// checks if the weight tensor can be used with the specified buffer type and device
|
|
172
|
-
static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
|
|
173
|
-
GGML_ASSERT(w != nullptr);
|
|
174
|
-
|
|
175
|
-
if (op == GGML_OP_NONE) {
|
|
176
|
-
return true;
|
|
177
|
-
}
|
|
178
|
-
|
|
179
|
-
ggml_init_params params = {
|
|
180
|
-
/*.mem_size =*/ ggml_tensor_overhead()*8,
|
|
181
|
-
/*.mem_buffer =*/ NULL,
|
|
182
|
-
/*.no_alloc =*/ true,
|
|
183
|
-
};
|
|
184
|
-
ggml_context_ptr ctx_ptr { ggml_init(params) };
|
|
185
|
-
if (!ctx_ptr) {
|
|
186
|
-
throw std::runtime_error(format("failed to create ggml context"));
|
|
187
|
-
}
|
|
188
|
-
ggml_context * ctx = ctx_ptr.get();
|
|
189
|
-
|
|
190
|
-
ggml_tensor * op_tensor = nullptr;
|
|
191
|
-
|
|
192
|
-
switch (op) {
|
|
193
|
-
case GGML_OP_GET_ROWS:
|
|
194
|
-
{
|
|
195
|
-
ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
|
|
196
|
-
op_tensor = ggml_get_rows(ctx, w, b);
|
|
197
|
-
} break;
|
|
198
|
-
case GGML_OP_MUL_MAT:
|
|
199
|
-
{
|
|
200
|
-
ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], 512, w->ne[2], w->ne[3]);
|
|
201
|
-
op_tensor = ggml_mul_mat(ctx, w, b);
|
|
202
|
-
} break;
|
|
203
|
-
case GGML_OP_MUL_MAT_ID:
|
|
204
|
-
{
|
|
205
|
-
int n_expert_used = hparams.n_expert_used;
|
|
206
|
-
ggml_tensor * b = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
|
|
207
|
-
ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
|
|
208
|
-
op_tensor = ggml_mul_mat_id(ctx, w, b, ids);
|
|
209
|
-
} break;
|
|
210
|
-
case GGML_OP_ADD:
|
|
211
|
-
{
|
|
212
|
-
ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
|
|
213
|
-
op_tensor = ggml_add(ctx, a, w);
|
|
214
|
-
} break;
|
|
215
|
-
case GGML_OP_ADD_ID:
|
|
216
|
-
{
|
|
217
|
-
int n_expert_used = hparams.n_expert_used;
|
|
218
|
-
ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
|
|
219
|
-
ggml_tensor * c = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
|
|
220
|
-
op_tensor = ggml_add_id(ctx, a, w, c);
|
|
221
|
-
} break;
|
|
222
|
-
case GGML_OP_MUL:
|
|
223
|
-
{
|
|
224
|
-
ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
|
|
225
|
-
op_tensor = ggml_mul(ctx, a, w);
|
|
226
|
-
} break;
|
|
227
|
-
case GGML_OP_DIV:
|
|
228
|
-
{
|
|
229
|
-
ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, w->ne[0]);
|
|
230
|
-
op_tensor = ggml_div(ctx, a, w);
|
|
231
|
-
} break;
|
|
232
|
-
case GGML_OP_ROPE:
|
|
233
|
-
{
|
|
234
|
-
int n_embd_head = hparams.n_embd_head_v;
|
|
235
|
-
int n_head = hparams.n_head();
|
|
236
|
-
ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd_head, n_head, 512);
|
|
237
|
-
ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
|
|
238
|
-
op_tensor = ggml_rope_ext(
|
|
239
|
-
ctx, a, b, w,
|
|
240
|
-
0, 0, 0, 0, 0,
|
|
241
|
-
0, 0, 0, 0
|
|
242
|
-
);
|
|
243
|
-
|
|
244
|
-
} break;
|
|
245
|
-
case GGML_OP_SSM_CONV:
|
|
246
|
-
{
|
|
247
|
-
const int64_t n_seq_tokens = 512;
|
|
248
|
-
const int64_t n_seqs = 3;
|
|
249
|
-
ggml_tensor * conv_x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0] - 1 + n_seq_tokens, w->ne[1], n_seqs);
|
|
250
|
-
op_tensor = ggml_ssm_conv(ctx, conv_x, w);
|
|
251
|
-
} break;
|
|
252
|
-
case GGML_OP_SSM_SCAN:
|
|
253
|
-
{
|
|
254
|
-
// w is ssm_a, which is used to distinguish Mamba-1 and Mamba-2
|
|
255
|
-
const int64_t d_state = w->ne[0] == 1 ? hparams.ssm_d_state : w->ne[0];
|
|
256
|
-
const int64_t n_head = w->ne[1];
|
|
257
|
-
const int64_t head_dim = hparams.ssm_d_inner / n_head;
|
|
258
|
-
const int64_t n_group = hparams.ssm_n_group ? hparams.ssm_n_group : 1;
|
|
259
|
-
const int64_t n_seq_tokens = 512;
|
|
260
|
-
const int64_t n_seqs = 3;
|
|
261
|
-
ggml_tensor * s = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, head_dim, n_head, n_seqs);
|
|
262
|
-
ggml_tensor * x = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, head_dim, n_head, n_seq_tokens, n_seqs);
|
|
263
|
-
ggml_tensor * dt = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_head, n_seq_tokens, n_seqs);
|
|
264
|
-
ggml_tensor * B = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
|
|
265
|
-
ggml_tensor * C = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, d_state, n_group, n_seq_tokens, n_seqs);
|
|
266
|
-
ggml_tensor * ids = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_seqs);
|
|
267
|
-
op_tensor = ggml_ssm_scan(ctx, s, x, dt, w, B, C, ids);
|
|
268
|
-
} break;
|
|
269
|
-
case GGML_OP_RWKV_WKV6:
|
|
270
|
-
{
|
|
271
|
-
// FIXME
|
|
272
|
-
const int64_t S = 123;
|
|
273
|
-
const int64_t H = 123;
|
|
274
|
-
const int64_t n_tokens = 123;
|
|
275
|
-
const int64_t n_seqs = 123;
|
|
276
|
-
ggml_tensor * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
|
|
277
|
-
ggml_tensor * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
|
|
278
|
-
ggml_tensor * r = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
|
|
279
|
-
ggml_tensor * tf = w;
|
|
280
|
-
ggml_tensor * td = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
|
|
281
|
-
ggml_tensor * state = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, n_seqs, S, H);
|
|
282
|
-
op_tensor = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state);
|
|
283
|
-
} break;
|
|
284
|
-
case GGML_OP_IM2COL:
|
|
285
|
-
{
|
|
286
|
-
const int n_embd_inp = hparams.n_embd_inp();
|
|
287
|
-
ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd_inp, w->ne[1], 1, 1);
|
|
288
|
-
op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
|
|
289
|
-
} break;
|
|
290
|
-
case GGML_OP_SCALE:
|
|
291
|
-
{
|
|
292
|
-
op_tensor = ggml_scale(ctx, w, 1.0f);
|
|
293
|
-
} break;
|
|
294
|
-
default:
|
|
295
|
-
GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
|
|
296
|
-
}
|
|
297
|
-
|
|
298
|
-
// create a temporary dummy buffer for the weight so that supports_op can check the buffer type
|
|
299
|
-
GGML_ASSERT(w->buffer == nullptr);
|
|
300
|
-
w->buffer = ggml_backend_buft_alloc_buffer(buft, 0);
|
|
301
|
-
bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
|
|
302
|
-
ggml_backend_buffer_free(w->buffer);
|
|
303
|
-
w->buffer = nullptr;
|
|
304
|
-
|
|
305
|
-
return op_supported;
|
|
306
|
-
}
|
|
307
|
-
|
|
308
|
-
// lists of buffer types used for each layer
|
|
309
|
-
using buft_list_t = std::vector<std::pair<ggml_backend_dev_t, ggml_backend_buffer_type_t>>;
|
|
310
|
-
|
|
311
|
-
// find the first buffer type in the list that can use the tensor
|
|
312
|
-
static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hparams, ggml_tensor * tensor, ggml_op op, const buft_list_t & buft_list) {
|
|
313
|
-
GGML_ASSERT(!buft_list.empty());
|
|
314
|
-
for (const auto & cur : buft_list) {
|
|
315
|
-
ggml_backend_dev_t cur_dev = cur.first;
|
|
316
|
-
ggml_backend_buffer_type_t cur_buft = cur.second;
|
|
317
|
-
if (weight_buft_supported(hparams, tensor, op, cur_buft, cur_dev)) {
|
|
318
|
-
return cur_buft;
|
|
319
|
-
}
|
|
320
|
-
}
|
|
321
|
-
|
|
322
|
-
return nullptr;
|
|
323
|
-
}
|
|
324
|
-
|
|
325
183
|
// CPU: ACCEL -> GPU host -> CPU extra -> CPU
|
|
326
184
|
static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices, bool use_extra_bufts, bool no_host) {
|
|
327
185
|
buft_list_t buft_list;
|
|
@@ -446,7 +304,7 @@ struct llama_model::impl {
|
|
|
446
304
|
llama_mlocks mlock_bufs;
|
|
447
305
|
llama_mlocks mlock_mmaps;
|
|
448
306
|
|
|
449
|
-
// contexts where the model tensors metadata is stored as well
|
|
307
|
+
// contexts where the model tensors metadata is stored as well as the corresponding buffers:
|
|
450
308
|
std::vector<std::pair<ggml_context_ptr, std::vector<ggml_backend_buffer_ptr>>> ctxs_bufs;
|
|
451
309
|
|
|
452
310
|
buft_list_t cpu_buft_list;
|
|
@@ -468,7 +326,11 @@ llama_model::llama_model(const llama_model_params & params) : params(params), pi
|
|
|
468
326
|
pimpl->has_tensor_overrides = params.tensor_buft_overrides && params.tensor_buft_overrides[0].pattern;
|
|
469
327
|
}
|
|
470
328
|
|
|
471
|
-
llama_model::~llama_model()
|
|
329
|
+
llama_model::~llama_model() {
|
|
330
|
+
for (auto * lora : loras) {
|
|
331
|
+
delete lora;
|
|
332
|
+
}
|
|
333
|
+
}
|
|
472
334
|
|
|
473
335
|
void llama_model::load_stats(llama_model_loader & ml) {
|
|
474
336
|
pimpl->n_elements = ml.n_elements;
|
|
@@ -483,7 +345,7 @@ void llama_model::load_arch(llama_model_loader & ml) {
|
|
|
483
345
|
}
|
|
484
346
|
|
|
485
347
|
void llama_model::load_hparams(llama_model_loader & ml) {
|
|
486
|
-
const gguf_context * ctx = ml.
|
|
348
|
+
const gguf_context * ctx = ml.metadata;
|
|
487
349
|
|
|
488
350
|
// get metadata as string
|
|
489
351
|
for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
|
|
@@ -507,7 +369,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
507
369
|
|
|
508
370
|
ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
|
|
509
371
|
ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
|
|
510
|
-
ml.get_key(LLM_KV_EMBEDDING_LENGTH_OUT, hparams.
|
|
372
|
+
ml.get_key(LLM_KV_EMBEDDING_LENGTH_OUT, hparams.n_embd_out_impl, false);
|
|
511
373
|
ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
|
|
512
374
|
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
|
|
513
375
|
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
|
|
@@ -515,7 +377,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
515
377
|
ml.get_key(LLM_KV_EXPERT_GROUP_USED_COUNT, hparams.n_group_used, false);
|
|
516
378
|
|
|
517
379
|
if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
|
|
518
|
-
ml.get_key(LLM_KV_FEATURES_LENGTH,
|
|
380
|
+
ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd);
|
|
381
|
+
ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd_out_impl);
|
|
519
382
|
|
|
520
383
|
ml.get_key(LLM_KV_POSNET_EMBEDDING_LENGTH, hparams.posnet.n_embd);
|
|
521
384
|
ml.get_key(LLM_KV_POSNET_BLOCK_COUNT, hparams.posnet.n_layer);
|
|
@@ -554,6 +417,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
554
417
|
std::fill(hparams.xielu_alpha_p.begin(), hparams.xielu_alpha_p.end(), 0.0f);
|
|
555
418
|
std::fill(hparams.xielu_beta.begin(), hparams.xielu_beta.end(), 0.0f);
|
|
556
419
|
std::fill(hparams.xielu_eps.begin(), hparams.xielu_eps.end(), 0.0f);
|
|
420
|
+
std::fill(hparams.swiglu_clamp_exp.begin(), hparams.swiglu_clamp_exp.end(), 0.0f);
|
|
421
|
+
std::fill(hparams.swiglu_clamp_shexp.begin(), hparams.swiglu_clamp_shexp.end(), 0.0f);
|
|
557
422
|
|
|
558
423
|
ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
|
|
559
424
|
ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
|
|
@@ -595,26 +460,37 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
595
460
|
// gpt-neox n_rot = rotary_pct * (n_embd / n_head)
|
|
596
461
|
// gpt-j n_rot = rotary_dim
|
|
597
462
|
|
|
598
|
-
hparams.
|
|
599
|
-
ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.
|
|
463
|
+
hparams.n_embd_head_k_full = hparams.n_embd / hparams.n_head();
|
|
464
|
+
ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k_full, false);
|
|
600
465
|
|
|
601
|
-
hparams.
|
|
602
|
-
ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.
|
|
466
|
+
hparams.n_embd_head_v_full = hparams.n_embd / hparams.n_head();
|
|
467
|
+
ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v_full, false);
|
|
603
468
|
|
|
604
469
|
// sanity check for n_rot (optional)
|
|
605
|
-
hparams.
|
|
470
|
+
hparams.n_rot_full = hparams.n_embd_head_k_full;
|
|
606
471
|
|
|
607
|
-
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.
|
|
472
|
+
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot_full, false);
|
|
608
473
|
|
|
609
474
|
if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON || arch == LLM_ARCH_LLAMA_EMBED) {
|
|
610
|
-
if (hparams.
|
|
611
|
-
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.
|
|
475
|
+
if (hparams.n_rot_full != hparams.n_embd_head_k_full) {
|
|
476
|
+
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot_full, hparams.n_embd_head_k_full));
|
|
612
477
|
}
|
|
613
478
|
}
|
|
614
479
|
} else {
|
|
615
|
-
hparams.
|
|
616
|
-
hparams.
|
|
617
|
-
hparams.
|
|
480
|
+
hparams.n_rot_full = 0;
|
|
481
|
+
hparams.n_embd_head_k_full = 0;
|
|
482
|
+
hparams.n_embd_head_v_full = 0;
|
|
483
|
+
}
|
|
484
|
+
|
|
485
|
+
// head size and n_rot for SWA layers
|
|
486
|
+
{
|
|
487
|
+
hparams.n_embd_head_k_swa = hparams.n_embd_head_k_full;
|
|
488
|
+
hparams.n_embd_head_v_swa = hparams.n_embd_head_v_full;
|
|
489
|
+
ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_SWA, hparams.n_embd_head_k_swa, false);
|
|
490
|
+
ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_SWA, hparams.n_embd_head_v_swa, false);
|
|
491
|
+
|
|
492
|
+
hparams.n_rot_swa = hparams.n_rot_full;
|
|
493
|
+
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT_SWA, hparams.n_rot_swa, false);
|
|
618
494
|
}
|
|
619
495
|
|
|
620
496
|
// for differentiating model types
|
|
@@ -674,7 +550,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
674
550
|
hparams.n_attn_temp_floor_scale = 8192;
|
|
675
551
|
hparams.f_attn_temp_scale = 0.1f;
|
|
676
552
|
hparams.f_attn_temp_offset = 1.0f;
|
|
677
|
-
|
|
553
|
+
uint32_t swa_period = 4; // pattern: 3 chunked - 1 full
|
|
554
|
+
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
|
|
555
|
+
hparams.set_swa_pattern(swa_period);
|
|
678
556
|
|
|
679
557
|
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
|
680
558
|
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
|
@@ -711,7 +589,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
711
589
|
case LLM_ARCH_AFMOE:
|
|
712
590
|
{
|
|
713
591
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
714
|
-
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
|
592
|
+
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
|
|
715
593
|
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
716
594
|
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
|
717
595
|
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
|
|
@@ -723,7 +601,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
723
601
|
// Pattern: 3 sliding - 1 full (global_attn_every_n_layers = 4)
|
|
724
602
|
if (hparams.n_swa > 0) {
|
|
725
603
|
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
726
|
-
|
|
604
|
+
uint32_t swa_period = 4;
|
|
605
|
+
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
|
|
606
|
+
hparams.set_swa_pattern(swa_period);
|
|
727
607
|
|
|
728
608
|
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
|
729
609
|
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
|
@@ -868,7 +748,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
868
748
|
case LLM_ARCH_BERT:
|
|
869
749
|
{
|
|
870
750
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
871
|
-
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
|
751
|
+
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
|
|
872
752
|
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
|
873
753
|
|
|
874
754
|
switch (hparams.n_layer) {
|
|
@@ -891,18 +771,17 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
891
771
|
{
|
|
892
772
|
const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
|
893
773
|
if (found_swa && hparams.n_swa > 0) {
|
|
894
|
-
uint32_t swa_period = 3;
|
|
895
774
|
hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC;
|
|
896
|
-
|
|
897
|
-
|
|
775
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
|
776
|
+
uint32_t swa_period = 3;
|
|
898
777
|
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
|
|
899
|
-
hparams.set_swa_pattern(swa_period);
|
|
778
|
+
hparams.set_swa_pattern(swa_period, true);
|
|
900
779
|
} else {
|
|
901
780
|
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
|
902
781
|
}
|
|
903
782
|
|
|
904
783
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
905
|
-
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
|
784
|
+
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
|
|
906
785
|
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
|
907
786
|
|
|
908
787
|
switch (hparams.n_layer) {
|
|
@@ -918,7 +797,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
918
797
|
case LLM_ARCH_JINA_BERT_V2:
|
|
919
798
|
{
|
|
920
799
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
921
|
-
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
|
800
|
+
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
|
|
922
801
|
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
|
923
802
|
hparams.f_max_alibi_bias = 8.0f;
|
|
924
803
|
|
|
@@ -931,7 +810,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
931
810
|
case LLM_ARCH_JINA_BERT_V3:
|
|
932
811
|
{
|
|
933
812
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
934
|
-
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
|
813
|
+
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
|
|
935
814
|
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
|
936
815
|
|
|
937
816
|
switch (hparams.n_layer) {
|
|
@@ -944,8 +823,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
944
823
|
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
945
824
|
{
|
|
946
825
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
947
|
-
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
|
948
|
-
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
|
826
|
+
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
|
|
827
|
+
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
|
949
828
|
ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers, 0);
|
|
950
829
|
|
|
951
830
|
if (hparams.n_layer == 12 && hparams.n_embd == 768) {
|
|
@@ -959,13 +838,23 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
959
838
|
case LLM_ARCH_NEO_BERT:
|
|
960
839
|
{
|
|
961
840
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
962
|
-
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
|
963
|
-
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
|
841
|
+
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
|
|
842
|
+
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
|
964
843
|
|
|
965
844
|
if (hparams.n_layer == 28) {
|
|
966
845
|
type = LLM_TYPE_250M;
|
|
967
846
|
}
|
|
968
847
|
} break;
|
|
848
|
+
case LLM_ARCH_EUROBERT:
|
|
849
|
+
{
|
|
850
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
851
|
+
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
|
|
852
|
+
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
|
853
|
+
|
|
854
|
+
if (hparams.n_layer == 12) {
|
|
855
|
+
type = LLM_TYPE_SMALL; // 0.2B
|
|
856
|
+
}
|
|
857
|
+
} break;
|
|
969
858
|
case LLM_ARCH_BLOOM:
|
|
970
859
|
{
|
|
971
860
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
@@ -988,7 +877,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
988
877
|
{
|
|
989
878
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
990
879
|
ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
|
|
991
|
-
ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
|
|
880
|
+
ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias, false);
|
|
992
881
|
|
|
993
882
|
switch (hparams.n_layer) {
|
|
994
883
|
case 32: type = LLM_TYPE_7B; break;
|
|
@@ -1237,19 +1126,15 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1237
1126
|
break;
|
|
1238
1127
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1239
1128
|
}
|
|
1240
|
-
|
|
1241
|
-
// Load attention parameters
|
|
1242
|
-
ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
|
|
1243
|
-
ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
|
|
1244
1129
|
} break;
|
|
1245
1130
|
case LLM_ARCH_PLAMO3:
|
|
1246
1131
|
{
|
|
1247
1132
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1248
1133
|
const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
|
1249
1134
|
if (found_swa && hparams.n_swa > 0) {
|
|
1250
|
-
uint32_t swa_period = 8;
|
|
1251
1135
|
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
1252
|
-
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
|
|
1136
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
|
1137
|
+
uint32_t swa_period = 8;
|
|
1253
1138
|
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
|
|
1254
1139
|
hparams.set_swa_pattern(swa_period);
|
|
1255
1140
|
} else {
|
|
@@ -1312,7 +1197,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1312
1197
|
{
|
|
1313
1198
|
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
1314
1199
|
hparams.n_swa = 4096; // default value of gemma 2
|
|
1315
|
-
|
|
1200
|
+
uint32_t swa_period = 2;
|
|
1201
|
+
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
|
|
1202
|
+
hparams.set_swa_pattern(swa_period);
|
|
1316
1203
|
hparams.attn_soft_cap = true;
|
|
1317
1204
|
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
|
1318
1205
|
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
|
@@ -1333,14 +1220,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1333
1220
|
// ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L173
|
|
1334
1221
|
hparams.f_attention_scale = type == LLM_TYPE_27B
|
|
1335
1222
|
? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
|
|
1336
|
-
: 1.0f / std::sqrt(float(hparams.n_embd_head_k));
|
|
1223
|
+
: 1.0f / std::sqrt(float(hparams.n_embd_head_k()));
|
|
1337
1224
|
} break;
|
|
1338
1225
|
case LLM_ARCH_GEMMA3:
|
|
1339
1226
|
{
|
|
1340
1227
|
const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
|
1341
1228
|
if (found_swa && hparams.n_swa > 0) {
|
|
1342
1229
|
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
1343
|
-
|
|
1230
|
+
uint32_t swa_period = 6;
|
|
1231
|
+
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
|
|
1232
|
+
hparams.set_swa_pattern(swa_period);
|
|
1344
1233
|
|
|
1345
1234
|
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
|
1346
1235
|
} else {
|
|
@@ -1364,12 +1253,14 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1364
1253
|
// ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L289
|
|
1365
1254
|
hparams.f_attention_scale = type == LLM_TYPE_27B
|
|
1366
1255
|
? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
|
|
1367
|
-
: 1.0f / std::sqrt(float(hparams.n_embd_head_k));
|
|
1256
|
+
: 1.0f / std::sqrt(float(hparams.n_embd_head_k()));
|
|
1368
1257
|
} break;
|
|
1369
1258
|
case LLM_ARCH_GEMMA3N:
|
|
1370
1259
|
{
|
|
1260
|
+
uint32_t swa_period = 5;
|
|
1261
|
+
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
|
|
1371
1262
|
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
1372
|
-
hparams.set_swa_pattern(
|
|
1263
|
+
hparams.set_swa_pattern(swa_period);
|
|
1373
1264
|
|
|
1374
1265
|
hparams.n_layer_kv_from_start = 20;
|
|
1375
1266
|
hparams.f_attention_scale = 1.0f;
|
|
@@ -1387,14 +1278,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1387
1278
|
case LLM_ARCH_GEMMA_EMBEDDING:
|
|
1388
1279
|
{
|
|
1389
1280
|
hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC;
|
|
1390
|
-
|
|
1281
|
+
uint32_t swa_period = 6;
|
|
1282
|
+
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
|
|
1283
|
+
hparams.set_swa_pattern(swa_period);
|
|
1391
1284
|
|
|
1392
1285
|
hparams.causal_attn = false; // embeddings do not use causal attention
|
|
1393
1286
|
|
|
1394
1287
|
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
|
1395
1288
|
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
|
1396
1289
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1397
|
-
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
|
1290
|
+
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
|
1398
1291
|
|
|
1399
1292
|
//applied only if model converted with --sentence-transformers-dense-modules
|
|
1400
1293
|
ml.get_key(LLM_KV_DENSE_2_FEAT_IN, hparams.dense_2_feat_in, false);
|
|
@@ -1409,7 +1302,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1409
1302
|
case 24: type = LLM_TYPE_0_3B; break;
|
|
1410
1303
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1411
1304
|
}
|
|
1412
|
-
hparams.f_attention_scale = 1.0f / std::sqrt(float(hparams.n_embd_head_k));
|
|
1305
|
+
hparams.f_attention_scale = 1.0f / std::sqrt(float(hparams.n_embd_head_k()));
|
|
1413
1306
|
|
|
1414
1307
|
} break;
|
|
1415
1308
|
case LLM_ARCH_STARCODER2:
|
|
@@ -1501,7 +1394,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1501
1394
|
}
|
|
1502
1395
|
|
|
1503
1396
|
switch (hparams.n_layer) {
|
|
1504
|
-
// TODO: Jamba layers are a bit
|
|
1397
|
+
// TODO: Jamba layers are a bit heterogeneous, so naming this is hard.
|
|
1505
1398
|
case 12: // 900M 8x???M
|
|
1506
1399
|
case 32: // 51B 16x?B
|
|
1507
1400
|
default: type = LLM_TYPE_UNKNOWN;
|
|
@@ -1519,7 +1412,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1519
1412
|
} break;
|
|
1520
1413
|
case LLM_ARCH_COMMAND_R:
|
|
1521
1414
|
{
|
|
1522
|
-
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
|
1415
|
+
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale, false);
|
|
1523
1416
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
1524
1417
|
switch (hparams.n_layer) {
|
|
1525
1418
|
case 40: type = LLM_TYPE_35B; break;
|
|
@@ -1529,7 +1422,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1529
1422
|
case LLM_ARCH_COHERE2:
|
|
1530
1423
|
{
|
|
1531
1424
|
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
1532
|
-
|
|
1425
|
+
uint32_t swa_period = 4;
|
|
1426
|
+
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
|
|
1427
|
+
hparams.set_swa_pattern(swa_period);
|
|
1533
1428
|
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
|
1534
1429
|
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
|
1535
1430
|
|
|
@@ -1571,7 +1466,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1571
1466
|
const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
|
1572
1467
|
if (found_swa && hparams.n_swa > 0) {
|
|
1573
1468
|
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
1574
|
-
|
|
1469
|
+
uint32_t swa_period = 4;
|
|
1470
|
+
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
|
|
1471
|
+
hparams.set_swa_pattern(swa_period);
|
|
1575
1472
|
|
|
1576
1473
|
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
|
1577
1474
|
hparams.rope_freq_scale_train_swa = 1.0; // See olmo2.cpp
|
|
@@ -1678,10 +1575,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1678
1575
|
case LLM_ARCH_DEEPSEEK:
|
|
1679
1576
|
{
|
|
1680
1577
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1681
|
-
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
|
1578
|
+
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
|
|
1682
1579
|
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
1683
1580
|
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
|
1684
|
-
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
|
1581
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
|
|
1685
1582
|
|
|
1686
1583
|
switch (hparams.n_ff_exp) {
|
|
1687
1584
|
case 1408: type = LLM_TYPE_16B; break;
|
|
@@ -1691,16 +1588,17 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1691
1588
|
} break;
|
|
1692
1589
|
case LLM_ARCH_DEEPSEEK2:
|
|
1693
1590
|
{
|
|
1694
|
-
// lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
|
|
1695
|
-
bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26);
|
|
1591
|
+
// lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B, Kanana-2-30B-A3B
|
|
1592
|
+
const bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26 || (hparams.n_layer == 48 && n_vocab == 128256));
|
|
1593
|
+
|
|
1696
1594
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1697
|
-
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
|
1595
|
+
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
|
|
1698
1596
|
if (!is_lite) {
|
|
1699
1597
|
ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
|
|
1700
1598
|
}
|
|
1701
1599
|
ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
|
|
1702
|
-
ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.
|
|
1703
|
-
ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.
|
|
1600
|
+
ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k_mla_impl, false);
|
|
1601
|
+
ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla_impl, false);
|
|
1704
1602
|
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
1705
1603
|
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
|
1706
1604
|
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
|
|
@@ -1709,7 +1607,12 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1709
1607
|
if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
|
|
1710
1608
|
// for compatibility with existing DeepSeek V2 and V2.5 GGUFs
|
|
1711
1609
|
// that have no expert_gating_func model parameter set
|
|
1712
|
-
hparams.
|
|
1610
|
+
if ((hparams.n_layer == 47 || hparams.n_layer == 48) && n_vocab == 154880) {
|
|
1611
|
+
// GLM 4.7 Lite
|
|
1612
|
+
hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
|
|
1613
|
+
} else {
|
|
1614
|
+
hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
|
|
1615
|
+
}
|
|
1713
1616
|
}
|
|
1714
1617
|
|
|
1715
1618
|
if (ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f)) {
|
|
@@ -1726,6 +1629,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1726
1629
|
|
|
1727
1630
|
switch (hparams.n_layer) {
|
|
1728
1631
|
case 27: type = LLM_TYPE_16B; break;
|
|
1632
|
+
case 47: type = LLM_TYPE_30B_A3B; break;
|
|
1729
1633
|
case 60: type = LLM_TYPE_236B; break;
|
|
1730
1634
|
case 61: type = LLM_TYPE_671B; break;
|
|
1731
1635
|
default: type = LLM_TYPE_UNKNOWN;
|
|
@@ -1765,7 +1669,15 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1765
1669
|
{
|
|
1766
1670
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1767
1671
|
ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
|
|
1672
|
+
|
|
1673
|
+
// NextN/MTP parameters (GLM-OCR)
|
|
1674
|
+
ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
|
|
1675
|
+
|
|
1676
|
+
// TODO: when MTP is implemented, this should probably be updated if needed
|
|
1677
|
+
hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
|
|
1678
|
+
|
|
1768
1679
|
switch (hparams.n_layer) {
|
|
1680
|
+
case 17: type = LLM_TYPE_1B; break; // GLM-OCR
|
|
1769
1681
|
case 40: type = LLM_TYPE_9B; break;
|
|
1770
1682
|
case 61: type = LLM_TYPE_32B; break;
|
|
1771
1683
|
default: type = LLM_TYPE_UNKNOWN;
|
|
@@ -1782,7 +1694,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1782
1694
|
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used);
|
|
1783
1695
|
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
|
1784
1696
|
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
|
|
1785
|
-
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
|
1697
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
|
|
1786
1698
|
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
|
1787
1699
|
|
|
1788
1700
|
// Expert gating function (GLM-4.5 uses sigmoid)
|
|
@@ -1804,6 +1716,50 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1804
1716
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1805
1717
|
}
|
|
1806
1718
|
} break;
|
|
1719
|
+
case LLM_ARCH_GLM_DSA:
|
|
1720
|
+
{
|
|
1721
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
1722
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1723
|
+
ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
|
|
1724
|
+
|
|
1725
|
+
// MoE parameters
|
|
1726
|
+
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert);
|
|
1727
|
+
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used);
|
|
1728
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
|
1729
|
+
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
|
|
1730
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
|
|
1731
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
|
1732
|
+
|
|
1733
|
+
// deepseek MLA parameters
|
|
1734
|
+
ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
|
|
1735
|
+
ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
|
|
1736
|
+
ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k_mla_impl, false);
|
|
1737
|
+
ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla_impl, false);
|
|
1738
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
1739
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
|
1740
|
+
|
|
1741
|
+
// DSA parameters
|
|
1742
|
+
ml.get_key(LLM_KV_ATTENTION_INDEXER_HEAD_COUNT, hparams.indexer_n_head);
|
|
1743
|
+
ml.get_key(LLM_KV_ATTENTION_INDEXER_KEY_LENGTH, hparams.indexer_head_size);
|
|
1744
|
+
ml.get_key(LLM_KV_ATTENTION_INDEXER_TOP_K, hparams.indexer_top_k);
|
|
1745
|
+
|
|
1746
|
+
// Expert gating function (GLM-4.5 uses sigmoid)
|
|
1747
|
+
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
|
|
1748
|
+
if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
|
|
1749
|
+
hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
|
|
1750
|
+
}
|
|
1751
|
+
|
|
1752
|
+
// NextN/MTP parameters
|
|
1753
|
+
ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
|
|
1754
|
+
|
|
1755
|
+
// TODO: when MTP is implemented, this should probably be updated if needed
|
|
1756
|
+
hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
|
|
1757
|
+
|
|
1758
|
+
switch (hparams.n_layer) {
|
|
1759
|
+
case 79: type = LLM_TYPE_744B_A40B; break;
|
|
1760
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1761
|
+
}
|
|
1762
|
+
} break;
|
|
1807
1763
|
case LLM_ARCH_BITNET:
|
|
1808
1764
|
{
|
|
1809
1765
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
@@ -1857,7 +1813,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1857
1813
|
case LLM_ARCH_JAIS:
|
|
1858
1814
|
{
|
|
1859
1815
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
1860
|
-
ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
|
|
1816
|
+
ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias, false);
|
|
1861
1817
|
|
|
1862
1818
|
switch (hparams.n_layer) {
|
|
1863
1819
|
case 24: type = LLM_TYPE_1_3B; break;
|
|
@@ -1866,6 +1822,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1866
1822
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1867
1823
|
}
|
|
1868
1824
|
} break;
|
|
1825
|
+
case LLM_ARCH_JAIS2:
|
|
1826
|
+
{
|
|
1827
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
1828
|
+
|
|
1829
|
+
switch (hparams.n_layer) {
|
|
1830
|
+
case 32: type = LLM_TYPE_8B; break;
|
|
1831
|
+
case 68: type = LLM_TYPE_70B; break;
|
|
1832
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1833
|
+
}
|
|
1834
|
+
} break;
|
|
1869
1835
|
case LLM_ARCH_NEMOTRON:
|
|
1870
1836
|
{
|
|
1871
1837
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
@@ -1896,10 +1862,12 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1896
1862
|
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared, false);
|
|
1897
1863
|
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
|
1898
1864
|
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
|
|
1865
|
+
ml.get_key(LLM_KV_MOE_LATENT_SIZE, hparams.moe_latent_size, false);
|
|
1899
1866
|
|
|
1900
1867
|
switch (hparams.n_layer) {
|
|
1901
1868
|
case 52: type = LLM_TYPE_31B_A3_5B; break; // Nemotron-H_MOE 31B
|
|
1902
1869
|
case 56: type = LLM_TYPE_9B; break;
|
|
1870
|
+
case 88: type = LLM_TYPE_120B_A12B; break;
|
|
1903
1871
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1904
1872
|
}
|
|
1905
1873
|
} break;
|
|
@@ -1917,7 +1885,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1917
1885
|
if (hparams.n_layer == 64) { // 32B
|
|
1918
1886
|
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
1919
1887
|
hparams.n_swa = 4096;
|
|
1920
|
-
|
|
1888
|
+
uint32_t swa_period = 4;
|
|
1889
|
+
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
|
|
1890
|
+
hparams.set_swa_pattern(swa_period);
|
|
1921
1891
|
|
|
1922
1892
|
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
|
1923
1893
|
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
|
@@ -1933,6 +1903,36 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1933
1903
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1934
1904
|
}
|
|
1935
1905
|
} break;
|
|
1906
|
+
case LLM_ARCH_EXAONE_MOE:
|
|
1907
|
+
{
|
|
1908
|
+
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
1909
|
+
hparams.n_swa = 128;
|
|
1910
|
+
uint32_t swa_period = 4;
|
|
1911
|
+
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
|
|
1912
|
+
hparams.set_swa_pattern(swa_period);
|
|
1913
|
+
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
|
1914
|
+
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
|
1915
|
+
|
|
1916
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
|
1917
|
+
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
|
1918
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1919
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared, false);
|
|
1920
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
1921
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
|
|
1922
|
+
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func);
|
|
1923
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
|
|
1924
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
|
1925
|
+
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
|
|
1926
|
+
|
|
1927
|
+
ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
|
|
1928
|
+
|
|
1929
|
+
switch (hparams.n_layer) {
|
|
1930
|
+
case 32: type = LLM_TYPE_30B_A3B; break;
|
|
1931
|
+
case 48:
|
|
1932
|
+
case 49: type = LLM_TYPE_235B_A22B; break;
|
|
1933
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1934
|
+
}
|
|
1935
|
+
} break;
|
|
1936
1936
|
case LLM_ARCH_RWKV6:
|
|
1937
1937
|
case LLM_ARCH_RWKV6QWEN2:
|
|
1938
1938
|
{
|
|
@@ -2006,9 +2006,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
2006
2006
|
{
|
|
2007
2007
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
2008
2008
|
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
|
2009
|
-
ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
|
|
2010
|
-
ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
|
|
2011
|
-
ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
|
|
2009
|
+
ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale, false);
|
|
2010
|
+
ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale, false);
|
|
2011
|
+
ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale, false);
|
|
2012
2012
|
|
|
2013
2013
|
// Granite uses rope_finetuned as a switch for rope, so default to true
|
|
2014
2014
|
bool rope_finetuned = true;
|
|
@@ -2066,7 +2066,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
2066
2066
|
{
|
|
2067
2067
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
2068
2068
|
hparams.f_norm_eps = 1e-5; // eps for qk-norm, torch default
|
|
2069
|
-
ml.get_key(LLM_KV_SWIN_NORM, hparams.swin_norm);
|
|
2069
|
+
ml.get_key(LLM_KV_SWIN_NORM, hparams.swin_norm, false);
|
|
2070
2070
|
|
|
2071
2071
|
switch (hparams.n_layer) {
|
|
2072
2072
|
case 32: type = LLM_TYPE_7B; break;
|
|
@@ -2079,15 +2079,15 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
2079
2079
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
2080
2080
|
ml.get_key(LLM_KV_ATTENTION_GROUPNORM_EPS, hparams.f_norm_group_eps);
|
|
2081
2081
|
ml.get_key(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups);
|
|
2082
|
-
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
|
2082
|
+
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false);
|
|
2083
2083
|
} break;
|
|
2084
2084
|
case LLM_ARCH_BAILINGMOE:
|
|
2085
2085
|
{
|
|
2086
2086
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
2087
|
-
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
|
2087
|
+
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
|
|
2088
2088
|
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
2089
2089
|
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
|
2090
|
-
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
|
2090
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
|
|
2091
2091
|
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
|
2092
2092
|
|
|
2093
2093
|
switch (hparams.n_layer) {
|
|
@@ -2099,11 +2099,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
2099
2099
|
case LLM_ARCH_BAILINGMOE2:
|
|
2100
2100
|
{
|
|
2101
2101
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
2102
|
-
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
|
2102
|
+
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
|
|
2103
2103
|
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
2104
|
-
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp);
|
|
2104
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
|
|
2105
2105
|
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
|
2106
|
-
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
|
2106
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
|
|
2107
2107
|
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
|
2108
2108
|
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func);
|
|
2109
2109
|
ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
|
|
@@ -2122,10 +2122,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
2122
2122
|
case LLM_ARCH_DOTS1:
|
|
2123
2123
|
{
|
|
2124
2124
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
2125
|
-
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
|
2125
|
+
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
|
|
2126
2126
|
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
2127
2127
|
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
|
2128
|
-
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
|
2128
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
|
|
2129
2129
|
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
|
2130
2130
|
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
|
|
2131
2131
|
switch (hparams.n_layer) {
|
|
@@ -2135,13 +2135,17 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
2135
2135
|
} break;
|
|
2136
2136
|
case LLM_ARCH_ERNIE4_5:
|
|
2137
2137
|
case LLM_ARCH_ERNIE4_5_MOE:
|
|
2138
|
+
case LLM_ARCH_PADDLEOCR:
|
|
2138
2139
|
{
|
|
2140
|
+
// paddleocr need mrope_section
|
|
2141
|
+
ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
|
|
2142
|
+
|
|
2139
2143
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
2140
2144
|
if (arch == LLM_ARCH_ERNIE4_5_MOE) {
|
|
2141
2145
|
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
2142
2146
|
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
|
|
2143
2147
|
ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step);
|
|
2144
|
-
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
|
2148
|
+
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
|
|
2145
2149
|
}
|
|
2146
2150
|
|
|
2147
2151
|
switch (hparams.n_layer) {
|
|
@@ -2186,7 +2190,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
2186
2190
|
{
|
|
2187
2191
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
2188
2192
|
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
2189
|
-
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp);
|
|
2193
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
|
|
2190
2194
|
|
|
2191
2195
|
switch (hparams.n_layer) {
|
|
2192
2196
|
case 32: type = LLM_TYPE_A13B; break;
|
|
@@ -2222,7 +2226,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
2222
2226
|
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
|
2223
2227
|
|
|
2224
2228
|
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
2225
|
-
|
|
2229
|
+
uint32_t swa_period = 2;
|
|
2230
|
+
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
|
|
2231
|
+
hparams.set_swa_pattern(swa_period);
|
|
2226
2232
|
|
|
2227
2233
|
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
|
2228
2234
|
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
|
@@ -2249,12 +2255,18 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
2249
2255
|
case 10752: type = LLM_TYPE_2_6B; break;
|
|
2250
2256
|
default: type = LLM_TYPE_UNKNOWN;
|
|
2251
2257
|
}
|
|
2258
|
+
if (const auto is_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false); is_swa && hparams.n_swa > 0) {
|
|
2259
|
+
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
2260
|
+
for (uint32_t il = 0; il < hparams.n_layer; ++il) {
|
|
2261
|
+
hparams.swa_layers[il] = !hparams.recurrent_layer_arr[il];
|
|
2262
|
+
}
|
|
2263
|
+
}
|
|
2252
2264
|
} break;
|
|
2253
2265
|
case LLM_ARCH_LFM2MOE:
|
|
2254
2266
|
{
|
|
2255
2267
|
ml.get_key(LLM_KV_SHORTCONV_L_CACHE, hparams.n_shortconv_l_cache);
|
|
2256
2268
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
2257
|
-
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
|
2269
|
+
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
|
|
2258
2270
|
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
2259
2271
|
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func);
|
|
2260
2272
|
|
|
@@ -2262,16 +2274,22 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
2262
2274
|
hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0;
|
|
2263
2275
|
}
|
|
2264
2276
|
|
|
2265
|
-
|
|
2277
|
+
switch (hparams.n_layer) {
|
|
2278
|
+
case 24: type = LLM_TYPE_8B_A1B; break;
|
|
2279
|
+
case 40: type = LLM_TYPE_24B_A2B; break;
|
|
2280
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
2281
|
+
}
|
|
2266
2282
|
} break;
|
|
2267
2283
|
case LLM_ARCH_SMALLTHINKER:
|
|
2268
2284
|
{
|
|
2269
2285
|
const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
|
2270
2286
|
|
|
2271
2287
|
if (found_swa && hparams.n_swa > 0) {
|
|
2272
|
-
hparams.swa_type
|
|
2273
|
-
hparams.n_swa
|
|
2274
|
-
|
|
2288
|
+
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
2289
|
+
hparams.n_swa = 4096;
|
|
2290
|
+
uint32_t swa_period = 4;
|
|
2291
|
+
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
|
|
2292
|
+
hparams.set_swa_pattern(swa_period, true);
|
|
2275
2293
|
|
|
2276
2294
|
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
|
2277
2295
|
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
|
@@ -2294,7 +2312,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
2294
2312
|
case LLM_ARCH_GROVEMOE:
|
|
2295
2313
|
{
|
|
2296
2314
|
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
2297
|
-
ml.get_key(LLM_KV_EXPERT_CHUNK_FEED_FORWARD_LENGTH, hparams.n_ff_chexp);
|
|
2315
|
+
ml.get_key(LLM_KV_EXPERT_CHUNK_FEED_FORWARD_LENGTH, hparams.n_ff_chexp, false);
|
|
2298
2316
|
ml.get_key(LLM_KV_EXPERT_GROUP_SCALE, hparams.expert_group_scale);
|
|
2299
2317
|
ml.get_key(LLM_KV_EXPERTS_PER_GROUP, hparams.n_group_experts);
|
|
2300
2318
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
@@ -2359,8 +2377,12 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
2359
2377
|
ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
|
|
2360
2378
|
|
|
2361
2379
|
// Mark recurrent layers (linear attention layers)
|
|
2362
|
-
|
|
2363
|
-
|
|
2380
|
+
{
|
|
2381
|
+
uint32_t full_attn_interval = 4;
|
|
2382
|
+
ml.get_key(LLM_KV_FULL_ATTENTION_INTERVAL, full_attn_interval, false);
|
|
2383
|
+
for (uint32_t i = 0; i < hparams.n_layer; ++i) {
|
|
2384
|
+
hparams.recurrent_layer_arr[i] = ((i + 1) % full_attn_interval != 0);
|
|
2385
|
+
}
|
|
2364
2386
|
}
|
|
2365
2387
|
|
|
2366
2388
|
switch (hparams.n_layer) {
|
|
@@ -2368,6 +2390,65 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
2368
2390
|
default: type = LLM_TYPE_UNKNOWN;
|
|
2369
2391
|
}
|
|
2370
2392
|
} break;
|
|
2393
|
+
case LLM_ARCH_QWEN35:
|
|
2394
|
+
{
|
|
2395
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
2396
|
+
ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
|
|
2397
|
+
|
|
2398
|
+
// Load linear attention (gated delta net) parameters
|
|
2399
|
+
ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
|
|
2400
|
+
ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
|
|
2401
|
+
ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
|
|
2402
|
+
ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
|
|
2403
|
+
ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
|
|
2404
|
+
|
|
2405
|
+
// Mark recurrent layers (linear attention layers)
|
|
2406
|
+
{
|
|
2407
|
+
uint32_t full_attn_interval = 4;
|
|
2408
|
+
ml.get_key(LLM_KV_FULL_ATTENTION_INTERVAL, full_attn_interval, false);
|
|
2409
|
+
for (uint32_t i = 0; i < hparams.n_layer; ++i) {
|
|
2410
|
+
hparams.recurrent_layer_arr[i] = ((i + 1) % full_attn_interval != 0);
|
|
2411
|
+
}
|
|
2412
|
+
}
|
|
2413
|
+
|
|
2414
|
+
switch (hparams.n_layer) {
|
|
2415
|
+
case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_8B : LLM_TYPE_2B; break;
|
|
2416
|
+
case 32: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_9B; break;
|
|
2417
|
+
case 64: type = LLM_TYPE_27B; break;
|
|
2418
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
2419
|
+
}
|
|
2420
|
+
} break;
|
|
2421
|
+
case LLM_ARCH_QWEN35MOE:
|
|
2422
|
+
{
|
|
2423
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
|
|
2424
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
|
|
2425
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
2426
|
+
|
|
2427
|
+
ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
|
|
2428
|
+
|
|
2429
|
+
// Load linear attention (gated delta net) parameters
|
|
2430
|
+
ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
|
|
2431
|
+
ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
|
|
2432
|
+
ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
|
|
2433
|
+
ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
|
|
2434
|
+
ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group);
|
|
2435
|
+
|
|
2436
|
+
// Mark recurrent layers (linear attention layers)
|
|
2437
|
+
{
|
|
2438
|
+
uint32_t full_attn_interval = 4;
|
|
2439
|
+
ml.get_key(LLM_KV_FULL_ATTENTION_INTERVAL, full_attn_interval, false);
|
|
2440
|
+
for (uint32_t i = 0; i < hparams.n_layer; ++i) {
|
|
2441
|
+
hparams.recurrent_layer_arr[i] = ((i + 1) % full_attn_interval != 0);
|
|
2442
|
+
}
|
|
2443
|
+
}
|
|
2444
|
+
|
|
2445
|
+
switch (hparams.n_layer) {
|
|
2446
|
+
case 40: type = LLM_TYPE_35B_A3B; break;
|
|
2447
|
+
case 48: type = LLM_TYPE_122B_A10B; break;
|
|
2448
|
+
case 60: type = LLM_TYPE_397B_A17B; break;
|
|
2449
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
2450
|
+
}
|
|
2451
|
+
} break;
|
|
2371
2452
|
case LLM_ARCH_MISTRAL3:
|
|
2372
2453
|
{
|
|
2373
2454
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
@@ -2402,7 +2483,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
2402
2483
|
|
|
2403
2484
|
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
2404
2485
|
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
|
2405
|
-
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
|
|
2486
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
|
2406
2487
|
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer);
|
|
2407
2488
|
|
|
2408
2489
|
switch (hparams.n_layer) {
|
|
@@ -2410,7 +2491,69 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
2410
2491
|
default: type = LLM_TYPE_UNKNOWN;
|
|
2411
2492
|
}
|
|
2412
2493
|
} break;
|
|
2413
|
-
|
|
2494
|
+
case LLM_ARCH_KIMI_LINEAR:
|
|
2495
|
+
{
|
|
2496
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
2497
|
+
ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k_mla_impl);
|
|
2498
|
+
ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla_impl);
|
|
2499
|
+
ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
|
|
2500
|
+
ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
|
|
2501
|
+
ml.get_key(LLM_KV_KDA_HEAD_DIM, hparams.n_embd_head_kda);
|
|
2502
|
+
|
|
2503
|
+
// MLA qk_rope_head_dim (for reference)
|
|
2504
|
+
// qk_rope_head_dim = 64, qk_nope_head_dim = 128, qk_head_dim = 192
|
|
2505
|
+
|
|
2506
|
+
// Mark KDA layers as recurrent using n_head_kv pattern (like Jamba)
|
|
2507
|
+
// Set n_head_kv = 0 for KDA layers (recurrent), n_head_kv = n_head for MLA layers (attention)
|
|
2508
|
+
for (uint32_t i = 0; i < hparams.n_layer; ++i) {
|
|
2509
|
+
hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0; // KDA layers are recurrent
|
|
2510
|
+
}
|
|
2511
|
+
|
|
2512
|
+
// MoE parameters - Kimi uses moe_intermediate_size = 1024
|
|
2513
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
2514
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
|
2515
|
+
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
|
|
2516
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
|
|
2517
|
+
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func);
|
|
2518
|
+
|
|
2519
|
+
switch (hparams.n_layer) {
|
|
2520
|
+
case 27: type = LLM_TYPE_48B_A3B; break; // Kimi-Linear-48B-A3B
|
|
2521
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
2522
|
+
}
|
|
2523
|
+
} break;
|
|
2524
|
+
case LLM_ARCH_STEP35:
|
|
2525
|
+
{
|
|
2526
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
2527
|
+
|
|
2528
|
+
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
|
2529
|
+
|
|
2530
|
+
// full_attention layer only use half of the RoPE dimensions
|
|
2531
|
+
hparams.n_rot_full = hparams.n_rot_full / 2;
|
|
2532
|
+
|
|
2533
|
+
// MoE + SWA parameters
|
|
2534
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
2535
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
|
|
2536
|
+
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
|
|
2537
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
|
|
2538
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
|
2539
|
+
|
|
2540
|
+
// Step35 uses sigmoid gating by default (if not set in GGUF)
|
|
2541
|
+
if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
|
|
2542
|
+
hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
|
|
2543
|
+
}
|
|
2544
|
+
|
|
2545
|
+
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
|
2546
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
|
2547
|
+
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer);
|
|
2548
|
+
ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_EXP, hparams.swiglu_clamp_exp, hparams.n_layer, false);
|
|
2549
|
+
ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_SHEXP, hparams.swiglu_clamp_shexp, hparams.n_layer, false);
|
|
2550
|
+
|
|
2551
|
+
switch (hparams.n_layer) {
|
|
2552
|
+
case 45: type = LLM_TYPE_196B_A11B; break;
|
|
2553
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
2554
|
+
}
|
|
2555
|
+
} break;
|
|
2556
|
+
default: throw std::runtime_error("unsupported model architecture: " + arch_name());
|
|
2414
2557
|
}
|
|
2415
2558
|
|
|
2416
2559
|
pimpl->n_bytes = ml.n_bytes;
|
|
@@ -2508,224 +2651,63 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2508
2651
|
// there is very little benefit to offloading the input layer, so always keep it on the CPU
|
|
2509
2652
|
pimpl->dev_input = { cpu_dev, &pimpl->cpu_buft_list };
|
|
2510
2653
|
|
|
2511
|
-
// assign the repeating layers to the devices according to the splits
|
|
2512
|
-
pimpl->dev_layer.resize(n_layer);
|
|
2513
|
-
for (int il = 0; il < n_layer; ++il) {
|
|
2514
|
-
pimpl->dev_layer[il] = get_layer_buft_list(il);
|
|
2515
|
-
}
|
|
2516
|
-
|
|
2517
|
-
// assign the output layer
|
|
2518
|
-
pimpl->dev_output = get_layer_buft_list(n_layer);
|
|
2519
|
-
|
|
2520
|
-
// one ggml context per buffer type
|
|
2521
|
-
int max_n_tensors = ml.n_tensors;
|
|
2522
|
-
max_n_tensors += 1; // duplicated output tensor
|
|
2523
|
-
max_n_tensors += n_layer*2; // duplicated rope freq tensors
|
|
2524
|
-
const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors;
|
|
2525
|
-
|
|
2526
|
-
// define a comparator for the buft -> ctx map to ensure that the order is well-defined:
|
|
2527
|
-
struct ggml_backend_buft_comparator {
|
|
2528
|
-
bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const {
|
|
2529
|
-
return strcmp(ggml_backend_buft_name(lhs), ggml_backend_buft_name(rhs)) < 0;
|
|
2530
|
-
}
|
|
2531
|
-
};
|
|
2532
|
-
std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> ctx_map;
|
|
2533
|
-
|
|
2534
|
-
auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
|
|
2535
|
-
auto it = ctx_map.find(buft);
|
|
2536
|
-
if (it == ctx_map.end()) {
|
|
2537
|
-
ggml_init_params params = {
|
|
2538
|
-
/*.mem_size =*/ ctx_size,
|
|
2539
|
-
/*.mem_buffer =*/ NULL,
|
|
2540
|
-
/*.no_alloc =*/ true,
|
|
2541
|
-
};
|
|
2542
|
-
|
|
2543
|
-
ggml_context * ctx = ggml_init(params);
|
|
2544
|
-
if (!ctx) {
|
|
2545
|
-
throw std::runtime_error(format("failed to create ggml context"));
|
|
2546
|
-
}
|
|
2547
|
-
|
|
2548
|
-
ctx_map.emplace(buft, ctx);
|
|
2549
|
-
|
|
2550
|
-
return ctx;
|
|
2551
|
-
}
|
|
2552
|
-
return it->second.get();
|
|
2553
|
-
};
|
|
2554
|
-
|
|
2555
|
-
const auto TENSOR_DUPLICATED = llama_model_loader::TENSOR_DUPLICATED;
|
|
2556
|
-
const auto TENSOR_NOT_REQUIRED = llama_model_loader::TENSOR_NOT_REQUIRED;
|
|
2557
|
-
const auto TENSOR_SKIP = llama_model_loader::TENSOR_SKIP;
|
|
2558
|
-
|
|
2559
|
-
// create tensors for the weights
|
|
2560
|
-
{
|
|
2561
|
-
// note: cast to int64_t since we will use these for the tensor dimensions
|
|
2562
|
-
const int64_t n_head = hparams.n_head();
|
|
2563
|
-
const int64_t n_head_kv = hparams.n_head_kv();
|
|
2564
|
-
const int64_t n_embd = hparams.n_embd;
|
|
2565
|
-
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
|
2566
|
-
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
|
2567
|
-
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
|
2568
|
-
const int64_t n_embd_head_v = hparams.n_embd_head_v;
|
|
2569
|
-
const int64_t n_ff = hparams.n_ff();
|
|
2570
|
-
const int64_t n_embd_gqa = n_embd_v_gqa;
|
|
2571
|
-
const int64_t n_vocab = vocab.n_tokens();
|
|
2572
|
-
const int64_t n_token_types = vocab.n_token_types();
|
|
2573
|
-
const int64_t n_rot = hparams.n_rot;
|
|
2574
|
-
const int64_t n_expert = hparams.n_expert;
|
|
2575
|
-
const int64_t n_expert_used = hparams.n_expert_used;
|
|
2576
|
-
const int64_t n_ctx_train = hparams.n_ctx_train;
|
|
2577
|
-
|
|
2578
|
-
if (n_expert > 0 && hparams.n_expert_used == 0) {
|
|
2579
|
-
throw std::runtime_error("model has expert layers but no expert layers are used");
|
|
2580
|
-
}
|
|
2581
|
-
|
|
2582
|
-
int n_moved_tensors = 0;
|
|
2583
|
-
ggml_tensor * first_moved_tensor = nullptr;
|
|
2584
|
-
ggml_backend_buffer_type_t first_moved_from_buft = nullptr;
|
|
2585
|
-
ggml_backend_buffer_type_t first_moved_to_buft = nullptr;
|
|
2586
|
-
|
|
2587
|
-
auto create_tensor = [&](const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) -> ggml_tensor * {
|
|
2588
|
-
ggml_tensor * t_meta = ml.get_tensor_meta(tn.str().c_str());
|
|
2589
|
-
|
|
2590
|
-
if (!t_meta) {
|
|
2591
|
-
if (flags & TENSOR_NOT_REQUIRED) {
|
|
2592
|
-
return nullptr;
|
|
2593
|
-
}
|
|
2594
|
-
throw std::runtime_error(format("missing tensor '%s'", tn.str().c_str()));
|
|
2595
|
-
}
|
|
2596
|
-
|
|
2597
|
-
// some models use the token embedding tensor as the output, but since these are used in different layers and with different ops
|
|
2598
|
-
// the tensor is duplicated
|
|
2599
|
-
// to handle this, we check if the tensor is duplicated, and if so, we assume that it is being loaded as the output tensor
|
|
2600
|
-
llm_tensor tn_tensor = tn.tensor;
|
|
2601
|
-
if (tn.tensor == LLM_TENSOR_TOKEN_EMBD && flags & TENSOR_DUPLICATED) {
|
|
2602
|
-
tn_tensor = LLM_TENSOR_OUTPUT;
|
|
2603
|
-
}
|
|
2604
|
-
|
|
2605
|
-
llm_tensor_info info;
|
|
2606
|
-
try {
|
|
2607
|
-
info = llm_tensor_info_for(tn_tensor);
|
|
2608
|
-
} catch (const std::out_of_range & e) {
|
|
2609
|
-
throw std::runtime_error(format("missing tensor info mapping for %s", tn.str().c_str()));
|
|
2610
|
-
}
|
|
2611
|
-
|
|
2612
|
-
// skip unused tensors
|
|
2613
|
-
if (info.op == GGML_OP_NONE || flags & TENSOR_SKIP) {
|
|
2614
|
-
const size_t nbytes = ggml_nbytes(t_meta);
|
|
2615
|
-
LLAMA_LOG_WARN("model has unused tensor %s (size = %zu bytes) -- ignoring\n", tn.str().c_str(), nbytes);
|
|
2616
|
-
|
|
2617
|
-
ml.size_data -= nbytes;
|
|
2618
|
-
ml.n_created++;
|
|
2619
|
-
|
|
2620
|
-
return nullptr;
|
|
2621
|
-
}
|
|
2622
|
-
|
|
2623
|
-
// tensors with "bias" suffix are always used with GGML_OP_ADD or GGML_OP_ADD_ID
|
|
2624
|
-
ggml_op op;
|
|
2625
|
-
bool bias = tn.suffix != nullptr && strcmp(tn.suffix, "bias") == 0;
|
|
2626
|
-
if (bias) {
|
|
2627
|
-
if (info.op == GGML_OP_MUL_MAT_ID) {
|
|
2628
|
-
op = GGML_OP_ADD_ID;
|
|
2629
|
-
} else {
|
|
2630
|
-
op = GGML_OP_ADD;
|
|
2631
|
-
}
|
|
2632
|
-
} else {
|
|
2633
|
-
op = info.op;
|
|
2634
|
-
}
|
|
2635
|
-
|
|
2636
|
-
// sanity checks
|
|
2637
|
-
if (info.layer == LLM_TENSOR_LAYER_INPUT || info.layer == LLM_TENSOR_LAYER_OUTPUT) {
|
|
2638
|
-
if (tn.bid != -1) {
|
|
2639
|
-
GGML_ABORT("input/output layer tensor %s used with a layer number", tn.str().c_str());
|
|
2640
|
-
}
|
|
2641
|
-
} else {
|
|
2642
|
-
if (tn.bid == -1) {
|
|
2643
|
-
GGML_ABORT("repeating layer tensor %s used without a layer number", tn.str().c_str());
|
|
2644
|
-
}
|
|
2645
|
-
}
|
|
2646
|
-
|
|
2647
|
-
// select the buffer type for this tensor
|
|
2648
|
-
buft_list_t * buft_list;
|
|
2649
|
-
switch (info.layer) {
|
|
2650
|
-
case LLM_TENSOR_LAYER_INPUT:
|
|
2651
|
-
buft_list = pimpl->dev_input.buft_list;
|
|
2652
|
-
break;
|
|
2653
|
-
case LLM_TENSOR_LAYER_OUTPUT:
|
|
2654
|
-
buft_list = pimpl->dev_output.buft_list;
|
|
2655
|
-
break;
|
|
2656
|
-
case LLM_TENSOR_LAYER_REPEATING:
|
|
2657
|
-
buft_list = pimpl->dev_layer.at(tn.bid).buft_list;
|
|
2658
|
-
break;
|
|
2659
|
-
default:
|
|
2660
|
-
GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str());
|
|
2661
|
-
}
|
|
2662
|
-
|
|
2663
|
-
ggml_backend_buffer_type_t buft = nullptr;
|
|
2664
|
-
|
|
2665
|
-
// check overrides
|
|
2666
|
-
if (ml.tensor_buft_overrides) {
|
|
2667
|
-
std::string tensor_name = tn.str();
|
|
2668
|
-
for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
|
|
2669
|
-
std::regex pattern(overrides->pattern);
|
|
2670
|
-
if (std::regex_search(tensor_name, pattern)) {
|
|
2671
|
-
if (overrides->buft == ggml_backend_cpu_buffer_type()) {
|
|
2672
|
-
// when overriding to a CPU buffer, consider the extra buffer types
|
|
2673
|
-
buft = select_weight_buft(hparams, t_meta, op, pimpl->cpu_buft_list);
|
|
2674
|
-
} else {
|
|
2675
|
-
buft = overrides->buft;
|
|
2676
|
-
}
|
|
2677
|
-
|
|
2678
|
-
LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
|
|
2679
|
-
tensor_name.c_str(),
|
|
2680
|
-
ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type),
|
|
2681
|
-
ggml_backend_buft_name(buft));
|
|
2682
|
-
break;
|
|
2683
|
-
}
|
|
2684
|
-
}
|
|
2685
|
-
}
|
|
2686
|
-
|
|
2687
|
-
if (!buft) {
|
|
2688
|
-
buft = select_weight_buft(hparams, t_meta, op, *buft_list);
|
|
2689
|
-
if (!buft) {
|
|
2690
|
-
throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
|
|
2691
|
-
}
|
|
2692
|
-
}
|
|
2654
|
+
// assign the repeating layers to the devices according to the splits
|
|
2655
|
+
pimpl->dev_layer.resize(n_layer);
|
|
2656
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
2657
|
+
pimpl->dev_layer[il] = get_layer_buft_list(il);
|
|
2658
|
+
}
|
|
2693
2659
|
|
|
2694
|
-
|
|
2695
|
-
|
|
2696
|
-
if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
|
|
2697
|
-
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
|
2698
|
-
if (!cpu_dev) {
|
|
2699
|
-
throw std::runtime_error("no CPU backend found");
|
|
2700
|
-
}
|
|
2701
|
-
buft = ggml_backend_dev_buffer_type(cpu_dev);
|
|
2702
|
-
}
|
|
2660
|
+
// assign the output layer
|
|
2661
|
+
pimpl->dev_output = get_layer_buft_list(n_layer);
|
|
2703
2662
|
|
|
2704
|
-
|
|
2705
|
-
|
|
2706
|
-
|
|
2707
|
-
|
|
2708
|
-
first_moved_from_buft = buft_list->front().second;
|
|
2709
|
-
first_moved_to_buft = buft;
|
|
2710
|
-
}
|
|
2711
|
-
}
|
|
2663
|
+
const auto TENSOR_DUPLICATED = llama_model_loader::TENSOR_DUPLICATED;
|
|
2664
|
+
const auto TENSOR_NOT_REQUIRED = llama_model_loader::TENSOR_NOT_REQUIRED;
|
|
2665
|
+
const auto TENSOR_SKIP = llama_model_loader::TENSOR_SKIP;
|
|
2666
|
+
const auto TENSOR_SKIP_IF_VIRTUAL = llama_model_loader::TENSOR_SKIP_IF_VIRTUAL;
|
|
2712
2667
|
|
|
2713
|
-
|
|
2668
|
+
// create tensors for the weights
|
|
2669
|
+
{
|
|
2670
|
+
// note: cast to int64_t since we will use these for the tensor dimensions
|
|
2671
|
+
const int64_t n_head = hparams.n_head();
|
|
2672
|
+
const int64_t n_head_kv = hparams.n_head_kv();
|
|
2673
|
+
const int64_t n_embd = hparams.n_embd;
|
|
2674
|
+
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
|
2675
|
+
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
|
2676
|
+
const int64_t n_embd_head_k = hparams.n_embd_head_k();
|
|
2677
|
+
const int64_t n_embd_head_v = hparams.n_embd_head_v();
|
|
2678
|
+
const int64_t n_ff = hparams.n_ff();
|
|
2679
|
+
const int64_t n_embd_gqa = n_embd_v_gqa;
|
|
2680
|
+
const int64_t n_vocab = vocab.n_tokens();
|
|
2681
|
+
const int64_t n_token_types = vocab.n_token_types();
|
|
2682
|
+
const int64_t n_rot = hparams.n_rot();
|
|
2683
|
+
const int64_t n_expert = hparams.n_expert;
|
|
2684
|
+
const int64_t n_expert_used = hparams.n_expert_used;
|
|
2685
|
+
const int64_t n_ctx_train = hparams.n_ctx_train;
|
|
2714
2686
|
|
|
2715
|
-
|
|
2716
|
-
|
|
2717
|
-
|
|
2718
|
-
|
|
2719
|
-
|
|
2720
|
-
|
|
2721
|
-
|
|
2722
|
-
|
|
2687
|
+
if (n_expert > 0 && hparams.n_expert_used == 0) {
|
|
2688
|
+
throw std::runtime_error("model has expert layers but no expert layers are used");
|
|
2689
|
+
}
|
|
2690
|
+
|
|
2691
|
+
auto create_tensor = [&](const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) -> ggml_tensor * {
|
|
2692
|
+
const buft_list_t * buft_list_layer = tn.bid == -1 ? nullptr : pimpl->dev_layer.at(tn.bid).buft_list;
|
|
2693
|
+
return ml.create_tensor(
|
|
2694
|
+
hparams, &pimpl->cpu_buft_list, pimpl->dev_input.buft_list, pimpl->dev_output.buft_list, buft_list_layer,
|
|
2695
|
+
tn, ne, flags);
|
|
2723
2696
|
};
|
|
2724
2697
|
|
|
2725
2698
|
layers.resize(n_layer);
|
|
2726
2699
|
|
|
2727
2700
|
// TODO: move to a separate function
|
|
2728
2701
|
const auto tn = LLM_TN(arch);
|
|
2702
|
+
|
|
2703
|
+
// helper: try merged gate_up_exps first, fall back to separate gate and up
|
|
2704
|
+
auto create_tensor_gate_up_exps = [&](llama_layer & layer, int bid, int64_t n_embd_, int64_t n_ff_, int64_t n_expert_, int flags) {
|
|
2705
|
+
layer.ffn_gate_up_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_UP_EXPS, "weight", bid), {n_embd_, n_ff_ * 2, n_expert_}, TENSOR_NOT_REQUIRED);
|
|
2706
|
+
if (layer.ffn_gate_up_exps == nullptr) {
|
|
2707
|
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", bid), {n_embd_, n_ff_, n_expert_}, flags);
|
|
2708
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", bid), {n_embd_, n_ff_, n_expert_}, flags);
|
|
2709
|
+
}
|
|
2710
|
+
};
|
|
2729
2711
|
switch (arch) {
|
|
2730
2712
|
case LLM_ARCH_LLAMA:
|
|
2731
2713
|
case LLM_ARCH_REFACT:
|
|
@@ -2879,6 +2861,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2879
2861
|
} break;
|
|
2880
2862
|
case LLM_ARCH_LLAMA4:
|
|
2881
2863
|
{
|
|
2864
|
+
if (n_expert == 0) {
|
|
2865
|
+
throw std::runtime_error(arch_name() + " model cannot have zero experts");
|
|
2866
|
+
}
|
|
2882
2867
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
2883
2868
|
|
|
2884
2869
|
// output
|
|
@@ -2891,7 +2876,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2891
2876
|
}
|
|
2892
2877
|
|
|
2893
2878
|
for (int i = 0; i < n_layer; ++i) {
|
|
2894
|
-
bool is_moe_layer = hparams.n_moe_layer_step > 0 && (i + 1) % hparams.n_moe_layer_step == 0;
|
|
2879
|
+
const bool is_moe_layer = hparams.n_moe_layer_step > 0 && (i + 1) % hparams.n_moe_layer_step == 0;
|
|
2895
2880
|
|
|
2896
2881
|
auto & layer = layers[i];
|
|
2897
2882
|
|
|
@@ -2907,7 +2892,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2907
2892
|
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
|
2908
2893
|
|
|
2909
2894
|
if (is_moe_layer) {
|
|
2910
|
-
|
|
2895
|
+
const int64_t n_ff_exp = hparams.n_ff_exp;
|
|
2911
2896
|
|
|
2912
2897
|
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
|
2913
2898
|
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
|
|
@@ -2994,8 +2979,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2994
2979
|
} break;
|
|
2995
2980
|
case LLM_ARCH_MINICPM3:
|
|
2996
2981
|
{
|
|
2997
|
-
const int64_t n_embd_head_qk_rope = hparams.n_rot;
|
|
2998
|
-
const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
|
|
2982
|
+
const int64_t n_embd_head_qk_rope = hparams.n_rot();
|
|
2983
|
+
const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k() - hparams.n_rot();
|
|
2999
2984
|
|
|
3000
2985
|
const int64_t q_lora_rank = hparams.n_lora_q;
|
|
3001
2986
|
const int64_t kv_lora_rank = hparams.n_lora_kv;
|
|
@@ -3038,7 +3023,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3038
3023
|
case LLM_ARCH_GROK:
|
|
3039
3024
|
{
|
|
3040
3025
|
if (n_expert == 0) {
|
|
3041
|
-
throw std::runtime_error("
|
|
3026
|
+
throw std::runtime_error(arch_name() + " model cannot have zero experts");
|
|
3042
3027
|
}
|
|
3043
3028
|
|
|
3044
3029
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
@@ -3210,6 +3195,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3210
3195
|
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
3211
3196
|
case LLM_ARCH_JINA_BERT_V3:
|
|
3212
3197
|
{
|
|
3198
|
+
if (n_token_types == 0) {
|
|
3199
|
+
throw std::runtime_error(arch_name() + " model needs to define token type count");
|
|
3200
|
+
}
|
|
3213
3201
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
3214
3202
|
type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, TENSOR_NOT_REQUIRED);
|
|
3215
3203
|
|
|
@@ -3294,9 +3282,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3294
3282
|
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
3295
3283
|
}
|
|
3296
3284
|
|
|
3297
|
-
|
|
3298
|
-
|
|
3299
|
-
|
|
3285
|
+
cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
|
|
3286
|
+
cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
|
|
3287
|
+
cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
|
|
3288
|
+
cls_norm = create_tensor(tn(LLM_TENSOR_CLS_NORM, "weight"), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
3300
3289
|
|
|
3301
3290
|
} break;
|
|
3302
3291
|
case LLM_ARCH_NEO_BERT:
|
|
@@ -3325,6 +3314,29 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3325
3314
|
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
|
3326
3315
|
}
|
|
3327
3316
|
} break;
|
|
3317
|
+
case LLM_ARCH_EUROBERT:
|
|
3318
|
+
{
|
|
3319
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
3320
|
+
|
|
3321
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
3322
|
+
|
|
3323
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
3324
|
+
auto & layer = layers[i];
|
|
3325
|
+
|
|
3326
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
3327
|
+
|
|
3328
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
|
3329
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
3330
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
3331
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
3332
|
+
|
|
3333
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
3334
|
+
|
|
3335
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
3336
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
3337
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
|
3338
|
+
}
|
|
3339
|
+
} break;
|
|
3328
3340
|
case LLM_ARCH_JINA_BERT_V2:
|
|
3329
3341
|
{
|
|
3330
3342
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); // word_embeddings
|
|
@@ -3452,8 +3464,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3452
3464
|
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
3453
3465
|
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
|
|
3454
3466
|
|
|
3455
|
-
|
|
3456
|
-
layer.
|
|
3467
|
+
// FIXME test-llama-archs crashes if q_norm is created
|
|
3468
|
+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED | TENSOR_SKIP_IF_VIRTUAL);
|
|
3469
|
+
layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED | TENSOR_SKIP_IF_VIRTUAL);
|
|
3457
3470
|
|
|
3458
3471
|
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
3459
3472
|
layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
@@ -3839,8 +3852,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3839
3852
|
const int64_t dt_dim = std::max(64, int(hparams.n_embd / 16));
|
|
3840
3853
|
|
|
3841
3854
|
// attention parameters
|
|
3842
|
-
const uint32_t qk_dim = hparams.n_embd_head_k;
|
|
3843
|
-
const uint32_t v_dim = hparams.n_embd_head_v;
|
|
3855
|
+
const uint32_t qk_dim = hparams.n_embd_head_k();
|
|
3856
|
+
const uint32_t v_dim = hparams.n_embd_head_v();
|
|
3844
3857
|
|
|
3845
3858
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
3846
3859
|
|
|
@@ -3900,8 +3913,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3900
3913
|
} break;
|
|
3901
3914
|
case LLM_ARCH_PLAMO3:
|
|
3902
3915
|
{
|
|
3903
|
-
const int64_t head_dim_q = hparams.n_embd_head_k;
|
|
3904
|
-
const int64_t head_dim_v = hparams.n_embd_head_v;
|
|
3916
|
+
const int64_t head_dim_q = hparams.n_embd_head_k();
|
|
3917
|
+
const int64_t head_dim_v = hparams.n_embd_head_v();
|
|
3905
3918
|
|
|
3906
3919
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
3907
3920
|
|
|
@@ -4648,7 +4661,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
4648
4661
|
} break;
|
|
4649
4662
|
case LLM_ARCH_SEED_OSS:
|
|
4650
4663
|
{
|
|
4651
|
-
const uint32_t head_dim = hparams.n_embd_head_k;
|
|
4664
|
+
const uint32_t head_dim = hparams.n_embd_head_k();
|
|
4652
4665
|
const int64_t n_qo_dim = n_head * head_dim;
|
|
4653
4666
|
const int64_t n_kv_dim = n_head_kv * head_dim;
|
|
4654
4667
|
|
|
@@ -4871,17 +4884,15 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
4871
4884
|
} break;
|
|
4872
4885
|
case LLM_ARCH_DEEPSEEK2:
|
|
4873
4886
|
{
|
|
4874
|
-
|
|
4875
|
-
const bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26);
|
|
4876
|
-
|
|
4877
|
-
const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
|
|
4887
|
+
const bool is_mla = hparams.is_mla();
|
|
4878
4888
|
|
|
4879
4889
|
// note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
|
|
4880
|
-
const int64_t n_embd_head_k_mla =
|
|
4881
|
-
const int64_t n_embd_head_v_mla =
|
|
4890
|
+
const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla();
|
|
4891
|
+
const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla();
|
|
4882
4892
|
|
|
4883
|
-
const int64_t n_embd_head_qk_rope = hparams.n_rot;
|
|
4893
|
+
const int64_t n_embd_head_qk_rope = hparams.n_rot();
|
|
4884
4894
|
const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;
|
|
4895
|
+
GGML_ASSERT(n_embd_head_qk_nope >= 1);
|
|
4885
4896
|
|
|
4886
4897
|
const int64_t q_lora_rank = hparams.n_lora_q;
|
|
4887
4898
|
const int64_t kv_lora_rank = hparams.n_lora_kv;
|
|
@@ -4903,13 +4914,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
4903
4914
|
auto & layer = layers[i];
|
|
4904
4915
|
|
|
4905
4916
|
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
4906
|
-
if (
|
|
4917
|
+
if (q_lora_rank > 0) {
|
|
4907
4918
|
layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
|
|
4908
4919
|
}
|
|
4909
4920
|
|
|
4910
4921
|
layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
|
|
4911
4922
|
|
|
4912
|
-
if (
|
|
4923
|
+
if (q_lora_rank > 0) {
|
|
4913
4924
|
layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
|
|
4914
4925
|
layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, 0);
|
|
4915
4926
|
} else {
|
|
@@ -4946,9 +4957,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
4946
4957
|
}
|
|
4947
4958
|
|
|
4948
4959
|
// MoE branch
|
|
4949
|
-
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
|
4950
4960
|
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
|
|
4951
|
-
layer
|
|
4961
|
+
create_tensor_gate_up_exps(layer, i, n_embd, n_ff_exp, n_expert, 0);
|
|
4952
4962
|
|
|
4953
4963
|
// Shared expert branch
|
|
4954
4964
|
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
|
|
@@ -4959,8 +4969,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
4959
4969
|
} break;
|
|
4960
4970
|
case LLM_ARCH_PLM:
|
|
4961
4971
|
{
|
|
4962
|
-
const int64_t n_embd_head_qk_rope = hparams.n_rot;
|
|
4963
|
-
const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
|
|
4972
|
+
const int64_t n_embd_head_qk_rope = hparams.n_rot();
|
|
4973
|
+
const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k() - hparams.n_rot();
|
|
4964
4974
|
const int64_t kv_lora_rank = hparams.n_lora_kv;
|
|
4965
4975
|
|
|
4966
4976
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
@@ -5000,23 +5010,23 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
5000
5010
|
layer.attn_sub_norm = create_tensor(tn(LLM_TENSOR_ATTN_SUB_NORM, "weight", i), {n_embd}, 0);
|
|
5001
5011
|
|
|
5002
5012
|
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
|
5003
|
-
layer.
|
|
5013
|
+
layer.wq_s = create_tensor(tn(LLM_TENSOR_ATTN_Q, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
|
5004
5014
|
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
5005
|
-
layer.
|
|
5015
|
+
layer.wk_s = create_tensor(tn(LLM_TENSOR_ATTN_K, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
|
5006
5016
|
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
5007
|
-
layer.
|
|
5017
|
+
layer.wv_s = create_tensor(tn(LLM_TENSOR_ATTN_V, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
|
5008
5018
|
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
5009
|
-
layer.
|
|
5019
|
+
layer.wo_s = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
|
5010
5020
|
|
|
5011
5021
|
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
5012
5022
|
layer.ffn_sub_norm = create_tensor(tn(LLM_TENSOR_FFN_SUB_NORM, "weight", i), {n_ff}, 0);
|
|
5013
5023
|
|
|
5014
5024
|
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
5015
|
-
layer.
|
|
5025
|
+
layer.ffn_gate_s = create_tensor(tn(LLM_TENSOR_FFN_GATE, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
|
5016
5026
|
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
|
5017
|
-
layer.
|
|
5027
|
+
layer.ffn_down_s = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
|
5018
5028
|
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
5019
|
-
layer.
|
|
5029
|
+
layer.ffn_up_s = create_tensor(tn(LLM_TENSOR_FFN_UP, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
|
5020
5030
|
}
|
|
5021
5031
|
} break;
|
|
5022
5032
|
case LLM_ARCH_T5:
|
|
@@ -5074,7 +5084,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
5074
5084
|
|
|
5075
5085
|
layer.attn_norm_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
5076
5086
|
// this tensor seems to be unused in HF transformers implementation
|
|
5077
|
-
layer.attn_rel_b_cross = create_tensor(
|
|
5087
|
+
layer.attn_rel_b_cross = create_tensor(
|
|
5088
|
+
tn(LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED | TENSOR_SKIP_IF_VIRTUAL);
|
|
5078
5089
|
|
|
5079
5090
|
layer.wq_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
|
5080
5091
|
layer.wk_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
|
@@ -5152,6 +5163,45 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
5152
5163
|
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
|
|
5153
5164
|
}
|
|
5154
5165
|
} break;
|
|
5166
|
+
case LLM_ARCH_JAIS2:
|
|
5167
|
+
{
|
|
5168
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
5169
|
+
|
|
5170
|
+
// output
|
|
5171
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
5172
|
+
output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
|
|
5173
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
5174
|
+
if (!output) {
|
|
5175
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
5176
|
+
}
|
|
5177
|
+
|
|
5178
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
5179
|
+
auto & layer = layers[i];
|
|
5180
|
+
|
|
5181
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
5182
|
+
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
|
|
5183
|
+
|
|
5184
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
5185
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
|
5186
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
|
5187
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
|
5188
|
+
|
|
5189
|
+
// attention biases - all have shape n_embd (output dimension of projections)
|
|
5190
|
+
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
|
|
5191
|
+
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd}, 0);
|
|
5192
|
+
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd}, 0);
|
|
5193
|
+
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
|
5194
|
+
|
|
5195
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
5196
|
+
layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
|
|
5197
|
+
|
|
5198
|
+
// Jais-2 uses simple MLP (no gate) with biases
|
|
5199
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
5200
|
+
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
|
|
5201
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
|
5202
|
+
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
|
|
5203
|
+
}
|
|
5204
|
+
} break;
|
|
5155
5205
|
case LLM_ARCH_CHATGLM:
|
|
5156
5206
|
{
|
|
5157
5207
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
@@ -5202,30 +5252,48 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
5202
5252
|
}
|
|
5203
5253
|
|
|
5204
5254
|
for (int i = 0; i < n_layer; ++i) {
|
|
5255
|
+
int flags = 0;
|
|
5256
|
+
if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
|
|
5257
|
+
// skip all tensors in the NextN layers
|
|
5258
|
+
flags |= TENSOR_SKIP;
|
|
5259
|
+
}
|
|
5260
|
+
|
|
5205
5261
|
auto & layer = layers[i];
|
|
5206
5262
|
|
|
5207
|
-
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd},
|
|
5208
|
-
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
|
5209
|
-
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
|
5263
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags);
|
|
5264
|
+
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, flags | TENSOR_NOT_REQUIRED);
|
|
5265
|
+
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, flags | TENSOR_NOT_REQUIRED);
|
|
5210
5266
|
|
|
5211
5267
|
if (layer.wqkv == nullptr) {
|
|
5212
|
-
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,
|
|
5213
|
-
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,
|
|
5214
|
-
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,
|
|
5215
|
-
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,
|
|
5216
|
-
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,
|
|
5217
|
-
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,
|
|
5268
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, flags);
|
|
5269
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, flags);
|
|
5270
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, flags);
|
|
5271
|
+
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, flags | TENSOR_NOT_REQUIRED);
|
|
5272
|
+
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, flags | TENSOR_NOT_REQUIRED);
|
|
5273
|
+
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, flags | TENSOR_NOT_REQUIRED);
|
|
5218
5274
|
}
|
|
5219
5275
|
|
|
5220
|
-
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {
|
|
5276
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, flags);
|
|
5221
5277
|
|
|
5222
|
-
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd},
|
|
5278
|
+
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, flags);
|
|
5223
5279
|
|
|
5224
|
-
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd},
|
|
5225
|
-
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd},
|
|
5226
|
-
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2},
|
|
5280
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags);
|
|
5281
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, flags);
|
|
5282
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, flags);
|
|
5227
5283
|
|
|
5228
|
-
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd},
|
|
5284
|
+
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, flags);
|
|
5285
|
+
|
|
5286
|
+
// NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
|
|
5287
|
+
if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
|
|
5288
|
+
layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
|
|
5289
|
+
layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
|
|
5290
|
+
layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
|
|
5291
|
+
|
|
5292
|
+
// Optional tensors
|
|
5293
|
+
layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
|
|
5294
|
+
layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
|
|
5295
|
+
layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, flags | TENSOR_NOT_REQUIRED);
|
|
5296
|
+
}
|
|
5229
5297
|
}
|
|
5230
5298
|
} break;
|
|
5231
5299
|
case LLM_ARCH_GLM4_MOE:
|
|
@@ -5329,6 +5397,108 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
5329
5397
|
}
|
|
5330
5398
|
}
|
|
5331
5399
|
break;
|
|
5400
|
+
case LLM_ARCH_GLM_DSA:
|
|
5401
|
+
{
|
|
5402
|
+
const bool is_mla = hparams.is_mla();
|
|
5403
|
+
if (!is_mla) {
|
|
5404
|
+
throw std::runtime_error("GLM_DSA architecture requires MLA");
|
|
5405
|
+
}
|
|
5406
|
+
|
|
5407
|
+
// note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
|
|
5408
|
+
const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla();
|
|
5409
|
+
const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla();
|
|
5410
|
+
|
|
5411
|
+
const int64_t n_embd_head_qk_rope = hparams.n_rot();
|
|
5412
|
+
const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;
|
|
5413
|
+
|
|
5414
|
+
const int64_t q_lora_rank = hparams.n_lora_q;
|
|
5415
|
+
const int64_t kv_lora_rank = hparams.n_lora_kv;
|
|
5416
|
+
|
|
5417
|
+
const int64_t n_ff_exp = hparams.n_ff_exp;
|
|
5418
|
+
const int64_t n_expert_shared = hparams.n_expert_shared;
|
|
5419
|
+
|
|
5420
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
5421
|
+
|
|
5422
|
+
// output
|
|
5423
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
5424
|
+
// try to load output.weight, if not found, use token_embd (tied embeddings)
|
|
5425
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
5426
|
+
if (!output) {
|
|
5427
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
5428
|
+
}
|
|
5429
|
+
|
|
5430
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
5431
|
+
int flags = 0;
|
|
5432
|
+
if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
|
|
5433
|
+
// skip all tensors in the NextN layers
|
|
5434
|
+
// TODO @ngxson : TENSOR_NOT_REQUIRED was a hack, need to remove it later
|
|
5435
|
+
flags |= TENSOR_SKIP | TENSOR_NOT_REQUIRED;
|
|
5436
|
+
}
|
|
5437
|
+
|
|
5438
|
+
auto & layer = layers[i];
|
|
5439
|
+
|
|
5440
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags);
|
|
5441
|
+
layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, flags);
|
|
5442
|
+
layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, flags);
|
|
5443
|
+
|
|
5444
|
+
layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, flags);
|
|
5445
|
+
layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, flags);
|
|
5446
|
+
|
|
5447
|
+
layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + n_embd_head_qk_rope}, flags);
|
|
5448
|
+
|
|
5449
|
+
// note: only old legacy GGUF files will have the unsplit wkv_b tensor in
|
|
5450
|
+
layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_qk_nope, kv_lora_rank, n_head}, flags);
|
|
5451
|
+
layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v_mla, n_head}, flags);
|
|
5452
|
+
|
|
5453
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v_mla, n_embd}, flags);
|
|
5454
|
+
|
|
5455
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags);
|
|
5456
|
+
|
|
5457
|
+
// DSA indexer
|
|
5458
|
+
layer.indexer_k_norm = create_tensor(tn(LLM_TENSOR_INDEXER_K_NORM, "weight", i), {hparams.indexer_head_size}, flags);
|
|
5459
|
+
layer.indexer_k_norm_b = create_tensor(tn(LLM_TENSOR_INDEXER_K_NORM, "bias", i), {hparams.indexer_head_size}, flags);
|
|
5460
|
+
layer.indexer_proj = create_tensor(tn(LLM_TENSOR_INDEXER_PROJ, "weight", i), {n_embd, hparams.indexer_n_head}, flags);
|
|
5461
|
+
layer.indexer_attn_k = create_tensor(tn(LLM_TENSOR_INDEXER_ATTN_K, "weight", i), {n_embd, hparams.indexer_head_size}, flags);
|
|
5462
|
+
layer.indexer_attn_q_b = create_tensor(tn(LLM_TENSOR_INDEXER_ATTN_Q_B, "weight", i), {q_lora_rank, hparams.indexer_n_head * hparams.indexer_head_size}, flags);
|
|
5463
|
+
if (i < (int) hparams.n_layer_dense_lead) {
|
|
5464
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, flags);
|
|
5465
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, flags);
|
|
5466
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, flags);
|
|
5467
|
+
} else {
|
|
5468
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, flags);
|
|
5469
|
+
layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
|
|
5470
|
+
|
|
5471
|
+
if (n_expert == 0) {
|
|
5472
|
+
throw std::runtime_error("n_expert must be > 0");
|
|
5473
|
+
}
|
|
5474
|
+
if (n_expert_used == 0) {
|
|
5475
|
+
throw std::runtime_error("n_expert_used must be > 0");
|
|
5476
|
+
}
|
|
5477
|
+
|
|
5478
|
+
// MoE branch
|
|
5479
|
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, flags);
|
|
5480
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, flags);
|
|
5481
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, flags);
|
|
5482
|
+
|
|
5483
|
+
// Shared expert branch
|
|
5484
|
+
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, flags);
|
|
5485
|
+
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, flags);
|
|
5486
|
+
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, flags);
|
|
5487
|
+
}
|
|
5488
|
+
|
|
5489
|
+
// NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
|
|
5490
|
+
if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
|
|
5491
|
+
layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
|
|
5492
|
+
layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
|
|
5493
|
+
layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
|
|
5494
|
+
|
|
5495
|
+
// Optional tensors
|
|
5496
|
+
layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
|
|
5497
|
+
layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
|
|
5498
|
+
layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, flags | TENSOR_NOT_REQUIRED);
|
|
5499
|
+
}
|
|
5500
|
+
}
|
|
5501
|
+
} break;
|
|
5332
5502
|
case LLM_ARCH_NEMOTRON:
|
|
5333
5503
|
{
|
|
5334
5504
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
@@ -5377,6 +5547,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
5377
5547
|
const int64_t n_ssm_head = hparams.ssm_dt_rank;
|
|
5378
5548
|
const int64_t n_group = hparams.ssm_n_group;
|
|
5379
5549
|
const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_ssm_head;
|
|
5550
|
+
const int64_t moe_n_embd = hparams.moe_latent_size > 0 ? hparams.moe_latent_size : n_embd;
|
|
5380
5551
|
|
|
5381
5552
|
// embeddings
|
|
5382
5553
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
@@ -5436,8 +5607,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
5436
5607
|
layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert }, 0);
|
|
5437
5608
|
|
|
5438
5609
|
// MoE branch
|
|
5439
|
-
layer.
|
|
5440
|
-
layer.
|
|
5610
|
+
layer.ffn_latent_down = create_tensor(tn(LLM_TENSOR_FFN_LATENT_DOWN, "weight", i), {n_embd, moe_n_embd}, TENSOR_NOT_REQUIRED);
|
|
5611
|
+
layer.ffn_latent_up = create_tensor(tn(LLM_TENSOR_FFN_LATENT_UP, "weight", i), {moe_n_embd, n_embd}, TENSOR_NOT_REQUIRED);
|
|
5612
|
+
|
|
5613
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, moe_n_embd, n_expert}, 0);
|
|
5614
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {moe_n_embd, n_ff_exp, n_expert}, 0);
|
|
5441
5615
|
|
|
5442
5616
|
// Shared expert branch
|
|
5443
5617
|
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
|
|
@@ -5504,16 +5678,94 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
5504
5678
|
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
|
5505
5679
|
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
5506
5680
|
|
|
5507
|
-
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
|
5681
|
+
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
|
5682
|
+
|
|
5683
|
+
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
|
|
5684
|
+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
5685
|
+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
5686
|
+
|
|
5687
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
5688
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
5689
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
5690
|
+
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
|
|
5691
|
+
}
|
|
5692
|
+
} break;
|
|
5693
|
+
case LLM_ARCH_EXAONE_MOE:
|
|
5694
|
+
{
|
|
5695
|
+
const int64_t n_ff_exp = hparams.n_ff_exp;
|
|
5696
|
+
const int64_t n_expert = hparams.n_expert;
|
|
5697
|
+
const int64_t n_expert_used = hparams.n_expert_used;
|
|
5698
|
+
const int64_t n_ff_shexp = hparams.n_ff_shexp > 0 ? hparams.n_ff_shexp : n_ff_exp;
|
|
5699
|
+
const int64_t head_dim = hparams.n_embd_head_k();
|
|
5700
|
+
const int64_t n_qo_dim = n_head * head_dim;
|
|
5701
|
+
const int64_t n_kv_dim = n_head_kv * head_dim;
|
|
5702
|
+
|
|
5703
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
5704
|
+
|
|
5705
|
+
// output
|
|
5706
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
5707
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
|
5708
|
+
|
|
5709
|
+
if (output == NULL) {
|
|
5710
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
5711
|
+
}
|
|
5712
|
+
|
|
5713
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
5714
|
+
int flags = 0;
|
|
5715
|
+
if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
|
|
5716
|
+
// skip all tensors in the NextN layers
|
|
5717
|
+
flags |= TENSOR_SKIP;
|
|
5718
|
+
}
|
|
5719
|
+
|
|
5720
|
+
auto & layer = layers[i];
|
|
5721
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_qo_dim}, flags);
|
|
5722
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_kv_dim}, flags);
|
|
5723
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_kv_dim}, flags);
|
|
5724
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_qo_dim, n_embd}, flags);
|
|
5725
|
+
|
|
5726
|
+
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0) | flags);
|
|
5727
|
+
|
|
5728
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags);
|
|
5729
|
+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, flags);
|
|
5730
|
+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, flags);
|
|
5731
|
+
|
|
5732
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags);
|
|
5733
|
+
|
|
5734
|
+
// dense layers for first n_layer_dense_lead layers or nextn_predict_layers layers at the end
|
|
5735
|
+
if (i < (int) hparams.n_layer_dense_lead || (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers)) {
|
|
5736
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, flags);
|
|
5737
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, flags);
|
|
5738
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, flags);
|
|
5739
|
+
} else {
|
|
5740
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, flags);
|
|
5741
|
+
layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED | flags);
|
|
5742
|
+
|
|
5743
|
+
if (n_expert == 0) {
|
|
5744
|
+
throw std::runtime_error("n_expert must be > 0");
|
|
5745
|
+
}
|
|
5746
|
+
if (n_expert_used == 0) {
|
|
5747
|
+
throw std::runtime_error("n_expert_used must be > 0");
|
|
5748
|
+
}
|
|
5749
|
+
|
|
5750
|
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, flags);
|
|
5751
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, flags);
|
|
5752
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, flags);
|
|
5508
5753
|
|
|
5509
|
-
|
|
5510
|
-
|
|
5511
|
-
|
|
5754
|
+
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp}, flags);
|
|
5755
|
+
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, flags);
|
|
5756
|
+
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp}, flags);
|
|
5757
|
+
}
|
|
5512
5758
|
|
|
5513
|
-
|
|
5514
|
-
|
|
5515
|
-
|
|
5516
|
-
|
|
5759
|
+
// NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
|
|
5760
|
+
if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
|
|
5761
|
+
layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), {2 * n_embd, n_embd}, flags);
|
|
5762
|
+
layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), {n_embd}, flags);
|
|
5763
|
+
layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), {n_embd}, flags);
|
|
5764
|
+
|
|
5765
|
+
layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), {n_embd}, flags | TENSOR_NOT_REQUIRED);
|
|
5766
|
+
layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), {n_embd, n_vocab}, flags | TENSOR_NOT_REQUIRED);
|
|
5767
|
+
layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), {n_embd, n_vocab}, flags | TENSOR_NOT_REQUIRED);
|
|
5768
|
+
}
|
|
5517
5769
|
}
|
|
5518
5770
|
} break;
|
|
5519
5771
|
case LLM_ARCH_RWKV6:
|
|
@@ -5806,9 +6058,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
5806
6058
|
} break;
|
|
5807
6059
|
case LLM_ARCH_WAVTOKENIZER_DEC:
|
|
5808
6060
|
{
|
|
5809
|
-
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hparams.
|
|
6061
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hparams.n_embd, n_vocab}, 0);
|
|
5810
6062
|
|
|
5811
|
-
conv1d = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, hparams.
|
|
6063
|
+
conv1d = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, hparams.n_embd, hparams.posnet.n_embd}, 0);
|
|
5812
6064
|
conv1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"), {1, hparams.posnet.n_embd}, 0);
|
|
5813
6065
|
|
|
5814
6066
|
// posnet
|
|
@@ -5904,8 +6156,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
5904
6156
|
output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
|
|
5905
6157
|
}
|
|
5906
6158
|
|
|
5907
|
-
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd,
|
|
5908
|
-
output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {
|
|
6159
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, hparams.n_embd_out()}, 0);
|
|
6160
|
+
output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {hparams.n_embd_out()}, 0);
|
|
5909
6161
|
} break;
|
|
5910
6162
|
case LLM_ARCH_BAILINGMOE:
|
|
5911
6163
|
{
|
|
@@ -6161,6 +6413,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
6161
6413
|
} break;
|
|
6162
6414
|
case LLM_ARCH_ERNIE4_5:
|
|
6163
6415
|
case LLM_ARCH_ERNIE4_5_MOE:
|
|
6416
|
+
case LLM_ARCH_PADDLEOCR:
|
|
6164
6417
|
{
|
|
6165
6418
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
6166
6419
|
|
|
@@ -6303,6 +6556,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
6303
6556
|
|
|
6304
6557
|
for (int i = 0; i < n_layer; ++i) {
|
|
6305
6558
|
auto & layer = layers[i];
|
|
6559
|
+
const uint32_t n_ff_shexp = hparams.n_ff_shexp > 0 ? hparams.n_ff_shexp : hparams.n_ff(i);
|
|
6306
6560
|
|
|
6307
6561
|
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
6308
6562
|
|
|
@@ -6321,9 +6575,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
6321
6575
|
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
|
|
6322
6576
|
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
|
|
6323
6577
|
|
|
6324
|
-
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd,
|
|
6325
|
-
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd,
|
|
6326
|
-
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {
|
|
6578
|
+
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp}, 0);
|
|
6579
|
+
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp}, 0);
|
|
6580
|
+
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
|
|
6327
6581
|
}
|
|
6328
6582
|
} break;
|
|
6329
6583
|
case LLM_ARCH_HUNYUAN_DENSE:
|
|
@@ -6481,7 +6735,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
6481
6735
|
}
|
|
6482
6736
|
|
|
6483
6737
|
// for LFM2-ColBert-350M
|
|
6484
|
-
dense_2_out_layers
|
|
6738
|
+
dense_2_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "weight"), {n_embd, hparams.n_embd_out()}, TENSOR_NOT_REQUIRED);
|
|
6739
|
+
dense_2_out_layers_b = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "bias"), {hparams.n_embd_out() }, TENSOR_NOT_REQUIRED);
|
|
6485
6740
|
} break;
|
|
6486
6741
|
case LLM_ARCH_SMALLTHINKER:
|
|
6487
6742
|
{
|
|
@@ -6637,6 +6892,141 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
6637
6892
|
layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 0);
|
|
6638
6893
|
}
|
|
6639
6894
|
} break;
|
|
6895
|
+
case LLM_ARCH_KIMI_LINEAR:
|
|
6896
|
+
{
|
|
6897
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
6898
|
+
|
|
6899
|
+
// output
|
|
6900
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
6901
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
|
6902
|
+
|
|
6903
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
6904
|
+
auto & layer = layers[i];
|
|
6905
|
+
|
|
6906
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
6907
|
+
|
|
6908
|
+
// Check for KDA specific tensors to determine layer type or if it's a mixed model
|
|
6909
|
+
// Assuming KDA layer if KDA tensors are present
|
|
6910
|
+
|
|
6911
|
+
// KDA uses head_dim = 128 (from linear_attn_config.head_dim)
|
|
6912
|
+
const int64_t n_embd_head_k_kda = hparams.n_embd_head_kda;
|
|
6913
|
+
const int64_t n_embd_head_v_kda = hparams.n_embd_head_kda;
|
|
6914
|
+
const int64_t ssm_d_conv = hparams.ssm_d_conv;
|
|
6915
|
+
|
|
6916
|
+
if (hparams.is_recurrent(i)) {
|
|
6917
|
+
// Conv1d weights: try 4D first, then 3D (quantization may remove trailing 1)
|
|
6918
|
+
// 4D: [d_conv, 1, d_inner, 1], 3D: [d_conv, 1, d_inner]
|
|
6919
|
+
layer.ssm_q_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_Q, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head, 1}, TENSOR_NOT_REQUIRED);
|
|
6920
|
+
if (!layer.ssm_q_conv) {
|
|
6921
|
+
layer.ssm_q_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_Q, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head}, 0);
|
|
6922
|
+
}
|
|
6923
|
+
|
|
6924
|
+
// KDA Layer - Conv1d weights may be 3D or 4D
|
|
6925
|
+
layer.ssm_k_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_K, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head, 1}, TENSOR_NOT_REQUIRED);
|
|
6926
|
+
if (!layer.ssm_k_conv) {
|
|
6927
|
+
layer.ssm_k_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_K, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head}, 0);
|
|
6928
|
+
}
|
|
6929
|
+
layer.ssm_v_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_V, "weight", i), {ssm_d_conv, 1, n_embd_head_v_kda * n_head, 1}, TENSOR_NOT_REQUIRED);
|
|
6930
|
+
if (!layer.ssm_v_conv) {
|
|
6931
|
+
layer.ssm_v_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_V, "weight", i), {ssm_d_conv, 1, n_embd_head_v_kda * n_head}, 0);
|
|
6932
|
+
}
|
|
6933
|
+
|
|
6934
|
+
// q, k, v projections
|
|
6935
|
+
// Python: q_proj, k_proj, v_proj
|
|
6936
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k_kda * n_head}, 0);
|
|
6937
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_head_k_kda * n_head}, 0);
|
|
6938
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_head_v_kda * n_head}, 0);
|
|
6939
|
+
|
|
6940
|
+
// KDA specific projections
|
|
6941
|
+
// f_a_proj, f_b_proj
|
|
6942
|
+
layer.ssm_f_a = create_tensor(tn(LLM_TENSOR_SSM_F_A, "weight", i), {n_embd, n_embd_head_k_kda}, 0); // head_dim
|
|
6943
|
+
layer.ssm_f_b = create_tensor(tn(LLM_TENSOR_SSM_F_B, "weight", i), {n_embd_head_k_kda, n_embd_head_k_kda * n_head}, 0); // projection_size
|
|
6944
|
+
|
|
6945
|
+
// b_proj (beta mixing coefficient)
|
|
6946
|
+
layer.ssm_beta = create_tensor(tn(LLM_TENSOR_SSM_BETA, "weight", i), {n_embd, n_head}, 0);
|
|
6947
|
+
|
|
6948
|
+
// A_log - Shape in GGUF: [1, num_heads, 1, 1] (4D) or [1, num_heads] (2D after quantization) Note: -exp(A_log) is applied in convert_hf_to_gguf.py
|
|
6949
|
+
layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_head, 1, 1}, TENSOR_NOT_REQUIRED);
|
|
6950
|
+
if (!layer.ssm_a) {
|
|
6951
|
+
layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_head}, 0);
|
|
6952
|
+
}
|
|
6953
|
+
|
|
6954
|
+
// dt_bias - shape [n_embd_head_k_kda * n_head] = [4096]
|
|
6955
|
+
layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_embd_head_k_kda * n_head}, 0);
|
|
6956
|
+
|
|
6957
|
+
// g_a_proj, g_b_proj (output gate)
|
|
6958
|
+
layer.ssm_g_a = create_tensor(tn(LLM_TENSOR_SSM_G_A, "weight", i), {n_embd, n_embd_head_k_kda}, 0);
|
|
6959
|
+
layer.ssm_g_b = create_tensor(tn(LLM_TENSOR_SSM_G_B, "weight", i), {n_embd_head_k_kda, n_embd_head_k_kda * n_head}, 0);
|
|
6960
|
+
|
|
6961
|
+
// o_norm (reusing SSM_NORM)
|
|
6962
|
+
layer.ssm_o_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {n_embd_head_k_kda}, 0); // FusedRMSNormGated
|
|
6963
|
+
|
|
6964
|
+
// o_proj
|
|
6965
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_v_kda * n_head, n_embd}, 0);
|
|
6966
|
+
|
|
6967
|
+
} else {
|
|
6968
|
+
// MLA Layer - use MLA-specific head dimensions
|
|
6969
|
+
const int64_t q_lora_rank = hparams.n_lora_q;
|
|
6970
|
+
const int64_t kv_lora_rank = hparams.n_lora_kv;
|
|
6971
|
+
const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla();
|
|
6972
|
+
const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla();
|
|
6973
|
+
|
|
6974
|
+
layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, TENSOR_NOT_REQUIRED);
|
|
6975
|
+
layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
|
|
6976
|
+
|
|
6977
|
+
if (layer.attn_q_a_norm) {
|
|
6978
|
+
layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
|
|
6979
|
+
layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, 0);
|
|
6980
|
+
} else {
|
|
6981
|
+
// Kimi MLA without Q compression: wq = [n_embd, n_head * n_embd_head_k_mla]
|
|
6982
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_embd_head_k_mla}, 0);
|
|
6983
|
+
}
|
|
6984
|
+
|
|
6985
|
+
// Kimi: qk_rope_head_dim = 64 (actual RoPE dimension for MLA)
|
|
6986
|
+
// Note: hparams.n_rot may be 72 (from conversion) but actual is 64
|
|
6987
|
+
const int64_t qk_rope_head_dim = hparams.n_rot(); // From config: qk_rope_head_dim
|
|
6988
|
+
layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + qk_rope_head_dim}, 0);
|
|
6989
|
+
// Support Legacy GGUFs that don't split wkv_b (MLA KV cache disabled)
|
|
6990
|
+
layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i),
|
|
6991
|
+
{kv_lora_rank, n_head * (n_embd_head_k_mla - qk_rope_head_dim + n_embd_head_v_mla)}, TENSOR_NOT_REQUIRED | TENSOR_SKIP_IF_VIRTUAL);
|
|
6992
|
+
if (!layer.wkv_b) { // MLA KV cache enabled
|
|
6993
|
+
layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_k_mla - qk_rope_head_dim, kv_lora_rank, n_head}, 0);
|
|
6994
|
+
layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v_mla, n_head}, 0);
|
|
6995
|
+
}
|
|
6996
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v_mla, n_embd}, 0);
|
|
6997
|
+
}
|
|
6998
|
+
|
|
6999
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
7000
|
+
|
|
7001
|
+
// MoE intermediate size (different from dense FFN)
|
|
7002
|
+
const int64_t n_ff_exp = hparams.n_ff_exp;
|
|
7003
|
+
|
|
7004
|
+
// Kimi uses n_layer_dense_lead to determine which layers use dense FFN vs MoE
|
|
7005
|
+
// first_k_dense_replace = 1 means layer 0 uses dense FFN, layers 1+ use MoE
|
|
7006
|
+
if (i < (int) hparams.n_layer_dense_lead) {
|
|
7007
|
+
// Dense FFN layer - use normal n_ff
|
|
7008
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
7009
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
|
7010
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
7011
|
+
} else {
|
|
7012
|
+
// MoE layer - use n_ff_exp (1024) instead of n_ff (9216)
|
|
7013
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
|
7014
|
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
|
|
7015
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
|
|
7016
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
|
|
7017
|
+
|
|
7018
|
+
// Shared experts use moe_intermediate_size * num_shared_experts
|
|
7019
|
+
// Kimi: shared_expert_intermediate_size = 1024 * 1 = 1024
|
|
7020
|
+
// Tensors are 2D: [n_embd, n_ff_shexp] or [n_ff_shexp, n_embd]
|
|
7021
|
+
const int64_t n_ff_shexp_actual = n_ff_exp * (hparams.n_expert_shared > 0 ? hparams.n_expert_shared : 1);
|
|
7022
|
+
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp_actual}, TENSOR_NOT_REQUIRED);
|
|
7023
|
+
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp_actual, n_embd}, TENSOR_NOT_REQUIRED);
|
|
7024
|
+
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp_actual}, TENSOR_NOT_REQUIRED);
|
|
7025
|
+
|
|
7026
|
+
layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 0);
|
|
7027
|
+
}
|
|
7028
|
+
}
|
|
7029
|
+
} break;
|
|
6640
7030
|
case LLM_ARCH_COGVLM:
|
|
6641
7031
|
{
|
|
6642
7032
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
@@ -6718,6 +7108,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
6718
7108
|
} break;
|
|
6719
7109
|
case LLM_ARCH_QWEN3NEXT:
|
|
6720
7110
|
{
|
|
7111
|
+
if (n_expert == 0) {
|
|
7112
|
+
throw std::runtime_error(arch_name() + " model cannot have zero experts");
|
|
7113
|
+
}
|
|
7114
|
+
|
|
6721
7115
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
|
|
6722
7116
|
|
|
6723
7117
|
// output
|
|
@@ -6746,6 +7140,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
6746
7140
|
|
|
6747
7141
|
for (int i = 0; i < n_layer; ++i) {
|
|
6748
7142
|
auto & layer = layers[i];
|
|
7143
|
+
const uint32_t n_ff_shexp = hparams.n_ff_shexp > 0 ? hparams.n_ff_shexp : hparams.n_ff(i);
|
|
6749
7144
|
|
|
6750
7145
|
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
|
|
6751
7146
|
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0);
|
|
@@ -6776,15 +7171,138 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
6776
7171
|
}
|
|
6777
7172
|
|
|
6778
7173
|
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0);
|
|
6779
|
-
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
|
|
6780
7174
|
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
|
|
6781
|
-
layer
|
|
7175
|
+
create_tensor_gate_up_exps(layer, i, n_embd, n_ff_exp, n_expert, 0);
|
|
7176
|
+
|
|
7177
|
+
// Shared experts
|
|
7178
|
+
layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), { n_embd }, 0);
|
|
7179
|
+
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp }, 0);
|
|
7180
|
+
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp }, 0);
|
|
7181
|
+
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_shexp, n_embd }, 0);
|
|
7182
|
+
}
|
|
7183
|
+
} break;
|
|
7184
|
+
case LLM_ARCH_QWEN35MOE:
|
|
7185
|
+
{
|
|
7186
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
|
|
7187
|
+
|
|
7188
|
+
// output
|
|
7189
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
|
|
7190
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
|
|
7191
|
+
|
|
7192
|
+
// if output is NULL, init from the input tok embed
|
|
7193
|
+
if (output == NULL) {
|
|
7194
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
|
|
7195
|
+
}
|
|
7196
|
+
|
|
7197
|
+
const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
|
|
7198
|
+
|
|
7199
|
+
// Calculate dimensions from hyperparameters
|
|
7200
|
+
const int64_t head_k_dim = hparams.ssm_d_state;
|
|
7201
|
+
const int64_t head_v_dim = hparams.ssm_d_state;
|
|
7202
|
+
const int64_t n_k_heads = hparams.ssm_n_group;
|
|
7203
|
+
const int64_t n_v_heads = hparams.ssm_dt_rank;
|
|
7204
|
+
const int64_t key_dim = head_k_dim * n_k_heads;
|
|
7205
|
+
const int64_t value_dim = head_v_dim * n_v_heads;
|
|
7206
|
+
const int64_t conv_dim = key_dim * 2 + value_dim;
|
|
7207
|
+
|
|
7208
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
7209
|
+
auto & layer = layers[i];
|
|
7210
|
+
|
|
7211
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
|
|
7212
|
+
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0);
|
|
7213
|
+
|
|
7214
|
+
if (!hparams.is_recurrent(i)) {
|
|
7215
|
+
// Attention layers
|
|
7216
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head * 2 }, 0);
|
|
7217
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
|
|
7218
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
|
|
7219
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
|
|
7220
|
+
|
|
7221
|
+
// Q/K normalization for attention layers
|
|
7222
|
+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
|
|
7223
|
+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
|
|
7224
|
+
} else {
|
|
7225
|
+
// Linear attention (gated delta net) specific tensors
|
|
7226
|
+
// Create tensors with calculated dimensions
|
|
7227
|
+
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, key_dim * 2 + value_dim }, TENSOR_NOT_REQUIRED);
|
|
7228
|
+
layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), { n_embd, value_dim }, TENSOR_NOT_REQUIRED);
|
|
7229
|
+
layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
|
|
7230
|
+
layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0);
|
|
7231
|
+
layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN, i), { hparams.ssm_dt_rank }, 0);
|
|
7232
|
+
layer.ssm_beta = create_tensor(tn(LLM_TENSOR_SSM_BETA, "weight", i), { n_embd, n_v_heads }, 0);
|
|
7233
|
+
layer.ssm_alpha = create_tensor(tn(LLM_TENSOR_SSM_ALPHA, "weight", i), { n_embd, n_v_heads }, 0);
|
|
7234
|
+
layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), { head_v_dim }, 0);
|
|
7235
|
+
layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), { value_dim, n_embd }, 0);
|
|
7236
|
+
}
|
|
7237
|
+
|
|
7238
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0);
|
|
7239
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
|
|
7240
|
+
create_tensor_gate_up_exps(layer, i, n_embd, n_ff_exp, n_expert, 0);
|
|
6782
7241
|
|
|
6783
7242
|
// Shared experts
|
|
7243
|
+
const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
|
|
7244
|
+
|
|
6784
7245
|
layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), { n_embd }, 0);
|
|
6785
|
-
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd,
|
|
6786
|
-
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd,
|
|
6787
|
-
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {
|
|
7246
|
+
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp }, 0);
|
|
7247
|
+
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp }, 0);
|
|
7248
|
+
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_shexp, n_embd }, 0);
|
|
7249
|
+
}
|
|
7250
|
+
} break;
|
|
7251
|
+
case LLM_ARCH_QWEN35:
|
|
7252
|
+
{
|
|
7253
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
|
|
7254
|
+
|
|
7255
|
+
// output
|
|
7256
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
|
|
7257
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
|
|
7258
|
+
|
|
7259
|
+
// if output is NULL, init from the input tok embed
|
|
7260
|
+
if (output == NULL) {
|
|
7261
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
|
|
7262
|
+
}
|
|
7263
|
+
|
|
7264
|
+
// Calculate dimensions from hyperparameters
|
|
7265
|
+
const int64_t head_k_dim = hparams.ssm_d_state;
|
|
7266
|
+
const int64_t head_v_dim = hparams.ssm_d_state;
|
|
7267
|
+
const int64_t n_k_heads = hparams.ssm_n_group;
|
|
7268
|
+
const int64_t n_v_heads = hparams.ssm_dt_rank;
|
|
7269
|
+
const int64_t key_dim = head_k_dim * n_k_heads;
|
|
7270
|
+
const int64_t value_dim = head_v_dim * n_v_heads;
|
|
7271
|
+
const int64_t conv_dim = key_dim * 2 + value_dim;
|
|
7272
|
+
|
|
7273
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
7274
|
+
auto & layer = layers[i];
|
|
7275
|
+
|
|
7276
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
|
|
7277
|
+
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0);
|
|
7278
|
+
|
|
7279
|
+
if (!hparams.is_recurrent(i)) {
|
|
7280
|
+
// Attention layers
|
|
7281
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head * 2 }, 0);
|
|
7282
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0);
|
|
7283
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0);
|
|
7284
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
|
|
7285
|
+
|
|
7286
|
+
// Q/K normalization for attention layers
|
|
7287
|
+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0);
|
|
7288
|
+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0);
|
|
7289
|
+
} else {
|
|
7290
|
+
// Linear attention (gated delta net) specific tensors
|
|
7291
|
+
// Create tensors with calculated dimensions
|
|
7292
|
+
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, key_dim * 2 + value_dim }, TENSOR_NOT_REQUIRED);
|
|
7293
|
+
layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), { n_embd, value_dim }, TENSOR_NOT_REQUIRED);
|
|
7294
|
+
layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
|
|
7295
|
+
layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0);
|
|
7296
|
+
layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN, i), { hparams.ssm_dt_rank }, 0);
|
|
7297
|
+
layer.ssm_beta = create_tensor(tn(LLM_TENSOR_SSM_BETA, "weight", i), { n_embd, n_v_heads }, 0);
|
|
7298
|
+
layer.ssm_alpha = create_tensor(tn(LLM_TENSOR_SSM_ALPHA, "weight", i), { n_embd, n_v_heads }, 0);
|
|
7299
|
+
layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), { head_v_dim }, 0);
|
|
7300
|
+
layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), { value_dim, n_embd }, 0);
|
|
7301
|
+
}
|
|
7302
|
+
|
|
7303
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
7304
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
7305
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
6788
7306
|
}
|
|
6789
7307
|
} break;
|
|
6790
7308
|
case LLM_ARCH_MIMO2:
|
|
@@ -6825,6 +7343,72 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
6825
7343
|
layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
|
|
6826
7344
|
}
|
|
6827
7345
|
} break;
|
|
7346
|
+
case LLM_ARCH_STEP35:
|
|
7347
|
+
{
|
|
7348
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
7349
|
+
|
|
7350
|
+
// output
|
|
7351
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
7352
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
|
7353
|
+
|
|
7354
|
+
// STEP35 supports per-layer partial RoPE dims; rope factors are stored as a single shared tensor
|
|
7355
|
+
// ("rope_freqs.weight") and ggml uses only the first (n_rot_l/2) entries per layer.
|
|
7356
|
+
uint32_t n_rot_max = 0;
|
|
7357
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
7358
|
+
n_rot_max = std::max(n_rot_max, hparams.n_rot(i));
|
|
7359
|
+
}
|
|
7360
|
+
if (n_rot_max == 0) {
|
|
7361
|
+
n_rot_max = n_rot;
|
|
7362
|
+
}
|
|
7363
|
+
|
|
7364
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
7365
|
+
auto & layer = layers[i];
|
|
7366
|
+
|
|
7367
|
+
const uint32_t n_head_l = hparams.n_head(i);
|
|
7368
|
+
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i);
|
|
7369
|
+
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i);
|
|
7370
|
+
|
|
7371
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
7372
|
+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, TENSOR_NOT_REQUIRED);
|
|
7373
|
+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, TENSOR_NOT_REQUIRED);
|
|
7374
|
+
|
|
7375
|
+
// optional rope factors (llama3) / longrope tensors
|
|
7376
|
+
if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
|
|
7377
|
+
layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot_max/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
|
7378
|
+
layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot_max/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
|
7379
|
+
} else {
|
|
7380
|
+
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot_max/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
|
7381
|
+
}
|
|
7382
|
+
|
|
7383
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head_l}, 0);
|
|
7384
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
|
7385
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
|
7386
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_v * n_head_l, n_embd}, 0);
|
|
7387
|
+
|
|
7388
|
+
// head-wise attention gate (Step35 self_attn.g_proj)
|
|
7389
|
+
layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), {n_embd, n_head_l}, TENSOR_NOT_REQUIRED);
|
|
7390
|
+
|
|
7391
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
7392
|
+
|
|
7393
|
+
// dense MLP (leading dense blocks)
|
|
7394
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
|
|
7395
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, TENSOR_NOT_REQUIRED);
|
|
7396
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
|
|
7397
|
+
|
|
7398
|
+
// MoE routed experts + selection bias (router_bias)
|
|
7399
|
+
const int64_t n_ff_exp = hparams.n_ff_exp;
|
|
7400
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, TENSOR_NOT_REQUIRED);
|
|
7401
|
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
|
|
7402
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, TENSOR_NOT_REQUIRED);
|
|
7403
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED);
|
|
7404
|
+
layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
|
|
7405
|
+
|
|
7406
|
+
// shared expert MLP
|
|
7407
|
+
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, TENSOR_NOT_REQUIRED);
|
|
7408
|
+
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, TENSOR_NOT_REQUIRED);
|
|
7409
|
+
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, TENSOR_NOT_REQUIRED);
|
|
7410
|
+
}
|
|
7411
|
+
} break;
|
|
6828
7412
|
case LLM_ARCH_MAINCODER:
|
|
6829
7413
|
{
|
|
6830
7414
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
@@ -6860,10 +7444,72 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
6860
7444
|
throw std::runtime_error("unknown architecture");
|
|
6861
7445
|
}
|
|
6862
7446
|
|
|
6863
|
-
|
|
6864
|
-
|
|
6865
|
-
|
|
6866
|
-
|
|
7447
|
+
// generic pass: load optional per-tensor/per-expert ".scale" tensors (e.g. NVFP4 scale2)
|
|
7448
|
+
// this avoids having to add scale loading to every architecture
|
|
7449
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
7450
|
+
auto & layer = layers[i];
|
|
7451
|
+
|
|
7452
|
+
// attention weight scales (per-tensor, shape {1})
|
|
7453
|
+
if (!layer.wq_s && layer.wq) {
|
|
7454
|
+
layer.wq_s = create_tensor(tn(LLM_TENSOR_ATTN_Q, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
|
7455
|
+
}
|
|
7456
|
+
if (!layer.wk_s && layer.wk) {
|
|
7457
|
+
layer.wk_s = create_tensor(tn(LLM_TENSOR_ATTN_K, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
|
7458
|
+
}
|
|
7459
|
+
if (!layer.wv_s && layer.wv) {
|
|
7460
|
+
layer.wv_s = create_tensor(tn(LLM_TENSOR_ATTN_V, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
|
7461
|
+
}
|
|
7462
|
+
if (!layer.wo_s && layer.wo) {
|
|
7463
|
+
layer.wo_s = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
|
7464
|
+
}
|
|
7465
|
+
if (!layer.wqkv_s && layer.wqkv) {
|
|
7466
|
+
layer.wqkv_s = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
|
7467
|
+
}
|
|
7468
|
+
if (!layer.wqkv_gate_s && layer.wqkv_gate) {
|
|
7469
|
+
layer.wqkv_gate_s = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
|
7470
|
+
}
|
|
7471
|
+
|
|
7472
|
+
// dense FFN weight scales (per-tensor, shape {1})
|
|
7473
|
+
if (!layer.ffn_gate_s && layer.ffn_gate) {
|
|
7474
|
+
layer.ffn_gate_s = create_tensor(tn(LLM_TENSOR_FFN_GATE, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
|
7475
|
+
}
|
|
7476
|
+
if (!layer.ffn_down_s && layer.ffn_down) {
|
|
7477
|
+
layer.ffn_down_s = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
|
7478
|
+
}
|
|
7479
|
+
if (!layer.ffn_up_s && layer.ffn_up) {
|
|
7480
|
+
layer.ffn_up_s = create_tensor(tn(LLM_TENSOR_FFN_UP, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
|
7481
|
+
}
|
|
7482
|
+
if (!layer.ffn_gate_shexp_s && layer.ffn_gate_shexp) {
|
|
7483
|
+
layer.ffn_gate_shexp_s = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
|
7484
|
+
}
|
|
7485
|
+
if (!layer.ffn_down_shexp_s && layer.ffn_down_shexp) {
|
|
7486
|
+
layer.ffn_down_shexp_s = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
|
7487
|
+
}
|
|
7488
|
+
if (!layer.ffn_up_shexp_s && layer.ffn_up_shexp) {
|
|
7489
|
+
layer.ffn_up_shexp_s = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
|
7490
|
+
}
|
|
7491
|
+
|
|
7492
|
+
// MoE expert weight scales (per-expert, shape {n_expert})
|
|
7493
|
+
if (!layer.ffn_gate_exps_s && layer.ffn_gate_exps) {
|
|
7494
|
+
layer.ffn_gate_exps_s = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "scale", i), {n_expert}, TENSOR_NOT_REQUIRED);
|
|
7495
|
+
}
|
|
7496
|
+
if (!layer.ffn_down_exps_s && layer.ffn_down_exps) {
|
|
7497
|
+
layer.ffn_down_exps_s = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "scale", i), {n_expert}, TENSOR_NOT_REQUIRED);
|
|
7498
|
+
}
|
|
7499
|
+
if (!layer.ffn_up_exps_s && layer.ffn_up_exps) {
|
|
7500
|
+
layer.ffn_up_exps_s = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "scale", i), {n_expert}, TENSOR_NOT_REQUIRED);
|
|
7501
|
+
}
|
|
7502
|
+
|
|
7503
|
+
// recurrent / linear-attention weight scales (per-tensor, shape {1})
|
|
7504
|
+
if (!layer.ssm_out_s && layer.ssm_out) {
|
|
7505
|
+
layer.ssm_out_s = create_tensor(tn(LLM_TENSOR_SSM_OUT, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
|
7506
|
+
}
|
|
7507
|
+
if (!layer.ssm_alpha_s && layer.ssm_alpha) {
|
|
7508
|
+
layer.ssm_alpha_s = create_tensor(tn(LLM_TENSOR_SSM_ALPHA, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
|
7509
|
+
}
|
|
7510
|
+
if (!layer.ssm_beta_s && layer.ssm_beta) {
|
|
7511
|
+
layer.ssm_beta_s = create_tensor(tn(LLM_TENSOR_SSM_BETA, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
|
7512
|
+
}
|
|
6867
7513
|
}
|
|
6868
7514
|
}
|
|
6869
7515
|
|
|
@@ -6874,13 +7520,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
6874
7520
|
|
|
6875
7521
|
// create the backend buffers
|
|
6876
7522
|
std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_buf_maps;
|
|
6877
|
-
ctx_buf_maps.reserve(ctx_map.size());
|
|
7523
|
+
ctx_buf_maps.reserve(ml.ctx_map.size());
|
|
6878
7524
|
|
|
6879
7525
|
// Ensure we have enough capacity for the maximum backend buffer we will potentially create
|
|
6880
|
-
const size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
|
|
7526
|
+
const size_t n_max_backend_buffer = ml.ctx_map.size() * ml.files.size();
|
|
6881
7527
|
pimpl->ctxs_bufs.reserve(n_max_backend_buffer);
|
|
6882
7528
|
|
|
6883
|
-
for (auto & [buft, ctx_ptr] : ctx_map) {
|
|
7529
|
+
for (auto & [buft, ctx_ptr] : ml.ctx_map) {
|
|
6884
7530
|
ggml_context * ctx = ctx_ptr.get();
|
|
6885
7531
|
|
|
6886
7532
|
// skip contexts without tensors
|
|
@@ -7101,59 +7747,62 @@ void llama_model::print_info() const {
|
|
|
7101
7747
|
};
|
|
7102
7748
|
|
|
7103
7749
|
// hparams
|
|
7104
|
-
LLAMA_LOG_INFO("%s: arch
|
|
7105
|
-
LLAMA_LOG_INFO("%s: vocab_only
|
|
7106
|
-
LLAMA_LOG_INFO("%s: no_alloc
|
|
7750
|
+
LLAMA_LOG_INFO("%s: arch = %s\n", __func__, arch_name().c_str());
|
|
7751
|
+
LLAMA_LOG_INFO("%s: vocab_only = %d\n", __func__, hparams.vocab_only);
|
|
7752
|
+
LLAMA_LOG_INFO("%s: no_alloc = %d\n", __func__, hparams.no_alloc);
|
|
7107
7753
|
|
|
7108
7754
|
if (!hparams.vocab_only) {
|
|
7109
|
-
LLAMA_LOG_INFO("%s: n_ctx_train
|
|
7110
|
-
LLAMA_LOG_INFO("%s: n_embd
|
|
7111
|
-
LLAMA_LOG_INFO("%s: n_embd_inp
|
|
7112
|
-
LLAMA_LOG_INFO("%s: n_layer
|
|
7113
|
-
LLAMA_LOG_INFO("%s: n_head
|
|
7114
|
-
LLAMA_LOG_INFO("%s: n_head_kv
|
|
7115
|
-
LLAMA_LOG_INFO("%s: n_rot
|
|
7116
|
-
LLAMA_LOG_INFO("%s: n_swa
|
|
7117
|
-
LLAMA_LOG_INFO("%s: is_swa_any
|
|
7118
|
-
LLAMA_LOG_INFO("%s: n_embd_head_k
|
|
7119
|
-
LLAMA_LOG_INFO("%s: n_embd_head_v
|
|
7120
|
-
LLAMA_LOG_INFO("%s: n_gqa
|
|
7121
|
-
LLAMA_LOG_INFO("%s: n_embd_k_gqa
|
|
7122
|
-
LLAMA_LOG_INFO("%s: n_embd_v_gqa
|
|
7123
|
-
LLAMA_LOG_INFO("%s: f_norm_eps
|
|
7124
|
-
LLAMA_LOG_INFO("%s: f_norm_rms_eps
|
|
7125
|
-
LLAMA_LOG_INFO("%s: f_clamp_kqv
|
|
7126
|
-
LLAMA_LOG_INFO("%s: f_max_alibi_bias
|
|
7127
|
-
LLAMA_LOG_INFO("%s: f_logit_scale
|
|
7128
|
-
LLAMA_LOG_INFO("%s: f_attn_scale
|
|
7129
|
-
LLAMA_LOG_INFO("%s: n_ff
|
|
7130
|
-
LLAMA_LOG_INFO("%s: n_expert
|
|
7131
|
-
LLAMA_LOG_INFO("%s: n_expert_used
|
|
7132
|
-
LLAMA_LOG_INFO("%s: n_expert_groups
|
|
7133
|
-
LLAMA_LOG_INFO("%s: n_group_used
|
|
7134
|
-
LLAMA_LOG_INFO("%s: causal attn
|
|
7135
|
-
LLAMA_LOG_INFO("%s: pooling type
|
|
7136
|
-
LLAMA_LOG_INFO("%s: rope type
|
|
7137
|
-
LLAMA_LOG_INFO("%s: rope scaling
|
|
7138
|
-
LLAMA_LOG_INFO("%s: freq_base_train
|
|
7139
|
-
LLAMA_LOG_INFO("%s: freq_scale_train
|
|
7755
|
+
LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
|
|
7756
|
+
LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
|
|
7757
|
+
LLAMA_LOG_INFO("%s: n_embd_inp = %u\n", __func__, hparams.n_embd_inp());
|
|
7758
|
+
LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
|
|
7759
|
+
LLAMA_LOG_INFO("%s: n_head = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head(il); }, hparams.n_layer).c_str());
|
|
7760
|
+
LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
|
|
7761
|
+
LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot_full);
|
|
7762
|
+
LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa);
|
|
7763
|
+
LLAMA_LOG_INFO("%s: is_swa_any = %u\n", __func__, hparams.is_swa_any());
|
|
7764
|
+
LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k_full);
|
|
7765
|
+
LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v_full);
|
|
7766
|
+
LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer).c_str());
|
|
7767
|
+
LLAMA_LOG_INFO("%s: n_embd_k_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer).c_str());
|
|
7768
|
+
LLAMA_LOG_INFO("%s: n_embd_v_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer).c_str());
|
|
7769
|
+
LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
|
|
7770
|
+
LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
|
|
7771
|
+
LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
|
|
7772
|
+
LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
|
|
7773
|
+
LLAMA_LOG_INFO("%s: f_logit_scale = %.1e\n", __func__, hparams.f_logit_scale);
|
|
7774
|
+
LLAMA_LOG_INFO("%s: f_attn_scale = %.1e\n", __func__, hparams.f_attention_scale);
|
|
7775
|
+
LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str());
|
|
7776
|
+
LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
|
|
7777
|
+
LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
|
|
7778
|
+
LLAMA_LOG_INFO("%s: n_expert_groups = %d\n", __func__, hparams.n_expert_groups);
|
|
7779
|
+
LLAMA_LOG_INFO("%s: n_group_used = %d\n", __func__, hparams.n_group_used);
|
|
7780
|
+
LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn);
|
|
7781
|
+
LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
|
|
7782
|
+
LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
|
|
7783
|
+
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
|
|
7784
|
+
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
|
7785
|
+
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
|
7140
7786
|
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
|
|
7141
|
-
LLAMA_LOG_INFO("%s: freq_base_swa
|
|
7142
|
-
LLAMA_LOG_INFO("%s: freq_scale_swa
|
|
7787
|
+
LLAMA_LOG_INFO("%s: freq_base_swa = %.1f\n", __func__, hparams.rope_freq_base_train_swa);
|
|
7788
|
+
LLAMA_LOG_INFO("%s: freq_scale_swa = %g\n", __func__, hparams.rope_freq_scale_train_swa);
|
|
7789
|
+
LLAMA_LOG_INFO("%s: n_embd_head_k_swa = %u\n", __func__, hparams.n_embd_head_k_swa);
|
|
7790
|
+
LLAMA_LOG_INFO("%s: n_embd_head_v_swa = %u\n", __func__, hparams.n_embd_head_v_swa);
|
|
7791
|
+
LLAMA_LOG_INFO("%s: n_rot_swa = %u\n", __func__, hparams.n_rot_swa);
|
|
7143
7792
|
}
|
|
7144
|
-
LLAMA_LOG_INFO("%s: n_ctx_orig_yarn
|
|
7145
|
-
LLAMA_LOG_INFO("%s: rope_yarn_log_mul= %.4f\n", __func__, hparams.rope_yarn_log_mul);
|
|
7146
|
-
LLAMA_LOG_INFO("%s: rope_finetuned
|
|
7793
|
+
LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
|
|
7794
|
+
LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
|
|
7795
|
+
LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
|
|
7147
7796
|
// MRoPE (Multi-axis Rotary Position Embedding) sections
|
|
7148
7797
|
if (const auto & s = hparams.rope_sections; s[0] || s[1] || s[2] || s[3]) {
|
|
7149
|
-
LLAMA_LOG_INFO("%s: mrope sections
|
|
7798
|
+
LLAMA_LOG_INFO("%s: mrope sections = [%d, %d, %d, %d]\n", __func__, s[0], s[1], s[2], s[3]);
|
|
7150
7799
|
}
|
|
7151
7800
|
if (!classifier_labels.empty()) {
|
|
7152
|
-
LLAMA_LOG_INFO("%s: n_cls_out
|
|
7801
|
+
LLAMA_LOG_INFO("%s: n_cls_out = %u\n", __func__, hparams.n_cls_out);
|
|
7153
7802
|
|
|
7154
7803
|
size_t i = 0;
|
|
7155
7804
|
for (auto label : classifier_labels) {
|
|
7156
|
-
LLAMA_LOG_INFO("%s: cls_label[%2zu]
|
|
7805
|
+
LLAMA_LOG_INFO("%s: cls_label[%2zu] = %s\n", __func__, i++, label.c_str());
|
|
7157
7806
|
}
|
|
7158
7807
|
}
|
|
7159
7808
|
}
|
|
@@ -7165,57 +7814,59 @@ void llama_model::print_info() const {
|
|
|
7165
7814
|
arch == LLM_ARCH_PLAMO2 ||
|
|
7166
7815
|
arch == LLM_ARCH_GRANITE_HYBRID ||
|
|
7167
7816
|
arch == LLM_ARCH_QWEN3NEXT ||
|
|
7817
|
+
arch == LLM_ARCH_QWEN35 ||
|
|
7818
|
+
arch == LLM_ARCH_QWEN35MOE ||
|
|
7168
7819
|
arch == LLM_ARCH_NEMOTRON_H ||
|
|
7169
7820
|
arch == LLM_ARCH_NEMOTRON_H_MOE) {
|
|
7170
|
-
LLAMA_LOG_INFO("%s: ssm_d_conv
|
|
7171
|
-
LLAMA_LOG_INFO("%s: ssm_d_inner
|
|
7172
|
-
LLAMA_LOG_INFO("%s: ssm_d_state
|
|
7173
|
-
LLAMA_LOG_INFO("%s: ssm_dt_rank
|
|
7174
|
-
LLAMA_LOG_INFO("%s: ssm_n_group
|
|
7175
|
-
LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms
|
|
7821
|
+
LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
|
|
7822
|
+
LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
|
|
7823
|
+
LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
|
|
7824
|
+
LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
|
|
7825
|
+
LLAMA_LOG_INFO("%s: ssm_n_group = %u\n", __func__, hparams.ssm_n_group);
|
|
7826
|
+
LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
|
|
7176
7827
|
}
|
|
7177
7828
|
|
|
7178
|
-
LLAMA_LOG_INFO("%s: model type
|
|
7829
|
+
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, type_name().c_str());
|
|
7179
7830
|
if (pimpl->n_elements >= 1e12) {
|
|
7180
|
-
LLAMA_LOG_INFO("%s: model params
|
|
7831
|
+
LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, pimpl->n_elements*1e-12);
|
|
7181
7832
|
} else if (pimpl->n_elements >= 1e9) {
|
|
7182
|
-
LLAMA_LOG_INFO("%s: model params
|
|
7833
|
+
LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, pimpl->n_elements*1e-9);
|
|
7183
7834
|
} else if (pimpl->n_elements >= 1e6) {
|
|
7184
|
-
LLAMA_LOG_INFO("%s: model params
|
|
7835
|
+
LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, pimpl->n_elements*1e-6);
|
|
7185
7836
|
} else {
|
|
7186
|
-
LLAMA_LOG_INFO("%s: model params
|
|
7837
|
+
LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, pimpl->n_elements*1e-3);
|
|
7187
7838
|
}
|
|
7188
7839
|
|
|
7189
7840
|
// general kv
|
|
7190
|
-
LLAMA_LOG_INFO("%s: general.name
|
|
7841
|
+
LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, name.c_str());
|
|
7191
7842
|
|
|
7192
7843
|
if (arch == LLM_ARCH_DEEPSEEK) {
|
|
7193
|
-
LLAMA_LOG_INFO("%s: n_layer_dense_lead
|
|
7194
|
-
LLAMA_LOG_INFO("%s: n_ff_exp
|
|
7195
|
-
LLAMA_LOG_INFO("%s: n_expert_shared
|
|
7196
|
-
LLAMA_LOG_INFO("%s: expert_weights_scale
|
|
7844
|
+
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
|
7845
|
+
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
|
7846
|
+
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
|
|
7847
|
+
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
|
7197
7848
|
}
|
|
7198
7849
|
|
|
7199
|
-
if (arch == LLM_ARCH_DEEPSEEK2) {
|
|
7200
|
-
LLAMA_LOG_INFO("%s: n_layer_dense_lead
|
|
7201
|
-
LLAMA_LOG_INFO("%s: n_lora_q
|
|
7202
|
-
LLAMA_LOG_INFO("%s: n_lora_kv
|
|
7203
|
-
LLAMA_LOG_INFO("%s: n_embd_head_k_mla
|
|
7204
|
-
LLAMA_LOG_INFO("%s: n_embd_head_v_mla
|
|
7205
|
-
LLAMA_LOG_INFO("%s: n_ff_exp
|
|
7206
|
-
LLAMA_LOG_INFO("%s: n_expert_shared
|
|
7207
|
-
LLAMA_LOG_INFO("%s: expert_weights_scale
|
|
7208
|
-
LLAMA_LOG_INFO("%s: expert_weights_norm
|
|
7209
|
-
LLAMA_LOG_INFO("%s: expert_gating_func
|
|
7850
|
+
if (arch == LLM_ARCH_DEEPSEEK2 || arch == LLM_ARCH_GLM_DSA) {
|
|
7851
|
+
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
|
7852
|
+
LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
|
|
7853
|
+
LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
|
|
7854
|
+
LLAMA_LOG_INFO("%s: n_embd_head_k_mla = %d\n", __func__, hparams.n_embd_head_k_mla());
|
|
7855
|
+
LLAMA_LOG_INFO("%s: n_embd_head_v_mla = %d\n", __func__, hparams.n_embd_head_v_mla());
|
|
7856
|
+
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
|
7857
|
+
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
|
|
7858
|
+
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
|
7859
|
+
LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
|
|
7860
|
+
LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
|
|
7210
7861
|
}
|
|
7211
7862
|
|
|
7212
7863
|
if (arch == LLM_ARCH_QWEN2MOE) {
|
|
7213
|
-
LLAMA_LOG_INFO("%s: n_ff_exp
|
|
7214
|
-
LLAMA_LOG_INFO("%s: n_ff_shexp
|
|
7864
|
+
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
|
7865
|
+
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
|
|
7215
7866
|
}
|
|
7216
7867
|
|
|
7217
7868
|
if (arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE || arch == LLM_ARCH_QWEN3VLMOE || arch == LLM_ARCH_RND1) {
|
|
7218
|
-
LLAMA_LOG_INFO("%s: n_ff_exp
|
|
7869
|
+
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
|
7219
7870
|
}
|
|
7220
7871
|
|
|
7221
7872
|
if (arch == LLM_ARCH_MINICPM ||
|
|
@@ -7223,41 +7874,41 @@ void llama_model::print_info() const {
|
|
|
7223
7874
|
arch == LLM_ARCH_GRANITE_MOE ||
|
|
7224
7875
|
arch == LLM_ARCH_GRANITE_HYBRID ||
|
|
7225
7876
|
arch == LLM_ARCH_NEMOTRON_H_MOE) {
|
|
7226
|
-
LLAMA_LOG_INFO("%s: f_embedding_scale
|
|
7227
|
-
LLAMA_LOG_INFO("%s: f_residual_scale
|
|
7228
|
-
LLAMA_LOG_INFO("%s: f_attention_scale
|
|
7229
|
-
LLAMA_LOG_INFO("%s: n_ff_shexp
|
|
7877
|
+
LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
|
|
7878
|
+
LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
|
|
7879
|
+
LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
|
|
7880
|
+
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
|
|
7230
7881
|
}
|
|
7231
7882
|
|
|
7232
7883
|
if (arch == LLM_ARCH_BAILINGMOE) {
|
|
7233
|
-
LLAMA_LOG_INFO("%s: n_layer_dense_lead
|
|
7234
|
-
LLAMA_LOG_INFO("%s: n_ff_exp
|
|
7235
|
-
LLAMA_LOG_INFO("%s: n_expert_shared
|
|
7236
|
-
LLAMA_LOG_INFO("%s: expert_weights_scale
|
|
7237
|
-
LLAMA_LOG_INFO("%s: expert_weights_norm
|
|
7884
|
+
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
|
7885
|
+
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
|
7886
|
+
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
|
|
7887
|
+
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
|
7888
|
+
LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
|
|
7238
7889
|
}
|
|
7239
7890
|
|
|
7240
7891
|
if (arch == LLM_ARCH_BAILINGMOE2) {
|
|
7241
|
-
LLAMA_LOG_INFO("%s: n_layer_dense_lead
|
|
7242
|
-
LLAMA_LOG_INFO("%s: n_ff_exp
|
|
7243
|
-
LLAMA_LOG_INFO("%s: n_ff_shexp
|
|
7244
|
-
LLAMA_LOG_INFO("%s: n_expert_shared
|
|
7245
|
-
LLAMA_LOG_INFO("%s: expert_weights_scale
|
|
7246
|
-
LLAMA_LOG_INFO("%s: expert_weights_norm
|
|
7247
|
-
LLAMA_LOG_INFO("%s: expert_gating_func
|
|
7248
|
-
LLAMA_LOG_INFO("%s: nextn_predict_layers
|
|
7892
|
+
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
|
7893
|
+
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
|
7894
|
+
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
|
|
7895
|
+
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
|
|
7896
|
+
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
|
7897
|
+
LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
|
|
7898
|
+
LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
|
|
7899
|
+
LLAMA_LOG_INFO("%s: nextn_predict_layers = %d\n", __func__, hparams.nextn_predict_layers);
|
|
7249
7900
|
}
|
|
7250
7901
|
|
|
7251
7902
|
if (arch == LLM_ARCH_SMALLTHINKER || arch == LLM_ARCH_LFM2MOE) {
|
|
7252
|
-
LLAMA_LOG_INFO("%s: n_ff_exp
|
|
7253
|
-
LLAMA_LOG_INFO("%s: expert_gating_func
|
|
7903
|
+
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
|
7904
|
+
LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
|
|
7254
7905
|
}
|
|
7255
7906
|
|
|
7256
7907
|
if (arch == LLM_ARCH_GROVEMOE) {
|
|
7257
|
-
LLAMA_LOG_INFO("%s: n_ff_exp
|
|
7258
|
-
LLAMA_LOG_INFO("%s: n_ff_chexp
|
|
7259
|
-
LLAMA_LOG_INFO("%s: n_group_experts
|
|
7260
|
-
LLAMA_LOG_INFO("%s: expert_group_scale
|
|
7908
|
+
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
|
7909
|
+
LLAMA_LOG_INFO("%s: n_ff_chexp = %d\n", __func__, hparams.n_ff_chexp);
|
|
7910
|
+
LLAMA_LOG_INFO("%s: n_group_experts = %d\n", __func__, hparams.n_group_experts);
|
|
7911
|
+
LLAMA_LOG_INFO("%s: expert_group_scale = %.2f\n", __func__, hparams.expert_group_scale);
|
|
7261
7912
|
}
|
|
7262
7913
|
|
|
7263
7914
|
vocab.print_info();
|
|
@@ -7372,6 +8023,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
7372
8023
|
case LLM_ARCH_NOMIC_BERT:
|
|
7373
8024
|
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
7374
8025
|
case LLM_ARCH_NEO_BERT:
|
|
8026
|
+
case LLM_ARCH_EUROBERT:
|
|
7375
8027
|
case LLM_ARCH_WAVTOKENIZER_DEC:
|
|
7376
8028
|
case LLM_ARCH_MODERN_BERT:
|
|
7377
8029
|
case LLM_ARCH_GEMMA_EMBEDDING:
|
|
@@ -7396,7 +8048,6 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
7396
8048
|
cparams.n_seq_max,
|
|
7397
8049
|
nullptr);
|
|
7398
8050
|
} else if (llm_arch_is_hybrid(arch)) {
|
|
7399
|
-
|
|
7400
8051
|
// The main difference between hybrid architectures is the
|
|
7401
8052
|
// layer filters, so pick the right one here
|
|
7402
8053
|
llama_memory_hybrid::layer_filter_cb filter_attn = nullptr;
|
|
@@ -7413,23 +8064,44 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
7413
8064
|
};
|
|
7414
8065
|
}
|
|
7415
8066
|
|
|
7416
|
-
|
|
7417
|
-
|
|
7418
|
-
|
|
7419
|
-
|
|
7420
|
-
|
|
7421
|
-
|
|
7422
|
-
|
|
7423
|
-
|
|
7424
|
-
|
|
7425
|
-
|
|
7426
|
-
|
|
7427
|
-
|
|
7428
|
-
|
|
7429
|
-
|
|
7430
|
-
|
|
7431
|
-
|
|
7432
|
-
|
|
8067
|
+
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
|
|
8068
|
+
// Use hybrid-iswa for hybrid models with SWA
|
|
8069
|
+
res = new llama_memory_hybrid_iswa(
|
|
8070
|
+
/* model */ *this,
|
|
8071
|
+
/* attn_type_k */ params.type_k,
|
|
8072
|
+
/* attn_type_v */ params.type_v,
|
|
8073
|
+
/* attn_v_trans */ !cparams.flash_attn,
|
|
8074
|
+
/* attn_swa_full */ params.swa_full,
|
|
8075
|
+
/* attn_kv_size */ cparams.n_ctx_seq,
|
|
8076
|
+
/* attn_n_ubatch */ cparams.n_ubatch,
|
|
8077
|
+
/* attn_n_pad */ 1,
|
|
8078
|
+
/* recurrent_type_r */ GGML_TYPE_F32,
|
|
8079
|
+
/* recurrent_type_s */ GGML_TYPE_F32,
|
|
8080
|
+
/* recurrent_rs_size */ std::max((uint32_t) 1, cparams.n_seq_max),
|
|
8081
|
+
/* n_seq_max */ cparams.n_seq_max,
|
|
8082
|
+
/* offload */ cparams.offload_kqv,
|
|
8083
|
+
/* unified */ cparams.kv_unified,
|
|
8084
|
+
/* filter_attn */ std::move(filter_attn),
|
|
8085
|
+
/* filter_recr */ std::move(filter_recr));
|
|
8086
|
+
} else {
|
|
8087
|
+
res = new llama_memory_hybrid(
|
|
8088
|
+
/* model */ *this,
|
|
8089
|
+
/* attn_type_k */ params.type_k,
|
|
8090
|
+
/* attn_type_v */ params.type_v,
|
|
8091
|
+
/* attn_v_trans */ !cparams.flash_attn,
|
|
8092
|
+
/* attn_kv_size */ cparams.n_ctx_seq,
|
|
8093
|
+
/* attn_n_pad */ 1,
|
|
8094
|
+
/* attn_n_swa */ hparams.n_swa,
|
|
8095
|
+
/* attn_swa_type */ hparams.swa_type,
|
|
8096
|
+
/* recurrent_type_k */ GGML_TYPE_F32,
|
|
8097
|
+
/* recurrent_type_v */ GGML_TYPE_F32,
|
|
8098
|
+
/* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max),
|
|
8099
|
+
/* n_seq_max */ cparams.n_seq_max,
|
|
8100
|
+
/* offload */ cparams.offload_kqv,
|
|
8101
|
+
/* unified */ cparams.kv_unified,
|
|
8102
|
+
/* filter_attn */ std::move(filter_attn),
|
|
8103
|
+
/* filter_recr */ std::move(filter_recr));
|
|
8104
|
+
}
|
|
7433
8105
|
} else {
|
|
7434
8106
|
llama_memory_i::layer_reuse_cb reuse = nullptr;
|
|
7435
8107
|
|
|
@@ -7549,6 +8221,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
7549
8221
|
{
|
|
7550
8222
|
llm = std::make_unique<llm_build_neo_bert>(*this, params);
|
|
7551
8223
|
} break;
|
|
8224
|
+
case LLM_ARCH_EUROBERT:
|
|
8225
|
+
{
|
|
8226
|
+
llm = std::make_unique<llm_build_eurobert>(*this, params);
|
|
8227
|
+
} break;
|
|
7552
8228
|
case LLM_ARCH_BLOOM:
|
|
7553
8229
|
{
|
|
7554
8230
|
llm = std::make_unique<llm_build_bloom>(*this, params);
|
|
@@ -7748,6 +8424,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
7748
8424
|
llm = std::make_unique<llm_build_deepseek>(*this, params);
|
|
7749
8425
|
} break;
|
|
7750
8426
|
case LLM_ARCH_DEEPSEEK2:
|
|
8427
|
+
case LLM_ARCH_GLM_DSA:
|
|
7751
8428
|
{
|
|
7752
8429
|
llm = std::make_unique<llm_build_deepseek2>(*this, params);
|
|
7753
8430
|
} break;
|
|
@@ -7790,6 +8467,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
7790
8467
|
{
|
|
7791
8468
|
llm = std::make_unique<llm_build_jais>(*this, params);
|
|
7792
8469
|
} break;
|
|
8470
|
+
case LLM_ARCH_JAIS2:
|
|
8471
|
+
{
|
|
8472
|
+
llm = std::make_unique<llm_build_jais2>(*this, params);
|
|
8473
|
+
} break;
|
|
7793
8474
|
case LLM_ARCH_NEMOTRON:
|
|
7794
8475
|
{
|
|
7795
8476
|
llm = std::make_unique<llm_build_nemotron>(*this, params);
|
|
@@ -7811,6 +8492,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
7811
8492
|
llm = std::make_unique<llm_build_exaone4<false>>(*this, params);
|
|
7812
8493
|
}
|
|
7813
8494
|
} break;
|
|
8495
|
+
case LLM_ARCH_EXAONE_MOE:
|
|
8496
|
+
{
|
|
8497
|
+
llm = std::make_unique<llm_build_exaone_moe>(*this, params);
|
|
8498
|
+
} break;
|
|
7814
8499
|
case LLM_ARCH_RWKV6:
|
|
7815
8500
|
{
|
|
7816
8501
|
llm = std::make_unique<llm_build_rwkv6>(*this, params);
|
|
@@ -7881,6 +8566,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
7881
8566
|
{
|
|
7882
8567
|
llm = std::make_unique<llm_build_ernie4_5_moe>(*this, params);
|
|
7883
8568
|
} break;
|
|
8569
|
+
case LLM_ARCH_PADDLEOCR:
|
|
8570
|
+
{
|
|
8571
|
+
llm = std::make_unique<llm_build_paddleocr>(*this, params);
|
|
8572
|
+
} break;
|
|
7884
8573
|
case LLM_ARCH_HUNYUAN_MOE:
|
|
7885
8574
|
{
|
|
7886
8575
|
llm = std::make_unique<llm_build_hunyuan_moe>(*this, params);
|
|
@@ -7904,7 +8593,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
7904
8593
|
case LLM_ARCH_LFM2:
|
|
7905
8594
|
case LLM_ARCH_LFM2MOE:
|
|
7906
8595
|
{
|
|
7907
|
-
|
|
8596
|
+
if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
|
|
8597
|
+
llm = std::make_unique<llm_build_lfm2<true>>(*this, params);
|
|
8598
|
+
} else {
|
|
8599
|
+
llm = std::make_unique<llm_build_lfm2<false>>(*this, params);
|
|
8600
|
+
}
|
|
7908
8601
|
} break;
|
|
7909
8602
|
case LLM_ARCH_SMALLTHINKER:
|
|
7910
8603
|
{
|
|
@@ -7938,6 +8631,14 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
7938
8631
|
{
|
|
7939
8632
|
llm = std::make_unique<llm_build_qwen3next>(*this, params);
|
|
7940
8633
|
} break;
|
|
8634
|
+
case LLM_ARCH_QWEN35:
|
|
8635
|
+
{
|
|
8636
|
+
llm = std::make_unique<llm_build_qwen35>(*this, params);
|
|
8637
|
+
} break;
|
|
8638
|
+
case LLM_ARCH_QWEN35MOE:
|
|
8639
|
+
{
|
|
8640
|
+
llm = std::make_unique<llm_build_qwen35moe>(*this, params);
|
|
8641
|
+
} break;
|
|
7941
8642
|
case LLM_ARCH_MISTRAL3:
|
|
7942
8643
|
{
|
|
7943
8644
|
llm = std::make_unique<llm_build_mistral3>(*this, params);
|
|
@@ -7946,12 +8647,20 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
7946
8647
|
{
|
|
7947
8648
|
llm = std::make_unique<llm_build_mimo2_iswa>(*this, params);
|
|
7948
8649
|
} break;
|
|
8650
|
+
case LLM_ARCH_KIMI_LINEAR:
|
|
8651
|
+
{
|
|
8652
|
+
llm = std::make_unique<llm_build_kimi_linear>(*this, params);
|
|
8653
|
+
} break;
|
|
8654
|
+
case LLM_ARCH_STEP35:
|
|
8655
|
+
{
|
|
8656
|
+
llm = std::make_unique<llm_build_step35_iswa>(*this, params);
|
|
8657
|
+
} break;
|
|
7949
8658
|
default:
|
|
7950
8659
|
GGML_ABORT("fatal error");
|
|
7951
8660
|
}
|
|
7952
8661
|
|
|
7953
8662
|
// add on pooling layer
|
|
7954
|
-
llm->build_pooling(cls, cls_b, cls_out, cls_out_b);
|
|
8663
|
+
llm->build_pooling(cls, cls_b, cls_out, cls_out_b, cls_norm);
|
|
7955
8664
|
|
|
7956
8665
|
// add backend sampling layers (if any)
|
|
7957
8666
|
llm->build_sampling();
|
|
@@ -7960,7 +8669,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
7960
8669
|
// there will be two additional dense projection layers
|
|
7961
8670
|
// dense linear projections are applied after pooling
|
|
7962
8671
|
// TODO: move reranking logic here and generalize
|
|
7963
|
-
llm->build_dense_out(dense_2_out_layers, dense_3_out_layers);
|
|
8672
|
+
llm->build_dense_out(dense_2_out_layers, dense_2_out_layers_b, dense_3_out_layers);
|
|
7964
8673
|
|
|
7965
8674
|
llm->res->set_outputs();
|
|
7966
8675
|
|
|
@@ -7985,7 +8694,7 @@ llama_model_params llama_model_default_params() {
|
|
|
7985
8694
|
/*.kv_overrides =*/ nullptr,
|
|
7986
8695
|
/*.vocab_only =*/ false,
|
|
7987
8696
|
/*.use_mmap =*/ true,
|
|
7988
|
-
/*.use_direct_io =*/
|
|
8697
|
+
/*.use_direct_io =*/ false,
|
|
7989
8698
|
/*.use_mlock =*/ false,
|
|
7990
8699
|
/*.check_tensors =*/ false,
|
|
7991
8700
|
/*.use_extra_bufts =*/ true,
|
|
@@ -8021,7 +8730,7 @@ int32_t llama_model_n_embd_inp(const llama_model * model) {
|
|
|
8021
8730
|
}
|
|
8022
8731
|
|
|
8023
8732
|
int32_t llama_model_n_embd_out(const llama_model * model) {
|
|
8024
|
-
return model->hparams.
|
|
8733
|
+
return model->hparams.n_embd_out();
|
|
8025
8734
|
}
|
|
8026
8735
|
|
|
8027
8736
|
int32_t llama_model_n_layer(const llama_model * model) {
|
|
@@ -8095,6 +8804,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
8095
8804
|
case LLM_ARCH_WAVTOKENIZER_DEC:
|
|
8096
8805
|
case LLM_ARCH_NEMOTRON_H:
|
|
8097
8806
|
case LLM_ARCH_NEMOTRON_H_MOE:
|
|
8807
|
+
case LLM_ARCH_KIMI_LINEAR:
|
|
8098
8808
|
return LLAMA_ROPE_TYPE_NONE;
|
|
8099
8809
|
|
|
8100
8810
|
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
|
@@ -8128,6 +8838,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
8128
8838
|
case LLM_ARCH_MISTRAL3:
|
|
8129
8839
|
case LLM_ARCH_LLAMA_EMBED:
|
|
8130
8840
|
case LLM_ARCH_MAINCODER:
|
|
8841
|
+
case LLM_ARCH_GLM_DSA:
|
|
8131
8842
|
return LLAMA_ROPE_TYPE_NORM;
|
|
8132
8843
|
|
|
8133
8844
|
// the pairs of head values are offset by n_rot/2
|
|
@@ -8140,6 +8851,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
8140
8851
|
case LLM_ARCH_MODERN_BERT:
|
|
8141
8852
|
case LLM_ARCH_NOMIC_BERT:
|
|
8142
8853
|
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
8854
|
+
case LLM_ARCH_EUROBERT:
|
|
8143
8855
|
case LLM_ARCH_STABLELM:
|
|
8144
8856
|
case LLM_ARCH_BITNET:
|
|
8145
8857
|
case LLM_ARCH_QWEN:
|
|
@@ -8171,10 +8883,12 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
8171
8883
|
case LLM_ARCH_NEMOTRON:
|
|
8172
8884
|
case LLM_ARCH_EXAONE:
|
|
8173
8885
|
case LLM_ARCH_EXAONE4:
|
|
8886
|
+
case LLM_ARCH_EXAONE_MOE:
|
|
8174
8887
|
case LLM_ARCH_MINICPM3:
|
|
8175
8888
|
case LLM_ARCH_BAILINGMOE2:
|
|
8176
8889
|
case LLM_ARCH_DOTS1:
|
|
8177
8890
|
case LLM_ARCH_HUNYUAN_MOE:
|
|
8891
|
+
case LLM_ARCH_JAIS2:
|
|
8178
8892
|
case LLM_ARCH_OPENAI_MOE:
|
|
8179
8893
|
case LLM_ARCH_HUNYUAN_DENSE:
|
|
8180
8894
|
case LLM_ARCH_LFM2:
|
|
@@ -8189,12 +8903,16 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
8189
8903
|
case LLM_ARCH_AFMOE:
|
|
8190
8904
|
case LLM_ARCH_QWEN3NEXT:
|
|
8191
8905
|
case LLM_ARCH_MIMO2:
|
|
8906
|
+
case LLM_ARCH_STEP35:
|
|
8192
8907
|
return LLAMA_ROPE_TYPE_NEOX;
|
|
8193
8908
|
|
|
8194
8909
|
case LLM_ARCH_QWEN2VL:
|
|
8910
|
+
case LLM_ARCH_PADDLEOCR:
|
|
8195
8911
|
return LLAMA_ROPE_TYPE_MROPE;
|
|
8196
8912
|
case LLM_ARCH_QWEN3VL:
|
|
8197
8913
|
case LLM_ARCH_QWEN3VLMOE:
|
|
8914
|
+
case LLM_ARCH_QWEN35:
|
|
8915
|
+
case LLM_ARCH_QWEN35MOE:
|
|
8198
8916
|
return LLAMA_ROPE_TYPE_IMROPE;
|
|
8199
8917
|
|
|
8200
8918
|
case LLM_ARCH_GLM4:
|