@novastera-oss/llamarn 0.2.7 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/cpp/include/llama.h +8 -3
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86/libggml.so +0 -0
- package/android/src/main/jniLibs/x86/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/LlamaCppModel.cpp +56 -22
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakeLists.txt +1 -2
- package/cpp/llama.cpp/README.md +4 -5
- package/cpp/llama.cpp/build-xcframework.sh +1 -1
- package/cpp/llama.cpp/common/CMakeLists.txt +4 -5
- package/cpp/llama.cpp/common/arg.cpp +24 -0
- package/cpp/llama.cpp/common/chat.cpp +37 -20
- package/cpp/llama.cpp/common/chat.h +2 -0
- package/cpp/llama.cpp/common/common.cpp +3 -0
- package/cpp/llama.cpp/common/common.h +5 -0
- package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
- package/cpp/llama.cpp/convert_hf_to_gguf.py +860 -23
- package/cpp/llama.cpp/convert_hf_to_gguf_update.py +9 -0
- package/cpp/llama.cpp/ggml/CMakeLists.txt +8 -2
- package/cpp/llama.cpp/ggml/include/ggml-backend.h +1 -1
- package/cpp/llama.cpp/ggml/include/ggml-cpu.h +2 -0
- package/cpp/llama.cpp/ggml/include/ggml.h +206 -10
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +17 -1
- package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +0 -8
- package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +36 -18
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +68 -5
- package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +16 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +37 -3
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +10 -9
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +109 -108
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +1027 -1038
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +53 -52
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +56 -55
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +42 -41
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +24 -23
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +29 -28
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +30 -29
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +83 -82
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +20 -19
- package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +9 -3
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +111 -103
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +1405 -240
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +25 -24
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +56 -40
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +212 -34
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +35 -11
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +187 -54
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +71 -29
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cu +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cross-entropy-loss.cu +2 -14
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +4 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +8 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +6 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +14 -12
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +5 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +15 -10
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +12 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/getrows.cu +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +269 -110
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +19 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cuh +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +2 -8
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cu +257 -87
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cuh +2 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cu +21 -27
- package/cpp/llama.cpp/ggml/src/ggml-cuda/scale.cu +8 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/softmax.cu +119 -58
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-conv.cu +10 -2
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +192 -52
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +5 -18
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cuh +0 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +97 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +11 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/upscale.cu +92 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +14 -5
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +125 -183
- package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -2
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +51 -9
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +394 -80
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +616 -239
- package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +741 -571
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gelu.cl +27 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/glu.cl +337 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mat_f16_f32.cl +130 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/scale.cl +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +95 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +24 -11
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +24 -11
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f16.cl +24 -11
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f32.cl +24 -11
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +2 -3
- package/cpp/llama.cpp/ggml/src/ggml-quants.c +6 -6
- package/cpp/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -6
- package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +1 -24
- package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +28 -41
- package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +4 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +99 -166
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +94 -72
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +49 -67
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +697 -1098
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +18 -9
- package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +6 -9
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +104 -62
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +60 -80
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +132 -201
- package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +55 -74
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +39 -38
- package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +131 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.hpp +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -8
- package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +12 -16
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +12 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +767 -292
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +58 -7
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +28 -23
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +14 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +38 -32
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +32 -27
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +44 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +27 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +11 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +39 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +15 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +128 -72
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +38 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +12 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +46 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +7 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +7 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +7 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +20 -4
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +69 -5
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +23 -3
- package/cpp/llama.cpp/ggml/src/ggml.c +449 -72
- package/cpp/llama.cpp/ggml/src/gguf.cpp +13 -2
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +285 -0
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +27 -0
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +137 -21
- package/cpp/llama.cpp/gguf-py/gguf/vocab.py +109 -7
- package/cpp/llama.cpp/gguf-py/pyproject.toml +2 -2
- package/cpp/llama.cpp/include/llama.h +8 -43
- package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +124 -0
- package/cpp/llama.cpp/src/llama-arch.cpp +265 -3
- package/cpp/llama.cpp/src/llama-arch.h +36 -1
- package/cpp/llama.cpp/src/llama-batch.cpp +596 -359
- package/cpp/llama.cpp/src/llama-batch.h +105 -70
- package/cpp/llama.cpp/src/llama-chat.cpp +26 -6
- package/cpp/llama.cpp/src/llama-chat.h +1 -0
- package/cpp/llama.cpp/src/llama-context.cpp +101 -107
- package/cpp/llama.cpp/src/llama-context.h +13 -13
- package/cpp/llama.cpp/src/llama-graph.cpp +286 -404
- package/cpp/llama.cpp/src/llama-graph.h +78 -79
- package/cpp/llama.cpp/src/llama-hparams.cpp +11 -1
- package/cpp/llama.cpp/src/llama-hparams.h +11 -0
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +74 -66
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +23 -26
- package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +312 -157
- package/cpp/llama.cpp/src/llama-kv-cache-unified.h +79 -46
- package/cpp/llama.cpp/src/llama-kv-cells.h +97 -21
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +73 -69
- package/cpp/llama.cpp/src/llama-memory-hybrid.h +19 -22
- package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +88 -77
- package/cpp/llama.cpp/src/llama-memory-recurrent.h +15 -20
- package/cpp/llama.cpp/src/llama-memory.cpp +17 -0
- package/cpp/llama.cpp/src/llama-memory.h +21 -22
- package/cpp/llama.cpp/src/llama-model-saver.cpp +1 -0
- package/cpp/llama.cpp/src/llama-model.cpp +5301 -2922
- package/cpp/llama.cpp/src/llama-model.h +40 -0
- package/cpp/llama.cpp/src/llama-quant.cpp +88 -5
- package/cpp/llama.cpp/src/llama-vocab.cpp +37 -3
- package/cpp/llama.cpp/src/llama-vocab.h +42 -0
- package/cpp/rn-utils.h +3 -0
- package/ios/include/chat.h +2 -0
- package/ios/include/common.h +5 -0
- package/ios/include/llama.h +8 -43
- package/ios/libs/llama.xcframework/Info.plist +19 -19
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5059 -4863
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +206 -10
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +8 -43
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4834
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3889 -3742
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +206 -10
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -43
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4834
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3891 -3744
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +206 -10
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +8 -43
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +206 -10
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +8 -43
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +206 -10
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +8 -43
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5059 -4863
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +206 -10
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +8 -43
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5030 -4834
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3889 -3742
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +206 -10
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -43
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5095 -4900
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +206 -10
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +8 -43
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5066 -4871
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3919 -3773
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +1 -1
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +206 -10
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -43
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +1 -1
- package/cpp/llama.cpp/ggml/include/ggml-kompute.h +0 -50
- package/cpp/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
- package/cpp/llama.cpp/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/common.comp +0 -112
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +0 -58
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +0 -25
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +0 -30
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +0 -22
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +0 -17
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +0 -31
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +0 -31
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +0 -38
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +0 -39
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +0 -44
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +0 -69
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +0 -51
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +0 -33
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +0 -35
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +0 -140
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +0 -106
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +0 -73
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +0 -28
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +0 -84
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +0 -21
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +0 -53
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +0 -52
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +0 -19
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +0 -23
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +0 -22
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +0 -72
- package/cpp/llama.cpp/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +0 -71
|
@@ -32,16 +32,21 @@ enum llm_type {
|
|
|
32
32
|
LLM_TYPE_190M,
|
|
33
33
|
LLM_TYPE_220M,
|
|
34
34
|
LLM_TYPE_250M,
|
|
35
|
+
LLM_TYPE_256M,
|
|
35
36
|
LLM_TYPE_270M,
|
|
36
37
|
LLM_TYPE_335M,
|
|
38
|
+
LLM_TYPE_350M,
|
|
37
39
|
LLM_TYPE_410M,
|
|
38
40
|
LLM_TYPE_450M,
|
|
39
41
|
LLM_TYPE_475M,
|
|
42
|
+
LLM_TYPE_700M,
|
|
40
43
|
LLM_TYPE_770M,
|
|
41
44
|
LLM_TYPE_780M,
|
|
45
|
+
LLM_TYPE_0_3B,
|
|
42
46
|
LLM_TYPE_0_5B,
|
|
43
47
|
LLM_TYPE_0_6B,
|
|
44
48
|
LLM_TYPE_1B,
|
|
49
|
+
LLM_TYPE_1_2B,
|
|
45
50
|
LLM_TYPE_1_3B,
|
|
46
51
|
LLM_TYPE_1_4B,
|
|
47
52
|
LLM_TYPE_1_5B,
|
|
@@ -93,8 +98,11 @@ enum llm_type {
|
|
|
93
98
|
LLM_TYPE_57B_A14B,
|
|
94
99
|
LLM_TYPE_17B_16E, // llama4 Scout
|
|
95
100
|
LLM_TYPE_17B_128E, // llama4 Maverick
|
|
101
|
+
LLM_TYPE_A13B,
|
|
96
102
|
LLM_TYPE_30B_A3B,
|
|
97
103
|
LLM_TYPE_235B_A22B,
|
|
104
|
+
LLM_TYPE_E2B,
|
|
105
|
+
LLM_TYPE_E4B,
|
|
98
106
|
};
|
|
99
107
|
|
|
100
108
|
std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type);
|
|
@@ -150,6 +158,12 @@ struct llama_layer_convnext {
|
|
|
150
158
|
struct ggml_tensor * gamma = nullptr;
|
|
151
159
|
};
|
|
152
160
|
|
|
161
|
+
struct llama_layer_shortconv {
|
|
162
|
+
struct ggml_tensor * in_proj = nullptr;
|
|
163
|
+
struct ggml_tensor * conv = nullptr;
|
|
164
|
+
struct ggml_tensor * out_proj = nullptr;
|
|
165
|
+
};
|
|
166
|
+
|
|
153
167
|
struct llama_layer {
|
|
154
168
|
// normalization
|
|
155
169
|
struct ggml_tensor * attn_norm = nullptr;
|
|
@@ -169,6 +183,10 @@ struct llama_layer {
|
|
|
169
183
|
struct ggml_tensor * ffn_sub_norm = nullptr;
|
|
170
184
|
struct ggml_tensor * attn_norm_cross = nullptr;
|
|
171
185
|
struct ggml_tensor * attn_norm_enc = nullptr;
|
|
186
|
+
struct ggml_tensor * ssm_norm = nullptr;
|
|
187
|
+
struct ggml_tensor * ssm_dt_norm = nullptr;
|
|
188
|
+
struct ggml_tensor * ssm_b_norm = nullptr;
|
|
189
|
+
struct ggml_tensor * ssm_c_norm = nullptr;
|
|
172
190
|
|
|
173
191
|
// attention
|
|
174
192
|
struct ggml_tensor * wq = nullptr;
|
|
@@ -316,9 +334,24 @@ struct llama_layer {
|
|
|
316
334
|
struct ggml_tensor * ffn_up_scale = nullptr;
|
|
317
335
|
struct ggml_tensor * ffn_down_scale = nullptr;
|
|
318
336
|
|
|
337
|
+
// altup & laurel
|
|
338
|
+
struct ggml_tensor * per_layer_inp_gate = nullptr;
|
|
339
|
+
struct ggml_tensor * per_layer_proj = nullptr;
|
|
340
|
+
struct ggml_tensor * per_layer_post_norm = nullptr;
|
|
341
|
+
struct ggml_tensor * altup_correct_coef = nullptr;
|
|
342
|
+
struct ggml_tensor * altup_correct_scale = nullptr;
|
|
343
|
+
struct ggml_tensor * altup_predict_coef = nullptr;
|
|
344
|
+
struct ggml_tensor * altup_router = nullptr;
|
|
345
|
+
struct ggml_tensor * altup_router_norm = nullptr;
|
|
346
|
+
struct ggml_tensor * laurel_l = nullptr;
|
|
347
|
+
struct ggml_tensor * laurel_r = nullptr;
|
|
348
|
+
struct ggml_tensor * laurel_post_norm = nullptr;
|
|
349
|
+
|
|
319
350
|
struct llama_layer_posnet posnet;
|
|
320
351
|
|
|
321
352
|
struct llama_layer_convnext convnext;
|
|
353
|
+
|
|
354
|
+
struct llama_layer_shortconv shortconv;
|
|
322
355
|
};
|
|
323
356
|
|
|
324
357
|
struct llama_model {
|
|
@@ -354,6 +387,13 @@ struct llama_model {
|
|
|
354
387
|
struct ggml_tensor * conv1d = nullptr;
|
|
355
388
|
struct ggml_tensor * conv1d_b = nullptr;
|
|
356
389
|
|
|
390
|
+
// gemma3n altup
|
|
391
|
+
struct ggml_tensor * tok_embd_per_layer = nullptr;
|
|
392
|
+
struct ggml_tensor * altup_proj = nullptr;
|
|
393
|
+
struct ggml_tensor * altup_unembd_proj = nullptr;
|
|
394
|
+
struct ggml_tensor * per_layer_model_proj = nullptr;
|
|
395
|
+
struct ggml_tensor * per_layer_proj_norm = nullptr;
|
|
396
|
+
|
|
357
397
|
std::vector<llama_layer> layers;
|
|
358
398
|
|
|
359
399
|
llama_model_params params;
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
#include "llama-quant.h"
|
|
2
|
-
|
|
3
2
|
#include "llama-impl.h"
|
|
4
3
|
#include "llama-model.h"
|
|
5
4
|
#include "llama-model-loader.h"
|
|
@@ -27,6 +26,56 @@ static void zeros(std::ofstream & file, size_t n) {
|
|
|
27
26
|
}
|
|
28
27
|
}
|
|
29
28
|
|
|
29
|
+
static std::string remap_layer(const std::string & orig_name, const std::vector<int> & prune, std::map<int, std::string> & mapped, int & next_id) {
|
|
30
|
+
if (prune.empty()) {
|
|
31
|
+
return orig_name;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
static const std::regex pattern(R"(blk\.(\d+)\.)");
|
|
35
|
+
if (std::smatch match; std::regex_search(orig_name, match, pattern)) {
|
|
36
|
+
const int blk = std::stoi(match[1]);
|
|
37
|
+
std::string new_name = orig_name;
|
|
38
|
+
|
|
39
|
+
if (mapped.count(blk)) {
|
|
40
|
+
// Already mapped, do nothing
|
|
41
|
+
} else if (std::find(prune.begin(), prune.end(), blk) != prune.end()) {
|
|
42
|
+
mapped[blk] = "";
|
|
43
|
+
} else if (blk < prune.front()) {
|
|
44
|
+
mapped[blk] = std::to_string(blk);
|
|
45
|
+
next_id = blk + 1;
|
|
46
|
+
} else {
|
|
47
|
+
mapped[blk] = std::to_string(next_id);
|
|
48
|
+
++next_id;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
return mapped[blk].empty() ? mapped[blk] : new_name.replace(match.position(1), match.length(1), mapped[blk]);
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
return orig_name;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
static std::string remap_imatrix (const std::string & orig_name, const std::map<int, std::string> & mapped) {
|
|
58
|
+
if (mapped.empty()) {
|
|
59
|
+
return orig_name;
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
static const std::regex pattern(R"(blk\.(\d+)\.)");
|
|
63
|
+
if (std::smatch match; std::regex_search(orig_name, match, pattern)) {
|
|
64
|
+
const std::string blk(match[1]);
|
|
65
|
+
std::string new_name = orig_name;
|
|
66
|
+
|
|
67
|
+
for (const auto & p : mapped) {
|
|
68
|
+
if (p.second == blk) {
|
|
69
|
+
LLAMA_LOG_DEBUG("(blk.%d imatrix) ", p.first);
|
|
70
|
+
return new_name.replace(match.position(1), match.length(1), std::to_string(p.first));
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
GGML_ABORT("\n%s: imatrix mapping error for %s\n", __func__, orig_name.c_str());
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
return orig_name;
|
|
77
|
+
}
|
|
78
|
+
|
|
30
79
|
struct quantize_state_impl {
|
|
31
80
|
const llama_model & model;
|
|
32
81
|
const llama_model_quantize_params * params;
|
|
@@ -174,7 +223,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
|
|
|
174
223
|
new_type = GGML_TYPE_Q6_K;
|
|
175
224
|
}
|
|
176
225
|
}
|
|
177
|
-
} else if (name == "token_embd.weight") {
|
|
226
|
+
} else if (name == "token_embd.weight" || name == "per_layer_token_embd.weight") {
|
|
178
227
|
if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
|
|
179
228
|
new_type = qs.params->token_embedding_type;
|
|
180
229
|
} else {
|
|
@@ -568,6 +617,11 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
568
617
|
const size_t align = GGUF_DEFAULT_ALIGNMENT;
|
|
569
618
|
gguf_context_ptr ctx_out { gguf_init_empty() };
|
|
570
619
|
|
|
620
|
+
std::vector<int> prune_list = {};
|
|
621
|
+
if (params->prune_layers) {
|
|
622
|
+
prune_list = *static_cast<const std::vector<int> *>(params->prune_layers);
|
|
623
|
+
}
|
|
624
|
+
|
|
571
625
|
// copy the KV pairs from the input file
|
|
572
626
|
gguf_set_kv (ctx_out.get(), ml.meta.get());
|
|
573
627
|
gguf_set_val_u32(ctx_out.get(), "general.quantization_version", GGML_QNT_VERSION); // TODO: use LLM_KV
|
|
@@ -597,12 +651,32 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
597
651
|
}
|
|
598
652
|
}
|
|
599
653
|
|
|
654
|
+
std::map<int, std::string> mapped;
|
|
655
|
+
int blk_id = 0;
|
|
656
|
+
int pruned_attention_w = 0;
|
|
657
|
+
|
|
600
658
|
// make a list of weights
|
|
601
659
|
std::vector<const llama_model_loader::llama_tensor_weight *> tensors;
|
|
602
660
|
tensors.reserve(ml.weights_map.size());
|
|
603
661
|
for (const auto & it : ml.weights_map) {
|
|
662
|
+
const std::string remapped_name(remap_layer(it.first, prune_list, mapped, blk_id));
|
|
663
|
+
if (remapped_name.empty()) {
|
|
664
|
+
if (it.first.find("attn_v.weight") != std::string::npos ||
|
|
665
|
+
it.first.find("attn_qkv.weight") != std::string::npos ||
|
|
666
|
+
it.first.find("attn_kv_b.weight") != std::string::npos) {
|
|
667
|
+
pruned_attention_w++;
|
|
668
|
+
}
|
|
669
|
+
LLAMA_LOG_DEBUG("%s: pruning tensor %s\n", __func__, it.first.c_str());
|
|
670
|
+
continue;
|
|
671
|
+
} else if (remapped_name != it.first) {
|
|
672
|
+
ggml_set_name(it.second.tensor, remapped_name.c_str());
|
|
673
|
+
LLAMA_LOG_DEBUG("%s: tensor %s remapped to %s\n", __func__, it.first.c_str(), ggml_get_name(it.second.tensor));
|
|
674
|
+
}
|
|
604
675
|
tensors.push_back(&it.second);
|
|
605
676
|
}
|
|
677
|
+
if (!prune_list.empty()) {
|
|
678
|
+
gguf_set_val_u32(ctx_out.get(), ml.llm_kv(LLM_KV_BLOCK_COUNT).c_str(), blk_id);
|
|
679
|
+
}
|
|
606
680
|
|
|
607
681
|
// keep_split requires that the weights are sorted by split index
|
|
608
682
|
if (params->keep_split) {
|
|
@@ -640,7 +714,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
640
714
|
if (llama_model_has_encoder(&model)) {
|
|
641
715
|
n_attn_layer *= 3;
|
|
642
716
|
}
|
|
643
|
-
GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected");
|
|
717
|
+
GGML_ASSERT((qs.n_attention_wv == n_attn_layer - pruned_attention_w) && "n_attention_wv is unexpected");
|
|
644
718
|
}
|
|
645
719
|
|
|
646
720
|
size_t total_size_org = 0;
|
|
@@ -681,7 +755,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
681
755
|
for (size_t i = 0; i < ctx_outs.size(); ++i) {
|
|
682
756
|
gguf_set_val_u16(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_NO).c_str(), i);
|
|
683
757
|
gguf_set_val_u16(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str(), n_split);
|
|
684
|
-
gguf_set_val_i32(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(),
|
|
758
|
+
gguf_set_val_i32(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), (int32_t)tensors.size());
|
|
685
759
|
}
|
|
686
760
|
}
|
|
687
761
|
|
|
@@ -756,6 +830,13 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
756
830
|
// NOTE: can't use LLM_TN here because the layer number is not known
|
|
757
831
|
quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
|
|
758
832
|
|
|
833
|
+
// these are very small (e.g. 4x4)
|
|
834
|
+
quantize &= name.find("altup") == std::string::npos;
|
|
835
|
+
quantize &= name.find("laurel") == std::string::npos;
|
|
836
|
+
|
|
837
|
+
// these are not too big so keep them as it is
|
|
838
|
+
quantize &= name.find("per_layer_model_proj") == std::string::npos;
|
|
839
|
+
|
|
759
840
|
// do not quantize positional embeddings and token types (BERT)
|
|
760
841
|
quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight");
|
|
761
842
|
quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
|
|
@@ -763,6 +844,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
763
844
|
// do not quantize Mamba's small yet 2D weights
|
|
764
845
|
// NOTE: can't use LLM_TN here because the layer number is not known
|
|
765
846
|
quantize &= name.find("ssm_conv1d.weight") == std::string::npos;
|
|
847
|
+
quantize &= name.find("shortconv.conv.weight") == std::string::npos;
|
|
766
848
|
|
|
767
849
|
// do not quantize RWKV's small yet 2D weights
|
|
768
850
|
quantize &= name.find("time_mix_first.weight") == std::string::npos;
|
|
@@ -832,7 +914,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
832
914
|
|
|
833
915
|
const float * imatrix = nullptr;
|
|
834
916
|
if (imatrix_data) {
|
|
835
|
-
auto it = imatrix_data->find(tensor->name);
|
|
917
|
+
auto it = imatrix_data->find(remap_imatrix(tensor->name, mapped));
|
|
836
918
|
if (it == imatrix_data->end()) {
|
|
837
919
|
LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
|
|
838
920
|
} else {
|
|
@@ -947,6 +1029,7 @@ llama_model_quantize_params llama_model_quantize_default_params() {
|
|
|
947
1029
|
/*.imatrix =*/ nullptr,
|
|
948
1030
|
/*.kv_overrides =*/ nullptr,
|
|
949
1031
|
/*.tensor_type =*/ nullptr,
|
|
1032
|
+
/*.prune_layers =*/ nullptr
|
|
950
1033
|
};
|
|
951
1034
|
|
|
952
1035
|
return result;
|
|
@@ -351,6 +351,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
|
|
351
351
|
break;
|
|
352
352
|
case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
|
|
353
353
|
case LLAMA_VOCAB_PRE_TYPE_QWEN2:
|
|
354
|
+
case LLAMA_VOCAB_PRE_TYPE_HUNYUAN:
|
|
354
355
|
regex_exprs = {
|
|
355
356
|
// original regex from tokenizer.json
|
|
356
357
|
// "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
|
|
@@ -1269,6 +1270,7 @@ struct llama_vocab::impl {
|
|
|
1269
1270
|
bool add_space_prefix = false;
|
|
1270
1271
|
bool add_bos = false;
|
|
1271
1272
|
bool add_eos = false;
|
|
1273
|
+
bool add_sep = false;
|
|
1272
1274
|
bool ignore_merges = false;
|
|
1273
1275
|
bool clean_spaces = false; // clean_up_tokenization_spaces
|
|
1274
1276
|
bool remove_extra_whitespaces = false;
|
|
@@ -1421,6 +1423,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1421
1423
|
special_sep_id = 102;
|
|
1422
1424
|
special_pad_id = 0;
|
|
1423
1425
|
special_mask_id = 103;
|
|
1426
|
+
|
|
1427
|
+
add_sep = true;
|
|
1424
1428
|
} else if (tokenizer_model == "gpt2") {
|
|
1425
1429
|
type = LLAMA_VOCAB_TYPE_BPE;
|
|
1426
1430
|
|
|
@@ -1519,7 +1523,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1519
1523
|
tokenizer_pre == "llama-v3" ||
|
|
1520
1524
|
tokenizer_pre == "llama-bpe"||
|
|
1521
1525
|
tokenizer_pre == "falcon3" ||
|
|
1522
|
-
tokenizer_pre == "
|
|
1526
|
+
tokenizer_pre == "falcon-h1" ||
|
|
1527
|
+
tokenizer_pre == "pixtral" ||
|
|
1528
|
+
tokenizer_pre == "midm-2.0" ||
|
|
1529
|
+
tokenizer_pre == "lfm2") {
|
|
1523
1530
|
pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
|
|
1524
1531
|
ignore_merges = true;
|
|
1525
1532
|
add_bos = true;
|
|
@@ -1550,12 +1557,16 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1550
1557
|
tokenizer_pre == "jina-es" ||
|
|
1551
1558
|
tokenizer_pre == "jina-de" ||
|
|
1552
1559
|
tokenizer_pre == "gigachat" ||
|
|
1553
|
-
tokenizer_pre == "jina-v1-en" ||
|
|
1554
1560
|
tokenizer_pre == "jina-v2-es" ||
|
|
1555
1561
|
tokenizer_pre == "jina-v2-de" ||
|
|
1562
|
+
tokenizer_pre == "a.x-4.0") {
|
|
1563
|
+
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
|
|
1564
|
+
} else if (
|
|
1565
|
+
tokenizer_pre == "jina-v1-en" ||
|
|
1556
1566
|
tokenizer_pre == "jina-v2-code" ||
|
|
1557
1567
|
tokenizer_pre == "roberta-bpe") {
|
|
1558
1568
|
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
|
|
1569
|
+
add_sep = true;
|
|
1559
1570
|
} else if (
|
|
1560
1571
|
tokenizer_pre == "refact") {
|
|
1561
1572
|
pre_type = LLAMA_VOCAB_PRE_TYPE_REFACT;
|
|
@@ -1650,6 +1661,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1650
1661
|
tokenizer_pre == "seed-coder") {
|
|
1651
1662
|
pre_type = LLAMA_VOCAB_PRE_TYPE_SEED_CODER;
|
|
1652
1663
|
clean_spaces = false;
|
|
1664
|
+
} else if (
|
|
1665
|
+
tokenizer_pre == "hunyuan") {
|
|
1666
|
+
pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN;
|
|
1667
|
+
clean_spaces = false;
|
|
1653
1668
|
} else {
|
|
1654
1669
|
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
|
1655
1670
|
}
|
|
@@ -1665,6 +1680,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1665
1680
|
clean_spaces = true;
|
|
1666
1681
|
add_bos = true;
|
|
1667
1682
|
add_eos = false;
|
|
1683
|
+
add_sep = true;
|
|
1668
1684
|
} else if (type == LLAMA_VOCAB_TYPE_UGM) {
|
|
1669
1685
|
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
|
1670
1686
|
add_bos = false;
|
|
@@ -1801,7 +1817,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1801
1817
|
}
|
|
1802
1818
|
}
|
|
1803
1819
|
|
|
1804
|
-
// Handle add_bos and
|
|
1820
|
+
// Handle add_bos, add_eos and add_sep
|
|
1805
1821
|
{
|
|
1806
1822
|
bool temp = true;
|
|
1807
1823
|
|
|
@@ -1811,6 +1827,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1811
1827
|
if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) {
|
|
1812
1828
|
add_eos = temp;
|
|
1813
1829
|
}
|
|
1830
|
+
if (ml.get_key(LLM_KV_TOKENIZER_ADD_SEP, temp, false)) {
|
|
1831
|
+
add_sep = temp;
|
|
1832
|
+
}
|
|
1814
1833
|
}
|
|
1815
1834
|
|
|
1816
1835
|
// auto-detect special tokens by text
|
|
@@ -1829,6 +1848,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1829
1848
|
|| t.first == "<EOT>"
|
|
1830
1849
|
|| t.first == "_<EOT>"
|
|
1831
1850
|
|| t.first == "<|end▁of▁sentence|>" // DeepSeek
|
|
1851
|
+
|| t.first == "<end_of_utterance>" // smoldocling
|
|
1832
1852
|
) {
|
|
1833
1853
|
special_eot_id = t.second;
|
|
1834
1854
|
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
|
@@ -1988,6 +2008,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1988
2008
|
|| t.first == "<EOT>"
|
|
1989
2009
|
|| t.first == "_<EOT>"
|
|
1990
2010
|
|| t.first == "<|end_of_text|>"
|
|
2011
|
+
|| t.first == "<end_of_utterance>" // smoldocling
|
|
1991
2012
|
) {
|
|
1992
2013
|
special_eog_ids.insert(t.second);
|
|
1993
2014
|
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
|
@@ -3000,6 +3021,10 @@ bool llama_vocab::get_add_eos() const {
|
|
|
3000
3021
|
return pimpl->add_eos;
|
|
3001
3022
|
}
|
|
3002
3023
|
|
|
3024
|
+
bool llama_vocab::get_add_sep() const {
|
|
3025
|
+
return pimpl->add_sep;
|
|
3026
|
+
}
|
|
3027
|
+
|
|
3003
3028
|
bool llama_vocab::get_ignore_merges() const {
|
|
3004
3029
|
return pimpl->ignore_merges;
|
|
3005
3030
|
}
|
|
@@ -3060,6 +3085,11 @@ int32_t llama_vocab::tokenize(
|
|
|
3060
3085
|
bool add_special,
|
|
3061
3086
|
bool parse_special) const {
|
|
3062
3087
|
auto res = tokenize(std::string(text, text_len), add_special, parse_special);
|
|
3088
|
+
if (res.size() >= static_cast<size_t>(std::numeric_limits<int32_t>::max())) {
|
|
3089
|
+
LLAMA_LOG_ERROR("%s: tokenization result size %zu exceeds int32_t limit\n", __func__, res.size());
|
|
3090
|
+
return std::numeric_limits<int32_t>::min();
|
|
3091
|
+
}
|
|
3092
|
+
|
|
3063
3093
|
if (n_tokens_max < (int) res.size()) {
|
|
3064
3094
|
// LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
|
|
3065
3095
|
return -((int) res.size());
|
|
@@ -3191,6 +3221,10 @@ bool llama_vocab_get_add_eos(const struct llama_vocab * vocab) {
|
|
|
3191
3221
|
return vocab->get_add_eos();
|
|
3192
3222
|
}
|
|
3193
3223
|
|
|
3224
|
+
bool llama_vocab_get_add_sep(const struct llama_vocab * vocab) {
|
|
3225
|
+
return vocab->get_add_sep();
|
|
3226
|
+
}
|
|
3227
|
+
|
|
3194
3228
|
llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab) {
|
|
3195
3229
|
return vocab->token_fim_pre();
|
|
3196
3230
|
}
|
|
@@ -6,6 +6,47 @@
|
|
|
6
6
|
#include <vector>
|
|
7
7
|
#include <memory>
|
|
8
8
|
|
|
9
|
+
// pre-tokenization types
|
|
10
|
+
enum llama_vocab_pre_type {
|
|
11
|
+
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
|
|
12
|
+
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
|
|
13
|
+
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
|
|
14
|
+
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
|
|
15
|
+
LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
|
|
16
|
+
LLAMA_VOCAB_PRE_TYPE_MPT = 5,
|
|
17
|
+
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
|
|
18
|
+
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
|
|
19
|
+
LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
|
|
20
|
+
LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
|
|
21
|
+
LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10,
|
|
22
|
+
LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
|
|
23
|
+
LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
|
|
24
|
+
LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
|
|
25
|
+
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
|
|
26
|
+
LLAMA_VOCAB_PRE_TYPE_PORO = 15,
|
|
27
|
+
LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16,
|
|
28
|
+
LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
|
|
29
|
+
LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
|
|
30
|
+
LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
|
|
31
|
+
LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
|
|
32
|
+
LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
|
|
33
|
+
LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
|
|
34
|
+
LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
|
|
35
|
+
LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
|
|
36
|
+
LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
|
|
37
|
+
LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
|
|
38
|
+
LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
|
|
39
|
+
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
|
|
40
|
+
LLAMA_VOCAB_PRE_TYPE_GPT4O = 29,
|
|
41
|
+
LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30,
|
|
42
|
+
LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
|
|
43
|
+
LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
|
|
44
|
+
LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
|
|
45
|
+
LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
|
|
46
|
+
LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
|
|
47
|
+
LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36,
|
|
48
|
+
};
|
|
49
|
+
|
|
9
50
|
struct LLM_KV;
|
|
10
51
|
struct llama_model_loader;
|
|
11
52
|
|
|
@@ -74,6 +115,7 @@ struct llama_vocab {
|
|
|
74
115
|
bool get_add_space_prefix () const;
|
|
75
116
|
bool get_add_bos () const;
|
|
76
117
|
bool get_add_eos () const;
|
|
118
|
+
bool get_add_sep () const;
|
|
77
119
|
bool get_ignore_merges () const;
|
|
78
120
|
bool get_clean_spaces () const;
|
|
79
121
|
bool get_remove_extra_whitespaces () const;
|
package/cpp/rn-utils.h
CHANGED
|
@@ -54,6 +54,7 @@ struct CompletionOptions {
|
|
|
54
54
|
float top_p = 0.9f;
|
|
55
55
|
float top_k = 40.0f;
|
|
56
56
|
float min_p = 0.05f;
|
|
57
|
+
float presence_penalty = 0.0f; // for reducing repetitions (0-2 range)
|
|
57
58
|
int n_keep = 0;
|
|
58
59
|
int n_probs = 0; // for log probabilities
|
|
59
60
|
bool post_sampling_probs = false;
|
|
@@ -77,6 +78,7 @@ struct CompletionOptions {
|
|
|
77
78
|
{"top_p", top_p},
|
|
78
79
|
{"top_k", top_k},
|
|
79
80
|
{"min_p", min_p},
|
|
81
|
+
{"presence_penalty", presence_penalty},
|
|
80
82
|
{"n_predict", n_predict},
|
|
81
83
|
{"n_keep", n_keep},
|
|
82
84
|
{"n_probs", n_probs},
|
|
@@ -147,6 +149,7 @@ struct CompletionOptions {
|
|
|
147
149
|
data["top_p"] = top_p;
|
|
148
150
|
data["max_tokens"] = n_predict;
|
|
149
151
|
data["stream"] = stream;
|
|
152
|
+
data["presence_penalty"] = presence_penalty;
|
|
150
153
|
|
|
151
154
|
if (seed >= 0) {
|
|
152
155
|
data["seed"] = seed;
|
package/ios/include/chat.h
CHANGED
|
@@ -7,6 +7,7 @@
|
|
|
7
7
|
#include <chrono>
|
|
8
8
|
#include <string>
|
|
9
9
|
#include <vector>
|
|
10
|
+
#include <map>
|
|
10
11
|
|
|
11
12
|
struct common_chat_templates;
|
|
12
13
|
|
|
@@ -125,6 +126,7 @@ struct common_chat_templates_inputs {
|
|
|
125
126
|
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
|
|
126
127
|
bool enable_thinking = true;
|
|
127
128
|
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
|
|
129
|
+
std::map<std::string, std::string> chat_template_kwargs;
|
|
128
130
|
};
|
|
129
131
|
|
|
130
132
|
struct common_chat_params {
|
package/ios/include/common.h
CHANGED
|
@@ -8,6 +8,7 @@
|
|
|
8
8
|
#include <string>
|
|
9
9
|
#include <string_view>
|
|
10
10
|
#include <vector>
|
|
11
|
+
#include <map>
|
|
11
12
|
#include <sstream>
|
|
12
13
|
|
|
13
14
|
#ifdef _WIN32
|
|
@@ -358,6 +359,7 @@ struct common_params {
|
|
|
358
359
|
int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
|
|
359
360
|
std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
|
|
360
361
|
std::string embd_sep = "\n"; // separator of embeddings
|
|
362
|
+
std::string cls_sep = "\t"; // separator of classification sequences
|
|
361
363
|
|
|
362
364
|
// server params
|
|
363
365
|
int32_t port = 8080; // server listens on this network port
|
|
@@ -368,6 +370,7 @@ struct common_params {
|
|
|
368
370
|
|
|
369
371
|
std::string hostname = "127.0.0.1";
|
|
370
372
|
std::string public_path = ""; // NOLINT
|
|
373
|
+
std::string api_prefix = ""; // NOLINT
|
|
371
374
|
std::string chat_template = ""; // NOLINT
|
|
372
375
|
bool use_jinja = false; // NOLINT
|
|
373
376
|
bool enable_chat_template = true;
|
|
@@ -380,6 +383,8 @@ struct common_params {
|
|
|
380
383
|
std::string ssl_file_key = ""; // NOLINT
|
|
381
384
|
std::string ssl_file_cert = ""; // NOLINT
|
|
382
385
|
|
|
386
|
+
std::map<std::string, std::string> default_template_kwargs;
|
|
387
|
+
|
|
383
388
|
// "advanced" endpoints are disabled by default for better security
|
|
384
389
|
bool webui = true;
|
|
385
390
|
bool endpoint_slots = false;
|
package/ios/include/llama.h
CHANGED
|
@@ -79,46 +79,6 @@ extern "C" {
|
|
|
79
79
|
LLAMA_VOCAB_TYPE_RWKV = 5, // RWKV tokenizer based on greedy tokenization
|
|
80
80
|
};
|
|
81
81
|
|
|
82
|
-
// pre-tokenization types
|
|
83
|
-
enum llama_vocab_pre_type {
|
|
84
|
-
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
|
|
85
|
-
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
|
|
86
|
-
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
|
|
87
|
-
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
|
|
88
|
-
LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
|
|
89
|
-
LLAMA_VOCAB_PRE_TYPE_MPT = 5,
|
|
90
|
-
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
|
|
91
|
-
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
|
|
92
|
-
LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
|
|
93
|
-
LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
|
|
94
|
-
LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10,
|
|
95
|
-
LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
|
|
96
|
-
LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
|
|
97
|
-
LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
|
|
98
|
-
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
|
|
99
|
-
LLAMA_VOCAB_PRE_TYPE_PORO = 15,
|
|
100
|
-
LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16,
|
|
101
|
-
LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
|
|
102
|
-
LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
|
|
103
|
-
LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
|
|
104
|
-
LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
|
|
105
|
-
LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
|
|
106
|
-
LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
|
|
107
|
-
LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
|
|
108
|
-
LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
|
|
109
|
-
LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
|
|
110
|
-
LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
|
|
111
|
-
LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
|
|
112
|
-
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
|
|
113
|
-
LLAMA_VOCAB_PRE_TYPE_GPT4O = 29,
|
|
114
|
-
LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30,
|
|
115
|
-
LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
|
|
116
|
-
LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
|
|
117
|
-
LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
|
|
118
|
-
LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
|
|
119
|
-
LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
|
|
120
|
-
};
|
|
121
|
-
|
|
122
82
|
enum llama_rope_type {
|
|
123
83
|
LLAMA_ROPE_TYPE_NONE = -1,
|
|
124
84
|
LLAMA_ROPE_TYPE_NORM = 0,
|
|
@@ -390,6 +350,7 @@ extern "C" {
|
|
|
390
350
|
void * imatrix; // pointer to importance matrix data
|
|
391
351
|
void * kv_overrides; // pointer to vector containing overrides
|
|
392
352
|
void * tensor_types; // pointer to vector containing tensor types
|
|
353
|
+
void * prune_layers; // pointer to vector containing layer indices to prune
|
|
393
354
|
} llama_model_quantize_params;
|
|
394
355
|
|
|
395
356
|
typedef struct llama_logit_bias {
|
|
@@ -943,12 +904,14 @@ extern "C" {
|
|
|
943
904
|
// Requires the context to have a memory.
|
|
944
905
|
// For encode-decoder contexts, processes the batch using the decoder.
|
|
945
906
|
// Positive return values does not mean a fatal error, but rather a warning.
|
|
946
|
-
// Upon
|
|
907
|
+
// Upon fatal-error or abort, the ubatches that managed to be been processed will remain in the memory state of the context
|
|
908
|
+
// To handle this correctly, query the memory state using llama_memory_seq_pos_min() and llama_memory_seq_pos_max()
|
|
909
|
+
// Upon other return values, the memory state is restored to the state before this call
|
|
947
910
|
// 0 - success
|
|
948
911
|
// 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
|
|
949
|
-
// 2 - aborted
|
|
912
|
+
// 2 - aborted (processed ubatches will remain in the context's memory)
|
|
950
913
|
// -1 - invalid input batch
|
|
951
|
-
// < -1 - error
|
|
914
|
+
// < -1 - fatal error (processed ubatches will remain in the context's memory)
|
|
952
915
|
LLAMA_API int32_t llama_decode(
|
|
953
916
|
struct llama_context * ctx,
|
|
954
917
|
struct llama_batch batch);
|
|
@@ -1044,6 +1007,7 @@ extern "C" {
|
|
|
1044
1007
|
|
|
1045
1008
|
LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
|
|
1046
1009
|
LLAMA_API bool llama_vocab_get_add_eos(const struct llama_vocab * vocab);
|
|
1010
|
+
LLAMA_API bool llama_vocab_get_add_sep(const struct llama_vocab * vocab);
|
|
1047
1011
|
|
|
1048
1012
|
LLAMA_API llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab);
|
|
1049
1013
|
LLAMA_API llama_token llama_vocab_fim_suf(const struct llama_vocab * vocab);
|
|
@@ -1087,6 +1051,7 @@ extern "C" {
|
|
|
1087
1051
|
/// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
|
|
1088
1052
|
/// @return Returns the number of tokens on success, no more than n_tokens_max
|
|
1089
1053
|
/// @return Returns a negative number on failure - the number of tokens that would have been returned
|
|
1054
|
+
/// @return Returns INT32_MIN on overflow (e.g., tokenization result size exceeds int32_t limit)
|
|
1090
1055
|
/// @param add_special Allow to add BOS and EOS tokens if model is configured to do so.
|
|
1091
1056
|
/// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
|
|
1092
1057
|
/// as plaintext. Does not insert a leading space.
|