@novastera-oss/llamarn 0.3.1 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +86 -3
- package/RNLlamaCpp.podspec +1 -1
- package/android/CMakeLists.txt +11 -3
- package/android/generated/jni/react/renderer/components/RNLlamaCppSpec/RNLlamaCppSpecJSI.h +49 -4
- package/android/src/main/cpp/include/llama.h +53 -114
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86/libggml.so +0 -0
- package/android/src/main/jniLibs/x86/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/LlamaCppModel.cpp +2 -10
- package/cpp/PureCppImpl.cpp +71 -4
- package/cpp/SystemUtils.cpp +3 -7
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakeLists.txt +2 -0
- package/cpp/llama.cpp/CODEOWNERS +1 -1
- package/cpp/llama.cpp/Makefile +6 -1605
- package/cpp/llama.cpp/README.md +5 -1
- package/cpp/llama.cpp/common/arg.cpp +230 -51
- package/cpp/llama.cpp/common/chat-parser.cpp +9 -1
- package/cpp/llama.cpp/common/chat.cpp +539 -8
- package/cpp/llama.cpp/common/chat.h +8 -1
- package/cpp/llama.cpp/common/common.cpp +60 -15
- package/cpp/llama.cpp/common/common.h +64 -15
- package/cpp/llama.cpp/common/speculative.cpp +135 -54
- package/cpp/llama.cpp/common/speculative.h +8 -1
- package/cpp/llama.cpp/convert_hf_to_gguf.py +1216 -109
- package/cpp/llama.cpp/convert_hf_to_gguf_update.py +19 -6
- package/cpp/llama.cpp/convert_lora_to_gguf.py +1 -1
- package/cpp/llama.cpp/flake.nix +0 -5
- package/cpp/llama.cpp/ggml/CMakeLists.txt +6 -3
- package/cpp/llama.cpp/ggml/cmake/ggml-config.cmake.in +71 -70
- package/cpp/llama.cpp/ggml/include/ggml-opt.h +25 -6
- package/cpp/llama.cpp/ggml/include/ggml-zdnn.h +16 -0
- package/cpp/llama.cpp/ggml/include/ggml.h +90 -3
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +13 -1
- package/cpp/llama.cpp/ggml/src/ggml-alloc.c +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +10 -0
- package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +113 -17
- package/cpp/llama.cpp/ggml/src/ggml-blas/ggml-blas.cpp +4 -4
- package/cpp/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +14 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +701 -585
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +13 -3
- package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +52 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +274 -91
- package/cpp/llama.cpp/ggml/src/ggml-common.h +17 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +132 -596
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +14 -286
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +90 -569
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +162 -589
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +55 -341
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +3 -58
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +371 -298
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +54 -314
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +184 -675
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +4679 -1657
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +33 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +26 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +21 -24
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +16 -7
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +232 -123
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +428 -23
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +4 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +35 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.h +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +458 -46
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.h +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +39 -14
- package/cpp/llama.cpp/ggml/src/ggml-cpu/traits.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/traits.h +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +20 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +122 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +9 -11
- package/cpp/llama.cpp/ggml/src/ggml-cuda/add-id.cu +58 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/add-id.cuh +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/binbcast.cu +275 -170
- package/cpp/llama.cpp/ggml/src/ggml-cuda/binbcast.cuh +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +103 -65
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv-transpose-1d.cu +1 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d.cu +171 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cu +33 -7
- package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cuh +13 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy-utils.cuh +2 -10
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +3 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/dequantize.cuh +14 -40
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +83 -27
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +116 -57
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +45 -18
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +56 -29
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +61 -39
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +70 -49
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +70 -21
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +162 -50
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cuh +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/getrows.cu +5 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +208 -97
- package/cpp/llama.cpp/ggml/src/ggml-cuda/im2col.cu +46 -35
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +56 -2
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mma.cuh +95 -51
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmf.cu +427 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmf.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +204 -57
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +252 -168
- package/cpp/llama.cpp/ggml/src/ggml-cuda/{mmv.cu → mmvf.cu} +53 -53
- package/cpp/llama.cpp/ggml/src/ggml-cuda/{mmv.cuh → mmvf.cuh} +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvq.cu +10 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cu +192 -19
- package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/opt-step-sgd.cu +49 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/opt-step-sgd.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/pad_reflect_1d.cu +82 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/pad_reflect_1d.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/reduce_rows.cuh +53 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/roll.cu +67 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/roll.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/set-rows.cu +1 -8
- package/cpp/llama.cpp/ggml/src/ggml-cuda/softcap.cu +34 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/softcap.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/softmax.cu +16 -10
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +153 -71
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sum.cu +6 -10
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +21 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmq-instance-mxfp4.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +75 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/vecdotq.cuh +110 -22
- package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +14 -25
- package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +2 -1
- package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +10 -2
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +61 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +31 -20
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +342 -131
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +464 -134
- package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +0 -4
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1108 -176
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/add.cl +107 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/div.cl +66 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +343 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +343 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +346 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/glu.cl +41 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/group_norm.cl +49 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul.cl +73 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +132 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +133 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/norm.cl +80 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +10 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +10 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f16.cl +10 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f32.cl +10 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sub.cl +66 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/transpose.cl +20 -0
- package/cpp/llama.cpp/ggml/src/ggml-opt.cpp +97 -41
- package/cpp/llama.cpp/ggml/src/ggml-quants.c +110 -16
- package/cpp/llama.cpp/ggml/src/ggml-quants.h +6 -0
- package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +22 -9
- package/cpp/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +0 -212
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +213 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +117 -238
- package/cpp/llama.cpp/ggml/src/ggml-sycl/quantize.hpp +133 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +94 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1666 -633
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +41 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +42 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +13 -4
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +39 -29
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +107 -43
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +18 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp +21 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +32 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +20 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +21 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +16 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +44 -8
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +44 -16
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +26 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +2 -17
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +37 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +11 -7
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +109 -55
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +71 -41
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +6 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +111 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +49 -11
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +65 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +9 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +17 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +38 -5
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +14 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +55 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/utils.comp +25 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +75 -20
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/CMakeLists.txt +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/ggml-webgpu.cpp +807 -412
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +72 -22
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +8 -8
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +1794 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +82 -0
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/ggml-zdnn-impl.h +97 -0
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/ggml-zdnn.cpp +846 -0
- package/cpp/llama.cpp/ggml/src/ggml.c +204 -50
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +187 -2
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +11 -2
- package/cpp/llama.cpp/gguf-py/gguf/quants.py +53 -4
- package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_convert_endian.py +67 -63
- package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_new_metadata.py +7 -1
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +120 -16
- package/cpp/llama.cpp/gguf-py/gguf/utility.py +5 -1
- package/cpp/llama.cpp/gguf-py/gguf/vocab.py +284 -1
- package/cpp/llama.cpp/gguf-py/tests/test_quants.py +14 -5
- package/cpp/llama.cpp/include/llama.h +53 -114
- package/cpp/llama.cpp/models/templates/ByteDance-Seed-OSS.jinja +171 -0
- package/cpp/llama.cpp/models/templates/README.md +2 -1
- package/cpp/llama.cpp/models/templates/ibm-granite-granite-3.3-2B-Instruct.jinja +59 -0
- package/cpp/llama.cpp/models/templates/openai-gpt-oss-120b.jinja +331 -0
- package/cpp/llama.cpp/models/templates/unsloth-mistral-Devstral-Small-2507.jinja +105 -0
- package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -1
- package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +0 -6
- package/cpp/llama.cpp/requirements/requirements-pydantic.txt +1 -1
- package/cpp/llama.cpp/src/CMakeLists.txt +2 -2
- package/cpp/llama.cpp/src/llama-adapter.cpp +68 -4
- package/cpp/llama.cpp/src/llama-adapter.h +3 -0
- package/cpp/llama.cpp/src/llama-arch.cpp +192 -2
- package/cpp/llama.cpp/src/llama-arch.h +18 -0
- package/cpp/llama.cpp/src/llama-batch.cpp +2 -2
- package/cpp/llama.cpp/src/llama-chat.cpp +47 -6
- package/cpp/llama.cpp/src/llama-chat.h +3 -0
- package/cpp/llama.cpp/src/llama-context.cpp +61 -252
- package/cpp/llama.cpp/src/llama-context.h +10 -15
- package/cpp/llama.cpp/src/llama-cparams.h +0 -1
- package/cpp/llama.cpp/src/llama-graph.cpp +180 -85
- package/cpp/llama.cpp/src/llama-graph.h +90 -51
- package/cpp/llama.cpp/src/llama-hparams.cpp +34 -3
- package/cpp/llama.cpp/src/llama-hparams.h +21 -6
- package/cpp/llama.cpp/src/{llama-kv-cache-unified-iswa.cpp → llama-kv-cache-iswa.cpp} +79 -56
- package/cpp/llama.cpp/src/{llama-kv-cache-unified-iswa.h → llama-kv-cache-iswa.h} +30 -28
- package/cpp/llama.cpp/src/{llama-kv-cache-unified.cpp → llama-kv-cache.cpp} +240 -632
- package/cpp/llama.cpp/src/{llama-kv-cache-unified.h → llama-kv-cache.h} +39 -74
- package/cpp/llama.cpp/src/llama-kv-cells.h +21 -21
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +41 -35
- package/cpp/llama.cpp/src/llama-memory-hybrid.h +26 -29
- package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +13 -9
- package/cpp/llama.cpp/src/llama-memory-recurrent.h +10 -14
- package/cpp/llama.cpp/src/llama-memory.h +13 -10
- package/cpp/llama.cpp/src/llama-model-loader.cpp +2 -0
- package/cpp/llama.cpp/src/llama-model-loader.h +3 -2
- package/cpp/llama.cpp/src/llama-model.cpp +1959 -419
- package/cpp/llama.cpp/src/llama-model.h +28 -4
- package/cpp/llama.cpp/src/llama-quant.cpp +40 -4
- package/cpp/llama.cpp/src/llama-vocab.cpp +51 -2
- package/cpp/llama.cpp/src/llama-vocab.h +1 -0
- package/cpp/llama.cpp/vendor/minja/chat-template.hpp +16 -7
- package/cpp/llama.cpp/vendor/minja/minja.hpp +47 -12
- package/cpp/rn-completion.cpp +3 -27
- package/ios/generated/RNLlamaCppSpec/RNLlamaCppSpec.h +30 -0
- package/ios/generated/RNLlamaCppSpecJSI.h +49 -4
- package/ios/include/chat.h +8 -1
- package/ios/include/common/minja/chat-template.hpp +16 -7
- package/ios/include/common/minja/minja.hpp +47 -12
- package/ios/include/common.h +64 -15
- package/ios/include/llama.h +53 -114
- package/ios/include/speculative.h +8 -1
- package/ios/libs/llama.xcframework/Info.plist +18 -18
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5557 -5267
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-opt.h +25 -6
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +90 -3
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +53 -114
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5520 -5238
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4241 -4014
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +25 -6
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +90 -3
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +53 -114
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5519 -5238
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4242 -4016
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-opt.h +25 -6
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +90 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +53 -114
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-opt.h +25 -6
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +90 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +53 -114
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-opt.h +25 -6
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +90 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +53 -114
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5556 -5267
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-opt.h +25 -6
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +90 -3
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +53 -114
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5519 -5238
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4241 -4014
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +25 -6
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +90 -3
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +53 -114
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5553 -5303
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-opt.h +25 -6
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +90 -3
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +53 -114
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5515 -5274
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4238 -4044
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +25 -6
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +90 -3
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +53 -114
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/lib/module/NativeRNLlamaCpp.js.map +1 -1
- package/lib/typescript/src/NativeRNLlamaCpp.d.ts +5 -0
- package/lib/typescript/src/NativeRNLlamaCpp.d.ts.map +1 -1
- package/package.json +1 -2
- package/src/NativeRNLlamaCpp.ts +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +0 -56
|
@@ -105,6 +105,7 @@ class Keys:
|
|
|
105
105
|
EXPERT_WEIGHTS_NORM = "{arch}.expert_weights_norm"
|
|
106
106
|
EXPERT_GATING_FUNC = "{arch}.expert_gating_func"
|
|
107
107
|
MOE_EVERY_N_LAYERS = "{arch}.moe_every_n_layers"
|
|
108
|
+
NEXTN_PREDICT_LAYERS = "{arch}.nextn_predict_layers"
|
|
108
109
|
POOLING_TYPE = "{arch}.pooling_type"
|
|
109
110
|
LOGIT_SCALE = "{arch}.logit_scale"
|
|
110
111
|
DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id"
|
|
@@ -230,8 +231,10 @@ class Keys:
|
|
|
230
231
|
MIDDLE_ID = "tokenizer.ggml.middle_token_id"
|
|
231
232
|
|
|
232
233
|
class Adapter:
|
|
233
|
-
TYPE
|
|
234
|
-
LORA_ALPHA
|
|
234
|
+
TYPE = "adapter.type"
|
|
235
|
+
LORA_ALPHA = "adapter.lora.alpha"
|
|
236
|
+
LORA_TASK_NAME = "adapter.lora.task_name"
|
|
237
|
+
LORA_PROMPT_PREFIX = "adapter.lora.prompt_prefix"
|
|
235
238
|
|
|
236
239
|
class IMatrix:
|
|
237
240
|
CHUNK_COUNT = "imatrix.chunk_count"
|
|
@@ -279,6 +282,9 @@ class Keys:
|
|
|
279
282
|
class Projector:
|
|
280
283
|
STACK_FACTOR = "clip.audio.projector.stack_factor"
|
|
281
284
|
|
|
285
|
+
class Diffusion:
|
|
286
|
+
SHIFT_LOGITS = "diffusion.shift_logits"
|
|
287
|
+
|
|
282
288
|
#
|
|
283
289
|
# recommended mapping of model tensor names for storage in gguf
|
|
284
290
|
#
|
|
@@ -311,6 +317,7 @@ class MODEL_ARCH(IntEnum):
|
|
|
311
317
|
NOMIC_BERT_MOE = auto()
|
|
312
318
|
NEO_BERT = auto()
|
|
313
319
|
JINA_BERT_V2 = auto()
|
|
320
|
+
JINA_BERT_V3 = auto()
|
|
314
321
|
BLOOM = auto()
|
|
315
322
|
STABLELM = auto()
|
|
316
323
|
QWEN = auto()
|
|
@@ -354,11 +361,13 @@ class MODEL_ARCH(IntEnum):
|
|
|
354
361
|
DEEPSEEK2 = auto()
|
|
355
362
|
CHATGLM = auto()
|
|
356
363
|
GLM4 = auto()
|
|
364
|
+
GLM4_MOE = auto()
|
|
357
365
|
BITNET = auto()
|
|
358
366
|
T5 = auto()
|
|
359
367
|
T5ENCODER = auto()
|
|
360
368
|
JAIS = auto()
|
|
361
369
|
NEMOTRON = auto()
|
|
370
|
+
NEMOTRON_H = auto()
|
|
362
371
|
EXAONE = auto()
|
|
363
372
|
EXAONE4 = auto()
|
|
364
373
|
GRANITE = auto()
|
|
@@ -373,9 +382,14 @@ class MODEL_ARCH(IntEnum):
|
|
|
373
382
|
ERNIE4_5 = auto()
|
|
374
383
|
ERNIE4_5_MOE = auto()
|
|
375
384
|
HUNYUAN_MOE = auto()
|
|
385
|
+
HUNYUAN_DENSE = auto()
|
|
376
386
|
SMOLLM3 = auto()
|
|
387
|
+
GPT_OSS = auto()
|
|
377
388
|
LFM2 = auto()
|
|
378
389
|
DREAM = auto()
|
|
390
|
+
SMALLTHINKER = auto()
|
|
391
|
+
LLADA = auto()
|
|
392
|
+
SEED_OSS = auto()
|
|
379
393
|
|
|
380
394
|
|
|
381
395
|
class VISION_PROJECTOR_TYPE(IntEnum):
|
|
@@ -408,6 +422,7 @@ class MODEL_TENSOR(IntEnum):
|
|
|
408
422
|
ATTN_OUT_NORM = auto()
|
|
409
423
|
ATTN_POST_NORM = auto()
|
|
410
424
|
ATTN_ROT_EMBD = auto()
|
|
425
|
+
ATTN_SINKS = auto()
|
|
411
426
|
FFN_GATE_INP = auto()
|
|
412
427
|
FFN_GATE_INP_SHEXP = auto()
|
|
413
428
|
FFN_NORM = auto()
|
|
@@ -608,6 +623,13 @@ class MODEL_TENSOR(IntEnum):
|
|
|
608
623
|
A_MMPROJ_FC = auto()
|
|
609
624
|
A_MM_NORM_PRE = auto()
|
|
610
625
|
A_MM_NORM_MID = auto()
|
|
626
|
+
# nextn/mtp
|
|
627
|
+
NEXTN_EH_PROJ = auto()
|
|
628
|
+
NEXTN_EMBED_TOKENS = auto()
|
|
629
|
+
NEXTN_ENORM = auto()
|
|
630
|
+
NEXTN_HNORM = auto()
|
|
631
|
+
NEXTN_SHARED_HEAD_HEAD = auto()
|
|
632
|
+
NEXTN_SHARED_HEAD_NORM = auto()
|
|
611
633
|
|
|
612
634
|
|
|
613
635
|
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
|
@@ -629,6 +651,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
|
|
629
651
|
MODEL_ARCH.NOMIC_BERT_MOE: "nomic-bert-moe",
|
|
630
652
|
MODEL_ARCH.NEO_BERT: "neo-bert",
|
|
631
653
|
MODEL_ARCH.JINA_BERT_V2: "jina-bert-v2",
|
|
654
|
+
MODEL_ARCH.JINA_BERT_V3: "jina-bert-v3",
|
|
632
655
|
MODEL_ARCH.BLOOM: "bloom",
|
|
633
656
|
MODEL_ARCH.STABLELM: "stablelm",
|
|
634
657
|
MODEL_ARCH.QWEN: "qwen",
|
|
@@ -672,11 +695,13 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
|
|
672
695
|
MODEL_ARCH.DEEPSEEK2: "deepseek2",
|
|
673
696
|
MODEL_ARCH.CHATGLM: "chatglm",
|
|
674
697
|
MODEL_ARCH.GLM4: "glm4",
|
|
698
|
+
MODEL_ARCH.GLM4_MOE: "glm4moe",
|
|
675
699
|
MODEL_ARCH.BITNET: "bitnet",
|
|
676
700
|
MODEL_ARCH.T5: "t5",
|
|
677
701
|
MODEL_ARCH.T5ENCODER: "t5encoder",
|
|
678
702
|
MODEL_ARCH.JAIS: "jais",
|
|
679
703
|
MODEL_ARCH.NEMOTRON: "nemotron",
|
|
704
|
+
MODEL_ARCH.NEMOTRON_H: "nemotron_h",
|
|
680
705
|
MODEL_ARCH.EXAONE: "exaone",
|
|
681
706
|
MODEL_ARCH.EXAONE4: "exaone4",
|
|
682
707
|
MODEL_ARCH.GRANITE: "granite",
|
|
@@ -692,9 +717,14 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
|
|
692
717
|
MODEL_ARCH.ERNIE4_5_MOE: "ernie4_5-moe",
|
|
693
718
|
MODEL_ARCH.FALCON_H1: "falcon-h1",
|
|
694
719
|
MODEL_ARCH.HUNYUAN_MOE: "hunyuan-moe",
|
|
720
|
+
MODEL_ARCH.HUNYUAN_DENSE: "hunyuan-dense",
|
|
695
721
|
MODEL_ARCH.SMOLLM3: "smollm3",
|
|
722
|
+
MODEL_ARCH.GPT_OSS: "gpt-oss",
|
|
696
723
|
MODEL_ARCH.LFM2: "lfm2",
|
|
697
724
|
MODEL_ARCH.DREAM: "dream",
|
|
725
|
+
MODEL_ARCH.SMALLTHINKER: "smallthinker",
|
|
726
|
+
MODEL_ARCH.LLADA: "llada",
|
|
727
|
+
MODEL_ARCH.SEED_OSS: "seed_oss",
|
|
698
728
|
}
|
|
699
729
|
|
|
700
730
|
VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
|
|
@@ -725,6 +755,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
|
|
725
755
|
MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v",
|
|
726
756
|
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
|
|
727
757
|
MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
|
|
758
|
+
MODEL_TENSOR.ATTN_SINKS: "blk.{bid}.attn_sinks",
|
|
728
759
|
MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm",
|
|
729
760
|
MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
|
|
730
761
|
MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm",
|
|
@@ -927,6 +958,13 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
|
|
927
958
|
MODEL_TENSOR.A_MMPROJ_FC: "mm.a.fc",
|
|
928
959
|
MODEL_TENSOR.A_MM_NORM_PRE: "mm.a.norm_pre",
|
|
929
960
|
MODEL_TENSOR.A_MM_NORM_MID: "mm.a.norm_mid",
|
|
961
|
+
# NextN/MTP
|
|
962
|
+
MODEL_TENSOR.NEXTN_EH_PROJ: "blk.{bid}.nextn.eh_proj",
|
|
963
|
+
MODEL_TENSOR.NEXTN_EMBED_TOKENS: "blk.{bid}.nextn.embed_tokens",
|
|
964
|
+
MODEL_TENSOR.NEXTN_ENORM: "blk.{bid}.nextn.enorm",
|
|
965
|
+
MODEL_TENSOR.NEXTN_HNORM: "blk.{bid}.nextn.hnorm",
|
|
966
|
+
MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD: "blk.{bid}.nextn.shared_head_head",
|
|
967
|
+
MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM: "blk.{bid}.nextn.shared_head_norm",
|
|
930
968
|
}
|
|
931
969
|
|
|
932
970
|
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|
@@ -1202,6 +1240,18 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|
|
1202
1240
|
MODEL_TENSOR.LAYER_OUT_NORM,
|
|
1203
1241
|
MODEL_TENSOR.CLS,
|
|
1204
1242
|
],
|
|
1243
|
+
MODEL_ARCH.JINA_BERT_V3: [
|
|
1244
|
+
MODEL_TENSOR.TOKEN_EMBD,
|
|
1245
|
+
MODEL_TENSOR.TOKEN_EMBD_NORM,
|
|
1246
|
+
MODEL_TENSOR.TOKEN_TYPES,
|
|
1247
|
+
MODEL_TENSOR.OUTPUT_NORM,
|
|
1248
|
+
MODEL_TENSOR.ATTN_OUT_NORM,
|
|
1249
|
+
MODEL_TENSOR.ATTN_QKV,
|
|
1250
|
+
MODEL_TENSOR.ATTN_OUT,
|
|
1251
|
+
MODEL_TENSOR.FFN_DOWN,
|
|
1252
|
+
MODEL_TENSOR.FFN_UP,
|
|
1253
|
+
MODEL_TENSOR.LAYER_OUT_NORM,
|
|
1254
|
+
],
|
|
1205
1255
|
MODEL_ARCH.MPT: [
|
|
1206
1256
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
1207
1257
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
@@ -1316,6 +1366,21 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|
|
1316
1366
|
MODEL_TENSOR.FFN_DOWN,
|
|
1317
1367
|
MODEL_TENSOR.FFN_UP,
|
|
1318
1368
|
],
|
|
1369
|
+
MODEL_ARCH.LLADA: [
|
|
1370
|
+
MODEL_TENSOR.TOKEN_EMBD,
|
|
1371
|
+
MODEL_TENSOR.OUTPUT_NORM,
|
|
1372
|
+
MODEL_TENSOR.OUTPUT,
|
|
1373
|
+
MODEL_TENSOR.ROPE_FREQS,
|
|
1374
|
+
MODEL_TENSOR.ATTN_NORM,
|
|
1375
|
+
MODEL_TENSOR.ATTN_Q,
|
|
1376
|
+
MODEL_TENSOR.ATTN_K,
|
|
1377
|
+
MODEL_TENSOR.ATTN_V,
|
|
1378
|
+
MODEL_TENSOR.ATTN_OUT,
|
|
1379
|
+
MODEL_TENSOR.FFN_NORM,
|
|
1380
|
+
MODEL_TENSOR.FFN_GATE,
|
|
1381
|
+
MODEL_TENSOR.FFN_DOWN,
|
|
1382
|
+
MODEL_TENSOR.FFN_UP,
|
|
1383
|
+
],
|
|
1319
1384
|
MODEL_ARCH.QWEN2VL: [
|
|
1320
1385
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
1321
1386
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
@@ -1928,6 +1993,20 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|
|
1928
1993
|
MODEL_TENSOR.FFN_DOWN,
|
|
1929
1994
|
MODEL_TENSOR.FFN_UP,
|
|
1930
1995
|
],
|
|
1996
|
+
MODEL_ARCH.SEED_OSS: [
|
|
1997
|
+
MODEL_TENSOR.TOKEN_EMBD,
|
|
1998
|
+
MODEL_TENSOR.ATTN_NORM,
|
|
1999
|
+
MODEL_TENSOR.ATTN_Q,
|
|
2000
|
+
MODEL_TENSOR.ATTN_K,
|
|
2001
|
+
MODEL_TENSOR.ATTN_V,
|
|
2002
|
+
MODEL_TENSOR.ATTN_OUT,
|
|
2003
|
+
MODEL_TENSOR.ATTN_POST_NORM,
|
|
2004
|
+
MODEL_TENSOR.FFN_GATE,
|
|
2005
|
+
MODEL_TENSOR.FFN_DOWN,
|
|
2006
|
+
MODEL_TENSOR.FFN_UP,
|
|
2007
|
+
MODEL_TENSOR.OUTPUT_NORM,
|
|
2008
|
+
MODEL_TENSOR.OUTPUT,
|
|
2009
|
+
],
|
|
1931
2010
|
MODEL_ARCH.OLMOE: [
|
|
1932
2011
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
1933
2012
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
@@ -2100,6 +2179,37 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|
|
2100
2179
|
MODEL_TENSOR.ATTN_POST_NORM,
|
|
2101
2180
|
MODEL_TENSOR.FFN_POST_NORM,
|
|
2102
2181
|
],
|
|
2182
|
+
MODEL_ARCH.GLM4_MOE: [
|
|
2183
|
+
MODEL_TENSOR.TOKEN_EMBD,
|
|
2184
|
+
MODEL_TENSOR.OUTPUT_NORM,
|
|
2185
|
+
MODEL_TENSOR.OUTPUT,
|
|
2186
|
+
MODEL_TENSOR.ATTN_NORM,
|
|
2187
|
+
MODEL_TENSOR.ATTN_POST_NORM,
|
|
2188
|
+
MODEL_TENSOR.ATTN_Q,
|
|
2189
|
+
MODEL_TENSOR.ATTN_K,
|
|
2190
|
+
MODEL_TENSOR.ATTN_V,
|
|
2191
|
+
MODEL_TENSOR.ATTN_OUT,
|
|
2192
|
+
MODEL_TENSOR.ATTN_Q_NORM,
|
|
2193
|
+
MODEL_TENSOR.ATTN_K_NORM,
|
|
2194
|
+
MODEL_TENSOR.FFN_GATE,
|
|
2195
|
+
MODEL_TENSOR.FFN_DOWN,
|
|
2196
|
+
MODEL_TENSOR.FFN_UP,
|
|
2197
|
+
MODEL_TENSOR.FFN_GATE_INP,
|
|
2198
|
+
MODEL_TENSOR.FFN_GATE_EXP,
|
|
2199
|
+
MODEL_TENSOR.FFN_DOWN_EXP,
|
|
2200
|
+
MODEL_TENSOR.FFN_UP_EXP,
|
|
2201
|
+
MODEL_TENSOR.FFN_GATE_SHEXP,
|
|
2202
|
+
MODEL_TENSOR.FFN_DOWN_SHEXP,
|
|
2203
|
+
MODEL_TENSOR.FFN_UP_SHEXP,
|
|
2204
|
+
MODEL_TENSOR.FFN_EXP_PROBS_B,
|
|
2205
|
+
# NextN/MTP tensors - preserved but unused
|
|
2206
|
+
MODEL_TENSOR.NEXTN_EH_PROJ,
|
|
2207
|
+
MODEL_TENSOR.NEXTN_EMBED_TOKENS,
|
|
2208
|
+
MODEL_TENSOR.NEXTN_ENORM,
|
|
2209
|
+
MODEL_TENSOR.NEXTN_HNORM,
|
|
2210
|
+
MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD,
|
|
2211
|
+
MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM,
|
|
2212
|
+
],
|
|
2103
2213
|
MODEL_ARCH.BITNET: [
|
|
2104
2214
|
MODEL_TENSOR.ATTN_Q,
|
|
2105
2215
|
MODEL_TENSOR.ATTN_K,
|
|
@@ -2189,6 +2299,25 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|
|
2189
2299
|
MODEL_TENSOR.FFN_DOWN,
|
|
2190
2300
|
MODEL_TENSOR.FFN_UP,
|
|
2191
2301
|
],
|
|
2302
|
+
MODEL_ARCH.NEMOTRON_H: [
|
|
2303
|
+
MODEL_TENSOR.TOKEN_EMBD,
|
|
2304
|
+
MODEL_TENSOR.OUTPUT_NORM,
|
|
2305
|
+
MODEL_TENSOR.OUTPUT,
|
|
2306
|
+
MODEL_TENSOR.ATTN_NORM,
|
|
2307
|
+
MODEL_TENSOR.SSM_IN,
|
|
2308
|
+
MODEL_TENSOR.SSM_CONV1D,
|
|
2309
|
+
MODEL_TENSOR.SSM_DT,
|
|
2310
|
+
MODEL_TENSOR.SSM_A,
|
|
2311
|
+
MODEL_TENSOR.SSM_D,
|
|
2312
|
+
MODEL_TENSOR.SSM_NORM,
|
|
2313
|
+
MODEL_TENSOR.SSM_OUT,
|
|
2314
|
+
MODEL_TENSOR.ATTN_Q,
|
|
2315
|
+
MODEL_TENSOR.ATTN_K,
|
|
2316
|
+
MODEL_TENSOR.ATTN_V,
|
|
2317
|
+
MODEL_TENSOR.ATTN_OUT,
|
|
2318
|
+
MODEL_TENSOR.FFN_DOWN,
|
|
2319
|
+
MODEL_TENSOR.FFN_UP,
|
|
2320
|
+
],
|
|
2192
2321
|
MODEL_ARCH.EXAONE: [
|
|
2193
2322
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
2194
2323
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
@@ -2449,6 +2578,22 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|
|
2449
2578
|
MODEL_TENSOR.FFN_DOWN_SHEXP,
|
|
2450
2579
|
MODEL_TENSOR.FFN_UP_SHEXP,
|
|
2451
2580
|
],
|
|
2581
|
+
MODEL_ARCH.HUNYUAN_DENSE: [
|
|
2582
|
+
MODEL_TENSOR.TOKEN_EMBD,
|
|
2583
|
+
MODEL_TENSOR.OUTPUT_NORM,
|
|
2584
|
+
MODEL_TENSOR.OUTPUT,
|
|
2585
|
+
MODEL_TENSOR.ATTN_NORM,
|
|
2586
|
+
MODEL_TENSOR.ATTN_Q,
|
|
2587
|
+
MODEL_TENSOR.ATTN_Q_NORM,
|
|
2588
|
+
MODEL_TENSOR.ATTN_K,
|
|
2589
|
+
MODEL_TENSOR.ATTN_K_NORM,
|
|
2590
|
+
MODEL_TENSOR.ATTN_V,
|
|
2591
|
+
MODEL_TENSOR.ATTN_OUT,
|
|
2592
|
+
MODEL_TENSOR.FFN_NORM,
|
|
2593
|
+
MODEL_TENSOR.FFN_GATE,
|
|
2594
|
+
MODEL_TENSOR.FFN_DOWN,
|
|
2595
|
+
MODEL_TENSOR.FFN_UP,
|
|
2596
|
+
],
|
|
2452
2597
|
MODEL_ARCH.SMOLLM3: [
|
|
2453
2598
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
2454
2599
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
@@ -2465,6 +2610,22 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|
|
2465
2610
|
MODEL_TENSOR.FFN_DOWN,
|
|
2466
2611
|
MODEL_TENSOR.FFN_UP,
|
|
2467
2612
|
],
|
|
2613
|
+
MODEL_ARCH.GPT_OSS: [
|
|
2614
|
+
MODEL_TENSOR.TOKEN_EMBD,
|
|
2615
|
+
MODEL_TENSOR.OUTPUT_NORM,
|
|
2616
|
+
MODEL_TENSOR.OUTPUT,
|
|
2617
|
+
MODEL_TENSOR.ATTN_NORM,
|
|
2618
|
+
MODEL_TENSOR.ATTN_POST_NORM,
|
|
2619
|
+
MODEL_TENSOR.ATTN_Q,
|
|
2620
|
+
MODEL_TENSOR.ATTN_K,
|
|
2621
|
+
MODEL_TENSOR.ATTN_V,
|
|
2622
|
+
MODEL_TENSOR.ATTN_OUT,
|
|
2623
|
+
MODEL_TENSOR.ATTN_SINKS,
|
|
2624
|
+
MODEL_TENSOR.FFN_GATE_INP,
|
|
2625
|
+
MODEL_TENSOR.FFN_GATE_EXP,
|
|
2626
|
+
MODEL_TENSOR.FFN_DOWN_EXP,
|
|
2627
|
+
MODEL_TENSOR.FFN_UP_EXP,
|
|
2628
|
+
],
|
|
2468
2629
|
MODEL_ARCH.LFM2: [
|
|
2469
2630
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
2470
2631
|
MODEL_TENSOR.TOKEN_EMBD_NORM,
|
|
@@ -2482,6 +2643,25 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|
|
2482
2643
|
MODEL_TENSOR.ATTN_K,
|
|
2483
2644
|
MODEL_TENSOR.ATTN_V,
|
|
2484
2645
|
MODEL_TENSOR.ATTN_OUT,
|
|
2646
|
+
MODEL_TENSOR.OUTPUT,
|
|
2647
|
+
],
|
|
2648
|
+
MODEL_ARCH.SMALLTHINKER: [
|
|
2649
|
+
MODEL_TENSOR.TOKEN_EMBD,
|
|
2650
|
+
MODEL_TENSOR.OUTPUT_NORM,
|
|
2651
|
+
MODEL_TENSOR.OUTPUT,
|
|
2652
|
+
MODEL_TENSOR.ATTN_NORM,
|
|
2653
|
+
MODEL_TENSOR.ATTN_Q,
|
|
2654
|
+
MODEL_TENSOR.ATTN_K,
|
|
2655
|
+
MODEL_TENSOR.ATTN_V,
|
|
2656
|
+
MODEL_TENSOR.ATTN_OUT,
|
|
2657
|
+
MODEL_TENSOR.FFN_NORM,
|
|
2658
|
+
MODEL_TENSOR.FFN_GATE,
|
|
2659
|
+
MODEL_TENSOR.FFN_DOWN,
|
|
2660
|
+
MODEL_TENSOR.FFN_UP,
|
|
2661
|
+
MODEL_TENSOR.FFN_GATE_INP,
|
|
2662
|
+
MODEL_TENSOR.FFN_GATE_EXP,
|
|
2663
|
+
MODEL_TENSOR.FFN_DOWN_EXP,
|
|
2664
|
+
MODEL_TENSOR.FFN_UP_EXP,
|
|
2485
2665
|
],
|
|
2486
2666
|
# TODO
|
|
2487
2667
|
}
|
|
@@ -2601,6 +2781,7 @@ class GGMLQuantizationType(IntEnum):
|
|
|
2601
2781
|
BF16 = 30
|
|
2602
2782
|
TQ1_0 = 34
|
|
2603
2783
|
TQ2_0 = 35
|
|
2784
|
+
MXFP4 = 39
|
|
2604
2785
|
|
|
2605
2786
|
|
|
2606
2787
|
class ExpertGatingFuncType(IntEnum):
|
|
@@ -2704,6 +2885,9 @@ class VisionProjectorType:
|
|
|
2704
2885
|
INTERNVL = "internvl"
|
|
2705
2886
|
QWEN2A = "qwen2a" # audio
|
|
2706
2887
|
QWEN25O = "qwen2.5o" # omni
|
|
2888
|
+
VOXTRAL = "voxtral"
|
|
2889
|
+
LFM2 = "lfm2"
|
|
2890
|
+
KIMIVL = "kimivl"
|
|
2707
2891
|
|
|
2708
2892
|
|
|
2709
2893
|
# Items here are (block size, type size)
|
|
@@ -2740,6 +2924,7 @@ GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = {
|
|
|
2740
2924
|
GGMLQuantizationType.BF16: (1, 2),
|
|
2741
2925
|
GGMLQuantizationType.TQ1_0: (256, 2 + 4 * 13),
|
|
2742
2926
|
GGMLQuantizationType.TQ2_0: (256, 2 + 64),
|
|
2927
|
+
GGMLQuantizationType.MXFP4: (32, 1 + 16),
|
|
2743
2928
|
}
|
|
2744
2929
|
|
|
2745
2930
|
|
|
@@ -138,8 +138,9 @@ class GGUFWriter:
|
|
|
138
138
|
size = prod(shape)
|
|
139
139
|
|
|
140
140
|
if "_exps." in name:
|
|
141
|
-
|
|
142
|
-
|
|
141
|
+
expert_count = shape[-2 if ".bias" in name else -3]
|
|
142
|
+
expert_params += (size // expert_count)
|
|
143
|
+
expert_sum += expert_count
|
|
143
144
|
n_expert_tensors += 1
|
|
144
145
|
else:
|
|
145
146
|
shared_params += size
|
|
@@ -753,6 +754,9 @@ class GGUFWriter:
|
|
|
753
754
|
def add_moe_every_n_layers(self, value: int) -> None:
|
|
754
755
|
self.add_uint32(Keys.LLM.MOE_EVERY_N_LAYERS.format(arch=self.arch), value)
|
|
755
756
|
|
|
757
|
+
def add_nextn_predict_layers(self, count: int) -> None:
|
|
758
|
+
self.add_uint32(Keys.LLM.NEXTN_PREDICT_LAYERS.format(arch=self.arch), count)
|
|
759
|
+
|
|
756
760
|
def add_swin_norm(self, value: bool) -> None:
|
|
757
761
|
self.add_bool(Keys.LLM.SWIN_NORM.format(arch=self.arch), value)
|
|
758
762
|
|
|
@@ -1047,6 +1051,11 @@ class GGUFWriter:
|
|
|
1047
1051
|
def add_audio_stack_factor(self, value: int) -> None:
|
|
1048
1052
|
self.add_uint32(Keys.ClipAudio.Projector.STACK_FACTOR, value)
|
|
1049
1053
|
|
|
1054
|
+
# diffusion models
|
|
1055
|
+
|
|
1056
|
+
def add_diffusion_shift_logits(self, value: bool) -> None:
|
|
1057
|
+
self.add_bool(Keys.Diffusion.SHIFT_LOGITS, value)
|
|
1058
|
+
|
|
1050
1059
|
def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes:
|
|
1051
1060
|
pack_prefix = ''
|
|
1052
1061
|
if not skip_pack_prefix:
|
|
@@ -228,8 +228,7 @@ class Q4_0(__Quant, qtype=GGMLQuantizationType.Q4_0):
|
|
|
228
228
|
d = max / -8
|
|
229
229
|
with np.errstate(divide="ignore"):
|
|
230
230
|
id = np.where(d == 0, 0, 1 / d)
|
|
231
|
-
|
|
232
|
-
qs = np.trunc((np.float64(blocks) * np.float64(id)) + np.float64(8.5), dtype=np.float32).astype(np.uint8).clip(0, 15)
|
|
231
|
+
qs = np.trunc((blocks * id) + np.float32(8.5), dtype=np.float32).astype(np.uint8).clip(0, 15)
|
|
233
232
|
|
|
234
233
|
qs = qs.reshape((n_blocks, 2, cls.block_size // 2))
|
|
235
234
|
qs = qs[..., 0, :] | (qs[..., 1, :] << np.uint8(4))
|
|
@@ -300,8 +299,7 @@ class Q5_0(__Quant, qtype=GGMLQuantizationType.Q5_0):
|
|
|
300
299
|
d = max / -16
|
|
301
300
|
with np.errstate(divide="ignore"):
|
|
302
301
|
id = np.where(d == 0, 0, 1 / d)
|
|
303
|
-
|
|
304
|
-
q = np.trunc((np.float64(blocks) * np.float64(id)) + np.float64(16.5), dtype=np.float32).astype(np.uint8).clip(0, 31)
|
|
302
|
+
q = np.trunc((blocks * id) + np.float32(16.5), dtype=np.float32).astype(np.uint8).clip(0, 31)
|
|
305
303
|
|
|
306
304
|
qs = q.reshape((n_blocks, 2, cls.block_size // 2))
|
|
307
305
|
qs = (qs[..., 0, :] & np.uint8(0x0F)) | (qs[..., 1, :] << np.uint8(4))
|
|
@@ -655,6 +653,57 @@ class TQ2_0(__Quant, qtype=GGMLQuantizationType.TQ2_0):
|
|
|
655
653
|
return (d * qs.astype(np.float32))
|
|
656
654
|
|
|
657
655
|
|
|
656
|
+
class MXFP4(__Quant, qtype=GGMLQuantizationType.MXFP4):
|
|
657
|
+
# e2m1 values (doubled)
|
|
658
|
+
# ref: https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
|
|
659
|
+
kvalues = (0, 1, 2, 3, 4, 6, 8, 12, 0, -1, -2, -3, -4, -6, -8, -12)
|
|
660
|
+
|
|
661
|
+
@staticmethod
|
|
662
|
+
# see ggml_e8m0_to_fp32_half in ggml-impl.h
|
|
663
|
+
def e8m0_to_fp32_half(x: np.ndarray) -> np.ndarray:
|
|
664
|
+
bits = np.where(x < 2, np.uint32(0x00200000) << np.uint32(x), np.uint32(x - 1) << np.uint32(23))
|
|
665
|
+
return bits.view(np.float32)
|
|
666
|
+
|
|
667
|
+
@classmethod
|
|
668
|
+
def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
|
|
669
|
+
n_blocks = blocks.shape[0]
|
|
670
|
+
|
|
671
|
+
d = abs(blocks).max(axis=-1, keepdims=True)
|
|
672
|
+
|
|
673
|
+
with np.errstate(divide="ignore"):
|
|
674
|
+
e = np.where(d > 0, np.floor(np.log2(d)) - 2 + 127, 0).astype(np.uint8)
|
|
675
|
+
|
|
676
|
+
d = cls.e8m0_to_fp32_half(e)
|
|
677
|
+
|
|
678
|
+
kvalues = np.array(cls.kvalues, dtype=np.int8).reshape((1, 1, 16))
|
|
679
|
+
|
|
680
|
+
errs = np.abs(d.reshape((n_blocks, 1, 1)) * kvalues.astype(np.float32) - blocks.reshape((n_blocks, cls.block_size, 1)))
|
|
681
|
+
best = np.argmin(errs, axis=-1, keepdims=True)
|
|
682
|
+
|
|
683
|
+
qs = best.reshape(n_blocks, 2, cls.block_size // 2).astype(np.uint8)
|
|
684
|
+
qs = qs[:, 0] | (qs[:, 1] << np.uint8(4))
|
|
685
|
+
|
|
686
|
+
qs = qs.reshape((n_blocks, cls.block_size // 2))
|
|
687
|
+
|
|
688
|
+
return np.concatenate([e, qs], axis=-1)
|
|
689
|
+
|
|
690
|
+
@classmethod
|
|
691
|
+
def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
|
|
692
|
+
n_blocks = blocks.shape[0]
|
|
693
|
+
|
|
694
|
+
e, qs = np.hsplit(blocks, [1])
|
|
695
|
+
|
|
696
|
+
d = cls.e8m0_to_fp32_half(e)
|
|
697
|
+
|
|
698
|
+
qs = qs.reshape((n_blocks, 1, cls.block_size // 2)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 2, 1))
|
|
699
|
+
qs = (qs & np.uint8(0x0F)).view(np.int8)
|
|
700
|
+
|
|
701
|
+
kvalues = np.array(cls.kvalues, dtype=np.int8).reshape(1, 1, 16)
|
|
702
|
+
qs = np.take_along_axis(kvalues, qs, axis=-1).reshape((n_blocks, cls.block_size))
|
|
703
|
+
|
|
704
|
+
return (d * qs.astype(np.float32))
|
|
705
|
+
|
|
706
|
+
|
|
658
707
|
class IQ2_XXS(__Quant, qtype=GGMLQuantizationType.IQ2_XXS):
|
|
659
708
|
ksigns: bytes = (
|
|
660
709
|
b"\x00\x81\x82\x03\x84\x05\x06\x87\x88\x09\x0a\x8b\x0c\x8d\x8e\x0f"
|
|
@@ -19,6 +19,61 @@ import gguf
|
|
|
19
19
|
logger = logging.getLogger("gguf-convert-endian")
|
|
20
20
|
|
|
21
21
|
|
|
22
|
+
def byteswap_q4_0(tensor, block_offs):
|
|
23
|
+
# Each block_q4_0 consists of an f16 delta (scaling factor) followed by 16 int8 quantizations.
|
|
24
|
+
|
|
25
|
+
# Byte-Swap f16 sized delta field
|
|
26
|
+
delta = tensor.data[block_offs:block_offs + 2].view(dtype=np.uint16)
|
|
27
|
+
delta.byteswap(inplace=True)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def byteswap_q8_0(tensor, block_offs):
|
|
31
|
+
# Each block_q8_0 consists of an f16 delta (scaling factor) followed by 32 int8 quantizations.
|
|
32
|
+
|
|
33
|
+
# Byte-Swap f16 sized delta field
|
|
34
|
+
delta = tensor.data[block_offs:block_offs + 2].view(dtype=np.uint16)
|
|
35
|
+
delta.byteswap(inplace=True)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def byteswap_q4_k(tensor, block_offs):
|
|
39
|
+
# Each block_q4_k consists of 2 f16 values followed by 140 int8 values.
|
|
40
|
+
|
|
41
|
+
# Byte-Swap f16 sized fields
|
|
42
|
+
delta = tensor.data[block_offs:block_offs + 2].view(dtype=np.uint16)
|
|
43
|
+
delta.byteswap(inplace=True)
|
|
44
|
+
|
|
45
|
+
delta = tensor.data[block_offs + 2:block_offs + 4].view(dtype=np.uint16)
|
|
46
|
+
delta.byteswap(inplace=True)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def byteswap_q6_k(tensor, block_offs):
|
|
50
|
+
# Each block_q6_k consists of 208 int8 values followed by 1 f16 value.
|
|
51
|
+
|
|
52
|
+
# Byte-Swap f16 sized field
|
|
53
|
+
delta = tensor.data[block_offs + 208:block_offs + 210].view(dtype=np.uint16)
|
|
54
|
+
delta.byteswap(inplace=True)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
byteswap_tensors = {
|
|
58
|
+
gguf.GGMLQuantizationType.Q4_0: {
|
|
59
|
+
"block_size": 18, # 18 bytes = <f16 delta scaling factor> + 16 * <int8 quant>
|
|
60
|
+
"byteswap_func": byteswap_q4_0,
|
|
61
|
+
},
|
|
62
|
+
gguf.GGMLQuantizationType.Q8_0: {
|
|
63
|
+
"block_size": 34, # 34 bytes = <f16 delta scaling factor> + 32 * <int8 quant>
|
|
64
|
+
"byteswap_func": byteswap_q8_0,
|
|
65
|
+
},
|
|
66
|
+
gguf.GGMLQuantizationType.Q4_K: {
|
|
67
|
+
"block_size": 144, # 144 bytes = 2 * <f16 delta scaling factor> + 140 * <int8 quant>
|
|
68
|
+
"byteswap_func": byteswap_q4_k,
|
|
69
|
+
},
|
|
70
|
+
gguf.GGMLQuantizationType.Q6_K: {
|
|
71
|
+
"block_size": 210, # 210 bytes = <f16 delta scaling factor> + 208 * <int8 quant>
|
|
72
|
+
"byteswap_func": byteswap_q6_k,
|
|
73
|
+
},
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
|
|
22
77
|
def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None:
|
|
23
78
|
file_endian = reader.endianess.name
|
|
24
79
|
if reader.byte_order == 'S':
|
|
@@ -32,13 +87,11 @@ def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None
|
|
|
32
87
|
sys.exit(0)
|
|
33
88
|
logger.info("* Checking tensors for conversion compatibility")
|
|
34
89
|
for tensor in reader.tensors:
|
|
35
|
-
if tensor.tensor_type not in
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
gguf.GGMLQuantizationType.Q6_K,
|
|
41
|
-
):
|
|
90
|
+
if tensor.tensor_type not in byteswap_tensors and \
|
|
91
|
+
tensor.tensor_type not in (
|
|
92
|
+
gguf.GGMLQuantizationType.F32,
|
|
93
|
+
gguf.GGMLQuantizationType.F16,
|
|
94
|
+
):
|
|
42
95
|
raise ValueError(f"Cannot handle type {tensor.tensor_type.name} for tensor {repr(tensor.name)}")
|
|
43
96
|
logger.info(f"* Preparing to convert from {file_endian} to {order}")
|
|
44
97
|
if args.dry_run:
|
|
@@ -72,78 +125,29 @@ def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None
|
|
|
72
125
|
part.byteswap(inplace=True)
|
|
73
126
|
|
|
74
127
|
# Byte-swap tensor data if necessary
|
|
75
|
-
if tensor.tensor_type
|
|
76
|
-
# Handle Q8_0 tensor blocks (block_q8_0)
|
|
77
|
-
# Specific handling of block_q8_0 is required.
|
|
78
|
-
# Each block_q8_0 consists of an f16 delta (scaling factor) followed by 32 int8 quantizations.
|
|
79
|
-
|
|
80
|
-
block_size = 34 # 34 bytes = <f16 delta scaling factor> + 32 * <int8 quant>
|
|
81
|
-
|
|
82
|
-
n_blocks = len(tensor.data) // block_size
|
|
83
|
-
for block_num in (inner_pbar := tqdm(range(n_blocks), desc="Byte-swapping Blocks", leave=False)):
|
|
84
|
-
block_offs = block_num * block_size
|
|
85
|
-
|
|
86
|
-
# Byte-Swap f16 sized delta field
|
|
87
|
-
delta = tensor.data[block_offs:block_offs + 2].view(dtype=np.uint16)
|
|
88
|
-
delta.byteswap(inplace=True)
|
|
89
|
-
|
|
90
|
-
# Byte-Swap Q8 weights
|
|
91
|
-
if block_num % 100000 == 0:
|
|
92
|
-
inner_pbar.set_description(f"Byte-swapping Blocks [{(n_blocks - block_num) // n_blocks}]")
|
|
93
|
-
|
|
94
|
-
elif tensor.tensor_type == gguf.GGMLQuantizationType.Q4_K:
|
|
95
|
-
# Handle Q4_K tensor blocks (block_q4_k)
|
|
96
|
-
# Specific handling of block_q4_k is required.
|
|
97
|
-
# Each block_q4_k consists of 2 f16 values followed by 140 int8 values.
|
|
98
|
-
|
|
128
|
+
if tensor.tensor_type in byteswap_tensors:
|
|
99
129
|
# first flatten structure
|
|
130
|
+
oldshape = tensor.data.shape
|
|
100
131
|
newshape = 1
|
|
101
132
|
for i in tensor.data.shape:
|
|
102
133
|
newshape *= i
|
|
103
134
|
|
|
104
135
|
tensor.data.resize(newshape)
|
|
105
136
|
|
|
106
|
-
block_size
|
|
107
|
-
|
|
108
|
-
for block_num in (inner_pbar := tqdm(range(n_blocks), desc="Byte-swapping Blocks", leave=False)):
|
|
109
|
-
block_offs = block_num * block_size
|
|
110
|
-
|
|
111
|
-
# Byte-Swap f16 sized fields
|
|
112
|
-
delta = tensor.data[block_offs:block_offs + 2].view(dtype=np.uint16)
|
|
113
|
-
delta.byteswap(inplace=True)
|
|
114
|
-
|
|
115
|
-
delta = tensor.data[block_offs + 2:block_offs + 4].view(dtype=np.uint16)
|
|
116
|
-
delta.byteswap(inplace=True)
|
|
117
|
-
|
|
118
|
-
# Byte-Swap
|
|
119
|
-
if block_num % 100000 == 0:
|
|
120
|
-
inner_pbar.set_description(f"Byte-swapping Blocks [{(n_blocks - block_num) // n_blocks}]")
|
|
121
|
-
|
|
122
|
-
elif tensor.tensor_type == gguf.GGMLQuantizationType.Q6_K:
|
|
123
|
-
# Handle Q6_K tensor blocks (block_q6_k)
|
|
124
|
-
# Specific handling of block_q6_k is required.
|
|
125
|
-
# Each block_q6_k consists of 208 int8 values followed by 1 f16 value.
|
|
126
|
-
|
|
127
|
-
# first flatten structure
|
|
128
|
-
newshape = 1
|
|
129
|
-
for i in tensor.data.shape:
|
|
130
|
-
newshape *= i
|
|
131
|
-
|
|
132
|
-
tensor.data.resize(newshape)
|
|
137
|
+
block_size = byteswap_tensors[tensor.tensor_type]["block_size"]
|
|
138
|
+
byteswap_func = byteswap_tensors[tensor.tensor_type]["byteswap_func"]
|
|
133
139
|
|
|
134
|
-
block_size = 210
|
|
135
140
|
n_blocks = len(tensor.data) // block_size
|
|
136
141
|
for block_num in (inner_pbar := tqdm(range(n_blocks), desc="Byte-swapping Blocks", leave=False)):
|
|
137
142
|
block_offs = block_num * block_size
|
|
138
143
|
|
|
139
|
-
|
|
140
|
-
delta = tensor.data[block_offs + 208:block_offs + 210].view(dtype=np.uint16)
|
|
141
|
-
delta.byteswap(inplace=True)
|
|
144
|
+
byteswap_func(tensor, block_offs)
|
|
142
145
|
|
|
143
|
-
# Byte-Swap
|
|
144
146
|
if block_num % 100000 == 0:
|
|
145
147
|
inner_pbar.set_description(f"Byte-swapping Blocks [{(n_blocks - block_num) // n_blocks}]")
|
|
146
148
|
|
|
149
|
+
# restore old shape in case it's ever used
|
|
150
|
+
tensor.data.resize(oldshape)
|
|
147
151
|
else:
|
|
148
152
|
# Handle other tensor types
|
|
149
153
|
tensor.data.byteswap(inplace=True)
|
|
@@ -111,6 +111,7 @@ def main() -> None:
|
|
|
111
111
|
parser.add_argument("--general-description", type=str, help="The models general.description", metavar='"Description ..."')
|
|
112
112
|
parser.add_argument("--chat-template", type=str, help="Chat template string (or JSON string containing templates)", metavar='"{% ... %} ..."')
|
|
113
113
|
parser.add_argument("--chat-template-config", type=Path, help="Config file containing chat template(s)", metavar='tokenizer_config.json')
|
|
114
|
+
parser.add_argument("--chat-template-file", type=Path, help="Jinja file containing chat template", metavar='chat_template.jinja')
|
|
114
115
|
parser.add_argument("--pre-tokenizer", type=str, help="The models tokenizer.ggml.pre", metavar='"pre tokenizer"')
|
|
115
116
|
parser.add_argument("--remove-metadata", action="append", type=str, help="Remove metadata (by key name) from output model", metavar='general.url')
|
|
116
117
|
parser.add_argument("--special-token", action="append", type=str, help="Special token by value", nargs=2, metavar=(' | '.join(token_names.keys()), '"<token>"'))
|
|
@@ -134,12 +135,17 @@ def main() -> None:
|
|
|
134
135
|
new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] = MetadataDetails(gguf.GGUFValueType.STRING, json.loads(args.chat_template) if args.chat_template.startswith('[') else args.chat_template)
|
|
135
136
|
|
|
136
137
|
if args.chat_template_config:
|
|
137
|
-
with open(args.chat_template_config, 'r') as fp:
|
|
138
|
+
with open(args.chat_template_config, 'r', encoding='utf-8') as fp:
|
|
138
139
|
config = json.load(fp)
|
|
139
140
|
template = config.get('chat_template')
|
|
140
141
|
if template:
|
|
141
142
|
new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] = MetadataDetails(gguf.GGUFValueType.STRING, template)
|
|
142
143
|
|
|
144
|
+
if args.chat_template_file:
|
|
145
|
+
with open(args.chat_template_file, 'r', encoding='utf-8') as fp:
|
|
146
|
+
template = fp.read()
|
|
147
|
+
new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] = MetadataDetails(gguf.GGUFValueType.STRING, template)
|
|
148
|
+
|
|
143
149
|
if args.pre_tokenizer:
|
|
144
150
|
new_metadata[gguf.Keys.Tokenizer.PRE] = MetadataDetails(gguf.GGUFValueType.STRING, args.pre_tokenizer)
|
|
145
151
|
|