@novastera-oss/llamarn 0.3.1 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +86 -3
- package/RNLlamaCpp.podspec +1 -1
- package/android/CMakeLists.txt +11 -3
- package/android/generated/jni/react/renderer/components/RNLlamaCppSpec/RNLlamaCppSpecJSI.h +49 -4
- package/android/src/main/cpp/include/llama.h +53 -114
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86/libggml.so +0 -0
- package/android/src/main/jniLibs/x86/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/LlamaCppModel.cpp +2 -10
- package/cpp/PureCppImpl.cpp +71 -4
- package/cpp/SystemUtils.cpp +3 -7
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakeLists.txt +2 -0
- package/cpp/llama.cpp/CODEOWNERS +1 -1
- package/cpp/llama.cpp/Makefile +6 -1605
- package/cpp/llama.cpp/README.md +5 -1
- package/cpp/llama.cpp/common/arg.cpp +230 -51
- package/cpp/llama.cpp/common/chat-parser.cpp +9 -1
- package/cpp/llama.cpp/common/chat.cpp +539 -8
- package/cpp/llama.cpp/common/chat.h +8 -1
- package/cpp/llama.cpp/common/common.cpp +60 -15
- package/cpp/llama.cpp/common/common.h +64 -15
- package/cpp/llama.cpp/common/speculative.cpp +135 -54
- package/cpp/llama.cpp/common/speculative.h +8 -1
- package/cpp/llama.cpp/convert_hf_to_gguf.py +1216 -109
- package/cpp/llama.cpp/convert_hf_to_gguf_update.py +19 -6
- package/cpp/llama.cpp/convert_lora_to_gguf.py +1 -1
- package/cpp/llama.cpp/flake.nix +0 -5
- package/cpp/llama.cpp/ggml/CMakeLists.txt +6 -3
- package/cpp/llama.cpp/ggml/cmake/ggml-config.cmake.in +71 -70
- package/cpp/llama.cpp/ggml/include/ggml-opt.h +25 -6
- package/cpp/llama.cpp/ggml/include/ggml-zdnn.h +16 -0
- package/cpp/llama.cpp/ggml/include/ggml.h +90 -3
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +13 -1
- package/cpp/llama.cpp/ggml/src/ggml-alloc.c +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +10 -0
- package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +113 -17
- package/cpp/llama.cpp/ggml/src/ggml-blas/ggml-blas.cpp +4 -4
- package/cpp/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +14 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +701 -585
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +13 -3
- package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +52 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +274 -91
- package/cpp/llama.cpp/ggml/src/ggml-common.h +17 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +132 -596
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +14 -286
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +90 -569
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +162 -589
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +55 -341
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +3 -58
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +371 -298
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +54 -314
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +184 -675
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +4679 -1657
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +33 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +26 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +21 -24
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +16 -7
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +232 -123
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +428 -23
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +4 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +35 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.h +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +458 -46
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.h +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +39 -14
- package/cpp/llama.cpp/ggml/src/ggml-cpu/traits.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/traits.h +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +20 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +122 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +9 -11
- package/cpp/llama.cpp/ggml/src/ggml-cuda/add-id.cu +58 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/add-id.cuh +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/binbcast.cu +275 -170
- package/cpp/llama.cpp/ggml/src/ggml-cuda/binbcast.cuh +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +103 -65
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv-transpose-1d.cu +1 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d.cu +171 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cu +33 -7
- package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cuh +13 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy-utils.cuh +2 -10
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +3 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/dequantize.cuh +14 -40
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +83 -27
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +116 -57
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +45 -18
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +56 -29
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +61 -39
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +70 -49
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +70 -21
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +162 -50
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cuh +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/getrows.cu +5 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +208 -97
- package/cpp/llama.cpp/ggml/src/ggml-cuda/im2col.cu +46 -35
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +56 -2
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mma.cuh +95 -51
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmf.cu +427 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmf.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +204 -57
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +252 -168
- package/cpp/llama.cpp/ggml/src/ggml-cuda/{mmv.cu → mmvf.cu} +53 -53
- package/cpp/llama.cpp/ggml/src/ggml-cuda/{mmv.cuh → mmvf.cuh} +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvq.cu +10 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cu +192 -19
- package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/opt-step-sgd.cu +49 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/opt-step-sgd.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/pad_reflect_1d.cu +82 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/pad_reflect_1d.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/reduce_rows.cuh +53 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/roll.cu +67 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/roll.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/set-rows.cu +1 -8
- package/cpp/llama.cpp/ggml/src/ggml-cuda/softcap.cu +34 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/softcap.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/softmax.cu +16 -10
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +153 -71
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sum.cu +6 -10
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +21 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmq-instance-mxfp4.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +75 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/vecdotq.cuh +110 -22
- package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +14 -25
- package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +2 -1
- package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +10 -2
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +61 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +31 -20
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +342 -131
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +464 -134
- package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +0 -4
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1108 -176
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/add.cl +107 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/add_id.cl +42 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/div.cl +66 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +343 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +343 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +346 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/glu.cl +41 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/group_norm.cl +49 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul.cl +73 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +132 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +133 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl +189 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl +144 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/norm.cl +80 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl +10 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl +10 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f16.cl +10 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/softmax_f32.cl +10 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sub.cl +66 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/transpose.cl +20 -0
- package/cpp/llama.cpp/ggml/src/ggml-opt.cpp +97 -41
- package/cpp/llama.cpp/ggml/src/ggml-quants.c +110 -16
- package/cpp/llama.cpp/ggml/src/ggml-quants.h +6 -0
- package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +22 -9
- package/cpp/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +0 -212
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +213 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +117 -238
- package/cpp/llama.cpp/ggml/src/ggml-sycl/quantize.hpp +133 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +94 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1666 -633
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +41 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +42 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +13 -4
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +39 -29
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +107 -43
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.comp +18 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp +21 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +32 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +20 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +21 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +16 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +44 -8
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +44 -16
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +26 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +2 -17
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +37 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +11 -7
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +109 -55
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +71 -41
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +6 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +111 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +49 -11
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +65 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +9 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +17 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +38 -5
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +14 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +55 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/utils.comp +25 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +75 -20
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/CMakeLists.txt +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/ggml-webgpu.cpp +807 -412
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +72 -22
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +8 -8
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +1794 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/set_rows.wgsl +82 -0
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/CMakeLists.txt +36 -0
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/ggml-zdnn-impl.h +97 -0
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/ggml-zdnn.cpp +846 -0
- package/cpp/llama.cpp/ggml/src/ggml.c +204 -50
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +187 -2
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +11 -2
- package/cpp/llama.cpp/gguf-py/gguf/quants.py +53 -4
- package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_convert_endian.py +67 -63
- package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_new_metadata.py +7 -1
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +120 -16
- package/cpp/llama.cpp/gguf-py/gguf/utility.py +5 -1
- package/cpp/llama.cpp/gguf-py/gguf/vocab.py +284 -1
- package/cpp/llama.cpp/gguf-py/tests/test_quants.py +14 -5
- package/cpp/llama.cpp/include/llama.h +53 -114
- package/cpp/llama.cpp/models/templates/ByteDance-Seed-OSS.jinja +171 -0
- package/cpp/llama.cpp/models/templates/README.md +2 -1
- package/cpp/llama.cpp/models/templates/ibm-granite-granite-3.3-2B-Instruct.jinja +59 -0
- package/cpp/llama.cpp/models/templates/openai-gpt-oss-120b.jinja +331 -0
- package/cpp/llama.cpp/models/templates/unsloth-mistral-Devstral-Small-2507.jinja +105 -0
- package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -1
- package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +0 -6
- package/cpp/llama.cpp/requirements/requirements-pydantic.txt +1 -1
- package/cpp/llama.cpp/src/CMakeLists.txt +2 -2
- package/cpp/llama.cpp/src/llama-adapter.cpp +68 -4
- package/cpp/llama.cpp/src/llama-adapter.h +3 -0
- package/cpp/llama.cpp/src/llama-arch.cpp +192 -2
- package/cpp/llama.cpp/src/llama-arch.h +18 -0
- package/cpp/llama.cpp/src/llama-batch.cpp +2 -2
- package/cpp/llama.cpp/src/llama-chat.cpp +47 -6
- package/cpp/llama.cpp/src/llama-chat.h +3 -0
- package/cpp/llama.cpp/src/llama-context.cpp +61 -252
- package/cpp/llama.cpp/src/llama-context.h +10 -15
- package/cpp/llama.cpp/src/llama-cparams.h +0 -1
- package/cpp/llama.cpp/src/llama-graph.cpp +180 -85
- package/cpp/llama.cpp/src/llama-graph.h +90 -51
- package/cpp/llama.cpp/src/llama-hparams.cpp +34 -3
- package/cpp/llama.cpp/src/llama-hparams.h +21 -6
- package/cpp/llama.cpp/src/{llama-kv-cache-unified-iswa.cpp → llama-kv-cache-iswa.cpp} +79 -56
- package/cpp/llama.cpp/src/{llama-kv-cache-unified-iswa.h → llama-kv-cache-iswa.h} +30 -28
- package/cpp/llama.cpp/src/{llama-kv-cache-unified.cpp → llama-kv-cache.cpp} +240 -632
- package/cpp/llama.cpp/src/{llama-kv-cache-unified.h → llama-kv-cache.h} +39 -74
- package/cpp/llama.cpp/src/llama-kv-cells.h +21 -21
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +41 -35
- package/cpp/llama.cpp/src/llama-memory-hybrid.h +26 -29
- package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +13 -9
- package/cpp/llama.cpp/src/llama-memory-recurrent.h +10 -14
- package/cpp/llama.cpp/src/llama-memory.h +13 -10
- package/cpp/llama.cpp/src/llama-model-loader.cpp +2 -0
- package/cpp/llama.cpp/src/llama-model-loader.h +3 -2
- package/cpp/llama.cpp/src/llama-model.cpp +1959 -419
- package/cpp/llama.cpp/src/llama-model.h +28 -4
- package/cpp/llama.cpp/src/llama-quant.cpp +40 -4
- package/cpp/llama.cpp/src/llama-vocab.cpp +51 -2
- package/cpp/llama.cpp/src/llama-vocab.h +1 -0
- package/cpp/llama.cpp/vendor/minja/chat-template.hpp +16 -7
- package/cpp/llama.cpp/vendor/minja/minja.hpp +47 -12
- package/cpp/rn-completion.cpp +3 -27
- package/ios/generated/RNLlamaCppSpec/RNLlamaCppSpec.h +30 -0
- package/ios/generated/RNLlamaCppSpecJSI.h +49 -4
- package/ios/include/chat.h +8 -1
- package/ios/include/common/minja/chat-template.hpp +16 -7
- package/ios/include/common/minja/minja.hpp +47 -12
- package/ios/include/common.h +64 -15
- package/ios/include/llama.h +53 -114
- package/ios/include/speculative.h +8 -1
- package/ios/libs/llama.xcframework/Info.plist +18 -18
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5557 -5267
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-opt.h +25 -6
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +90 -3
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +53 -114
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5520 -5238
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4241 -4014
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +25 -6
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +90 -3
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +53 -114
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5519 -5238
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4242 -4016
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-opt.h +25 -6
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +90 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +53 -114
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-opt.h +25 -6
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +90 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +53 -114
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-opt.h +25 -6
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +90 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +53 -114
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5556 -5267
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-opt.h +25 -6
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +90 -3
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +53 -114
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5519 -5238
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4241 -4014
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +25 -6
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +90 -3
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +53 -114
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5553 -5303
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-opt.h +25 -6
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +90 -3
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +53 -114
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5515 -5274
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4238 -4044
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +25 -6
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +90 -3
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +53 -114
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/lib/module/NativeRNLlamaCpp.js.map +1 -1
- package/lib/typescript/src/NativeRNLlamaCpp.d.ts +5 -0
- package/lib/typescript/src/NativeRNLlamaCpp.d.ts.map +1 -1
- package/package.json +1 -2
- package/src/NativeRNLlamaCpp.ts +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +0 -56
package/README.md
CHANGED
|
@@ -18,6 +18,9 @@
|
|
|
18
18
|
* Chat completion with templates (including Jinja template support)
|
|
19
19
|
* Embeddings generation
|
|
20
20
|
* Function/tool calling support
|
|
21
|
+
* **Advanced thinking and reasoning support** for compatible models
|
|
22
|
+
* **Flexible reasoning budget control** (unlimited, disabled, or limited)
|
|
23
|
+
* **Multiple reasoning format support** (none, auto, deepseek, deepseek-legacy)
|
|
21
24
|
|
|
22
25
|
## What Needs Help
|
|
23
26
|
|
|
@@ -38,6 +41,8 @@ We welcome contributions, especially in these areas:
|
|
|
38
41
|
3. **Tool Support**:
|
|
39
42
|
* Improving tool calling functionality for complex interactions
|
|
40
43
|
* Better JSON validation and error handling
|
|
44
|
+
* Enhanced thinking and reasoning model support
|
|
45
|
+
* Advanced reasoning format implementations
|
|
41
46
|
|
|
42
47
|
4. **Testing**:
|
|
43
48
|
* Automated testing using the example project
|
|
@@ -139,7 +144,10 @@ import { initLlama } from '@novastera-oss/llamarn';
|
|
|
139
144
|
const context = await initLlama({
|
|
140
145
|
model: 'path/to/model.gguf',
|
|
141
146
|
n_ctx: 2048,
|
|
142
|
-
n_batch: 512
|
|
147
|
+
n_batch: 512,
|
|
148
|
+
// Optional: Enable thinking and reasoning capabilities
|
|
149
|
+
reasoning_budget: -1, // Unlimited thinking
|
|
150
|
+
reasoning_format: 'auto' // Automatic reasoning format detection
|
|
143
151
|
});
|
|
144
152
|
|
|
145
153
|
// Generate a completion
|
|
@@ -162,7 +170,10 @@ const context = await initLlama({
|
|
|
162
170
|
model: 'path/to/model.gguf',
|
|
163
171
|
n_ctx: 4096,
|
|
164
172
|
n_batch: 512,
|
|
165
|
-
use_jinja: true // Enable Jinja template parsing
|
|
173
|
+
use_jinja: true, // Enable Jinja template parsing
|
|
174
|
+
// Optional: Configure thinking and reasoning
|
|
175
|
+
reasoning_budget: -1, // Enable unlimited thinking
|
|
176
|
+
reasoning_format: 'deepseek' // Use DeepSeek reasoning format
|
|
166
177
|
});
|
|
167
178
|
|
|
168
179
|
// Chat completion with messages
|
|
@@ -189,9 +200,47 @@ const context = await initLlama({
|
|
|
189
200
|
model: 'path/to/model.gguf',
|
|
190
201
|
n_ctx: 2048,
|
|
191
202
|
n_batch: 512,
|
|
192
|
-
use_jinja: true // Enable template handling for tool calls
|
|
203
|
+
use_jinja: true, // Enable template handling for tool calls
|
|
204
|
+
parse_tool_calls: true, // Enable tool call parsing (auto-enabled with use_jinja)
|
|
205
|
+
parallel_tool_calls: false // Disable parallel tool calls for compatibility
|
|
206
|
+
});
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
### Thinking and Reasoning Models
|
|
210
|
+
|
|
211
|
+
For models that support reasoning and thinking, you can enable advanced thinking functionality:
|
|
212
|
+
|
|
213
|
+
```js
|
|
214
|
+
import { initLlama } from '@novastera-oss/llamarn';
|
|
215
|
+
|
|
216
|
+
// Initialize a reasoning model with thinking capabilities
|
|
217
|
+
const context = await initLlama({
|
|
218
|
+
model: 'path/to/reasoning-model.gguf',
|
|
219
|
+
n_ctx: 4096,
|
|
220
|
+
n_batch: 512,
|
|
221
|
+
use_jinja: true,
|
|
222
|
+
|
|
223
|
+
// Thinking and reasoning options
|
|
224
|
+
reasoning_budget: -1, // -1 = unlimited thinking, 0 = disabled, >0 = limited
|
|
225
|
+
reasoning_format: 'deepseek', // Use DeepSeek reasoning format
|
|
226
|
+
thinking_forced_open: true, // Force the model to always output thinking
|
|
227
|
+
parse_tool_calls: true, // Enable tool call parsing
|
|
228
|
+
parallel_tool_calls: false // Disable parallel tool calls for compatibility
|
|
229
|
+
});
|
|
230
|
+
|
|
231
|
+
// Chat completion with thinking enabled
|
|
232
|
+
const result = await context.completion({
|
|
233
|
+
messages: [
|
|
234
|
+
{ role: 'system', content: 'You are a helpful assistant. Think through problems step by step.' },
|
|
235
|
+
{ role: 'user', content: 'Solve this math problem: What is 15% of 240?' }
|
|
236
|
+
],
|
|
237
|
+
temperature: 0.7
|
|
193
238
|
});
|
|
194
239
|
|
|
240
|
+
console.log('Response:', result.text);
|
|
241
|
+
// The response may include thinking tags like <think>...</think> depending on the model
|
|
242
|
+
```
|
|
243
|
+
|
|
195
244
|
// Create a chat with tool calling
|
|
196
245
|
const response = await context.completion({
|
|
197
246
|
messages: [
|
|
@@ -260,6 +309,40 @@ const embeddingResponse = await context.embedding({
|
|
|
260
309
|
console.log('Embedding:', embeddingResponse.data[0].embedding);
|
|
261
310
|
```
|
|
262
311
|
|
|
312
|
+
## Advanced Configuration Options
|
|
313
|
+
|
|
314
|
+
### Thinking and Reasoning Parameters
|
|
315
|
+
|
|
316
|
+
The library supports advanced thinking and reasoning capabilities for models that support them:
|
|
317
|
+
|
|
318
|
+
- **`reasoning_budget`**: Controls the amount of thinking allowed
|
|
319
|
+
- `-1`: Unlimited thinking (default)
|
|
320
|
+
- `0`: Disabled thinking
|
|
321
|
+
- `>0`: Limited thinking with the specified budget
|
|
322
|
+
|
|
323
|
+
- **`reasoning_format`**: Controls how thinking is parsed and returned
|
|
324
|
+
- `'none'`: Leave thoughts unparsed in message content
|
|
325
|
+
- `'auto'`: Same as deepseek (default)
|
|
326
|
+
- `'deepseek'`: Extract thinking into `message.reasoning_content`
|
|
327
|
+
- `'deepseek-legacy'`: Extract thinking with streaming behavior
|
|
328
|
+
|
|
329
|
+
- **`thinking_forced_open`**: Forces reasoning models to always output thinking
|
|
330
|
+
- `false`: Normal thinking behavior (default)
|
|
331
|
+
- `true`: Always include thinking tags in output
|
|
332
|
+
|
|
333
|
+
- **`parse_tool_calls`**: Enables tool call parsing
|
|
334
|
+
- `true`: Parse and extract tool calls (default)
|
|
335
|
+
- `false`: Disable tool call parsing
|
|
336
|
+
- **Note**: Automatically enabled when `use_jinja` is true
|
|
337
|
+
|
|
338
|
+
- **`parallel_tool_calls`**: Enables multiple tool calls in a single response
|
|
339
|
+
- `false`: Single tool calls only (default, for compatibility)
|
|
340
|
+
- `true`: Allow parallel tool calls (only supported by some models)
|
|
341
|
+
|
|
342
|
+
### Automatic Tool Call Enhancement
|
|
343
|
+
|
|
344
|
+
When `use_jinja` is enabled, `parse_tool_calls` is automatically enabled because Jinja templates provide better tool calling capabilities. This ensures optimal tool support when using advanced templates.
|
|
345
|
+
|
|
263
346
|
## Model Path Handling
|
|
264
347
|
|
|
265
348
|
The module accepts different path formats depending on the platform:
|
package/RNLlamaCpp.podspec
CHANGED
|
@@ -53,7 +53,7 @@ Pod::Spec.new do |s|
|
|
|
53
53
|
# Compiler settings
|
|
54
54
|
s.pod_target_xcconfig = {
|
|
55
55
|
"HEADER_SEARCH_PATHS" => "\"$(PODS_TARGET_SRCROOT)/ios/include\" \"$(PODS_TARGET_SRCROOT)/cpp\" \"$(PODS_TARGET_SRCROOT)/ios/generated/RNLlamaCppSpec\" \"$(PODS_TARGET_SRCROOT)/ios/generated\" \"$(PODS_TARGET_SRCROOT)/cpp/llama.cpp\" \"$(PODS_TARGET_SRCROOT)/cpp/llama.cpp/include\" \"$(PODS_TARGET_SRCROOT)/cpp/llama.cpp/ggml/include\" \"$(PODS_TARGET_SRCROOT)/cpp/llama.cpp/common\" \"$(PODS_TARGET_SRCROOT)/cpp/llama.cpp/vendor\" \"$(PODS_ROOT)/boost\" \"$(PODS_ROOT)/Headers/Public/React-bridging\" \"$(PODS_ROOT)/Headers/Public/React\"",
|
|
56
|
-
"OTHER_CPLUSPLUSFLAGS" => "-DFOLLY_NO_CONFIG -DFOLLY_MOBILE=1 -DFOLLY_USE_LIBCPP=1 -DLLAMA_METAL -DRCT_NEW_ARCH_ENABLED=1 -DFBJSRT_EXPORTED=1",
|
|
56
|
+
"OTHER_CPLUSPLUSFLAGS" => "-DFOLLY_NO_CONFIG -DFOLLY_MOBILE=1 -DFOLLY_USE_LIBCPP=1 -DFOLLY_CFG_NO_COROUTINES=1 -DLLAMA_METAL -DRCT_NEW_ARCH_ENABLED=1 -DFBJSRT_EXPORTED=1",
|
|
57
57
|
"CLANG_CXX_LANGUAGE_STANDARD" => "c++17",
|
|
58
58
|
"GCC_OPTIMIZATION_LEVEL" => "3", # Maximum optimization
|
|
59
59
|
"SWIFT_OPTIMIZATION_LEVEL" => "-O",
|
package/android/CMakeLists.txt
CHANGED
|
@@ -78,9 +78,17 @@ add_library(
|
|
|
78
78
|
${CPP_DIR}/rn-completion.cpp
|
|
79
79
|
)
|
|
80
80
|
|
|
81
|
-
# Suppress
|
|
82
|
-
target_compile_options(common PRIVATE
|
|
83
|
-
|
|
81
|
+
# Suppress additional warnings that are treated as errors in Expo SDK 54
|
|
82
|
+
target_compile_options(common PRIVATE )
|
|
83
|
+
|
|
84
|
+
# Use React Native's compile options function for proper C++ flags and RN_SERIALIZABLE_STATE
|
|
85
|
+
if(ReactAndroid_VERSION_MINOR GREATER_EQUAL 80)
|
|
86
|
+
# Add additional warning suppressions for RNLlamaCpp target
|
|
87
|
+
target_compile_reactnative_options(RNLlamaCpp PRIVATE)
|
|
88
|
+
target_compile_options(RNLlamaCpp PRIVATE -Wno-unused-function)
|
|
89
|
+
else()
|
|
90
|
+
target_compile_options(RNLlamaCpp PRIVATE -Wno-unused-function)
|
|
91
|
+
endif()
|
|
84
92
|
|
|
85
93
|
# Check if Vulkan backend library is available
|
|
86
94
|
set(VULKAN_BACKEND_AVAILABLE FALSE)
|
|
@@ -18,7 +18,7 @@ namespace facebook::react {
|
|
|
18
18
|
|
|
19
19
|
#pragma mark - NativeRNLlamaCppLlamaModelParams
|
|
20
20
|
|
|
21
|
-
template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9, typename P10, typename P11, typename P12, typename P13, typename P14, typename P15, typename P16, typename P17, typename P18, typename P19, typename P20, typename P21, typename P22, typename P23>
|
|
21
|
+
template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9, typename P10, typename P11, typename P12, typename P13, typename P14, typename P15, typename P16, typename P17, typename P18, typename P19, typename P20, typename P21, typename P22, typename P23, typename P24, typename P25, typename P26, typename P27, typename P28>
|
|
22
22
|
struct NativeRNLlamaCppLlamaModelParams {
|
|
23
23
|
P0 model;
|
|
24
24
|
P1 n_ctx;
|
|
@@ -42,10 +42,15 @@ struct NativeRNLlamaCppLlamaModelParams {
|
|
|
42
42
|
P19 chat_template;
|
|
43
43
|
P20 use_jinja;
|
|
44
44
|
P21 verbose;
|
|
45
|
-
P22
|
|
46
|
-
P23
|
|
45
|
+
P22 reasoning_budget;
|
|
46
|
+
P23 reasoning_format;
|
|
47
|
+
P24 thinking_forced_open;
|
|
48
|
+
P25 parse_tool_calls;
|
|
49
|
+
P26 parallel_tool_calls;
|
|
50
|
+
P27 lora_adapters;
|
|
51
|
+
P28 grammar;
|
|
47
52
|
bool operator==(const NativeRNLlamaCppLlamaModelParams &other) const {
|
|
48
|
-
return model == other.model && n_ctx == other.n_ctx && n_batch == other.n_batch && n_ubatch == other.n_ubatch && n_threads == other.n_threads && n_keep == other.n_keep && n_gpu_layers == other.n_gpu_layers && use_mmap == other.use_mmap && use_mlock == other.use_mlock && vocab_only == other.vocab_only && embedding == other.embedding && seed == other.seed && rope_freq_base == other.rope_freq_base && rope_freq_scale == other.rope_freq_scale && yarn_ext_factor == other.yarn_ext_factor && yarn_attn_factor == other.yarn_attn_factor && yarn_beta_fast == other.yarn_beta_fast && yarn_beta_slow == other.yarn_beta_slow && logits_all == other.logits_all && chat_template == other.chat_template && use_jinja == other.use_jinja && verbose == other.verbose && lora_adapters == other.lora_adapters && grammar == other.grammar;
|
|
53
|
+
return model == other.model && n_ctx == other.n_ctx && n_batch == other.n_batch && n_ubatch == other.n_ubatch && n_threads == other.n_threads && n_keep == other.n_keep && n_gpu_layers == other.n_gpu_layers && use_mmap == other.use_mmap && use_mlock == other.use_mlock && vocab_only == other.vocab_only && embedding == other.embedding && seed == other.seed && rope_freq_base == other.rope_freq_base && rope_freq_scale == other.rope_freq_scale && yarn_ext_factor == other.yarn_ext_factor && yarn_attn_factor == other.yarn_attn_factor && yarn_beta_fast == other.yarn_beta_fast && yarn_beta_slow == other.yarn_beta_slow && logits_all == other.logits_all && chat_template == other.chat_template && use_jinja == other.use_jinja && verbose == other.verbose && reasoning_budget == other.reasoning_budget && reasoning_format == other.reasoning_format && thinking_forced_open == other.thinking_forced_open && parse_tool_calls == other.parse_tool_calls && parallel_tool_calls == other.parallel_tool_calls && lora_adapters == other.lora_adapters && grammar == other.grammar;
|
|
49
54
|
}
|
|
50
55
|
};
|
|
51
56
|
|
|
@@ -80,6 +85,11 @@ struct NativeRNLlamaCppLlamaModelParamsBridging {
|
|
|
80
85
|
bridging::fromJs<decltype(types.chat_template)>(rt, value.getProperty(rt, "chat_template"), jsInvoker),
|
|
81
86
|
bridging::fromJs<decltype(types.use_jinja)>(rt, value.getProperty(rt, "use_jinja"), jsInvoker),
|
|
82
87
|
bridging::fromJs<decltype(types.verbose)>(rt, value.getProperty(rt, "verbose"), jsInvoker),
|
|
88
|
+
bridging::fromJs<decltype(types.reasoning_budget)>(rt, value.getProperty(rt, "reasoning_budget"), jsInvoker),
|
|
89
|
+
bridging::fromJs<decltype(types.reasoning_format)>(rt, value.getProperty(rt, "reasoning_format"), jsInvoker),
|
|
90
|
+
bridging::fromJs<decltype(types.thinking_forced_open)>(rt, value.getProperty(rt, "thinking_forced_open"), jsInvoker),
|
|
91
|
+
bridging::fromJs<decltype(types.parse_tool_calls)>(rt, value.getProperty(rt, "parse_tool_calls"), jsInvoker),
|
|
92
|
+
bridging::fromJs<decltype(types.parallel_tool_calls)>(rt, value.getProperty(rt, "parallel_tool_calls"), jsInvoker),
|
|
83
93
|
bridging::fromJs<decltype(types.lora_adapters)>(rt, value.getProperty(rt, "lora_adapters"), jsInvoker),
|
|
84
94
|
bridging::fromJs<decltype(types.grammar)>(rt, value.getProperty(rt, "grammar"), jsInvoker)};
|
|
85
95
|
return result;
|
|
@@ -174,6 +184,26 @@ struct NativeRNLlamaCppLlamaModelParamsBridging {
|
|
|
174
184
|
return bridging::toJs(rt, value);
|
|
175
185
|
}
|
|
176
186
|
|
|
187
|
+
static double reasoning_budgetToJs(jsi::Runtime &rt, decltype(types.reasoning_budget) value) {
|
|
188
|
+
return bridging::toJs(rt, value);
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
static jsi::String reasoning_formatToJs(jsi::Runtime &rt, decltype(types.reasoning_format) value) {
|
|
192
|
+
return bridging::toJs(rt, value);
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
static bool thinking_forced_openToJs(jsi::Runtime &rt, decltype(types.thinking_forced_open) value) {
|
|
196
|
+
return bridging::toJs(rt, value);
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
static bool parse_tool_callsToJs(jsi::Runtime &rt, decltype(types.parse_tool_calls) value) {
|
|
200
|
+
return bridging::toJs(rt, value);
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
static bool parallel_tool_callsToJs(jsi::Runtime &rt, decltype(types.parallel_tool_calls) value) {
|
|
204
|
+
return bridging::toJs(rt, value);
|
|
205
|
+
}
|
|
206
|
+
|
|
177
207
|
static jsi::Array lora_adaptersToJs(jsi::Runtime &rt, decltype(types.lora_adapters) value) {
|
|
178
208
|
return bridging::toJs(rt, value);
|
|
179
209
|
}
|
|
@@ -252,6 +282,21 @@ struct NativeRNLlamaCppLlamaModelParamsBridging {
|
|
|
252
282
|
if (value.verbose) {
|
|
253
283
|
result.setProperty(rt, "verbose", bridging::toJs(rt, value.verbose.value(), jsInvoker));
|
|
254
284
|
}
|
|
285
|
+
if (value.reasoning_budget) {
|
|
286
|
+
result.setProperty(rt, "reasoning_budget", bridging::toJs(rt, value.reasoning_budget.value(), jsInvoker));
|
|
287
|
+
}
|
|
288
|
+
if (value.reasoning_format) {
|
|
289
|
+
result.setProperty(rt, "reasoning_format", bridging::toJs(rt, value.reasoning_format.value(), jsInvoker));
|
|
290
|
+
}
|
|
291
|
+
if (value.thinking_forced_open) {
|
|
292
|
+
result.setProperty(rt, "thinking_forced_open", bridging::toJs(rt, value.thinking_forced_open.value(), jsInvoker));
|
|
293
|
+
}
|
|
294
|
+
if (value.parse_tool_calls) {
|
|
295
|
+
result.setProperty(rt, "parse_tool_calls", bridging::toJs(rt, value.parse_tool_calls.value(), jsInvoker));
|
|
296
|
+
}
|
|
297
|
+
if (value.parallel_tool_calls) {
|
|
298
|
+
result.setProperty(rt, "parallel_tool_calls", bridging::toJs(rt, value.parallel_tool_calls.value(), jsInvoker));
|
|
299
|
+
}
|
|
255
300
|
if (value.lora_adapters) {
|
|
256
301
|
result.setProperty(rt, "lora_adapters", bridging::toJs(rt, value.lora_adapters.value(), jsInvoker));
|
|
257
302
|
}
|
|
@@ -64,8 +64,6 @@ extern "C" {
|
|
|
64
64
|
|
|
65
65
|
typedef struct llama_memory_i * llama_memory_t;
|
|
66
66
|
|
|
67
|
-
struct llama_kv_cache; // DEPRECATED (use llama_memory instead)
|
|
68
|
-
|
|
69
67
|
typedef int32_t llama_pos;
|
|
70
68
|
typedef int32_t llama_token;
|
|
71
69
|
typedef int32_t llama_seq_id;
|
|
@@ -152,6 +150,7 @@ extern "C" {
|
|
|
152
150
|
//LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // removed from gguf files, use Q4_0 and runtime repack
|
|
153
151
|
LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors
|
|
154
152
|
LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors
|
|
153
|
+
LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38, // except 1d tensors
|
|
155
154
|
|
|
156
155
|
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
|
157
156
|
};
|
|
@@ -284,10 +283,11 @@ extern "C" {
|
|
|
284
283
|
const struct llama_model_kv_override * kv_overrides;
|
|
285
284
|
|
|
286
285
|
// Keep the booleans together to avoid misalignment during copy-by-value.
|
|
287
|
-
bool vocab_only;
|
|
288
|
-
bool use_mmap;
|
|
289
|
-
bool use_mlock;
|
|
290
|
-
bool check_tensors;
|
|
286
|
+
bool vocab_only; // only load the vocabulary, no weights
|
|
287
|
+
bool use_mmap; // use mmap if possible
|
|
288
|
+
bool use_mlock; // force system to keep model in RAM
|
|
289
|
+
bool check_tensors; // validate model tensor data
|
|
290
|
+
bool use_extra_bufts; // use extra buffer types (used for weight repacking)
|
|
291
291
|
};
|
|
292
292
|
|
|
293
293
|
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
|
|
@@ -312,7 +312,7 @@ extern "C" {
|
|
|
312
312
|
float yarn_beta_fast; // YaRN low correction dim
|
|
313
313
|
float yarn_beta_slow; // YaRN high correction dim
|
|
314
314
|
uint32_t yarn_orig_ctx; // YaRN original context size
|
|
315
|
-
float defrag_thold; // defragment the KV cache if holes/size > thold, <= 0 disabled (default)
|
|
315
|
+
float defrag_thold; // [DEPRECATED] defragment the KV cache if holes/size > thold, <= 0 disabled (default)
|
|
316
316
|
|
|
317
317
|
ggml_backend_sched_eval_callback cb_eval;
|
|
318
318
|
void * cb_eval_user_data;
|
|
@@ -467,8 +467,6 @@ extern "C" {
|
|
|
467
467
|
LLAMA_API llama_memory_t llama_get_memory (const struct llama_context * ctx);
|
|
468
468
|
LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); // TODO: rename to llama_get_pooling_type
|
|
469
469
|
|
|
470
|
-
DEPRECATED(LLAMA_API struct llama_kv_cache * llama_get_kv_self(struct llama_context * ctx), "use llama_get_memory instead");
|
|
471
|
-
|
|
472
470
|
LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
|
|
473
471
|
LLAMA_API enum llama_rope_type llama_model_rope_type(const struct llama_model * model);
|
|
474
472
|
|
|
@@ -537,6 +535,9 @@ extern "C" {
|
|
|
537
535
|
// Returns true if the model is recurrent (like Mamba, RWKV, etc.)
|
|
538
536
|
LLAMA_API bool llama_model_is_recurrent(const struct llama_model * model);
|
|
539
537
|
|
|
538
|
+
// Returns true if the model is diffusion-based (like LLaDA, Dream, etc.)
|
|
539
|
+
LLAMA_API bool llama_model_is_diffusion(const struct llama_model * model);
|
|
540
|
+
|
|
540
541
|
// Returns 0 on success
|
|
541
542
|
LLAMA_API uint32_t llama_model_quantize(
|
|
542
543
|
const char * fname_inp,
|
|
@@ -552,6 +553,24 @@ extern "C" {
|
|
|
552
553
|
struct llama_model * model,
|
|
553
554
|
const char * path_lora);
|
|
554
555
|
|
|
556
|
+
// Functions to access the adapter's GGUF metadata scalar values
|
|
557
|
+
// - The functions return the length of the string on success, or -1 on failure
|
|
558
|
+
// - The output string is always null-terminated and cleared on failure
|
|
559
|
+
// - When retrieving a string, an extra byte must be allocated to account for the null terminator
|
|
560
|
+
// - GGUF array values are not supported by these functions
|
|
561
|
+
|
|
562
|
+
// Get metadata value as a string by key name
|
|
563
|
+
LLAMA_API int32_t llama_adapter_meta_val_str(const struct llama_adapter_lora * adapter, const char * key, char * buf, size_t buf_size);
|
|
564
|
+
|
|
565
|
+
// Get the number of metadata key/value pairs
|
|
566
|
+
LLAMA_API int32_t llama_adapter_meta_count(const struct llama_adapter_lora * adapter);
|
|
567
|
+
|
|
568
|
+
// Get metadata key name by index
|
|
569
|
+
LLAMA_API int32_t llama_adapter_meta_key_by_index(const struct llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size);
|
|
570
|
+
|
|
571
|
+
// Get metadata value as a string by index
|
|
572
|
+
LLAMA_API int32_t llama_adapter_meta_val_str_by_index(const struct llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size);
|
|
573
|
+
|
|
555
574
|
// Manually free a LoRA adapter
|
|
556
575
|
// Note: loaded adapters will be free when the associated model is deleted
|
|
557
576
|
LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter);
|
|
@@ -662,111 +681,6 @@ extern "C" {
|
|
|
662
681
|
// Check if the memory supports shifting
|
|
663
682
|
LLAMA_API bool llama_memory_can_shift(llama_memory_t mem);
|
|
664
683
|
|
|
665
|
-
//
|
|
666
|
-
// KV cache for self-attention (TODO: deprecate in favor of llama_memory)
|
|
667
|
-
//
|
|
668
|
-
|
|
669
|
-
// Returns the number of tokens in the KV cache (slow, use only for debug)
|
|
670
|
-
// If a KV cell has multiple sequences assigned to it, it will be counted multiple times
|
|
671
|
-
DEPRECATED(LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx),
|
|
672
|
-
"Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
|
|
673
|
-
|
|
674
|
-
// Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
|
|
675
|
-
DEPRECATED(LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx),
|
|
676
|
-
"Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
|
|
677
|
-
|
|
678
|
-
// Clear the KV cache - both cell info is erased and KV data is zeroed
|
|
679
|
-
DEPRECATED(LLAMA_API void llama_kv_self_clear(
|
|
680
|
-
struct llama_context * ctx),
|
|
681
|
-
"Use llama_memory_clear() instead");
|
|
682
|
-
|
|
683
|
-
// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
|
|
684
|
-
// Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
|
|
685
|
-
// seq_id < 0 : match any sequence
|
|
686
|
-
// p0 < 0 : [0, p1]
|
|
687
|
-
// p1 < 0 : [p0, inf)
|
|
688
|
-
DEPRECATED(LLAMA_API bool llama_kv_self_seq_rm(
|
|
689
|
-
struct llama_context * ctx,
|
|
690
|
-
llama_seq_id seq_id,
|
|
691
|
-
llama_pos p0,
|
|
692
|
-
llama_pos p1),
|
|
693
|
-
"Use llama_memory_seq_rm() instead");
|
|
694
|
-
|
|
695
|
-
// Copy all tokens that belong to the specified sequence to another sequence
|
|
696
|
-
// Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
|
|
697
|
-
// p0 < 0 : [0, p1]
|
|
698
|
-
// p1 < 0 : [p0, inf)
|
|
699
|
-
DEPRECATED(LLAMA_API void llama_kv_self_seq_cp(
|
|
700
|
-
struct llama_context * ctx,
|
|
701
|
-
llama_seq_id seq_id_src,
|
|
702
|
-
llama_seq_id seq_id_dst,
|
|
703
|
-
llama_pos p0,
|
|
704
|
-
llama_pos p1),
|
|
705
|
-
"Use llama_memory_seq_cp() instead");
|
|
706
|
-
|
|
707
|
-
// Removes all tokens that do not belong to the specified sequence
|
|
708
|
-
DEPRECATED(LLAMA_API void llama_kv_self_seq_keep(
|
|
709
|
-
struct llama_context * ctx,
|
|
710
|
-
llama_seq_id seq_id),
|
|
711
|
-
"Use llama_memory_seq_keep() instead");
|
|
712
|
-
|
|
713
|
-
// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
|
|
714
|
-
// If the KV cache is RoPEd, the KV data is updated accordingly:
|
|
715
|
-
// - lazily on next llama_decode()
|
|
716
|
-
// p0 < 0 : [0, p1]
|
|
717
|
-
// p1 < 0 : [p0, inf)
|
|
718
|
-
DEPRECATED(LLAMA_API void llama_kv_self_seq_add(
|
|
719
|
-
struct llama_context * ctx,
|
|
720
|
-
llama_seq_id seq_id,
|
|
721
|
-
llama_pos p0,
|
|
722
|
-
llama_pos p1,
|
|
723
|
-
llama_pos delta),
|
|
724
|
-
"Use llama_memory_seq_add() instead");
|
|
725
|
-
|
|
726
|
-
// Integer division of the positions by factor of `d > 1`
|
|
727
|
-
// If the KV cache is RoPEd, the KV data is updated accordingly:
|
|
728
|
-
// - lazily on next llama_decode()
|
|
729
|
-
// p0 < 0 : [0, p1]
|
|
730
|
-
// p1 < 0 : [p0, inf)
|
|
731
|
-
DEPRECATED(LLAMA_API void llama_kv_self_seq_div(
|
|
732
|
-
struct llama_context * ctx,
|
|
733
|
-
llama_seq_id seq_id,
|
|
734
|
-
llama_pos p0,
|
|
735
|
-
llama_pos p1,
|
|
736
|
-
int d),
|
|
737
|
-
"Use llama_memory_seq_div() instead");
|
|
738
|
-
|
|
739
|
-
// Returns the smallest position present in the KV cache for the specified sequence
|
|
740
|
-
// This is typically non-zero only for SWA caches
|
|
741
|
-
// Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
|
|
742
|
-
// Return -1 if the sequence is empty
|
|
743
|
-
DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_min(
|
|
744
|
-
struct llama_context * ctx,
|
|
745
|
-
llama_seq_id seq_id),
|
|
746
|
-
"Use llama_memory_seq_pos_min() instead");
|
|
747
|
-
|
|
748
|
-
// Returns the largest position present in the KV cache for the specified sequence
|
|
749
|
-
// Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
|
|
750
|
-
// Return -1 if the sequence is empty
|
|
751
|
-
DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_max(
|
|
752
|
-
struct llama_context * ctx,
|
|
753
|
-
llama_seq_id seq_id),
|
|
754
|
-
"Use llama_memory_seq_pos_max() instead");
|
|
755
|
-
|
|
756
|
-
// Defragment the KV cache
|
|
757
|
-
// This will be applied:
|
|
758
|
-
// - lazily on next llama_decode()
|
|
759
|
-
DEPRECATED(LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx),
|
|
760
|
-
"simply remove this call, the context will automatically decide when to do a defragmentation based on 'defrag_thold'");
|
|
761
|
-
|
|
762
|
-
// Check if the context supports KV cache shifting
|
|
763
|
-
DEPRECATED(LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx),
|
|
764
|
-
"use llama_memory_can_shift() instead");
|
|
765
|
-
|
|
766
|
-
// Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
|
|
767
|
-
DEPRECATED(LLAMA_API void llama_kv_self_update(struct llama_context * ctx),
|
|
768
|
-
"simply remove this call, updates are applied lazily on the next llama_decode()");
|
|
769
|
-
|
|
770
684
|
//
|
|
771
685
|
// State / sessions
|
|
772
686
|
//
|
|
@@ -865,6 +779,29 @@ extern "C" {
|
|
|
865
779
|
size_t n_token_capacity,
|
|
866
780
|
size_t * n_token_count_out);
|
|
867
781
|
|
|
782
|
+
#define LLAMA_STATE_SEQ_FLAGS_SWA_ONLY 1
|
|
783
|
+
|
|
784
|
+
typedef uint32_t llama_state_seq_flags;
|
|
785
|
+
|
|
786
|
+
LLAMA_API size_t llama_state_seq_get_size_ext(
|
|
787
|
+
struct llama_context * ctx,
|
|
788
|
+
llama_seq_id seq_id,
|
|
789
|
+
llama_state_seq_flags flags);
|
|
790
|
+
|
|
791
|
+
LLAMA_API size_t llama_state_seq_get_data_ext(
|
|
792
|
+
struct llama_context * ctx,
|
|
793
|
+
uint8_t * dst,
|
|
794
|
+
size_t size,
|
|
795
|
+
llama_seq_id seq_id,
|
|
796
|
+
llama_state_seq_flags flags);
|
|
797
|
+
|
|
798
|
+
LLAMA_API size_t llama_state_seq_set_data_ext(
|
|
799
|
+
struct llama_context * ctx,
|
|
800
|
+
const uint8_t * src,
|
|
801
|
+
size_t size,
|
|
802
|
+
llama_seq_id dest_seq_id,
|
|
803
|
+
llama_state_seq_flags flags);
|
|
804
|
+
|
|
868
805
|
//
|
|
869
806
|
// Decoding
|
|
870
807
|
//
|
|
@@ -1432,6 +1369,8 @@ extern "C" {
|
|
|
1432
1369
|
|
|
1433
1370
|
ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
|
|
1434
1371
|
void * get_opt_pars_ud; // userdata for calculating optimizer parameters
|
|
1372
|
+
|
|
1373
|
+
enum ggml_opt_optimizer_type optimizer_type;
|
|
1435
1374
|
};
|
|
1436
1375
|
|
|
1437
1376
|
LLAMA_API void llama_opt_init(struct llama_context * lctx, struct llama_model * model, struct llama_opt_params lopt_params);
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
package/cpp/LlamaCppModel.cpp
CHANGED
|
@@ -948,16 +948,8 @@ jsi::Value LlamaCppModel::embeddingJsi(jsi::Runtime& rt, const jsi::Value* args,
|
|
|
948
948
|
throw std::runtime_error("Invalid embedding dimension");
|
|
949
949
|
}
|
|
950
950
|
|
|
951
|
-
//
|
|
952
|
-
|
|
953
|
-
if (options.hasProperty(rt, "pooling") && options.getProperty(rt, "pooling").isString()) {
|
|
954
|
-
std::string pooling = options.getProperty(rt, "pooling").getString(rt).utf8(rt);
|
|
955
|
-
if (pooling == "last") {
|
|
956
|
-
pooling_type = LLAMA_POOLING_TYPE_LAST;
|
|
957
|
-
} else if (pooling == "cls" || pooling == "first") {
|
|
958
|
-
pooling_type = LLAMA_POOLING_TYPE_CLS;
|
|
959
|
-
}
|
|
960
|
-
}
|
|
951
|
+
// Note: Pooling is handled automatically by llama_get_embeddings()
|
|
952
|
+
// The function returns the appropriate embedding based on the model's configuration
|
|
961
953
|
|
|
962
954
|
// Get the embeddings
|
|
963
955
|
std::vector<float> embedding_vec(n_embd);
|