cui-llama.rn 1.7.3 → 1.7.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +217 -17
- package/android/src/main/CMakeLists.txt +34 -15
- package/android/src/main/java/com/rnllama/LlamaContext.java +94 -8
- package/android/src/main/java/com/rnllama/RNLlama.java +247 -0
- package/android/src/main/jni.cpp +213 -14
- package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
- package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
- package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +35 -0
- package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +34 -0
- package/cpp/README.md +1 -1
- package/cpp/chat-parser.cpp +385 -0
- package/cpp/chat-parser.h +120 -0
- package/cpp/chat.cpp +726 -596
- package/cpp/chat.h +71 -6
- package/cpp/common.cpp +56 -38
- package/cpp/common.h +9 -3
- package/cpp/ggml-backend-reg.cpp +5 -0
- package/cpp/ggml-backend.cpp +10 -2
- package/cpp/ggml-common.h +4 -0
- package/cpp/ggml-cpu/amx/amx.cpp +1 -1
- package/cpp/ggml-cpu/amx/mmq.cpp +11 -10
- package/cpp/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
- package/cpp/ggml-cpu/arch/arm/quants.c +4114 -0
- package/cpp/ggml-cpu/arch/arm/repack.cpp +2163 -0
- package/cpp/ggml-cpu/arch/x86/cpu-feats.cpp +327 -0
- package/cpp/ggml-cpu/arch/x86/quants.c +4311 -0
- package/cpp/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
- package/cpp/ggml-cpu/arch-fallback.h +184 -0
- package/cpp/ggml-cpu/common.h +4 -3
- package/cpp/ggml-cpu/ggml-cpu-impl.h +21 -16
- package/cpp/ggml-cpu/ggml-cpu.c +123 -104
- package/cpp/ggml-cpu/ggml-cpu.cpp +11 -8
- package/cpp/ggml-cpu/ops.cpp +330 -148
- package/cpp/ggml-cpu/ops.h +1 -0
- package/cpp/ggml-cpu/quants.c +1158 -0
- package/cpp/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
- package/cpp/ggml-cpu/repack.cpp +1571 -0
- package/cpp/ggml-cpu/repack.h +98 -0
- package/cpp/ggml-cpu/simd-mappings.h +330 -38
- package/cpp/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
- package/cpp/ggml-cpu/vec.cpp +87 -18
- package/cpp/ggml-cpu/vec.h +249 -94
- package/cpp/ggml-cpu.h +1 -0
- package/cpp/ggml-impl.h +63 -183
- package/cpp/ggml-llama-sim.metallib +0 -0
- package/cpp/ggml-llama.metallib +0 -0
- package/cpp/ggml-metal.m +152 -45
- package/cpp/ggml-quants.c +0 -2
- package/cpp/ggml.c +61 -21
- package/cpp/ggml.h +22 -3
- package/cpp/gguf.cpp +24 -3
- package/cpp/json-partial.cpp +256 -0
- package/cpp/json-partial.h +38 -0
- package/cpp/json-schema-to-grammar.cpp +5 -47
- package/cpp/json-schema-to-grammar.h +4 -4
- package/cpp/llama-arch.cpp +153 -3
- package/cpp/llama-arch.h +27 -1
- package/cpp/llama-batch.cpp +741 -272
- package/cpp/llama-batch.h +112 -54
- package/cpp/llama-chat.cpp +30 -8
- package/cpp/llama-chat.h +1 -0
- package/cpp/llama-context.cpp +524 -339
- package/cpp/llama-context.h +38 -17
- package/cpp/llama-cparams.cpp +4 -0
- package/cpp/llama-cparams.h +2 -0
- package/cpp/llama-grammar.cpp +12 -2
- package/cpp/llama-graph.cpp +431 -356
- package/cpp/llama-graph.h +126 -58
- package/cpp/llama-hparams.cpp +10 -2
- package/cpp/llama-hparams.h +19 -2
- package/cpp/llama-kv-cache-unified-iswa.cpp +279 -0
- package/cpp/llama-kv-cache-unified-iswa.h +128 -0
- package/cpp/llama-kv-cache-unified.cpp +1841 -0
- package/cpp/llama-kv-cache-unified.h +303 -0
- package/cpp/llama-kv-cells.h +439 -0
- package/cpp/llama-memory-hybrid.cpp +246 -0
- package/cpp/llama-memory-hybrid.h +138 -0
- package/cpp/llama-memory-recurrent.cpp +1112 -0
- package/cpp/llama-memory-recurrent.h +183 -0
- package/cpp/llama-memory.cpp +41 -0
- package/cpp/llama-memory.h +86 -5
- package/cpp/llama-mmap.cpp +1 -1
- package/cpp/llama-model-loader.cpp +42 -17
- package/cpp/llama-model-saver.cpp +1 -0
- package/cpp/llama-model.cpp +1639 -513
- package/cpp/llama-model.h +26 -0
- package/cpp/llama-sampling.cpp +2 -2
- package/cpp/llama-vocab.cpp +65 -28
- package/cpp/llama-vocab.h +1 -0
- package/cpp/llama.cpp +11 -7
- package/cpp/llama.h +150 -42
- package/cpp/minja/chat-template.hpp +1 -1
- package/cpp/minja/minja.hpp +1 -1
- package/cpp/{json.hpp → nlohmann/json.hpp} +3027 -2267
- package/cpp/nlohmann/json_fwd.hpp +187 -0
- package/cpp/regex-partial.cpp +204 -0
- package/cpp/regex-partial.h +56 -0
- package/cpp/rn-llama.cpp +646 -35
- package/cpp/rn-llama.h +32 -1
- package/cpp/rn-tts.h +39 -0
- package/cpp/sampling.cpp +7 -8
- package/cpp/tools/mtmd/clip-impl.h +5 -0
- package/cpp/tools/mtmd/clip.cpp +572 -436
- package/cpp/tools/mtmd/clip.h +14 -4
- package/cpp/tools/mtmd/mtmd-audio.cpp +0 -86
- package/cpp/tools/mtmd/mtmd-audio.h +2 -17
- package/cpp/tools/mtmd/mtmd-helper.cpp +175 -12
- package/cpp/tools/mtmd/mtmd-helper.h +91 -0
- package/cpp/tools/mtmd/mtmd.cpp +368 -248
- package/cpp/tools/mtmd/mtmd.h +6 -70
- package/cpp/unicode.cpp +5 -0
- package/ios/CMakeLists.txt +26 -6
- package/ios/RNLlama.h +1 -1
- package/ios/RNLlama.mm +153 -3
- package/ios/RNLlamaContext.h +9 -1
- package/ios/RNLlamaContext.mm +112 -9
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat-parser.h +120 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat.h +71 -6
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/common.h +9 -3
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-common.h +4 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-impl.h +63 -183
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml.h +22 -3
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/json-partial.h +38 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-arch.h +27 -1
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-batch.h +112 -54
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-chat.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-context.h +38 -17
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-cparams.h +2 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-graph.h +126 -58
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-hparams.h +19 -2
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cells.h +439 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory.h +86 -5
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model.h +26 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-vocab.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama.h +150 -42
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/minja.hpp +1 -1
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/{json.hpp → nlohmann/json.hpp} +3027 -2267
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/regex-partial.h +56 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-llama.h +32 -1
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-tts.h +39 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/ggml-llama.metallib +0 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/rnllama +0 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat-parser.h +120 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +71 -6
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +9 -3
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-common.h +4 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +63 -183
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +22 -3
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/json-partial.h +38 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +27 -1
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +112 -54
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +38 -17
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +2 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +126 -58
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +19 -2
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cells.h +439 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +86 -5
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +26 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +150 -42
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +1 -1
- package/ios/rnllama.xcframework/{tvos-arm64/rnllama.framework/Headers → ios-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann}/json.hpp +3027 -2267
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/regex-partial.h +56 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +32 -1
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-tts.h +39 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat-parser.h +120 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat.h +71 -6
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/common.h +9 -3
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-common.h +4 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-impl.h +63 -183
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml.h +22 -3
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/json-partial.h +38 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-arch.h +27 -1
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-batch.h +112 -54
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-chat.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-context.h +38 -17
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-cparams.h +2 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-graph.h +126 -58
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-hparams.h +19 -2
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cells.h +439 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory.h +86 -5
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model.h +26 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-vocab.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama.h +150 -42
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/minja.hpp +1 -1
- package/ios/rnllama.xcframework/{ios-arm64_x86_64-simulator/rnllama.framework/Headers → tvos-arm64/rnllama.framework/Headers/nlohmann}/json.hpp +3027 -2267
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/regex-partial.h +56 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-llama.h +32 -1
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-tts.h +39 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/ggml-llama.metallib +0 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/rnllama +0 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat-parser.h +120 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +71 -6
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +9 -3
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-common.h +4 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +63 -183
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +22 -3
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json-partial.h +38 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +27 -1
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +112 -54
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +38 -17
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +2 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +126 -58
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +19 -2
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cells.h +439 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +86 -5
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +26 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +150 -42
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +1 -1
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json.hpp +25526 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/regex-partial.h +56 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +32 -1
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-tts.h +39 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
- package/jest/mock.js +24 -0
- package/package.json +1 -1
- package/src/NativeRNLlama.ts +46 -2
- package/src/index.ts +105 -1
- package/cpp/ggml-cpu/ggml-cpu-aarch64.h +0 -8
- package/cpp/ggml-cpu/ggml-cpu-quants.c +0 -13326
- package/cpp/ggml-cpu/sgemm.cpp +0 -3544
- package/cpp/ggml-cpu/sgemm.h +0 -14
- package/cpp/llama-kv-cache.cpp +0 -2827
- package/cpp/llama-kv-cache.h +0 -515
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache.h +0 -515
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +0 -515
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache.h +0 -515
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json.hpp +0 -24766
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +0 -515
- /package/cpp/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
- /package/cpp/tools/mtmd/{miniaudio.h → miniaudio/miniaudio.h} +0 -0
- /package/cpp/tools/mtmd/{stb_image.h → stb/stb_image.h} +0 -0
package/cpp/llama.h
CHANGED
@@ -62,7 +62,10 @@ extern "C" {
|
|
62
62
|
struct llama_model;
|
63
63
|
struct llama_context;
|
64
64
|
struct llama_sampler;
|
65
|
-
|
65
|
+
|
66
|
+
typedef struct llama_memory_i * llama_memory_t;
|
67
|
+
|
68
|
+
struct llama_kv_cache; // DEPRECATED (use llama_memory instead)
|
66
69
|
|
67
70
|
typedef int32_t llama_pos;
|
68
71
|
typedef int32_t llama_token;
|
@@ -241,18 +244,21 @@ extern "C" {
|
|
241
244
|
|
242
245
|
typedef bool (*llama_progress_callback)(float progress, void * user_data);
|
243
246
|
|
244
|
-
// Input data for llama_decode
|
247
|
+
// Input data for llama_encode/llama_decode
|
245
248
|
// A llama_batch object can contain input about one or many sequences
|
246
249
|
// The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens
|
247
250
|
//
|
248
251
|
// - token : the token ids of the input (used when embd is NULL)
|
249
252
|
// - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
|
250
253
|
// - pos : the positions of the respective token in the sequence
|
251
|
-
// (if set to NULL, the token position will be tracked automatically by llama_decode)
|
254
|
+
// (if set to NULL, the token position will be tracked automatically by llama_encode/llama_decode)
|
252
255
|
// - seq_id : the sequence to which the respective token belongs
|
253
256
|
// (if set to NULL, the sequence ID will be assumed to be 0)
|
254
257
|
// - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
|
255
|
-
// (if set to NULL
|
258
|
+
// (if set to NULL:
|
259
|
+
// - if embeddings: all tokens are output
|
260
|
+
// - if not: only the last token is output
|
261
|
+
// )
|
256
262
|
//
|
257
263
|
typedef struct llama_batch {
|
258
264
|
int32_t n_tokens;
|
@@ -262,7 +268,7 @@ extern "C" {
|
|
262
268
|
llama_pos * pos;
|
263
269
|
int32_t * n_seq_id;
|
264
270
|
llama_seq_id ** seq_id;
|
265
|
-
int8_t * logits;
|
271
|
+
int8_t * logits; // TODO: rename this to "output"
|
266
272
|
} llama_batch;
|
267
273
|
|
268
274
|
enum llama_model_kv_override_type {
|
@@ -367,6 +373,8 @@ extern "C" {
|
|
367
373
|
bool no_perf; // measure performance timings
|
368
374
|
bool op_offload; // offload host tensor operations to device
|
369
375
|
bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
|
376
|
+
// NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases
|
377
|
+
// ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
|
370
378
|
};
|
371
379
|
|
372
380
|
// model quantization parameters
|
@@ -383,6 +391,7 @@ extern "C" {
|
|
383
391
|
void * imatrix; // pointer to importance matrix data
|
384
392
|
void * kv_overrides; // pointer to vector containing overrides
|
385
393
|
void * tensor_types; // pointer to vector containing tensor types
|
394
|
+
void * prune_layers; // pointer to vector containing layer indices to prune
|
386
395
|
} llama_model_quantize_params;
|
387
396
|
|
388
397
|
typedef struct llama_logit_bias {
|
@@ -472,6 +481,7 @@ extern "C" {
|
|
472
481
|
LLAMA_API int64_t llama_time_us(void);
|
473
482
|
|
474
483
|
LLAMA_API size_t llama_max_devices(void);
|
484
|
+
LLAMA_API size_t llama_max_parallel_sequences(void);
|
475
485
|
|
476
486
|
LLAMA_API bool llama_supports_mmap (void);
|
477
487
|
LLAMA_API bool llama_supports_mlock (void);
|
@@ -491,9 +501,11 @@ extern "C" {
|
|
491
501
|
DEPRECATED(LLAMA_API int32_t llama_n_vocab (const struct llama_vocab * vocab), "use llama_vocab_n_tokens instead");
|
492
502
|
|
493
503
|
LLAMA_API const struct llama_model * llama_get_model (const struct llama_context * ctx);
|
494
|
-
LLAMA_API
|
504
|
+
LLAMA_API llama_memory_t llama_get_memory (const struct llama_context * ctx);
|
495
505
|
LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); // TODO: rename to llama_get_pooling_type
|
496
506
|
|
507
|
+
DEPRECATED(LLAMA_API struct llama_kv_cache * llama_get_kv_self(struct llama_context * ctx), "use llama_get_memory instead");
|
508
|
+
|
497
509
|
LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
|
498
510
|
LLAMA_API enum llama_rope_type llama_model_rope_type(const struct llama_model * model);
|
499
511
|
|
@@ -502,10 +514,18 @@ extern "C" {
|
|
502
514
|
LLAMA_API int32_t llama_model_n_layer (const struct llama_model * model);
|
503
515
|
LLAMA_API int32_t llama_model_n_head (const struct llama_model * model);
|
504
516
|
LLAMA_API int32_t llama_model_n_head_kv (const struct llama_model * model);
|
517
|
+
LLAMA_API int32_t llama_model_n_swa (const struct llama_model * model);
|
505
518
|
|
506
519
|
// Get the model's RoPE frequency scaling factor
|
507
520
|
LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model);
|
508
521
|
|
522
|
+
// Returns the number of classifier outputs (only valid for classifier models)
|
523
|
+
// Undefined behavior for non-classifier models
|
524
|
+
LLAMA_API uint32_t llama_model_n_cls_out(const struct llama_model * model);
|
525
|
+
|
526
|
+
// Returns label of classifier output by index (<n_cls_out). Returns nullptr if no label provided
|
527
|
+
LLAMA_API const char * llama_model_cls_label(const struct llama_model * model, uint32_t i);
|
528
|
+
|
509
529
|
LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_vocab * vocab);
|
510
530
|
|
511
531
|
LLAMA_API int32_t llama_vocab_n_tokens(const struct llama_vocab * vocab);
|
@@ -606,106 +626,190 @@ extern "C" {
|
|
606
626
|
int32_t il_end);
|
607
627
|
|
608
628
|
//
|
609
|
-
//
|
629
|
+
// Memory
|
630
|
+
//
|
631
|
+
|
632
|
+
// Clear the memory contents
|
633
|
+
// If data == true, the data buffers will also be cleared together with the metadata
|
634
|
+
LLAMA_API void llama_memory_clear(
|
635
|
+
llama_memory_t mem,
|
636
|
+
bool data);
|
637
|
+
|
638
|
+
// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
|
639
|
+
// Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
|
640
|
+
// seq_id < 0 : match any sequence
|
641
|
+
// p0 < 0 : [0, p1]
|
642
|
+
// p1 < 0 : [p0, inf)
|
643
|
+
LLAMA_API bool llama_memory_seq_rm(
|
644
|
+
llama_memory_t mem,
|
645
|
+
llama_seq_id seq_id,
|
646
|
+
llama_pos p0,
|
647
|
+
llama_pos p1);
|
648
|
+
|
649
|
+
// Copy all tokens that belong to the specified sequence to another sequence
|
650
|
+
// p0 < 0 : [0, p1]
|
651
|
+
// p1 < 0 : [p0, inf)
|
652
|
+
LLAMA_API void llama_memory_seq_cp(
|
653
|
+
llama_memory_t mem,
|
654
|
+
llama_seq_id seq_id_src,
|
655
|
+
llama_seq_id seq_id_dst,
|
656
|
+
llama_pos p0,
|
657
|
+
llama_pos p1);
|
658
|
+
|
659
|
+
// Removes all tokens that do not belong to the specified sequence
|
660
|
+
LLAMA_API void llama_memory_seq_keep(
|
661
|
+
llama_memory_t mem,
|
662
|
+
llama_seq_id seq_id);
|
663
|
+
|
664
|
+
// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
|
665
|
+
// p0 < 0 : [0, p1]
|
666
|
+
// p1 < 0 : [p0, inf)
|
667
|
+
LLAMA_API void llama_memory_seq_add(
|
668
|
+
llama_memory_t mem,
|
669
|
+
llama_seq_id seq_id,
|
670
|
+
llama_pos p0,
|
671
|
+
llama_pos p1,
|
672
|
+
llama_pos delta);
|
673
|
+
|
674
|
+
// Integer division of the positions by factor of `d > 1`
|
675
|
+
// p0 < 0 : [0, p1]
|
676
|
+
// p1 < 0 : [p0, inf)
|
677
|
+
LLAMA_API void llama_memory_seq_div(
|
678
|
+
llama_memory_t mem,
|
679
|
+
llama_seq_id seq_id,
|
680
|
+
llama_pos p0,
|
681
|
+
llama_pos p1,
|
682
|
+
int d);
|
683
|
+
|
684
|
+
// Returns the smallest position present in the memory for the specified sequence
|
685
|
+
// This is typically non-zero only for SWA caches
|
686
|
+
// Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the memory
|
687
|
+
// Return -1 if the sequence is empty
|
688
|
+
LLAMA_API llama_pos llama_memory_seq_pos_min(
|
689
|
+
llama_memory_t mem,
|
690
|
+
llama_seq_id seq_id);
|
691
|
+
|
692
|
+
// Returns the largest position present in the memory for the specified sequence
|
693
|
+
// Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the memory
|
694
|
+
// Return -1 if the sequence is empty
|
695
|
+
LLAMA_API llama_pos llama_memory_seq_pos_max(
|
696
|
+
llama_memory_t mem,
|
697
|
+
llama_seq_id seq_id);
|
698
|
+
|
699
|
+
// Check if the memory supports shifting
|
700
|
+
LLAMA_API bool llama_memory_can_shift(llama_memory_t mem);
|
701
|
+
|
702
|
+
//
|
703
|
+
// KV cache for self-attention (TODO: deprecate in favor of llama_memory)
|
610
704
|
//
|
611
705
|
|
612
706
|
// Returns the number of tokens in the KV cache (slow, use only for debug)
|
613
707
|
// If a KV cell has multiple sequences assigned to it, it will be counted multiple times
|
614
708
|
DEPRECATED(LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx),
|
615
|
-
"Use llama_kv_self_seq_pos_max() instead");
|
709
|
+
"Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
|
616
710
|
|
617
711
|
// Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
|
618
712
|
DEPRECATED(LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx),
|
619
|
-
"Use llama_kv_self_seq_pos_max() instead");
|
713
|
+
"Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
|
620
714
|
|
621
715
|
// Clear the KV cache - both cell info is erased and KV data is zeroed
|
622
|
-
LLAMA_API void llama_kv_self_clear(
|
623
|
-
|
716
|
+
DEPRECATED(LLAMA_API void llama_kv_self_clear(
|
717
|
+
struct llama_context * ctx),
|
718
|
+
"Use llama_memory_clear() instead");
|
624
719
|
|
625
720
|
// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
|
626
721
|
// Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
|
627
722
|
// seq_id < 0 : match any sequence
|
628
723
|
// p0 < 0 : [0, p1]
|
629
724
|
// p1 < 0 : [p0, inf)
|
630
|
-
LLAMA_API bool llama_kv_self_seq_rm(
|
725
|
+
DEPRECATED(LLAMA_API bool llama_kv_self_seq_rm(
|
631
726
|
struct llama_context * ctx,
|
632
727
|
llama_seq_id seq_id,
|
633
728
|
llama_pos p0,
|
634
|
-
llama_pos p1)
|
729
|
+
llama_pos p1),
|
730
|
+
"Use llama_memory_seq_rm() instead");
|
635
731
|
|
636
732
|
// Copy all tokens that belong to the specified sequence to another sequence
|
637
733
|
// Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
|
638
734
|
// p0 < 0 : [0, p1]
|
639
735
|
// p1 < 0 : [p0, inf)
|
640
|
-
LLAMA_API void llama_kv_self_seq_cp(
|
736
|
+
DEPRECATED(LLAMA_API void llama_kv_self_seq_cp(
|
641
737
|
struct llama_context * ctx,
|
642
738
|
llama_seq_id seq_id_src,
|
643
739
|
llama_seq_id seq_id_dst,
|
644
740
|
llama_pos p0,
|
645
|
-
llama_pos p1)
|
741
|
+
llama_pos p1),
|
742
|
+
"Use llama_memory_seq_cp() instead");
|
646
743
|
|
647
744
|
// Removes all tokens that do not belong to the specified sequence
|
648
|
-
LLAMA_API void llama_kv_self_seq_keep(
|
745
|
+
DEPRECATED(LLAMA_API void llama_kv_self_seq_keep(
|
649
746
|
struct llama_context * ctx,
|
650
|
-
llama_seq_id seq_id)
|
747
|
+
llama_seq_id seq_id),
|
748
|
+
"Use llama_memory_seq_keep() instead");
|
651
749
|
|
652
750
|
// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
|
653
751
|
// If the KV cache is RoPEd, the KV data is updated accordingly:
|
654
752
|
// - lazily on next llama_decode()
|
655
|
-
// - explicitly with llama_kv_self_update()
|
656
753
|
// p0 < 0 : [0, p1]
|
657
754
|
// p1 < 0 : [p0, inf)
|
658
|
-
LLAMA_API void llama_kv_self_seq_add(
|
755
|
+
DEPRECATED(LLAMA_API void llama_kv_self_seq_add(
|
659
756
|
struct llama_context * ctx,
|
660
757
|
llama_seq_id seq_id,
|
661
758
|
llama_pos p0,
|
662
759
|
llama_pos p1,
|
663
|
-
llama_pos delta)
|
760
|
+
llama_pos delta),
|
761
|
+
"Use llama_memory_seq_add() instead");
|
664
762
|
|
665
763
|
// Integer division of the positions by factor of `d > 1`
|
666
764
|
// If the KV cache is RoPEd, the KV data is updated accordingly:
|
667
765
|
// - lazily on next llama_decode()
|
668
|
-
// - explicitly with llama_kv_self_update()
|
669
766
|
// p0 < 0 : [0, p1]
|
670
767
|
// p1 < 0 : [p0, inf)
|
671
|
-
|
768
|
+
DEPRECATED(void llama_kv_self_seq_div(
|
672
769
|
struct llama_context * ctx,
|
673
770
|
llama_seq_id seq_id,
|
674
771
|
llama_pos p0,
|
675
772
|
llama_pos p1,
|
676
|
-
int d)
|
773
|
+
int d),
|
774
|
+
"Use llama_memory_seq_div() instead");
|
677
775
|
|
678
776
|
// Returns the smallest position present in the KV cache for the specified sequence
|
679
777
|
// This is typically non-zero only for SWA caches
|
778
|
+
// Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
|
680
779
|
// Return -1 if the sequence is empty
|
681
|
-
LLAMA_API llama_pos llama_kv_self_seq_pos_min(
|
780
|
+
DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_min(
|
682
781
|
struct llama_context * ctx,
|
683
|
-
llama_seq_id seq_id)
|
782
|
+
llama_seq_id seq_id),
|
783
|
+
"Use llama_memory_seq_pos_min() instead");
|
684
784
|
|
685
785
|
// Returns the largest position present in the KV cache for the specified sequence
|
786
|
+
// Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
|
686
787
|
// Return -1 if the sequence is empty
|
687
|
-
LLAMA_API llama_pos llama_kv_self_seq_pos_max(
|
788
|
+
DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_max(
|
688
789
|
struct llama_context * ctx,
|
689
|
-
llama_seq_id seq_id)
|
790
|
+
llama_seq_id seq_id),
|
791
|
+
"Use llama_memory_seq_pos_max() instead");
|
690
792
|
|
691
793
|
// Defragment the KV cache
|
692
794
|
// This will be applied:
|
693
795
|
// - lazily on next llama_decode()
|
694
|
-
|
695
|
-
|
796
|
+
DEPRECATED(LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx),
|
797
|
+
"simply remove this call, the context will automatically decide when to do a defragmentation based on 'defrag_thold'");
|
696
798
|
|
697
799
|
// Check if the context supports KV cache shifting
|
698
|
-
LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx)
|
800
|
+
DEPRECATED(LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx),
|
801
|
+
"use llama_memory_can_shift() instead");
|
699
802
|
|
700
803
|
// Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
|
701
|
-
LLAMA_API void llama_kv_self_update(struct llama_context * ctx)
|
804
|
+
DEPRECATED(LLAMA_API void llama_kv_self_update(struct llama_context * ctx),
|
805
|
+
"simply remove this call, updates are applied lazily on the next llama_decode()");
|
702
806
|
|
703
807
|
//
|
704
808
|
// State / sessions
|
705
809
|
//
|
706
810
|
|
707
811
|
// Returns the *actual* size in bytes of the state
|
708
|
-
// (logits, embedding and
|
812
|
+
// (logits, embedding and memory)
|
709
813
|
// Only use when saving the state, not when restoring it, otherwise the size may be too small.
|
710
814
|
LLAMA_API size_t llama_state_get_size(struct llama_context * ctx);
|
711
815
|
LLAMA_API DEPRECATED(size_t llama_get_state_size(struct llama_context * ctx),
|
@@ -761,12 +865,12 @@ extern "C" {
|
|
761
865
|
size_t n_token_count),
|
762
866
|
"use llama_state_save_file instead");
|
763
867
|
|
764
|
-
// Get the exact size needed to copy the
|
868
|
+
// Get the exact size needed to copy the state of a single sequence
|
765
869
|
LLAMA_API size_t llama_state_seq_get_size(
|
766
870
|
struct llama_context * ctx,
|
767
871
|
llama_seq_id seq_id);
|
768
872
|
|
769
|
-
// Copy the
|
873
|
+
// Copy the state of a single sequence into the specified buffer
|
770
874
|
LLAMA_API size_t llama_state_seq_get_data(
|
771
875
|
struct llama_context * ctx,
|
772
876
|
uint8_t * dst,
|
@@ -832,21 +936,23 @@ extern "C" {
|
|
832
936
|
// For encode-decoder contexts, processes the batch using the encoder.
|
833
937
|
// Can store the encoder output internally for later use by the decoder's cross-attention layers.
|
834
938
|
// 0 - success
|
835
|
-
// < 0 - error. the
|
939
|
+
// < 0 - error. the memory state is restored to the state before this call
|
836
940
|
LLAMA_API int32_t llama_encode(
|
837
941
|
struct llama_context * ctx,
|
838
942
|
struct llama_batch batch);
|
839
943
|
|
840
944
|
// Process a batch of tokens.
|
841
|
-
// Requires
|
945
|
+
// Requires the context to have a memory.
|
842
946
|
// For encode-decoder contexts, processes the batch using the decoder.
|
843
947
|
// Positive return values does not mean a fatal error, but rather a warning.
|
844
|
-
// Upon
|
948
|
+
// Upon fatal-error or abort, the ubatches that managed to be been processed will remain in the memory state of the context
|
949
|
+
// To handle this correctly, query the memory state using llama_memory_seq_pos_min() and llama_memory_seq_pos_max()
|
950
|
+
// Upon other return values, the memory state is restored to the state before this call
|
845
951
|
// 0 - success
|
846
952
|
// 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
|
847
|
-
// 2 - aborted
|
953
|
+
// 2 - aborted (processed ubatches will remain in the context's memory)
|
848
954
|
// -1 - invalid input batch
|
849
|
-
// < -1 - error
|
955
|
+
// < -1 - fatal error (processed ubatches will remain in the context's memory)
|
850
956
|
LLAMA_API int32_t llama_decode(
|
851
957
|
struct llama_context * ctx,
|
852
958
|
struct llama_batch batch);
|
@@ -862,8 +968,8 @@ extern "C" {
|
|
862
968
|
// Get the number of threads used for prompt and batch processing (multiple token).
|
863
969
|
LLAMA_API int32_t llama_n_threads_batch(struct llama_context * ctx);
|
864
970
|
|
865
|
-
// Set whether the
|
866
|
-
//
|
971
|
+
// Set whether the context outputs embeddings or not
|
972
|
+
// TODO: rename to avoid confusion with llama_get_embeddings()
|
867
973
|
LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings);
|
868
974
|
|
869
975
|
// Set whether to use causal attention or not
|
@@ -912,7 +1018,7 @@ extern "C" {
|
|
912
1018
|
|
913
1019
|
// Get the embeddings for a sequence id
|
914
1020
|
// Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
|
915
|
-
// when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[
|
1021
|
+
// when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[n_cls_out] with the rank(s) of the sequence
|
916
1022
|
// otherwise: float[n_embd] (1-dimensional)
|
917
1023
|
LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
|
918
1024
|
|
@@ -942,6 +1048,7 @@ extern "C" {
|
|
942
1048
|
|
943
1049
|
LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
|
944
1050
|
LLAMA_API bool llama_vocab_get_add_eos(const struct llama_vocab * vocab);
|
1051
|
+
LLAMA_API bool llama_vocab_get_add_sep(const struct llama_vocab * vocab);
|
945
1052
|
|
946
1053
|
LLAMA_API llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab);
|
947
1054
|
LLAMA_API llama_token llama_vocab_fim_suf(const struct llama_vocab * vocab);
|
@@ -985,6 +1092,7 @@ extern "C" {
|
|
985
1092
|
/// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
|
986
1093
|
/// @return Returns the number of tokens on success, no more than n_tokens_max
|
987
1094
|
/// @return Returns a negative number on failure - the number of tokens that would have been returned
|
1095
|
+
/// @return Returns INT32_MIN on overflow (e.g., tokenization result size exceeds int32_t limit)
|
988
1096
|
/// @param add_special Allow to add BOS and EOS tokens if model is configured to do so.
|
989
1097
|
/// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
|
990
1098
|
/// as plaintext. Does not insert a leading space.
|