cui-llama.rn 1.7.4 → 1.7.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +217 -17
- package/android/src/main/CMakeLists.txt +34 -15
- package/android/src/main/java/com/rnllama/LlamaContext.java +79 -5
- package/android/src/main/java/com/rnllama/RNLlama.java +237 -0
- package/android/src/main/jni.cpp +213 -14
- package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
- package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
- package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +35 -0
- package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +34 -0
- package/cpp/README.md +1 -1
- package/cpp/chat-parser.cpp +385 -0
- package/cpp/chat-parser.h +120 -0
- package/cpp/chat.cpp +726 -596
- package/cpp/chat.h +71 -6
- package/cpp/common.cpp +56 -38
- package/cpp/common.h +9 -3
- package/cpp/ggml-backend-reg.cpp +5 -0
- package/cpp/ggml-backend.cpp +10 -2
- package/cpp/ggml-common.h +4 -0
- package/cpp/ggml-cpu/amx/amx.cpp +1 -1
- package/cpp/ggml-cpu/amx/mmq.cpp +11 -10
- package/cpp/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
- package/cpp/ggml-cpu/arch/arm/quants.c +4114 -0
- package/cpp/ggml-cpu/arch/arm/repack.cpp +2163 -0
- package/cpp/ggml-cpu/arch/x86/cpu-feats.cpp +327 -0
- package/cpp/ggml-cpu/arch/x86/quants.c +4311 -0
- package/cpp/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
- package/cpp/ggml-cpu/arch-fallback.h +184 -0
- package/cpp/ggml-cpu/common.h +4 -3
- package/cpp/ggml-cpu/ggml-cpu-impl.h +21 -16
- package/cpp/ggml-cpu/ggml-cpu.c +123 -104
- package/cpp/ggml-cpu/ggml-cpu.cpp +11 -8
- package/cpp/ggml-cpu/ops.cpp +330 -148
- package/cpp/ggml-cpu/ops.h +1 -0
- package/cpp/ggml-cpu/quants.c +1158 -0
- package/cpp/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
- package/cpp/ggml-cpu/repack.cpp +1571 -0
- package/cpp/ggml-cpu/repack.h +98 -0
- package/cpp/ggml-cpu/simd-mappings.h +330 -38
- package/cpp/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
- package/cpp/ggml-cpu/vec.cpp +87 -18
- package/cpp/ggml-cpu/vec.h +249 -94
- package/cpp/ggml-cpu.h +1 -0
- package/cpp/ggml-impl.h +63 -183
- package/cpp/ggml-llama-sim.metallib +0 -0
- package/cpp/ggml-llama.metallib +0 -0
- package/cpp/ggml-metal.m +152 -45
- package/cpp/ggml-quants.c +0 -2
- package/cpp/ggml.c +61 -21
- package/cpp/ggml.h +22 -3
- package/cpp/gguf.cpp +24 -3
- package/cpp/json-partial.cpp +256 -0
- package/cpp/json-partial.h +38 -0
- package/cpp/json-schema-to-grammar.cpp +5 -47
- package/cpp/json-schema-to-grammar.h +4 -4
- package/cpp/llama-arch.cpp +153 -3
- package/cpp/llama-arch.h +27 -1
- package/cpp/llama-batch.cpp +741 -272
- package/cpp/llama-batch.h +112 -54
- package/cpp/llama-chat.cpp +30 -8
- package/cpp/llama-chat.h +1 -0
- package/cpp/llama-context.cpp +524 -339
- package/cpp/llama-context.h +38 -17
- package/cpp/llama-cparams.cpp +4 -0
- package/cpp/llama-cparams.h +2 -0
- package/cpp/llama-grammar.cpp +12 -2
- package/cpp/llama-graph.cpp +431 -356
- package/cpp/llama-graph.h +126 -58
- package/cpp/llama-hparams.cpp +10 -2
- package/cpp/llama-hparams.h +19 -2
- package/cpp/llama-kv-cache-unified-iswa.cpp +279 -0
- package/cpp/llama-kv-cache-unified-iswa.h +128 -0
- package/cpp/llama-kv-cache-unified.cpp +1841 -0
- package/cpp/llama-kv-cache-unified.h +303 -0
- package/cpp/llama-kv-cells.h +439 -0
- package/cpp/llama-memory-hybrid.cpp +246 -0
- package/cpp/llama-memory-hybrid.h +138 -0
- package/cpp/llama-memory-recurrent.cpp +1112 -0
- package/cpp/llama-memory-recurrent.h +183 -0
- package/cpp/llama-memory.cpp +41 -0
- package/cpp/llama-memory.h +86 -5
- package/cpp/llama-mmap.cpp +1 -1
- package/cpp/llama-model-loader.cpp +42 -17
- package/cpp/llama-model-saver.cpp +1 -0
- package/cpp/llama-model.cpp +1639 -513
- package/cpp/llama-model.h +26 -0
- package/cpp/llama-sampling.cpp +2 -2
- package/cpp/llama-vocab.cpp +65 -28
- package/cpp/llama-vocab.h +1 -0
- package/cpp/llama.cpp +11 -7
- package/cpp/llama.h +150 -42
- package/cpp/minja/chat-template.hpp +1 -1
- package/cpp/minja/minja.hpp +1 -1
- package/cpp/{json.hpp → nlohmann/json.hpp} +3027 -2267
- package/cpp/nlohmann/json_fwd.hpp +187 -0
- package/cpp/regex-partial.cpp +204 -0
- package/cpp/regex-partial.h +56 -0
- package/cpp/rn-llama.cpp +646 -35
- package/cpp/rn-llama.h +32 -1
- package/cpp/rn-tts.h +39 -0
- package/cpp/sampling.cpp +7 -8
- package/cpp/tools/mtmd/clip-impl.h +5 -0
- package/cpp/tools/mtmd/clip.cpp +572 -436
- package/cpp/tools/mtmd/clip.h +14 -4
- package/cpp/tools/mtmd/mtmd-audio.cpp +0 -86
- package/cpp/tools/mtmd/mtmd-audio.h +2 -17
- package/cpp/tools/mtmd/mtmd-helper.cpp +175 -12
- package/cpp/tools/mtmd/mtmd-helper.h +91 -0
- package/cpp/tools/mtmd/mtmd.cpp +368 -248
- package/cpp/tools/mtmd/mtmd.h +6 -70
- package/cpp/unicode.cpp +5 -0
- package/ios/CMakeLists.txt +26 -6
- package/ios/RNLlama.h +1 -1
- package/ios/RNLlama.mm +153 -3
- package/ios/RNLlamaContext.h +9 -1
- package/ios/RNLlamaContext.mm +112 -9
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat-parser.h +120 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat.h +71 -6
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/common.h +9 -3
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-common.h +4 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-impl.h +63 -183
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml.h +22 -3
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/json-partial.h +38 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-arch.h +27 -1
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-batch.h +112 -54
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-chat.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-context.h +38 -17
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-cparams.h +2 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-graph.h +126 -58
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-hparams.h +19 -2
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cells.h +439 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory.h +86 -5
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model.h +26 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-vocab.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama.h +150 -42
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/minja.hpp +1 -1
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/{json.hpp → nlohmann/json.hpp} +3027 -2267
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/regex-partial.h +56 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-llama.h +32 -1
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-tts.h +39 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/ggml-llama.metallib +0 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/rnllama +0 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat-parser.h +120 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +71 -6
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +9 -3
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-common.h +4 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +63 -183
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +22 -3
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/json-partial.h +38 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +27 -1
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +112 -54
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +38 -17
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +2 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +126 -58
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +19 -2
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cells.h +439 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +86 -5
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +26 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +150 -42
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +1 -1
- package/ios/rnllama.xcframework/{tvos-arm64/rnllama.framework/Headers → ios-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann}/json.hpp +3027 -2267
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/regex-partial.h +56 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +32 -1
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-tts.h +39 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat-parser.h +120 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat.h +71 -6
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/common.h +9 -3
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-common.h +4 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-impl.h +63 -183
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml.h +22 -3
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/json-partial.h +38 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-arch.h +27 -1
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-batch.h +112 -54
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-chat.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-context.h +38 -17
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-cparams.h +2 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-graph.h +126 -58
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-hparams.h +19 -2
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cells.h +439 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory.h +86 -5
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model.h +26 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-vocab.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama.h +150 -42
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/minja.hpp +1 -1
- package/ios/rnllama.xcframework/{ios-arm64_x86_64-simulator/rnllama.framework/Headers → tvos-arm64/rnllama.framework/Headers/nlohmann}/json.hpp +3027 -2267
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/regex-partial.h +56 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-llama.h +32 -1
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-tts.h +39 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/ggml-llama.metallib +0 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/rnllama +0 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat-parser.h +120 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +71 -6
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +9 -3
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-common.h +4 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +63 -183
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +22 -3
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json-partial.h +38 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json-schema-to-grammar.h +4 -4
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +27 -1
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +112 -54
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +38 -17
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +2 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +126 -58
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +19 -2
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified-iswa.h +128 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache-unified.h +303 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cells.h +439 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-hybrid.h +138 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory-recurrent.h +183 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +86 -5
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +26 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +150 -42
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +1 -1
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +1 -1
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json.hpp +25526 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/nlohmann/json_fwd.hpp +187 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/regex-partial.h +56 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +32 -1
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-tts.h +39 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
- package/jest/mock.js +24 -0
- package/package.json +1 -1
- package/src/NativeRNLlama.ts +46 -2
- package/src/index.ts +105 -1
- package/cpp/ggml-cpu/ggml-cpu-aarch64.h +0 -8
- package/cpp/ggml-cpu/ggml-cpu-quants.c +0 -13326
- package/cpp/ggml-cpu/sgemm.cpp +0 -3544
- package/cpp/ggml-cpu/sgemm.h +0 -14
- package/cpp/llama-kv-cache.cpp +0 -2827
- package/cpp/llama-kv-cache.h +0 -515
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache.h +0 -515
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +0 -515
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache.h +0 -515
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json.hpp +0 -24766
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +0 -515
- /package/cpp/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
- /package/cpp/tools/mtmd/{miniaudio.h → miniaudio/miniaudio.h} +0 -0
- /package/cpp/tools/mtmd/{stb_image.h → stb/stb_image.h} +0 -0
package/cpp/chat.h
CHANGED
@@ -3,6 +3,7 @@
|
|
3
3
|
#pragma once
|
4
4
|
|
5
5
|
#include "common.h"
|
6
|
+
#include <functional>
|
6
7
|
#include <chrono>
|
7
8
|
#include <string>
|
8
9
|
#include <vector>
|
@@ -21,11 +22,19 @@ struct common_chat_tool_call {
|
|
21
22
|
std::string name;
|
22
23
|
std::string arguments;
|
23
24
|
std::string id;
|
25
|
+
|
26
|
+
bool operator==(const common_chat_tool_call & other) const {
|
27
|
+
return name == other.name && arguments == other.arguments && id == other.id;
|
28
|
+
}
|
24
29
|
};
|
25
30
|
|
26
31
|
struct common_chat_msg_content_part {
|
27
32
|
std::string type;
|
28
33
|
std::string text;
|
34
|
+
|
35
|
+
bool operator==(const common_chat_msg_content_part & other) const {
|
36
|
+
return type == other.type && text == other.text;
|
37
|
+
}
|
29
38
|
};
|
30
39
|
|
31
40
|
struct common_chat_msg {
|
@@ -36,6 +45,51 @@ struct common_chat_msg {
|
|
36
45
|
std::string reasoning_content;
|
37
46
|
std::string tool_name;
|
38
47
|
std::string tool_call_id;
|
48
|
+
|
49
|
+
template <class T> T to_json_oaicompat() const;
|
50
|
+
|
51
|
+
bool empty() const {
|
52
|
+
return content.empty() && content_parts.empty() && tool_calls.empty() && reasoning_content.empty() && tool_name.empty() && tool_call_id.empty();
|
53
|
+
}
|
54
|
+
void ensure_tool_call_ids_set(std::vector<std::string> & ids_cache, const std::function<std::string()> & gen_tool_call_id) {
|
55
|
+
for (auto i = 0u; i < tool_calls.size(); i++) {
|
56
|
+
if (ids_cache.size() <= i) {
|
57
|
+
auto id = tool_calls[i].id;
|
58
|
+
if (id.empty()) {
|
59
|
+
id = gen_tool_call_id();
|
60
|
+
}
|
61
|
+
ids_cache.push_back(id);
|
62
|
+
}
|
63
|
+
tool_calls[i].id = ids_cache[i];
|
64
|
+
}
|
65
|
+
}
|
66
|
+
bool operator==(const common_chat_msg & other) const {
|
67
|
+
return role == other.role
|
68
|
+
&& content == other.content
|
69
|
+
&& content_parts == other.content_parts
|
70
|
+
&& tool_calls == other.tool_calls
|
71
|
+
&& reasoning_content == other.reasoning_content
|
72
|
+
&& tool_name == other.tool_name
|
73
|
+
&& tool_call_id == other.tool_call_id;
|
74
|
+
}
|
75
|
+
bool operator!=(const common_chat_msg & other) const {
|
76
|
+
return !(*this == other);
|
77
|
+
}
|
78
|
+
};
|
79
|
+
|
80
|
+
struct common_chat_msg_diff {
|
81
|
+
std::string reasoning_content_delta;
|
82
|
+
std::string content_delta;
|
83
|
+
size_t tool_call_index = std::string::npos;
|
84
|
+
common_chat_tool_call tool_call_delta;
|
85
|
+
|
86
|
+
static std::vector<common_chat_msg_diff> compute_diffs(const common_chat_msg & previous_msg, const common_chat_msg & new_msg);
|
87
|
+
|
88
|
+
bool operator==(const common_chat_msg_diff & other) const {
|
89
|
+
return content_delta == other.content_delta
|
90
|
+
&& tool_call_index == other.tool_call_index
|
91
|
+
&& tool_call_delta == other.tool_call_delta;
|
92
|
+
}
|
39
93
|
};
|
40
94
|
|
41
95
|
struct common_chat_tool {
|
@@ -57,14 +111,11 @@ enum common_chat_format {
|
|
57
111
|
COMMON_CHAT_FORMAT_LLAMA_3_X,
|
58
112
|
COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
|
59
113
|
COMMON_CHAT_FORMAT_DEEPSEEK_R1,
|
60
|
-
COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING,
|
61
114
|
COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
|
62
115
|
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
|
63
116
|
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
|
64
117
|
COMMON_CHAT_FORMAT_HERMES_2_PRO,
|
65
|
-
COMMON_CHAT_FORMAT_HERMES_2_PRO_EXTRACT_REASONING,
|
66
118
|
COMMON_CHAT_FORMAT_COMMAND_R7B,
|
67
|
-
COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING,
|
68
119
|
|
69
120
|
COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
|
70
121
|
};
|
@@ -79,7 +130,8 @@ struct common_chat_templates_inputs {
|
|
79
130
|
std::vector<common_chat_tool> tools;
|
80
131
|
common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
|
81
132
|
bool parallel_tool_calls = false;
|
82
|
-
|
133
|
+
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
|
134
|
+
bool enable_thinking = true;
|
83
135
|
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
|
84
136
|
};
|
85
137
|
|
@@ -88,11 +140,21 @@ struct common_chat_params {
|
|
88
140
|
std::string prompt;
|
89
141
|
std::string grammar;
|
90
142
|
bool grammar_lazy = false;
|
143
|
+
bool thinking_forced_open = false;
|
91
144
|
std::vector<common_grammar_trigger> grammar_triggers;
|
92
145
|
std::vector<std::string> preserved_tokens;
|
93
146
|
std::vector<std::string> additional_stops;
|
94
147
|
};
|
95
148
|
|
149
|
+
struct common_chat_syntax {
|
150
|
+
common_chat_format format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
|
151
|
+
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
|
152
|
+
// Whether reasoning_content should be inlined in the content (e.g. for reasoning_format=deepseek in stream mode)
|
153
|
+
bool reasoning_in_content = false;
|
154
|
+
bool thinking_forced_open = false;
|
155
|
+
bool parse_tool_calls = true;
|
156
|
+
};
|
157
|
+
|
96
158
|
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
|
97
159
|
bool common_chat_verify_template(const std::string & tmpl, bool use_jinja);
|
98
160
|
|
@@ -129,8 +191,9 @@ std::string common_chat_format_example(
|
|
129
191
|
const struct common_chat_templates * tmpls,
|
130
192
|
bool use_jinja);
|
131
193
|
|
132
|
-
|
133
|
-
|
194
|
+
const char* common_chat_format_name(common_chat_format format);
|
195
|
+
const char* common_reasoning_format_name(common_reasoning_format format);
|
196
|
+
common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
|
134
197
|
|
135
198
|
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
|
136
199
|
|
@@ -143,3 +206,5 @@ template <class T> T common_chat_msgs_to_json_oaicompat(const std::vector<common
|
|
143
206
|
// T can be std::string containing JSON or nlohmann::ordered_json
|
144
207
|
template <class T> std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const T & tools);
|
145
208
|
template <class T> T common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools);
|
209
|
+
|
210
|
+
template <class T> T common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff);
|
package/cpp/common.cpp
CHANGED
@@ -210,6 +210,7 @@ bool set_process_priority(enum lm_ggml_sched_priority prio) {
|
|
210
210
|
|
211
211
|
DWORD p = NORMAL_PRIORITY_CLASS;
|
212
212
|
switch (prio) {
|
213
|
+
case LM_GGML_SCHED_PRIO_LOW: p = BELOW_NORMAL_PRIORITY_CLASS; break;
|
213
214
|
case LM_GGML_SCHED_PRIO_NORMAL: p = NORMAL_PRIORITY_CLASS; break;
|
214
215
|
case LM_GGML_SCHED_PRIO_MEDIUM: p = ABOVE_NORMAL_PRIORITY_CLASS; break;
|
215
216
|
case LM_GGML_SCHED_PRIO_HIGH: p = HIGH_PRIORITY_CLASS; break;
|
@@ -235,6 +236,7 @@ bool set_process_priority(enum lm_ggml_sched_priority prio) {
|
|
235
236
|
|
236
237
|
int p = 0;
|
237
238
|
switch (prio) {
|
239
|
+
case LM_GGML_SCHED_PRIO_LOW: p = 5; break;
|
238
240
|
case LM_GGML_SCHED_PRIO_NORMAL: p = 0; break;
|
239
241
|
case LM_GGML_SCHED_PRIO_MEDIUM: p = -5; break;
|
240
242
|
case LM_GGML_SCHED_PRIO_HIGH: p = -10; break;
|
@@ -471,7 +473,7 @@ size_t string_find_partial_stop(const std::string_view & str, const std::string_
|
|
471
473
|
|
472
474
|
std::string regex_escape(const std::string & s) {
|
473
475
|
static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
|
474
|
-
return std::regex_replace(s, special_chars, "
|
476
|
+
return std::regex_replace(s, special_chars, "\\$&");
|
475
477
|
}
|
476
478
|
|
477
479
|
std::string string_join(const std::vector<std::string> & values, const std::string & separator) {
|
@@ -711,11 +713,17 @@ bool fs_validate_filename(const std::string & filename) {
|
|
711
713
|
// disable C++17 deprecation warning for std::codecvt_utf8
|
712
714
|
# pragma clang diagnostic push
|
713
715
|
# pragma clang diagnostic ignored "-Wdeprecated-declarations"
|
716
|
+
#elif defined(__GNUC__)
|
717
|
+
# pragma GCC diagnostic push
|
718
|
+
# pragma GCC diagnostic ignored "-Wdeprecated-declarations"
|
714
719
|
#endif
|
720
|
+
|
715
721
|
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
|
716
722
|
|
717
723
|
#if defined(__clang__)
|
718
724
|
# pragma clang diagnostic pop
|
725
|
+
#elif defined(__GNUC__)
|
726
|
+
# pragma GCC diagnostic pop
|
719
727
|
#endif
|
720
728
|
|
721
729
|
filename_utf32 = converter.from_bytes(filename);
|
@@ -772,6 +780,9 @@ bool fs_validate_filename(const std::string & filename) {
|
|
772
780
|
return true;
|
773
781
|
}
|
774
782
|
|
783
|
+
#include <iostream>
|
784
|
+
|
785
|
+
|
775
786
|
// returns true if successful, false otherwise
|
776
787
|
bool fs_create_directory_with_parents(const std::string & path) {
|
777
788
|
#ifdef _WIN32
|
@@ -789,9 +800,16 @@ bool fs_create_directory_with_parents(const std::string & path) {
|
|
789
800
|
// process path from front to back, procedurally creating directories
|
790
801
|
while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
|
791
802
|
const std::wstring subpath = wpath.substr(0, pos_slash);
|
792
|
-
const wchar_t * test = subpath.c_str();
|
793
803
|
|
794
|
-
|
804
|
+
pos_slash += 1;
|
805
|
+
|
806
|
+
// skip the drive letter, in some systems it can return an access denied error
|
807
|
+
if (subpath.length() == 2 && subpath[1] == ':') {
|
808
|
+
continue;
|
809
|
+
}
|
810
|
+
|
811
|
+
const bool success = CreateDirectoryW(subpath.c_str(), NULL);
|
812
|
+
|
795
813
|
if (!success) {
|
796
814
|
const DWORD error = GetLastError();
|
797
815
|
|
@@ -805,8 +823,6 @@ bool fs_create_directory_with_parents(const std::string & path) {
|
|
805
823
|
return false;
|
806
824
|
}
|
807
825
|
}
|
808
|
-
|
809
|
-
pos_slash += 1;
|
810
826
|
}
|
811
827
|
|
812
828
|
return true;
|
@@ -856,7 +872,7 @@ std::string fs_get_cache_directory() {
|
|
856
872
|
if (getenv("LLAMA_CACHE")) {
|
857
873
|
cache_directory = std::getenv("LLAMA_CACHE");
|
858
874
|
} else {
|
859
|
-
#if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)
|
875
|
+
#if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX) || defined(__OpenBSD__)
|
860
876
|
if (std::getenv("XDG_CACHE_HOME")) {
|
861
877
|
cache_directory = std::getenv("XDG_CACHE_HOME");
|
862
878
|
} else {
|
@@ -902,31 +918,6 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
902
918
|
|
903
919
|
const llama_vocab * vocab = llama_model_get_vocab(model);
|
904
920
|
|
905
|
-
if (params.reranking) {
|
906
|
-
bool ok = true;
|
907
|
-
|
908
|
-
if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
|
909
|
-
LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__);
|
910
|
-
ok = false;
|
911
|
-
}
|
912
|
-
|
913
|
-
if (llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
|
914
|
-
LOG_WRN("%s: warning: vocab does not have an EOS token, reranking will not work\n", __func__);
|
915
|
-
ok = false;
|
916
|
-
}
|
917
|
-
|
918
|
-
if (llama_vocab_sep(vocab) == LLAMA_TOKEN_NULL) {
|
919
|
-
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
|
920
|
-
ok = false;
|
921
|
-
}
|
922
|
-
|
923
|
-
if (!ok) {
|
924
|
-
llama_model_free(model);
|
925
|
-
|
926
|
-
return iparams;
|
927
|
-
}
|
928
|
-
}
|
929
|
-
|
930
921
|
auto cparams = common_context_params_to_llama(params);
|
931
922
|
|
932
923
|
llama_context * lctx = llama_init_from_model(model, cparams);
|
@@ -936,7 +927,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
936
927
|
return iparams;
|
937
928
|
}
|
938
929
|
|
939
|
-
if (params.ctx_shift && !
|
930
|
+
if (params.ctx_shift && !llama_memory_can_shift(llama_get_memory(lctx))) {
|
940
931
|
LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__);
|
941
932
|
params.ctx_shift = false;
|
942
933
|
}
|
@@ -968,6 +959,35 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
968
959
|
}
|
969
960
|
}
|
970
961
|
|
962
|
+
if (llama_pooling_type(lctx) == LLAMA_POOLING_TYPE_RANK) {
|
963
|
+
bool ok = true;
|
964
|
+
|
965
|
+
if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
|
966
|
+
LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__);
|
967
|
+
ok = false;
|
968
|
+
}
|
969
|
+
|
970
|
+
bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
|
971
|
+
bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
|
972
|
+
|
973
|
+
if (!has_eos && !has_sep) {
|
974
|
+
LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
|
975
|
+
ok = false;
|
976
|
+
} else if (!has_eos) {
|
977
|
+
LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
|
978
|
+
} else if (!has_sep) {
|
979
|
+
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
|
980
|
+
ok = false;
|
981
|
+
}
|
982
|
+
|
983
|
+
if (!ok) {
|
984
|
+
llama_free(lctx);
|
985
|
+
llama_model_free(model);
|
986
|
+
|
987
|
+
return iparams;
|
988
|
+
}
|
989
|
+
}
|
990
|
+
|
971
991
|
// load and optionally apply lora adapters
|
972
992
|
for (auto & la : params.lora_adapters) {
|
973
993
|
llama_adapter_lora_ptr lora;
|
@@ -1043,7 +1063,7 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
1043
1063
|
if (llama_model_has_decoder(model)) {
|
1044
1064
|
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
|
1045
1065
|
}
|
1046
|
-
|
1066
|
+
llama_memory_clear(llama_get_memory(lctx), true);
|
1047
1067
|
llama_synchronize(lctx);
|
1048
1068
|
llama_perf_context_reset(lctx);
|
1049
1069
|
llama_set_warmup(lctx, false);
|
@@ -1145,11 +1165,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
|
|
1145
1165
|
cparams.op_offload = !params.no_op_offload;
|
1146
1166
|
cparams.swa_full = params.swa_full;
|
1147
1167
|
|
1148
|
-
if (params.reranking) {
|
1149
|
-
cparams.embeddings = true;
|
1150
|
-
cparams.pooling_type = LLAMA_POOLING_TYPE_RANK;
|
1151
|
-
}
|
1152
|
-
|
1153
1168
|
cparams.type_k = params.cache_type_k;
|
1154
1169
|
cparams.type_v = params.cache_type_v;
|
1155
1170
|
|
@@ -1282,6 +1297,9 @@ std::vector<llama_token> common_tokenize(
|
|
1282
1297
|
int n_tokens = text.length() + 2 * add_special;
|
1283
1298
|
std::vector<llama_token> result(n_tokens);
|
1284
1299
|
n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
1300
|
+
if (n_tokens == std::numeric_limits<int32_t>::min()) {
|
1301
|
+
throw std::runtime_error("Tokenization failed: input text too large, tokenization result exceeds int32_t limit");
|
1302
|
+
}
|
1285
1303
|
if (n_tokens < 0) {
|
1286
1304
|
result.resize(-n_tokens);
|
1287
1305
|
int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
package/cpp/common.h
CHANGED
@@ -126,7 +126,7 @@ enum common_grammar_trigger_type {
|
|
126
126
|
COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN,
|
127
127
|
COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
|
128
128
|
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
|
129
|
-
|
129
|
+
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
|
130
130
|
};
|
131
131
|
|
132
132
|
struct common_grammar_trigger {
|
@@ -210,6 +210,9 @@ struct common_params_speculative {
|
|
210
210
|
float p_split = 0.1f; // speculative decoding split probability
|
211
211
|
float p_min = 0.75f; // minimum speculative decoding probability (greedy)
|
212
212
|
|
213
|
+
lm_ggml_type cache_type_k = LM_GGML_TYPE_F16; // KV cache data type for the K
|
214
|
+
lm_ggml_type cache_type_v = LM_GGML_TYPE_F16; // KV cache data type for the V
|
215
|
+
|
213
216
|
struct cpu_params cpuparams;
|
214
217
|
struct cpu_params cpuparams_batch;
|
215
218
|
|
@@ -226,7 +229,8 @@ struct common_params_vocoder {
|
|
226
229
|
|
227
230
|
enum common_reasoning_format {
|
228
231
|
COMMON_REASONING_FORMAT_NONE,
|
229
|
-
|
232
|
+
COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
|
233
|
+
COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
|
230
234
|
};
|
231
235
|
|
232
236
|
struct common_params {
|
@@ -306,6 +310,7 @@ struct common_params {
|
|
306
310
|
int32_t verbosity = 0;
|
307
311
|
int32_t control_vector_layer_start = -1; // layer range for control vector
|
308
312
|
int32_t control_vector_layer_end = -1; // layer range for control vector
|
313
|
+
bool offline = false;
|
309
314
|
|
310
315
|
int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
|
311
316
|
int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
|
@@ -368,7 +373,7 @@ struct common_params {
|
|
368
373
|
int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
|
369
374
|
std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
|
370
375
|
std::string embd_sep = "\n"; // separator of embeddings
|
371
|
-
|
376
|
+
std::string cls_sep = "\t"; // separator of classification sequences
|
372
377
|
|
373
378
|
// server params
|
374
379
|
int32_t port = 8080; // server listens on this network port
|
@@ -383,6 +388,7 @@ struct common_params {
|
|
383
388
|
bool use_jinja = false; // NOLINT
|
384
389
|
bool enable_chat_template = true;
|
385
390
|
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
|
391
|
+
int reasoning_budget = -1;
|
386
392
|
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
|
387
393
|
|
388
394
|
std::vector<std::string> api_keys;
|
package/cpp/ggml-backend-reg.cpp
CHANGED
@@ -69,6 +69,9 @@
|
|
69
69
|
#if defined(__clang__)
|
70
70
|
# pragma clang diagnostic push
|
71
71
|
# pragma clang diagnostic ignored "-Wdeprecated-declarations"
|
72
|
+
#elif defined(__GNUC__)
|
73
|
+
# pragma GCC diagnostic push
|
74
|
+
# pragma GCC diagnostic ignored "-Wdeprecated-declarations"
|
72
75
|
#endif
|
73
76
|
|
74
77
|
namespace fs = std::filesystem;
|
@@ -91,6 +94,8 @@ static std::string path_str(const fs::path & path) {
|
|
91
94
|
|
92
95
|
#if defined(__clang__)
|
93
96
|
# pragma clang diagnostic pop
|
97
|
+
#elif defined(__GNUC__)
|
98
|
+
# pragma GCC diagnostic pop
|
94
99
|
#endif
|
95
100
|
|
96
101
|
#ifdef _WIN32
|
package/cpp/ggml-backend.cpp
CHANGED
@@ -1340,7 +1340,10 @@ static bool lm_ggml_backend_sched_alloc_splits(lm_ggml_backend_sched_t sched) {
|
|
1340
1340
|
// allocate graph
|
1341
1341
|
if (backend_ids_changed || !lm_ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
|
1342
1342
|
// the re-allocation may cause the split inputs to be moved to a different address
|
1343
|
-
lm_ggml_backend_sched_synchronize
|
1343
|
+
// synchronize without lm_ggml_backend_sched_synchronize to avoid changing cur_copy
|
1344
|
+
for (int i = 0; i < sched->n_backends; i++) {
|
1345
|
+
lm_ggml_backend_synchronize(sched->backends[i]);
|
1346
|
+
}
|
1344
1347
|
#ifndef NDEBUG
|
1345
1348
|
LM_GGML_LOG_DEBUG("%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
|
1346
1349
|
#endif
|
@@ -1564,7 +1567,6 @@ bool lm_ggml_backend_sched_alloc_graph(lm_ggml_backend_sched_t sched, struct lm_
|
|
1564
1567
|
|
1565
1568
|
lm_ggml_backend_sched_split_graph(sched, graph);
|
1566
1569
|
|
1567
|
-
|
1568
1570
|
if (!lm_ggml_backend_sched_alloc_splits(sched)) {
|
1569
1571
|
return false;
|
1570
1572
|
}
|
@@ -1598,6 +1600,12 @@ void lm_ggml_backend_sched_synchronize(lm_ggml_backend_sched_t sched) {
|
|
1598
1600
|
for (int i = 0; i < sched->n_backends; i++) {
|
1599
1601
|
lm_ggml_backend_synchronize(sched->backends[i]);
|
1600
1602
|
}
|
1603
|
+
if (!sched->is_alloc) {
|
1604
|
+
// if the graph is not already allocated, always use copy 0 after a synchronization
|
1605
|
+
// this ensures that during generation the same copy is used every time,
|
1606
|
+
// which avoids changes in the graph that could cause CUDA or other graphs to be disabled
|
1607
|
+
sched->cur_copy = 0;
|
1608
|
+
}
|
1601
1609
|
}
|
1602
1610
|
|
1603
1611
|
void lm_ggml_backend_sched_set_eval_callback(lm_ggml_backend_sched_t sched, lm_ggml_backend_sched_eval_callback callback, void * user_data) {
|
package/cpp/ggml-common.h
CHANGED
@@ -1074,6 +1074,10 @@ LM_GGML_TABLE_BEGIN(uint32_t, iq3s_grid, 512)
|
|
1074
1074
|
0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101,
|
1075
1075
|
LM_GGML_TABLE_END()
|
1076
1076
|
|
1077
|
+
LM_GGML_TABLE_BEGIN(int8_t, kvalues_iq4nl, 16)
|
1078
|
+
-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113,
|
1079
|
+
LM_GGML_TABLE_END()
|
1080
|
+
|
1077
1081
|
#define NGRID_IQ1S 2048
|
1078
1082
|
#define IQ1S_DELTA 0.125f
|
1079
1083
|
#define IQ1M_DELTA 0.125f
|
package/cpp/ggml-cpu/amx/amx.cpp
CHANGED
package/cpp/ggml-cpu/amx/mmq.cpp
CHANGED
@@ -8,7 +8,8 @@
|
|
8
8
|
#include "mmq.h"
|
9
9
|
#include "ggml-impl.h"
|
10
10
|
#include "ggml-cpu-impl.h"
|
11
|
-
#include "
|
11
|
+
#include "simd-mappings.h"
|
12
|
+
#include "quants.h"
|
12
13
|
#include "ggml-quants.h"
|
13
14
|
#include <algorithm>
|
14
15
|
#include <type_traits>
|
@@ -453,7 +454,7 @@ void quantize_row_q8_K_vnni(const float * RESTRICT x, void * RESTRICT vy, int64_
|
|
453
454
|
|
454
455
|
// Quantize these floats
|
455
456
|
const float iscale = 127.f / amax;
|
456
|
-
y[i].d =
|
457
|
+
y[i].d = LM_GGML_CPU_FP32_TO_FP16(1 / iscale);
|
457
458
|
const float id = ( amax != 0.0f ) ? iscale : 0.f;
|
458
459
|
const __m512 vscale = _mm512_set1_ps(id);
|
459
460
|
|
@@ -1090,7 +1091,7 @@ struct acc_C<block_q8_0, block_q4_0, is_acc> {
|
|
1090
1091
|
const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset)));
|
1091
1092
|
|
1092
1093
|
for (int m = 0; m < nr; ++m) {
|
1093
|
-
const __m512 vd1 = _mm512_set1_ps(
|
1094
|
+
const __m512 vd1 = _mm512_set1_ps(LM_GGML_CPU_FP16_TO_FP32(A[m * lda].d));
|
1094
1095
|
const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
|
1095
1096
|
|
1096
1097
|
__m512 vsum;
|
@@ -1113,8 +1114,8 @@ struct acc_C<block_q8_1, block_q4_1, is_acc> {
|
|
1113
1114
|
const __m512 vm0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset + TILE_N * sizeof(lm_ggml_half))));
|
1114
1115
|
|
1115
1116
|
for (int m = 0; m < nr; ++m) {
|
1116
|
-
const __m512 vd1 = _mm512_set1_ps(
|
1117
|
-
const __m512 vs1 = _mm512_set1_ps(
|
1117
|
+
const __m512 vd1 = _mm512_set1_ps(LM_GGML_CPU_FP16_TO_FP32(A[m * lda].d));
|
1118
|
+
const __m512 vs1 = _mm512_set1_ps(LM_GGML_CPU_FP16_TO_FP32(A[m * lda].s));
|
1118
1119
|
const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
|
1119
1120
|
|
1120
1121
|
__m512 vsum;
|
@@ -1137,7 +1138,7 @@ struct acc_C<block_q8_0, block_q8_0, is_acc> {
|
|
1137
1138
|
const __m512 vd0 = _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)((const char *)packed_B + offset)));
|
1138
1139
|
|
1139
1140
|
for (int m = 0; m < nr; ++m) {
|
1140
|
-
const __m512 vd1 = _mm512_set1_ps(
|
1141
|
+
const __m512 vd1 = _mm512_set1_ps(LM_GGML_CPU_FP16_TO_FP32(A[m * lda].d));
|
1141
1142
|
const __m512 vtile = _mm512_cvtepi32_ps(_mm512_loadu_si512(tile + m * TILE_N));
|
1142
1143
|
|
1143
1144
|
__m512 vsum;
|
@@ -1437,7 +1438,7 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q4_0, float, BLOCK_M, BLOCK_N, BLO
|
|
1437
1438
|
va[k] = _mm512_set1_epi32(a_ptr[k]);
|
1438
1439
|
vcomp = _mm512_dpbusd_epi32(vcomp, off, va[k]);
|
1439
1440
|
}
|
1440
|
-
vd1 = _mm512_set1_ps(
|
1441
|
+
vd1 = _mm512_set1_ps(LM_GGML_CPU_FP16_TO_FP32(A[0 * KB + i].d));
|
1441
1442
|
}
|
1442
1443
|
|
1443
1444
|
// load b
|
@@ -1498,8 +1499,8 @@ struct tinygemm_kernel_vnni<block_q8_1, block_q4_1, float, 1, BLOCK_N, BLOCK_K>
|
|
1498
1499
|
for (int k = 0; k < 8; ++k) {
|
1499
1500
|
va[k] = _mm512_set1_epi32(a_ptr[k]);
|
1500
1501
|
}
|
1501
|
-
vd1 = _mm512_set1_ps(
|
1502
|
-
vs1 = _mm512_set1_ps(
|
1502
|
+
vd1 = _mm512_set1_ps(LM_GGML_CPU_FP16_TO_FP32(A[0 * KB + i].d));
|
1503
|
+
vs1 = _mm512_set1_ps(LM_GGML_CPU_FP16_TO_FP32(A[0 * KB + i].s));
|
1503
1504
|
}
|
1504
1505
|
|
1505
1506
|
// load b
|
@@ -1571,7 +1572,7 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q8_0, float, BLOCK_M, BLOCK_N, BLO
|
|
1571
1572
|
va[k] = _mm512_set1_epi32(a_ptr[k]);
|
1572
1573
|
va[k] = _mm512_add_epi8(va[k], off);
|
1573
1574
|
}
|
1574
|
-
vd1 = _mm512_set1_ps(
|
1575
|
+
vd1 = _mm512_set1_ps(LM_GGML_CPU_FP16_TO_FP32(A[0 * KB + i].d));
|
1575
1576
|
}
|
1576
1577
|
|
1577
1578
|
// load b
|
@@ -0,0 +1,94 @@
|
|
1
|
+
#include "ggml-backend-impl.h"
|
2
|
+
|
3
|
+
#if defined(__aarch64__)
|
4
|
+
|
5
|
+
#if defined(__linux__)
|
6
|
+
#include <sys/auxv.h>
|
7
|
+
#elif defined(__APPLE__)
|
8
|
+
#include <sys/sysctl.h>
|
9
|
+
#endif
|
10
|
+
|
11
|
+
#if !defined(HWCAP2_I8MM)
|
12
|
+
#define HWCAP2_I8MM (1 << 13)
|
13
|
+
#endif
|
14
|
+
|
15
|
+
#if !defined(HWCAP2_SME)
|
16
|
+
#define HWCAP2_SME (1 << 23)
|
17
|
+
#endif
|
18
|
+
|
19
|
+
struct aarch64_features {
|
20
|
+
// has_neon not needed, aarch64 has NEON guaranteed
|
21
|
+
bool has_dotprod = false;
|
22
|
+
bool has_fp16_va = false;
|
23
|
+
bool has_sve = false;
|
24
|
+
bool has_sve2 = false;
|
25
|
+
bool has_i8mm = false;
|
26
|
+
bool has_sme = false;
|
27
|
+
|
28
|
+
aarch64_features() {
|
29
|
+
#if defined(__linux__)
|
30
|
+
uint32_t hwcap = getauxval(AT_HWCAP);
|
31
|
+
uint32_t hwcap2 = getauxval(AT_HWCAP2);
|
32
|
+
|
33
|
+
has_dotprod = !!(hwcap & HWCAP_ASIMDDP);
|
34
|
+
has_fp16_va = !!(hwcap & HWCAP_FPHP);
|
35
|
+
has_sve = !!(hwcap & HWCAP_SVE);
|
36
|
+
has_sve2 = !!(hwcap2 & HWCAP2_SVE2);
|
37
|
+
has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
|
38
|
+
has_sme = !!(hwcap2 & HWCAP2_SME);
|
39
|
+
#elif defined(__APPLE__)
|
40
|
+
int oldp = 0;
|
41
|
+
size_t size = sizeof(oldp);
|
42
|
+
|
43
|
+
if (sysctlbyname("hw.optional.arm.FEAT_DotProd", &oldp, &size, NULL, 0) == 0) {
|
44
|
+
has_dotprod = static_cast<bool>(oldp);
|
45
|
+
}
|
46
|
+
|
47
|
+
if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) == 0) {
|
48
|
+
has_i8mm = static_cast<bool>(oldp);
|
49
|
+
}
|
50
|
+
|
51
|
+
if (sysctlbyname("hw.optional.arm.FEAT_SME", &oldp, &size, NULL, 0) == 0) {
|
52
|
+
has_sme = static_cast<bool>(oldp);
|
53
|
+
}
|
54
|
+
|
55
|
+
// Apple apparently does not implement SVE yet
|
56
|
+
#endif
|
57
|
+
}
|
58
|
+
};
|
59
|
+
|
60
|
+
static int lm_ggml_backend_cpu_aarch64_score() {
|
61
|
+
int score = 1;
|
62
|
+
aarch64_features af;
|
63
|
+
|
64
|
+
#ifdef LM_GGML_USE_DOTPROD
|
65
|
+
if (!af.has_dotprod) { return 0; }
|
66
|
+
score += 1<<1;
|
67
|
+
#endif
|
68
|
+
#ifdef LM_GGML_USE_FP16_VECTOR_ARITHMETIC
|
69
|
+
if (!af.has_fp16_va) { return 0; }
|
70
|
+
score += 1<<2;
|
71
|
+
#endif
|
72
|
+
#ifdef LM_GGML_USE_SVE
|
73
|
+
if (!af.has_sve) { return 0; }
|
74
|
+
score += 1<<3;
|
75
|
+
#endif
|
76
|
+
#ifdef LM_GGML_USE_MATMUL_INT8
|
77
|
+
if (!af.has_i8mm) { return 0; }
|
78
|
+
score += 1<<4;
|
79
|
+
#endif
|
80
|
+
#ifdef LM_GGML_USE_SVE2
|
81
|
+
if (!af.has_sve2) { return 0; }
|
82
|
+
score += 1<<5;
|
83
|
+
#endif
|
84
|
+
#ifdef LM_GGML_USE_SME
|
85
|
+
if (!af.has_sme) { return 0; }
|
86
|
+
score += 1<<6;
|
87
|
+
#endif
|
88
|
+
|
89
|
+
return score;
|
90
|
+
}
|
91
|
+
|
92
|
+
LM_GGML_BACKEND_DL_SCORE_IMPL(lm_ggml_backend_cpu_aarch64_score)
|
93
|
+
|
94
|
+
# endif // defined(__aarch64__)
|