cui-llama.rn 1.4.6 → 1.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +20 -20
- package/README.md +317 -319
- package/android/build.gradle +116 -116
- package/android/gradle.properties +5 -5
- package/android/src/main/AndroidManifest.xml +4 -4
- package/android/src/main/CMakeLists.txt +124 -117
- package/android/src/main/java/com/rnllama/LlamaContext.java +645 -645
- package/android/src/main/java/com/rnllama/RNLlama.java +695 -695
- package/android/src/main/java/com/rnllama/RNLlamaPackage.java +48 -48
- package/android/src/main/jni-utils.h +100 -100
- package/android/src/main/jni.cpp +1263 -1245
- package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
- package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
- package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +135 -135
- package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +136 -136
- package/cpp/README.md +4 -4
- package/cpp/binary-ops.cpp +158 -0
- package/cpp/binary-ops.h +16 -0
- package/cpp/chat.cpp +1769 -1779
- package/cpp/chat.h +9 -1
- package/cpp/common.cpp +20 -522
- package/cpp/common.h +13 -36
- package/cpp/cpu-common.h +72 -0
- package/cpp/ggml-common.h +12 -6
- package/cpp/ggml-cpu-aarch64.cpp +1557 -80
- package/cpp/ggml-cpu-impl.h +2 -21
- package/cpp/ggml-cpu-quants.c +904 -405
- package/cpp/ggml-cpu.c +909 -13237
- package/cpp/ggml-impl.h +50 -23
- package/cpp/ggml-llama-sim.metallib +0 -0
- package/cpp/ggml-llama.metallib +0 -0
- package/cpp/ggml-metal-impl.h +597 -523
- package/cpp/ggml-metal.m +798 -580
- package/cpp/ggml.c +92 -3
- package/cpp/ggml.h +30 -6
- package/cpp/gguf.cpp +1 -0
- package/cpp/llama-adapter.cpp +55 -20
- package/cpp/llama-adapter.h +11 -9
- package/cpp/llama-arch.cpp +217 -16
- package/cpp/llama-arch.h +25 -0
- package/cpp/llama-batch.h +2 -2
- package/cpp/llama-chat.cpp +54 -2
- package/cpp/llama-chat.h +3 -0
- package/cpp/llama-context.cpp +2294 -1238
- package/cpp/llama-context.h +214 -77
- package/cpp/llama-cparams.h +1 -0
- package/cpp/llama-graph.cpp +1695 -0
- package/cpp/llama-graph.h +592 -0
- package/cpp/llama-hparams.cpp +8 -0
- package/cpp/llama-hparams.h +17 -0
- package/cpp/llama-io.cpp +15 -0
- package/cpp/llama-io.h +35 -0
- package/cpp/llama-kv-cache.cpp +965 -303
- package/cpp/llama-kv-cache.h +145 -151
- package/cpp/llama-memory.cpp +1 -0
- package/cpp/llama-memory.h +21 -0
- package/cpp/llama-mmap.cpp +1 -1
- package/cpp/llama-model-loader.cpp +10 -5
- package/cpp/llama-model-loader.h +5 -3
- package/cpp/llama-model.cpp +9194 -201
- package/cpp/llama-model.h +40 -1
- package/cpp/llama-sampling.cpp +5 -0
- package/cpp/llama-vocab.cpp +36 -5
- package/cpp/llama.cpp +51 -9984
- package/cpp/llama.h +102 -22
- package/cpp/log.cpp +34 -0
- package/cpp/minja/chat-template.hpp +15 -7
- package/cpp/minja/minja.hpp +120 -94
- package/cpp/ops.cpp +8723 -0
- package/cpp/ops.h +128 -0
- package/cpp/rn-llama.cpp +873 -882
- package/cpp/rn-llama.h +138 -148
- package/cpp/sampling.cpp +3 -0
- package/cpp/sampling.h +107 -107
- package/cpp/sgemm.cpp +533 -88
- package/cpp/simd-mappings.h +888 -0
- package/cpp/speculative.cpp +4 -4
- package/cpp/unary-ops.cpp +186 -0
- package/cpp/unary-ops.h +28 -0
- package/cpp/unicode-data.cpp +7034 -7034
- package/cpp/unicode-data.h +20 -20
- package/cpp/unicode.cpp +849 -849
- package/cpp/unicode.h +66 -66
- package/cpp/vec.cpp +258 -0
- package/cpp/vec.h +802 -0
- package/ios/CMakeLists.txt +116 -105
- package/ios/RNLlama.h +7 -7
- package/ios/RNLlama.mm +418 -405
- package/ios/RNLlamaContext.h +57 -57
- package/ios/RNLlamaContext.mm +835 -819
- package/ios/rnllama.xcframework/Info.plist +74 -74
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/binary-ops.h +16 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat.h +143 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/common.h +677 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/cpu-common.h +72 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-alloc.h +76 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-backend-impl.h +255 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-backend.h +354 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-common.h +1857 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpp.h +39 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-aarch64.h +8 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-impl.h +512 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-quants.h +63 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-traits.h +38 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu.h +138 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-impl.h +594 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-metal-impl.h +597 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-metal.h +66 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-opt.h +216 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-quants.h +100 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-threading.h +14 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml.h +2222 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/gguf.h +202 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/json-schema-to-grammar.h +21 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/json.hpp +24766 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-adapter.h +76 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-arch.h +428 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-batch.h +88 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-chat.h +56 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-context.h +265 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-cparams.h +38 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-cpp.h +30 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-grammar.h +173 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-graph.h +592 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-hparams.h +156 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-impl.h +61 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-io.h +35 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache.h +213 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory.h +21 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-mmap.h +68 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model-loader.h +169 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model.h +409 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-sampling.h +32 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-vocab.h +125 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama.h +1434 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/log.h +132 -0
- package/{cpp → ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja}/chat-template.hpp +15 -7
- package/{cpp → ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja}/minja.hpp +120 -94
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ops.h +128 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-llama.h +138 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/sampling.h +107 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/sgemm.h +14 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/simd-mappings.h +888 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/speculative.h +28 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/unary-ops.h +28 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/unicode-data.h +20 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/unicode.h +66 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/vec.h +802 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Info.plist +0 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/ggml-llama.metallib +0 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/rnllama +0 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/binary-ops.h +16 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +143 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +677 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/cpu-common.h +72 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-alloc.h +76 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-backend-impl.h +255 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-backend.h +354 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-common.h +1857 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpp.h +39 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-aarch64.h +8 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-impl.h +512 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-quants.h +63 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-traits.h +38 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +138 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +594 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-metal-impl.h +597 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-metal.h +66 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-opt.h +216 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-quants.h +100 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-threading.h +14 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +2222 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/gguf.h +202 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/json-schema-to-grammar.h +21 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/json.hpp +24766 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-adapter.h +76 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +428 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +88 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +56 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +265 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +38 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cpp.h +30 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-grammar.h +173 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +592 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +156 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-impl.h +61 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-io.h +35 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +213 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +21 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-mmap.h +68 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model-loader.h +169 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +409 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-sampling.h +32 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +125 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +1434 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/log.h +132 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +537 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +2941 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ops.h +128 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +138 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/sampling.h +107 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/sgemm.h +14 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/simd-mappings.h +888 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/speculative.h +28 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/unary-ops.h +28 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/unicode-data.h +20 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/unicode.h +66 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/vec.h +802 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Info.plist +0 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/_CodeSignature/CodeResources +101 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/binary-ops.h +16 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat.h +143 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/common.h +677 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/cpu-common.h +72 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-alloc.h +76 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-backend-impl.h +255 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-backend.h +354 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-common.h +1857 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpp.h +39 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-aarch64.h +8 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-impl.h +512 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-quants.h +63 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-traits.h +38 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu.h +138 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-impl.h +594 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-metal-impl.h +597 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-metal.h +66 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-opt.h +216 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-quants.h +100 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-threading.h +14 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml.h +2222 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/gguf.h +202 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/json-schema-to-grammar.h +21 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/json.hpp +24766 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-adapter.h +76 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-arch.h +428 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-batch.h +88 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-chat.h +56 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-context.h +265 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-cparams.h +38 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-cpp.h +30 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-grammar.h +173 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-graph.h +592 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-hparams.h +156 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-impl.h +61 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-io.h +35 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache.h +213 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory.h +21 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-mmap.h +68 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model-loader.h +169 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model.h +409 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-sampling.h +32 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-vocab.h +125 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama.h +1434 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/log.h +132 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/chat-template.hpp +537 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/minja.hpp +2941 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ops.h +128 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-llama.h +138 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/sampling.h +107 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/sgemm.h +14 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/simd-mappings.h +888 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/speculative.h +28 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/unary-ops.h +28 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/unicode-data.h +20 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/unicode.h +66 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/vec.h +802 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Info.plist +0 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/ggml-llama.metallib +0 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/rnllama +0 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/binary-ops.h +16 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +143 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +677 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/cpu-common.h +72 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-alloc.h +76 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-backend-impl.h +255 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-backend.h +354 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-common.h +1857 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpp.h +39 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-aarch64.h +8 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-impl.h +512 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-quants.h +63 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-traits.h +38 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +138 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +594 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-metal-impl.h +597 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-metal.h +66 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-opt.h +216 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-quants.h +100 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-threading.h +14 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +2222 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/gguf.h +202 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json-schema-to-grammar.h +21 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/json.hpp +24766 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-adapter.h +76 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +428 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +88 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +56 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +265 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +38 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cpp.h +30 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-grammar.h +173 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +592 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +156 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-impl.h +61 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-io.h +35 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +213 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +21 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-mmap.h +68 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model-loader.h +169 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +409 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-sampling.h +32 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +125 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +1434 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/log.h +132 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +537 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +2941 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ops.h +128 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +138 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/sampling.h +107 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/sgemm.h +14 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/simd-mappings.h +888 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/speculative.h +28 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/unary-ops.h +28 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/unicode-data.h +20 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/unicode.h +66 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/vec.h +802 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Info.plist +0 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/_CodeSignature/CodeResources +101 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
- package/jest/mock.js +203 -203
- package/lib/commonjs/NativeRNLlama.js +1 -2
- package/lib/commonjs/NativeRNLlama.js.map +1 -1
- package/lib/commonjs/chat.js.map +1 -1
- package/lib/commonjs/grammar.js +12 -31
- package/lib/commonjs/grammar.js.map +1 -1
- package/lib/commonjs/index.js +47 -47
- package/lib/commonjs/index.js.map +1 -1
- package/lib/commonjs/package.json +1 -0
- package/lib/module/NativeRNLlama.js +2 -0
- package/lib/module/NativeRNLlama.js.map +1 -1
- package/lib/module/chat.js +2 -0
- package/lib/module/chat.js.map +1 -1
- package/lib/module/grammar.js +14 -31
- package/lib/module/grammar.js.map +1 -1
- package/lib/module/index.js +47 -45
- package/lib/module/index.js.map +1 -1
- package/lib/module/package.json +1 -0
- package/lib/typescript/NativeRNLlama.d.ts +6 -4
- package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
- package/lib/typescript/index.d.ts.map +1 -1
- package/llama-rn.podspec +48 -48
- package/package.json +233 -233
- package/src/NativeRNLlama.ts +426 -424
- package/src/chat.ts +44 -44
- package/src/grammar.ts +854 -854
- package/src/index.ts +495 -485
package/cpp/rn-llama.cpp
CHANGED
@@ -1,882 +1,873 @@
|
|
1
|
-
#include "rn-llama.h"
|
2
|
-
|
3
|
-
namespace rnllama {
|
4
|
-
|
5
|
-
const std::vector<lm_ggml_type> kv_cache_types = {
|
6
|
-
LM_GGML_TYPE_F32,
|
7
|
-
LM_GGML_TYPE_F16,
|
8
|
-
LM_GGML_TYPE_BF16,
|
9
|
-
LM_GGML_TYPE_Q8_0,
|
10
|
-
LM_GGML_TYPE_Q4_0,
|
11
|
-
LM_GGML_TYPE_Q4_1,
|
12
|
-
LM_GGML_TYPE_IQ4_NL,
|
13
|
-
LM_GGML_TYPE_Q5_0,
|
14
|
-
LM_GGML_TYPE_Q5_1,
|
15
|
-
};
|
16
|
-
|
17
|
-
lm_ggml_type kv_cache_type_from_str(const std::string & s) {
|
18
|
-
for (const auto & type : kv_cache_types) {
|
19
|
-
if (lm_ggml_type_name(type) == s) {
|
20
|
-
return type;
|
21
|
-
}
|
22
|
-
}
|
23
|
-
throw std::runtime_error("Unsupported cache type: " + s);
|
24
|
-
}
|
25
|
-
|
26
|
-
static void llama_batch_clear(llama_batch *batch) {
|
27
|
-
batch->n_tokens = 0;
|
28
|
-
}
|
29
|
-
|
30
|
-
static void llama_batch_add(llama_batch *batch, llama_token id, llama_pos pos, std::vector<llama_seq_id> seq_ids, bool logits) {
|
31
|
-
batch->token [batch->n_tokens] = id;
|
32
|
-
batch->pos [batch->n_tokens] = pos;
|
33
|
-
batch->n_seq_id[batch->n_tokens] = seq_ids.size();
|
34
|
-
for (size_t i = 0; i < seq_ids.size(); i++) {
|
35
|
-
batch->seq_id[batch->n_tokens][i] = seq_ids[i];
|
36
|
-
}
|
37
|
-
batch->logits [batch->n_tokens] = logits ? 1 : 0;
|
38
|
-
batch->n_tokens += 1;
|
39
|
-
}
|
40
|
-
|
41
|
-
// NOTE: Edit from https://github.com/ggerganov/llama.cpp/blob/master/examples/server/server.cpp
|
42
|
-
|
43
|
-
static void log(const char *level, const char *function, int line,
|
44
|
-
const char *format, ...)
|
45
|
-
{
|
46
|
-
va_list args;
|
47
|
-
#if defined(__ANDROID__)
|
48
|
-
char prefix[256];
|
49
|
-
snprintf(prefix, sizeof(prefix), "%s:%d %s", function, line, format);
|
50
|
-
|
51
|
-
va_start(args, format);
|
52
|
-
android_LogPriority priority;
|
53
|
-
if (strcmp(level, "ERROR") == 0) {
|
54
|
-
priority = ANDROID_LOG_ERROR;
|
55
|
-
} else if (strcmp(level, "WARNING") == 0) {
|
56
|
-
priority = ANDROID_LOG_WARN;
|
57
|
-
} else if (strcmp(level, "INFO") == 0) {
|
58
|
-
priority = ANDROID_LOG_INFO;
|
59
|
-
} else {
|
60
|
-
priority = ANDROID_LOG_DEBUG;
|
61
|
-
}
|
62
|
-
__android_log_vprint(priority, "RNLlama", prefix, args);
|
63
|
-
va_end(args);
|
64
|
-
#else
|
65
|
-
printf("[%s] %s:%d ", level, function, line);
|
66
|
-
va_start(args, format);
|
67
|
-
vprintf(format, args);
|
68
|
-
va_end(args);
|
69
|
-
printf("\n");
|
70
|
-
#endif
|
71
|
-
}
|
72
|
-
|
73
|
-
#if RNLLAMA_VERBOSE != 1
|
74
|
-
#define LOG_VERBOSE(MSG, ...)
|
75
|
-
#else
|
76
|
-
#define LOG_VERBOSE(MSG, ...) \
|
77
|
-
do \
|
78
|
-
{ \
|
79
|
-
if (rnllama_verbose) \
|
80
|
-
{ \
|
81
|
-
log("VERBOSE", __func__, __LINE__, MSG, ##__VA_ARGS__); \
|
82
|
-
} \
|
83
|
-
} while (0)
|
84
|
-
#endif
|
85
|
-
|
86
|
-
#define LOG_ERROR(MSG, ...) log("ERROR", __func__, __LINE__, MSG, ##__VA_ARGS__)
|
87
|
-
#define LOG_WARNING(MSG, ...) log("WARNING", __func__, __LINE__, MSG, ##__VA_ARGS__)
|
88
|
-
#define LOG_INFO(MSG, ...) log("INFO", __func__, __LINE__, MSG, ##__VA_ARGS__)
|
89
|
-
|
90
|
-
static size_t common_part(const std::vector<llama_token> &a, const std::vector<llama_token> &b)
|
91
|
-
{
|
92
|
-
size_t i;
|
93
|
-
for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++)
|
94
|
-
{
|
95
|
-
}
|
96
|
-
return i;
|
97
|
-
}
|
98
|
-
|
99
|
-
static bool ends_with(const std::string &str, const std::string &suffix)
|
100
|
-
{
|
101
|
-
return str.size() >= suffix.size() &&
|
102
|
-
0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
|
103
|
-
}
|
104
|
-
|
105
|
-
static size_t find_partial_stop_string(const std::string &stop,
|
106
|
-
const std::string &text)
|
107
|
-
{
|
108
|
-
if (!text.empty() && !stop.empty())
|
109
|
-
{
|
110
|
-
const char text_last_char = text.back();
|
111
|
-
for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--)
|
112
|
-
{
|
113
|
-
if (stop[char_index] == text_last_char)
|
114
|
-
{
|
115
|
-
const std::string current_partial = stop.substr(0, char_index + 1);
|
116
|
-
if (ends_with(text, current_partial))
|
117
|
-
{
|
118
|
-
return text.size() - char_index - 1;
|
119
|
-
}
|
120
|
-
}
|
121
|
-
}
|
122
|
-
}
|
123
|
-
return std::string::npos;
|
124
|
-
}
|
125
|
-
|
126
|
-
// format incomplete utf-8 multibyte character for output
|
127
|
-
std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token)
|
128
|
-
{
|
129
|
-
std::string out = token == -1 ? "" : common_token_to_piece(ctx, token);
|
130
|
-
// if the size is 1 and first bit is 1, meaning it's a partial character
|
131
|
-
// (size > 1 meaning it's already a known token)
|
132
|
-
if (out.size() == 1 && (out[0] & 0x80) == 0x80)
|
133
|
-
{
|
134
|
-
std::stringstream ss;
|
135
|
-
ss << std::hex << (out[0] & 0xff);
|
136
|
-
std::string res(ss.str());
|
137
|
-
out = "byte: \\x" + res;
|
138
|
-
}
|
139
|
-
return out;
|
140
|
-
}
|
141
|
-
|
142
|
-
std::string tokens_to_str(llama_context *ctx, const std::vector<llama_token>::const_iterator begin, const std::vector<llama_token>::const_iterator end)
|
143
|
-
{
|
144
|
-
std::string ret;
|
145
|
-
for (auto it = begin; it != end; ++it)
|
146
|
-
{
|
147
|
-
ret += common_token_to_piece(ctx, *it);
|
148
|
-
}
|
149
|
-
return ret;
|
150
|
-
}
|
151
|
-
|
152
|
-
llama_rn_context::~llama_rn_context() {
|
153
|
-
if (ctx_sampling != nullptr) {
|
154
|
-
common_sampler_free(ctx_sampling);
|
155
|
-
}
|
156
|
-
}
|
157
|
-
|
158
|
-
void llama_rn_context::rewind() {
|
159
|
-
is_interrupted = false;
|
160
|
-
params.antiprompt.clear();
|
161
|
-
params.sampling.grammar.clear();
|
162
|
-
num_prompt_tokens = 0;
|
163
|
-
num_tokens_predicted = 0;
|
164
|
-
generated_text = "";
|
165
|
-
generated_text.reserve(params.n_ctx);
|
166
|
-
generated_token_probs.clear();
|
167
|
-
truncated = false;
|
168
|
-
stopped_eos = false;
|
169
|
-
stopped_word = false;
|
170
|
-
stopped_limit = false;
|
171
|
-
stopping_word = "";
|
172
|
-
incomplete = false;
|
173
|
-
n_remain = 0;
|
174
|
-
n_past = 0;
|
175
|
-
params.sampling.n_prev = n_ctx;
|
176
|
-
}
|
177
|
-
|
178
|
-
bool llama_rn_context::initSampling() {
|
179
|
-
if (ctx_sampling != nullptr) {
|
180
|
-
common_sampler_free(ctx_sampling);
|
181
|
-
}
|
182
|
-
ctx_sampling = common_sampler_init(model, params.sampling);
|
183
|
-
return ctx_sampling != nullptr;
|
184
|
-
}
|
185
|
-
|
186
|
-
bool llama_rn_context::loadModel(common_params ¶ms_)
|
187
|
-
{
|
188
|
-
params = params_;
|
189
|
-
llama_init = common_init_from_params(params);
|
190
|
-
model = llama_init.model.get();
|
191
|
-
ctx = llama_init.context.get();
|
192
|
-
if (model == nullptr)
|
193
|
-
{
|
194
|
-
LOG_ERROR("unable to load model: %s", params_.model.c_str());
|
195
|
-
return false;
|
196
|
-
}
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
//
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
const std::string &
|
217
|
-
const std::string &
|
218
|
-
const std::string &
|
219
|
-
const
|
220
|
-
const
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
std::string
|
249
|
-
const std::string &
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
prompt_tokens =
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
//
|
314
|
-
if
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
{
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
n_past
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
}
|
422
|
-
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
result.tok =
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
size_t
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
|
509
|
-
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
514
|
-
|
515
|
-
|
516
|
-
|
517
|
-
|
518
|
-
|
519
|
-
|
520
|
-
|
521
|
-
|
522
|
-
|
523
|
-
|
524
|
-
|
525
|
-
|
526
|
-
|
527
|
-
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
|
553
|
-
|
554
|
-
|
555
|
-
|
556
|
-
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
|
561
|
-
|
562
|
-
|
563
|
-
|
564
|
-
|
565
|
-
|
566
|
-
|
567
|
-
|
568
|
-
|
569
|
-
}
|
570
|
-
|
571
|
-
|
572
|
-
|
573
|
-
|
574
|
-
|
575
|
-
{
|
576
|
-
|
577
|
-
|
578
|
-
|
579
|
-
|
580
|
-
|
581
|
-
|
582
|
-
|
583
|
-
|
584
|
-
|
585
|
-
|
586
|
-
|
587
|
-
|
588
|
-
|
589
|
-
|
590
|
-
|
591
|
-
|
592
|
-
|
593
|
-
|
594
|
-
|
595
|
-
|
596
|
-
|
597
|
-
|
598
|
-
|
599
|
-
|
600
|
-
|
601
|
-
|
602
|
-
|
603
|
-
|
604
|
-
|
605
|
-
|
606
|
-
|
607
|
-
|
608
|
-
|
609
|
-
|
610
|
-
|
611
|
-
|
612
|
-
|
613
|
-
|
614
|
-
|
615
|
-
|
616
|
-
|
617
|
-
|
618
|
-
|
619
|
-
|
620
|
-
|
621
|
-
|
622
|
-
|
623
|
-
|
624
|
-
|
625
|
-
{
|
626
|
-
|
627
|
-
}
|
628
|
-
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
|
633
|
-
|
634
|
-
|
635
|
-
|
636
|
-
|
637
|
-
|
638
|
-
|
639
|
-
|
640
|
-
|
641
|
-
|
642
|
-
|
643
|
-
|
644
|
-
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
|
649
|
-
|
650
|
-
|
651
|
-
|
652
|
-
|
653
|
-
|
654
|
-
|
655
|
-
|
656
|
-
|
657
|
-
|
658
|
-
|
659
|
-
|
660
|
-
|
661
|
-
|
662
|
-
|
663
|
-
|
664
|
-
|
665
|
-
|
666
|
-
|
667
|
-
|
668
|
-
|
669
|
-
|
670
|
-
|
671
|
-
|
672
|
-
|
673
|
-
|
674
|
-
|
675
|
-
|
676
|
-
|
677
|
-
|
678
|
-
|
679
|
-
|
680
|
-
|
681
|
-
|
682
|
-
|
683
|
-
|
684
|
-
|
685
|
-
|
686
|
-
|
687
|
-
|
688
|
-
|
689
|
-
|
690
|
-
|
691
|
-
|
692
|
-
|
693
|
-
|
694
|
-
|
695
|
-
|
696
|
-
|
697
|
-
|
698
|
-
|
699
|
-
|
700
|
-
|
701
|
-
}
|
702
|
-
|
703
|
-
|
704
|
-
|
705
|
-
|
706
|
-
|
707
|
-
|
708
|
-
|
709
|
-
|
710
|
-
|
711
|
-
|
712
|
-
|
713
|
-
return
|
714
|
-
}
|
715
|
-
|
716
|
-
|
717
|
-
|
718
|
-
|
719
|
-
|
720
|
-
|
721
|
-
|
722
|
-
|
723
|
-
|
724
|
-
|
725
|
-
|
726
|
-
|
727
|
-
|
728
|
-
|
729
|
-
|
730
|
-
|
731
|
-
|
732
|
-
|
733
|
-
|
734
|
-
|
735
|
-
|
736
|
-
|
737
|
-
|
738
|
-
|
739
|
-
|
740
|
-
|
741
|
-
|
742
|
-
|
743
|
-
|
744
|
-
|
745
|
-
|
746
|
-
|
747
|
-
|
748
|
-
|
749
|
-
|
750
|
-
|
751
|
-
|
752
|
-
|
753
|
-
|
754
|
-
|
755
|
-
|
756
|
-
|
757
|
-
|
758
|
-
|
759
|
-
|
760
|
-
|
761
|
-
|
762
|
-
|
763
|
-
|
764
|
-
|
765
|
-
|
766
|
-
|
767
|
-
|
768
|
-
|
769
|
-
|
770
|
-
|
771
|
-
|
772
|
-
|
773
|
-
|
774
|
-
|
775
|
-
|
776
|
-
|
777
|
-
|
778
|
-
|
779
|
-
|
780
|
-
|
781
|
-
|
782
|
-
|
783
|
-
|
784
|
-
|
785
|
-
|
786
|
-
|
787
|
-
|
788
|
-
|
789
|
-
|
790
|
-
|
791
|
-
|
792
|
-
|
793
|
-
|
794
|
-
|
795
|
-
|
796
|
-
|
797
|
-
|
798
|
-
|
799
|
-
|
800
|
-
|
801
|
-
|
802
|
-
|
803
|
-
|
804
|
-
|
805
|
-
|
806
|
-
|
807
|
-
|
808
|
-
|
809
|
-
|
810
|
-
|
811
|
-
|
812
|
-
|
813
|
-
|
814
|
-
|
815
|
-
|
816
|
-
|
817
|
-
|
818
|
-
|
819
|
-
|
820
|
-
|
821
|
-
|
822
|
-
|
823
|
-
|
824
|
-
|
825
|
-
|
826
|
-
|
827
|
-
|
828
|
-
|
829
|
-
|
830
|
-
}
|
831
|
-
|
832
|
-
|
833
|
-
|
834
|
-
|
835
|
-
|
836
|
-
|
837
|
-
|
838
|
-
|
839
|
-
|
840
|
-
|
841
|
-
|
842
|
-
|
843
|
-
|
844
|
-
|
845
|
-
|
846
|
-
|
847
|
-
|
848
|
-
|
849
|
-
|
850
|
-
|
851
|
-
|
852
|
-
|
853
|
-
|
854
|
-
|
855
|
-
|
856
|
-
|
857
|
-
|
858
|
-
|
859
|
-
|
860
|
-
|
861
|
-
|
862
|
-
|
863
|
-
|
864
|
-
|
865
|
-
|
866
|
-
|
867
|
-
|
868
|
-
|
869
|
-
|
870
|
-
|
871
|
-
|
872
|
-
|
873
|
-
|
874
|
-
LOG_INFO("\n[Context Shifting: Erased %d tokens at position %d]", diff, trimstart + 1);
|
875
|
-
|
876
|
-
current_context_tokens.resize(current_context_tokens.size() - diff);
|
877
|
-
}
|
878
|
-
}
|
879
|
-
|
880
|
-
}
|
881
|
-
|
882
|
-
}
|
1
|
+
#include "rn-llama.h"
|
2
|
+
|
3
|
+
namespace rnllama {
|
4
|
+
|
5
|
+
const std::vector<lm_ggml_type> kv_cache_types = {
|
6
|
+
LM_GGML_TYPE_F32,
|
7
|
+
LM_GGML_TYPE_F16,
|
8
|
+
LM_GGML_TYPE_BF16,
|
9
|
+
LM_GGML_TYPE_Q8_0,
|
10
|
+
LM_GGML_TYPE_Q4_0,
|
11
|
+
LM_GGML_TYPE_Q4_1,
|
12
|
+
LM_GGML_TYPE_IQ4_NL,
|
13
|
+
LM_GGML_TYPE_Q5_0,
|
14
|
+
LM_GGML_TYPE_Q5_1,
|
15
|
+
};
|
16
|
+
|
17
|
+
lm_ggml_type kv_cache_type_from_str(const std::string & s) {
|
18
|
+
for (const auto & type : kv_cache_types) {
|
19
|
+
if (lm_ggml_type_name(type) == s) {
|
20
|
+
return type;
|
21
|
+
}
|
22
|
+
}
|
23
|
+
throw std::runtime_error("Unsupported cache type: " + s);
|
24
|
+
}
|
25
|
+
|
26
|
+
static void llama_batch_clear(llama_batch *batch) {
|
27
|
+
batch->n_tokens = 0;
|
28
|
+
}
|
29
|
+
|
30
|
+
static void llama_batch_add(llama_batch *batch, llama_token id, llama_pos pos, std::vector<llama_seq_id> seq_ids, bool logits) {
|
31
|
+
batch->token [batch->n_tokens] = id;
|
32
|
+
batch->pos [batch->n_tokens] = pos;
|
33
|
+
batch->n_seq_id[batch->n_tokens] = seq_ids.size();
|
34
|
+
for (size_t i = 0; i < seq_ids.size(); i++) {
|
35
|
+
batch->seq_id[batch->n_tokens][i] = seq_ids[i];
|
36
|
+
}
|
37
|
+
batch->logits [batch->n_tokens] = logits ? 1 : 0;
|
38
|
+
batch->n_tokens += 1;
|
39
|
+
}
|
40
|
+
|
41
|
+
// NOTE: Edit from https://github.com/ggerganov/llama.cpp/blob/master/examples/server/server.cpp
|
42
|
+
|
43
|
+
static void log(const char *level, const char *function, int line,
|
44
|
+
const char *format, ...)
|
45
|
+
{
|
46
|
+
va_list args;
|
47
|
+
#if defined(__ANDROID__)
|
48
|
+
char prefix[256];
|
49
|
+
snprintf(prefix, sizeof(prefix), "%s:%d %s", function, line, format);
|
50
|
+
|
51
|
+
va_start(args, format);
|
52
|
+
android_LogPriority priority;
|
53
|
+
if (strcmp(level, "ERROR") == 0) {
|
54
|
+
priority = ANDROID_LOG_ERROR;
|
55
|
+
} else if (strcmp(level, "WARNING") == 0) {
|
56
|
+
priority = ANDROID_LOG_WARN;
|
57
|
+
} else if (strcmp(level, "INFO") == 0) {
|
58
|
+
priority = ANDROID_LOG_INFO;
|
59
|
+
} else {
|
60
|
+
priority = ANDROID_LOG_DEBUG;
|
61
|
+
}
|
62
|
+
__android_log_vprint(priority, "RNLlama", prefix, args);
|
63
|
+
va_end(args);
|
64
|
+
#else
|
65
|
+
printf("[%s] %s:%d ", level, function, line);
|
66
|
+
va_start(args, format);
|
67
|
+
vprintf(format, args);
|
68
|
+
va_end(args);
|
69
|
+
printf("\n");
|
70
|
+
#endif
|
71
|
+
}
|
72
|
+
|
73
|
+
#if RNLLAMA_VERBOSE != 1
|
74
|
+
#define LOG_VERBOSE(MSG, ...)
|
75
|
+
#else
|
76
|
+
#define LOG_VERBOSE(MSG, ...) \
|
77
|
+
do \
|
78
|
+
{ \
|
79
|
+
if (rnllama_verbose) \
|
80
|
+
{ \
|
81
|
+
log("VERBOSE", __func__, __LINE__, MSG, ##__VA_ARGS__); \
|
82
|
+
} \
|
83
|
+
} while (0)
|
84
|
+
#endif
|
85
|
+
|
86
|
+
#define LOG_ERROR(MSG, ...) log("ERROR", __func__, __LINE__, MSG, ##__VA_ARGS__)
|
87
|
+
#define LOG_WARNING(MSG, ...) log("WARNING", __func__, __LINE__, MSG, ##__VA_ARGS__)
|
88
|
+
#define LOG_INFO(MSG, ...) log("INFO", __func__, __LINE__, MSG, ##__VA_ARGS__)
|
89
|
+
|
90
|
+
static size_t common_part(const std::vector<llama_token> &a, const std::vector<llama_token> &b)
|
91
|
+
{
|
92
|
+
size_t i;
|
93
|
+
for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++)
|
94
|
+
{
|
95
|
+
}
|
96
|
+
return i;
|
97
|
+
}
|
98
|
+
|
99
|
+
static bool ends_with(const std::string &str, const std::string &suffix)
|
100
|
+
{
|
101
|
+
return str.size() >= suffix.size() &&
|
102
|
+
0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
|
103
|
+
}
|
104
|
+
|
105
|
+
static size_t find_partial_stop_string(const std::string &stop,
|
106
|
+
const std::string &text)
|
107
|
+
{
|
108
|
+
if (!text.empty() && !stop.empty())
|
109
|
+
{
|
110
|
+
const char text_last_char = text.back();
|
111
|
+
for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--)
|
112
|
+
{
|
113
|
+
if (stop[char_index] == text_last_char)
|
114
|
+
{
|
115
|
+
const std::string current_partial = stop.substr(0, char_index + 1);
|
116
|
+
if (ends_with(text, current_partial))
|
117
|
+
{
|
118
|
+
return text.size() - char_index - 1;
|
119
|
+
}
|
120
|
+
}
|
121
|
+
}
|
122
|
+
}
|
123
|
+
return std::string::npos;
|
124
|
+
}
|
125
|
+
|
126
|
+
// format incomplete utf-8 multibyte character for output
|
127
|
+
std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token)
|
128
|
+
{
|
129
|
+
std::string out = token == -1 ? "" : common_token_to_piece(ctx, token);
|
130
|
+
// if the size is 1 and first bit is 1, meaning it's a partial character
|
131
|
+
// (size > 1 meaning it's already a known token)
|
132
|
+
if (out.size() == 1 && (out[0] & 0x80) == 0x80)
|
133
|
+
{
|
134
|
+
std::stringstream ss;
|
135
|
+
ss << std::hex << (out[0] & 0xff);
|
136
|
+
std::string res(ss.str());
|
137
|
+
out = "byte: \\x" + res;
|
138
|
+
}
|
139
|
+
return out;
|
140
|
+
}
|
141
|
+
|
142
|
+
std::string tokens_to_str(llama_context *ctx, const std::vector<llama_token>::const_iterator begin, const std::vector<llama_token>::const_iterator end)
|
143
|
+
{
|
144
|
+
std::string ret;
|
145
|
+
for (auto it = begin; it != end; ++it)
|
146
|
+
{
|
147
|
+
ret += common_token_to_piece(ctx, *it);
|
148
|
+
}
|
149
|
+
return ret;
|
150
|
+
}
|
151
|
+
|
152
|
+
llama_rn_context::~llama_rn_context() {
|
153
|
+
if (ctx_sampling != nullptr) {
|
154
|
+
common_sampler_free(ctx_sampling);
|
155
|
+
}
|
156
|
+
}
|
157
|
+
|
158
|
+
void llama_rn_context::rewind() {
|
159
|
+
is_interrupted = false;
|
160
|
+
params.antiprompt.clear();
|
161
|
+
params.sampling.grammar.clear();
|
162
|
+
num_prompt_tokens = 0;
|
163
|
+
num_tokens_predicted = 0;
|
164
|
+
generated_text = "";
|
165
|
+
generated_text.reserve(params.n_ctx);
|
166
|
+
generated_token_probs.clear();
|
167
|
+
truncated = false;
|
168
|
+
stopped_eos = false;
|
169
|
+
stopped_word = false;
|
170
|
+
stopped_limit = false;
|
171
|
+
stopping_word = "";
|
172
|
+
incomplete = false;
|
173
|
+
n_remain = 0;
|
174
|
+
n_past = 0;
|
175
|
+
params.sampling.n_prev = n_ctx;
|
176
|
+
}
|
177
|
+
|
178
|
+
bool llama_rn_context::initSampling() {
|
179
|
+
if (ctx_sampling != nullptr) {
|
180
|
+
common_sampler_free(ctx_sampling);
|
181
|
+
}
|
182
|
+
ctx_sampling = common_sampler_init(model, params.sampling);
|
183
|
+
return ctx_sampling != nullptr;
|
184
|
+
}
|
185
|
+
|
186
|
+
bool llama_rn_context::loadModel(common_params ¶ms_)
|
187
|
+
{
|
188
|
+
params = params_;
|
189
|
+
llama_init = common_init_from_params(params);
|
190
|
+
model = llama_init.model.get();
|
191
|
+
ctx = llama_init.context.get();
|
192
|
+
if (model == nullptr)
|
193
|
+
{
|
194
|
+
LOG_ERROR("unable to load model: %s", params_.model.path.c_str());
|
195
|
+
return false;
|
196
|
+
}
|
197
|
+
templates = common_chat_templates_init(model, params.chat_template);
|
198
|
+
n_ctx = llama_n_ctx(ctx);
|
199
|
+
|
200
|
+
// We can uncomment for debugging or after this fix: https://github.com/ggerganov/llama.cpp/pull/11101
|
201
|
+
// LOG_INFO("%s\n", common_params_get_system_info(params).c_str());
|
202
|
+
|
203
|
+
return true;
|
204
|
+
}
|
205
|
+
|
206
|
+
bool llama_rn_context::validateModelChatTemplate(bool use_jinja, const char *name) const {
|
207
|
+
const char * tmpl = llama_model_chat_template(model, name);
|
208
|
+
if (tmpl == nullptr) {
|
209
|
+
return false;
|
210
|
+
}
|
211
|
+
return common_chat_verify_template(tmpl, use_jinja);
|
212
|
+
}
|
213
|
+
|
214
|
+
common_chat_params llama_rn_context::getFormattedChatWithJinja(
|
215
|
+
const std::string &messages,
|
216
|
+
const std::string &chat_template,
|
217
|
+
const std::string &json_schema,
|
218
|
+
const std::string &tools,
|
219
|
+
const bool ¶llel_tool_calls,
|
220
|
+
const std::string &tool_choice
|
221
|
+
) const {
|
222
|
+
common_chat_templates_inputs inputs;
|
223
|
+
inputs.use_jinja = true;
|
224
|
+
inputs.messages = common_chat_msgs_parse_oaicompat(json::parse(messages));
|
225
|
+
auto useTools = !tools.empty();
|
226
|
+
if (useTools) {
|
227
|
+
inputs.tools = common_chat_tools_parse_oaicompat(json::parse(tools));
|
228
|
+
}
|
229
|
+
inputs.parallel_tool_calls = parallel_tool_calls;
|
230
|
+
if (!tool_choice.empty()) {
|
231
|
+
inputs.tool_choice = common_chat_tool_choice_parse_oaicompat(tool_choice);
|
232
|
+
}
|
233
|
+
if (!json_schema.empty()) {
|
234
|
+
inputs.json_schema = json::parse(json_schema);
|
235
|
+
}
|
236
|
+
inputs.extract_reasoning = params.reasoning_format != COMMON_REASONING_FORMAT_NONE;
|
237
|
+
|
238
|
+
// If chat_template is provided, create new one and use it (probably slow)
|
239
|
+
if (!chat_template.empty()) {
|
240
|
+
auto tmps = common_chat_templates_init(model, chat_template);
|
241
|
+
return common_chat_templates_apply(tmps.get(), inputs);
|
242
|
+
} else {
|
243
|
+
return common_chat_templates_apply(templates.get(), inputs);
|
244
|
+
}
|
245
|
+
}
|
246
|
+
|
247
|
+
std::string llama_rn_context::getFormattedChat(
|
248
|
+
const std::string &messages,
|
249
|
+
const std::string &chat_template
|
250
|
+
) const {
|
251
|
+
common_chat_templates_inputs inputs;
|
252
|
+
inputs.messages = common_chat_msgs_parse_oaicompat(json::parse(messages));
|
253
|
+
inputs.use_jinja = false;
|
254
|
+
|
255
|
+
// If chat_template is provided, create new one and use it (probably slow)
|
256
|
+
if (!chat_template.empty()) {
|
257
|
+
auto tmps = common_chat_templates_init(model, chat_template);
|
258
|
+
return common_chat_templates_apply(tmps.get(), inputs).prompt;
|
259
|
+
} else {
|
260
|
+
return common_chat_templates_apply(templates.get(), inputs).prompt;
|
261
|
+
}
|
262
|
+
}
|
263
|
+
|
264
|
+
void llama_rn_context::truncatePrompt(std::vector<llama_token> &prompt_tokens) {
|
265
|
+
const int n_left = n_ctx - params.n_keep;
|
266
|
+
const int n_block_size = n_left / 2;
|
267
|
+
const int erased_blocks = (prompt_tokens.size() - params.n_keep - n_block_size) / n_block_size;
|
268
|
+
|
269
|
+
// Keep n_keep tokens at start of prompt (at most n_ctx - 4)
|
270
|
+
std::vector<llama_token> new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep);
|
271
|
+
|
272
|
+
new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_block_size, prompt_tokens.end());
|
273
|
+
|
274
|
+
LOG_VERBOSE("input truncated, n_ctx: %d, n_keep: %d, n_left: %d, new_tokens: %s, num_prompt_tokens: %d",
|
275
|
+
n_ctx,
|
276
|
+
params.n_keep,
|
277
|
+
n_left,
|
278
|
+
tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend()).c_str(),
|
279
|
+
new_tokens.size()
|
280
|
+
);
|
281
|
+
|
282
|
+
truncated = true;
|
283
|
+
prompt_tokens = new_tokens;
|
284
|
+
}
|
285
|
+
|
286
|
+
void llama_rn_context::loadPrompt() {
|
287
|
+
std::vector<llama_token> prompt_tokens = ::common_tokenize(ctx, params.prompt, true, true);
|
288
|
+
num_prompt_tokens = prompt_tokens.size();
|
289
|
+
|
290
|
+
// LOG tokens
|
291
|
+
std::stringstream ss;
|
292
|
+
ss << "\n" << __func__ << ": prompt_tokens = ";
|
293
|
+
for (auto& token : prompt_tokens) {
|
294
|
+
ss << token << " ";
|
295
|
+
}
|
296
|
+
LOG_INFO("%s\n", ss.str().c_str());
|
297
|
+
|
298
|
+
if (params.n_keep < 0)
|
299
|
+
{
|
300
|
+
params.n_keep = (int)num_prompt_tokens;
|
301
|
+
}
|
302
|
+
params.n_keep = std::min(n_ctx - 4, params.n_keep);
|
303
|
+
|
304
|
+
// if input prompt is too big, truncate like normal
|
305
|
+
if (num_prompt_tokens >= (size_t) n_ctx)
|
306
|
+
{
|
307
|
+
truncatePrompt(prompt_tokens);
|
308
|
+
num_prompt_tokens = prompt_tokens.size();
|
309
|
+
|
310
|
+
LM_GGML_ASSERT(num_prompt_tokens < (size_t) n_ctx);
|
311
|
+
}
|
312
|
+
|
313
|
+
// do context shifitng
|
314
|
+
if(!params.embedding){
|
315
|
+
purge_missing_tokens(ctx, embd, prompt_tokens, params.n_predict, params.n_ctx);
|
316
|
+
}
|
317
|
+
|
318
|
+
|
319
|
+
// push the prompt into the sampling context (do not apply grammar)
|
320
|
+
for (auto & token : prompt_tokens)
|
321
|
+
{
|
322
|
+
common_sampler_accept(ctx_sampling, token, false);
|
323
|
+
}
|
324
|
+
|
325
|
+
// compare the evaluated prompt with the new prompt
|
326
|
+
n_past = common_part(embd, prompt_tokens);
|
327
|
+
|
328
|
+
embd = prompt_tokens;
|
329
|
+
if (n_past == num_prompt_tokens)
|
330
|
+
{
|
331
|
+
// we have to evaluate at least 1 token to generate logits.
|
332
|
+
n_past--;
|
333
|
+
}
|
334
|
+
|
335
|
+
// since #3228 we now have to manually manage the KV cache
|
336
|
+
llama_kv_self_seq_rm(ctx, 0, n_past, -1);
|
337
|
+
|
338
|
+
LOG_VERBOSE("prompt ingested, n_past: %d, cached: %s, to_eval: %s",
|
339
|
+
n_past,
|
340
|
+
tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past).c_str(),
|
341
|
+
tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend()).c_str()
|
342
|
+
);
|
343
|
+
|
344
|
+
has_next_token = true;
|
345
|
+
}
|
346
|
+
|
347
|
+
void llama_rn_context::beginCompletion() {
|
348
|
+
// number of tokens to keep when resetting context
|
349
|
+
n_remain = params.n_predict;
|
350
|
+
llama_perf_context_reset(ctx);
|
351
|
+
is_predicting = true;
|
352
|
+
}
|
353
|
+
|
354
|
+
completion_token_output llama_rn_context::nextToken()
|
355
|
+
{
|
356
|
+
completion_token_output result;
|
357
|
+
result.tok = -1;
|
358
|
+
|
359
|
+
if (embd.size() >= (size_t)params.n_ctx)
|
360
|
+
{
|
361
|
+
// Shift context
|
362
|
+
|
363
|
+
const int n_left = n_past - params.n_keep - 1;
|
364
|
+
const int n_discard = n_left/2;
|
365
|
+
|
366
|
+
llama_kv_self_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
|
367
|
+
llama_kv_self_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
|
368
|
+
|
369
|
+
for (size_t i = params.n_keep + 1 + n_discard; i < embd.size(); i++)
|
370
|
+
{
|
371
|
+
embd[i - n_discard] = embd[i];
|
372
|
+
}
|
373
|
+
embd.resize(embd.size() - n_discard);
|
374
|
+
|
375
|
+
n_past -= n_discard;
|
376
|
+
|
377
|
+
LOG_VERBOSE("input truncated, n_ctx: %d, n_keep: %d, n_left: %d, new_tokens: %s",
|
378
|
+
params.n_ctx,
|
379
|
+
params.n_keep,
|
380
|
+
n_left
|
381
|
+
);
|
382
|
+
}
|
383
|
+
|
384
|
+
bool tg = true;
|
385
|
+
while (n_past < embd.size())
|
386
|
+
{
|
387
|
+
int n_eval = (int)embd.size() - n_past;
|
388
|
+
tg = n_eval == 1;
|
389
|
+
if (n_eval > params.n_batch)
|
390
|
+
{
|
391
|
+
n_eval = params.n_batch;
|
392
|
+
}
|
393
|
+
if (llama_decode(ctx, llama_batch_get_one(&embd[n_past], n_eval)))
|
394
|
+
{
|
395
|
+
LOG_ERROR("failed to eval, n_eval: %d, n_past: %d, n_threads: %d, embd: %s",
|
396
|
+
n_eval,
|
397
|
+
n_past,
|
398
|
+
params.cpuparams.n_threads,
|
399
|
+
tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend()).c_str()
|
400
|
+
);
|
401
|
+
has_next_token = false;
|
402
|
+
return result;
|
403
|
+
}
|
404
|
+
n_past += n_eval;
|
405
|
+
|
406
|
+
if(is_interrupted) {
|
407
|
+
LOG_INFO("Decoding Interrupted");
|
408
|
+
embd.resize(n_past);
|
409
|
+
has_next_token = false;
|
410
|
+
return result;
|
411
|
+
}
|
412
|
+
}
|
413
|
+
|
414
|
+
const llama_vocab* vocab = llama_model_get_vocab(model);
|
415
|
+
|
416
|
+
if (params.n_predict == 0)
|
417
|
+
{
|
418
|
+
has_next_token = false;
|
419
|
+
result.tok = llama_vocab_eos(vocab);
|
420
|
+
return result;
|
421
|
+
}
|
422
|
+
|
423
|
+
{
|
424
|
+
// out of user input, sample next token
|
425
|
+
std::vector<llama_token_data> candidates;
|
426
|
+
candidates.reserve(llama_vocab_n_tokens(vocab));
|
427
|
+
|
428
|
+
result.tok = common_sampler_sample(ctx_sampling, ctx, -1);
|
429
|
+
|
430
|
+
llama_token_data_array cur_p = *common_sampler_get_candidates(ctx_sampling);
|
431
|
+
|
432
|
+
const int32_t n_probs = params.sampling.n_probs;
|
433
|
+
|
434
|
+
// deprecated
|
435
|
+
/*if (params.sampling.temp <= 0 && n_probs > 0)
|
436
|
+
{
|
437
|
+
// For llama_sample_token_greedy we need to sort candidates
|
438
|
+
llama_sampler_init_softmax();
|
439
|
+
|
440
|
+
}*/
|
441
|
+
|
442
|
+
|
443
|
+
for (size_t i = 0; i < std::min(cur_p.size, (size_t)n_probs); ++i)
|
444
|
+
{
|
445
|
+
result.probs.push_back({cur_p.data[i].id, cur_p.data[i].p});
|
446
|
+
}
|
447
|
+
|
448
|
+
common_sampler_accept(ctx_sampling, result.tok, true);
|
449
|
+
if (tg) {
|
450
|
+
num_tokens_predicted++;
|
451
|
+
}
|
452
|
+
}
|
453
|
+
|
454
|
+
// add it to the context
|
455
|
+
embd.push_back(result.tok);
|
456
|
+
// decrement remaining sampling budget
|
457
|
+
--n_remain;
|
458
|
+
|
459
|
+
if (!embd.empty() && embd.back() == llama_vocab_eos(vocab))
|
460
|
+
{
|
461
|
+
// stopping_word = llama_token_to_piece(ctx, embd.back());
|
462
|
+
has_next_token = false;
|
463
|
+
stopped_eos = true;
|
464
|
+
LOG_VERBOSE("eos token found", "");
|
465
|
+
return result;
|
466
|
+
}
|
467
|
+
|
468
|
+
has_next_token = params.n_predict == -1 || n_remain != 0;
|
469
|
+
return result;
|
470
|
+
}
|
471
|
+
|
472
|
+
size_t llama_rn_context::findStoppingStrings(const std::string &text, const size_t last_token_size,
|
473
|
+
const stop_type type)
|
474
|
+
{
|
475
|
+
size_t stop_pos = std::string::npos;
|
476
|
+
for (const std::string &word : params.antiprompt)
|
477
|
+
{
|
478
|
+
size_t pos;
|
479
|
+
if (type == STOP_FULL)
|
480
|
+
{
|
481
|
+
const size_t tmp = word.size() + last_token_size;
|
482
|
+
const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0;
|
483
|
+
pos = text.find(word, from_pos);
|
484
|
+
}
|
485
|
+
else
|
486
|
+
{
|
487
|
+
pos = find_partial_stop_string(word, text);
|
488
|
+
}
|
489
|
+
if (pos != std::string::npos &&
|
490
|
+
(stop_pos == std::string::npos || pos < stop_pos))
|
491
|
+
{
|
492
|
+
if (type == STOP_FULL)
|
493
|
+
{
|
494
|
+
stopping_word = word;
|
495
|
+
stopped_word = true;
|
496
|
+
has_next_token = false;
|
497
|
+
}
|
498
|
+
stop_pos = pos;
|
499
|
+
}
|
500
|
+
}
|
501
|
+
return stop_pos;
|
502
|
+
}
|
503
|
+
|
504
|
+
completion_token_output llama_rn_context::doCompletion()
|
505
|
+
{
|
506
|
+
const completion_token_output token_with_probs = nextToken();
|
507
|
+
|
508
|
+
const std::string token_text = token_with_probs.tok == -1 ? "" : common_token_to_piece(ctx, token_with_probs.tok);
|
509
|
+
generated_text += token_text;
|
510
|
+
|
511
|
+
if (params.sampling.n_probs > 0)
|
512
|
+
{
|
513
|
+
generated_token_probs.push_back(token_with_probs);
|
514
|
+
}
|
515
|
+
|
516
|
+
// check if there is incomplete UTF-8 character at the end
|
517
|
+
for (unsigned i = 1; i < 5 && i <= generated_text.size(); ++i) {
|
518
|
+
unsigned char c = generated_text[generated_text.size() - i];
|
519
|
+
if ((c & 0xC0) == 0x80) {
|
520
|
+
// continuation byte: 10xxxxxx
|
521
|
+
continue;
|
522
|
+
}
|
523
|
+
if ((c & 0xE0) == 0xC0) {
|
524
|
+
// 2-byte character: 110xxxxx ...
|
525
|
+
incomplete = i < 2;
|
526
|
+
} else if ((c & 0xF0) == 0xE0) {
|
527
|
+
// 3-byte character: 1110xxxx ...
|
528
|
+
incomplete = i < 3;
|
529
|
+
} else if ((c & 0xF8) == 0xF0) {
|
530
|
+
// 4-byte character: 11110xxx ...
|
531
|
+
incomplete = i < 4;
|
532
|
+
}
|
533
|
+
// else 1-byte character or invalid byte
|
534
|
+
break;
|
535
|
+
}
|
536
|
+
|
537
|
+
if (incomplete && !has_next_token)
|
538
|
+
{
|
539
|
+
has_next_token = true;
|
540
|
+
n_remain++;
|
541
|
+
}
|
542
|
+
|
543
|
+
if (!has_next_token && n_remain == 0)
|
544
|
+
{
|
545
|
+
stopped_limit = true;
|
546
|
+
}
|
547
|
+
|
548
|
+
LOG_VERBOSE("next token, token: %s, token_text: %s, has_next_token: %d, n_remain: %d, num_tokens_predicted: %d, stopped_eos: %d, stopped_word: %d, stopped_limit: %d, stopping_word: %s",
|
549
|
+
common_token_to_piece(ctx, token_with_probs.tok),
|
550
|
+
tokens_to_output_formatted_string(ctx, token_with_probs.tok).c_str(),
|
551
|
+
has_next_token,
|
552
|
+
n_remain,
|
553
|
+
num_tokens_predicted,
|
554
|
+
stopped_eos,
|
555
|
+
stopped_word,
|
556
|
+
stopped_limit,
|
557
|
+
stopping_word.c_str()
|
558
|
+
);
|
559
|
+
return token_with_probs;
|
560
|
+
}
|
561
|
+
|
562
|
+
std::vector<float> llama_rn_context::getEmbedding(common_params &embd_params)
|
563
|
+
{
|
564
|
+
static const int n_embd = llama_model_n_embd(llama_get_model(ctx));
|
565
|
+
if (!embd_params.embedding)
|
566
|
+
{
|
567
|
+
LOG_WARNING("embedding disabled, embedding: %s", embd_params.embedding);
|
568
|
+
return std::vector<float>(n_embd, 0.0f);
|
569
|
+
}
|
570
|
+
float *data;
|
571
|
+
|
572
|
+
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
|
573
|
+
if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
574
|
+
data = llama_get_embeddings(ctx);
|
575
|
+
} else {
|
576
|
+
data = llama_get_embeddings_seq(ctx, 0);
|
577
|
+
}
|
578
|
+
|
579
|
+
if (!data) {
|
580
|
+
return std::vector<float>(n_embd, 0.0f);
|
581
|
+
}
|
582
|
+
std::vector<float> embedding(data, data + n_embd), out(data, data + n_embd);
|
583
|
+
common_embd_normalize(embedding.data(), out.data(), n_embd, embd_params.embd_normalize);
|
584
|
+
return out;
|
585
|
+
}
|
586
|
+
|
587
|
+
std::string llama_rn_context::bench(int pp, int tg, int pl, int nr)
|
588
|
+
{
|
589
|
+
if (is_predicting) {
|
590
|
+
LOG_ERROR("cannot benchmark while predicting", "");
|
591
|
+
return std::string("[]");
|
592
|
+
}
|
593
|
+
|
594
|
+
is_predicting = true;
|
595
|
+
|
596
|
+
double pp_avg = 0;
|
597
|
+
double tg_avg = 0;
|
598
|
+
|
599
|
+
double pp_std = 0;
|
600
|
+
double tg_std = 0;
|
601
|
+
|
602
|
+
// TODO: move batch into llama_rn_context (related https://github.com/mybigday/llama.rn/issues/30)
|
603
|
+
llama_batch batch = llama_batch_init(
|
604
|
+
std::min(pp, params.n_ubatch), // max n_tokens is limited by n_ubatch
|
605
|
+
0, // No embeddings
|
606
|
+
1 // Single sequence
|
607
|
+
);
|
608
|
+
|
609
|
+
for (int i = 0; i < nr; i++)
|
610
|
+
{
|
611
|
+
llama_batch_clear(&batch);
|
612
|
+
|
613
|
+
const int n_tokens = pp;
|
614
|
+
|
615
|
+
for (int i = 0; i < n_tokens; i++)
|
616
|
+
{
|
617
|
+
llama_batch_add(&batch, 0, i, {0}, false);
|
618
|
+
}
|
619
|
+
batch.logits[batch.n_tokens - 1] = 1; // true
|
620
|
+
|
621
|
+
llama_kv_self_clear(ctx);
|
622
|
+
|
623
|
+
const int64_t t_pp_start = llama_time_us();
|
624
|
+
if (llama_decode(ctx, batch) != 0)
|
625
|
+
{
|
626
|
+
LOG_ERROR("llama_decode() failed during prompt", "");
|
627
|
+
}
|
628
|
+
const int64_t t_pp_end = llama_time_us();
|
629
|
+
llama_kv_self_clear(ctx);
|
630
|
+
|
631
|
+
if (is_interrupted) break;
|
632
|
+
|
633
|
+
const int64_t t_tg_start = llama_time_us();
|
634
|
+
|
635
|
+
for (int i = 0; i < tg; i++)
|
636
|
+
{
|
637
|
+
llama_batch_clear(&batch);
|
638
|
+
|
639
|
+
for (int j = 0; j < pl; j++)
|
640
|
+
{
|
641
|
+
llama_batch_add(&batch, 0, i, {j}, true);
|
642
|
+
}
|
643
|
+
|
644
|
+
if (llama_decode(ctx, batch) != 0)
|
645
|
+
{
|
646
|
+
LOG_ERROR("llama_decode() failed during text generation", "");
|
647
|
+
}
|
648
|
+
if (is_interrupted) break;
|
649
|
+
}
|
650
|
+
|
651
|
+
const int64_t t_tg_end = llama_time_us();
|
652
|
+
|
653
|
+
llama_kv_self_clear(ctx);
|
654
|
+
|
655
|
+
const double t_pp = (t_pp_end - t_pp_start) / 1000000.0;
|
656
|
+
const double t_tg = (t_tg_end - t_tg_start) / 1000000.0;
|
657
|
+
|
658
|
+
const double speed_pp = pp / t_pp;
|
659
|
+
const double speed_tg = (pl * tg) / t_tg;
|
660
|
+
|
661
|
+
pp_avg += speed_pp;
|
662
|
+
tg_avg += speed_tg;
|
663
|
+
|
664
|
+
pp_std += speed_pp * speed_pp;
|
665
|
+
tg_std += speed_tg * speed_tg;
|
666
|
+
}
|
667
|
+
|
668
|
+
pp_avg /= nr;
|
669
|
+
tg_avg /= nr;
|
670
|
+
|
671
|
+
if (nr > 1) {
|
672
|
+
pp_std = sqrt(pp_std / (nr - 1) - pp_avg * pp_avg * nr / (nr - 1));
|
673
|
+
tg_std = sqrt(tg_std / (nr - 1) - tg_avg * tg_avg * nr / (nr - 1));
|
674
|
+
} else {
|
675
|
+
pp_std = 0;
|
676
|
+
tg_std = 0;
|
677
|
+
}
|
678
|
+
|
679
|
+
if (is_interrupted) llama_kv_self_clear(ctx);
|
680
|
+
is_predicting = false;
|
681
|
+
|
682
|
+
char model_desc[128];
|
683
|
+
llama_model_desc(model, model_desc, sizeof(model_desc));
|
684
|
+
return std::string("[\"") + model_desc + std::string("\",") +
|
685
|
+
std::to_string(llama_model_size(model)) + std::string(",") +
|
686
|
+
std::to_string(llama_model_n_params(model)) + std::string(",") +
|
687
|
+
std::to_string(pp_avg) + std::string(",") +
|
688
|
+
std::to_string(pp_std) + std::string(",") +
|
689
|
+
std::to_string(tg_avg) + std::string(",") +
|
690
|
+
std::to_string(tg_std) +
|
691
|
+
std::string("]");
|
692
|
+
}
|
693
|
+
|
694
|
+
int llama_rn_context::applyLoraAdapters(std::vector<common_adapter_lora_info> lora) {
|
695
|
+
for (auto &la : lora) {
|
696
|
+
la.ptr = llama_adapter_lora_init(model, la.path.c_str());
|
697
|
+
if (la.ptr == nullptr) {
|
698
|
+
LOG_ERROR("failed to apply lora adapter '%s'\n", la.path.c_str());
|
699
|
+
return -1;
|
700
|
+
}
|
701
|
+
}
|
702
|
+
this->lora = lora;
|
703
|
+
common_set_adapter_lora(ctx, lora);
|
704
|
+
return 0;
|
705
|
+
}
|
706
|
+
|
707
|
+
void llama_rn_context::removeLoraAdapters() {
|
708
|
+
this->lora.clear();
|
709
|
+
common_set_adapter_lora(ctx, this->lora); // apply empty list
|
710
|
+
}
|
711
|
+
|
712
|
+
std::vector<common_adapter_lora_info> llama_rn_context::getLoadedLoraAdapters() {
|
713
|
+
return this->lora;
|
714
|
+
}
|
715
|
+
std::vector<int> llama_rn_context::longest_common_subseq(const std::vector<int> x, const std::vector<int> y){
|
716
|
+
int m = x.size(), n = y.size();
|
717
|
+
|
718
|
+
//int LCSuff[m+1][n+1];
|
719
|
+
std::vector<std::vector<int>> LCSuff(m+1, std::vector<int>(n+1));
|
720
|
+
|
721
|
+
for (int j = 0; j <= n; j++)
|
722
|
+
LCSuff[0][j] = 0;
|
723
|
+
for (int i = 0; i <= m; i++)
|
724
|
+
LCSuff[i][0] = 0;
|
725
|
+
|
726
|
+
for (int i = 1; i <= m; i++)
|
727
|
+
{
|
728
|
+
for (int j = 1; j <= n; j++)
|
729
|
+
{
|
730
|
+
if (x[i - 1] == y[j - 1])
|
731
|
+
LCSuff[i][j] = LCSuff[i - 1][j - 1] + 1;
|
732
|
+
else
|
733
|
+
LCSuff[i][j] = 0;
|
734
|
+
}
|
735
|
+
}
|
736
|
+
|
737
|
+
std::vector<int> longest;
|
738
|
+
for (int i = 1; i <= m; i++)
|
739
|
+
{
|
740
|
+
for (int j = 1; j <= n; j++)
|
741
|
+
{
|
742
|
+
if (LCSuff[i][j] > longest.size())
|
743
|
+
{
|
744
|
+
auto off1 = ((i - LCSuff[i][j] + 1) - 1);
|
745
|
+
auto off2 = off1 + LCSuff[i][j];
|
746
|
+
longest.clear();
|
747
|
+
// std::vector<int>().swap(longest);
|
748
|
+
longest = std::vector<int>(x.begin() + off1, x.begin() + off2);
|
749
|
+
// x.substr((i - LCSuff[i][j] + 1) - 1, LCSuff[i][j]);
|
750
|
+
}
|
751
|
+
}
|
752
|
+
}
|
753
|
+
return longest;
|
754
|
+
}
|
755
|
+
|
756
|
+
bool llama_rn_context::arr_start_with(const std::vector<int> targetArray, const std::vector<int> searchSeq)
|
757
|
+
{
|
758
|
+
int ss = searchSeq.size();
|
759
|
+
if(targetArray.size()<ss)
|
760
|
+
{
|
761
|
+
return false;
|
762
|
+
}
|
763
|
+
for(int i=0;i<ss;++i)
|
764
|
+
{
|
765
|
+
if(targetArray[i]!=searchSeq[i])
|
766
|
+
{
|
767
|
+
return false;
|
768
|
+
}
|
769
|
+
}
|
770
|
+
return true;
|
771
|
+
}
|
772
|
+
|
773
|
+
int llama_rn_context::arr_find_index_of(const std::vector<int> targetArray, const std::vector<int> searchSeq)
|
774
|
+
{
|
775
|
+
int ss = searchSeq.size();
|
776
|
+
int tas = targetArray.size();
|
777
|
+
if(tas<ss)
|
778
|
+
{
|
779
|
+
return -1;
|
780
|
+
}
|
781
|
+
for(int i=0;i<tas;++i)
|
782
|
+
{
|
783
|
+
int srch = 0;
|
784
|
+
bool fail = false;
|
785
|
+
for(int srch=0;srch<ss;++srch)
|
786
|
+
{
|
787
|
+
if ((i + srch) >= tas || targetArray[i + srch] != searchSeq[srch])
|
788
|
+
{
|
789
|
+
fail = true;
|
790
|
+
break;
|
791
|
+
}
|
792
|
+
}
|
793
|
+
if(!fail)
|
794
|
+
{
|
795
|
+
return i;
|
796
|
+
}
|
797
|
+
}
|
798
|
+
return -1;
|
799
|
+
}
|
800
|
+
|
801
|
+
void llama_rn_context::purge_missing_tokens(llama_context * ctx, std::vector<int> ¤t_context_tokens, std::vector<int> &new_context_tokens, const int genamt, const int nctx)
|
802
|
+
{
|
803
|
+
//scan from start old and new ctx, until first mismatch found, save as p0
|
804
|
+
//check remaining old and new ctx for longest common subseq, which needs to be at 256 tokens
|
805
|
+
//test: longest common subseq (LCQ) MUST start within 0 tokens from end of memory, otherwise purge fails
|
806
|
+
//if passed, save beginning of LCQ from old ctx as p1
|
807
|
+
//remove all tokens from old ctx between p0 and p1, updating both arrays and kv, then continue as normal
|
808
|
+
|
809
|
+
const int short_fall_threshold = 200 + (nctx/30); //dont trigger shifting if the distance between trimstart and currhead < this
|
810
|
+
const int stack_allowance = 60 + (nctx/50); //in case the end text is slightly modified, be forgiving
|
811
|
+
|
812
|
+
int trimstart = 0;
|
813
|
+
int new_tokens_len = new_context_tokens.size();
|
814
|
+
bool purge_needed = true;
|
815
|
+
|
816
|
+
for (int i = 0; i < current_context_tokens.size(); ++i)
|
817
|
+
{
|
818
|
+
if (current_context_tokens[i] == new_context_tokens[i])
|
819
|
+
{
|
820
|
+
trimstart += 1;
|
821
|
+
}
|
822
|
+
else
|
823
|
+
{
|
824
|
+
break;
|
825
|
+
}
|
826
|
+
if ((i + 2) >= new_tokens_len)
|
827
|
+
{
|
828
|
+
purge_needed = false;
|
829
|
+
break; //no surgery required
|
830
|
+
}
|
831
|
+
}
|
832
|
+
|
833
|
+
|
834
|
+
|
835
|
+
if(!purge_needed || new_tokens_len < 6 || current_context_tokens.size() < 6 || new_tokens_len - trimstart < short_fall_threshold)
|
836
|
+
{
|
837
|
+
LOG_INFO("Fall Threshold: %d out of %d\n", new_tokens_len - trimstart, short_fall_threshold);
|
838
|
+
return; //no purge is needed
|
839
|
+
}
|
840
|
+
|
841
|
+
//at least this many tokens need to match, otherwise don't bother trimming
|
842
|
+
const int lc_tok_threshold = std::max(std::min((new_tokens_len - trimstart) - (genamt+stack_allowance), (int)(nctx*0.45)), short_fall_threshold - stack_allowance);
|
843
|
+
|
844
|
+
auto curr_ctx_without_memory = std::vector<int>(current_context_tokens.begin() + trimstart, current_context_tokens.end());
|
845
|
+
auto new_ctx_without_memory = std::vector<int>(new_context_tokens.begin() + trimstart, new_context_tokens.end());
|
846
|
+
|
847
|
+
auto shared = longest_common_subseq(curr_ctx_without_memory, new_ctx_without_memory);
|
848
|
+
|
849
|
+
if (shared.size() > lc_tok_threshold && arr_start_with(new_ctx_without_memory, shared)) // enough tokens in common
|
850
|
+
{
|
851
|
+
int found = arr_find_index_of(current_context_tokens,shared);
|
852
|
+
if(found>=0 && found > trimstart)
|
853
|
+
{
|
854
|
+
|
855
|
+
//extract the unwanted tokens out from context and KV
|
856
|
+
int diff = found - trimstart;
|
857
|
+
llama_kv_self_seq_rm(ctx, 0, trimstart, trimstart + diff);
|
858
|
+
llama_kv_self_seq_add(ctx, 0, trimstart + diff, -1, -diff);
|
859
|
+
|
860
|
+
for (size_t i = trimstart + diff; i < current_context_tokens.size() - 1; i++)
|
861
|
+
{
|
862
|
+
current_context_tokens[i - diff] = current_context_tokens[i];
|
863
|
+
}
|
864
|
+
|
865
|
+
LOG_INFO("\n[Context Shifting: Erased %d tokens at position %d]", diff, trimstart + 1);
|
866
|
+
|
867
|
+
current_context_tokens.resize(current_context_tokens.size() - diff);
|
868
|
+
}
|
869
|
+
}
|
870
|
+
|
871
|
+
}
|
872
|
+
|
873
|
+
}
|