cui-llama.rn 1.6.0 → 1.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +35 -7
- package/android/src/main/CMakeLists.txt +22 -11
- package/android/src/main/java/com/rnllama/LlamaContext.java +42 -6
- package/android/src/main/java/com/rnllama/RNLlama.java +139 -4
- package/android/src/main/jni.cpp +173 -18
- package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
- package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
- package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +24 -4
- package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +22 -2
- package/cpp/LICENSE +21 -0
- package/cpp/chat.cpp +129 -107
- package/cpp/chat.h +2 -0
- package/cpp/common.cpp +58 -78
- package/cpp/common.h +29 -21
- package/cpp/ggml-alloc.c +4 -1
- package/cpp/ggml-backend.cpp +9 -5
- package/cpp/ggml-backend.h +4 -4
- package/cpp/ggml-cpp.h +1 -1
- package/cpp/ggml-cpu/amx/amx.cpp +221 -0
- package/cpp/ggml-cpu/amx/amx.h +8 -0
- package/cpp/ggml-cpu/amx/common.h +91 -0
- package/cpp/ggml-cpu/amx/mmq.cpp +2511 -0
- package/cpp/ggml-cpu/amx/mmq.h +10 -0
- package/{ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers → cpp/ggml-cpu}/binary-ops.h +1 -1
- package/cpp/ggml-cpu/common.h +72 -0
- package/cpp/{ggml-cpu-aarch64.cpp → ggml-cpu/ggml-cpu-aarch64.cpp} +809 -103
- package/cpp/{ggml-cpu-quants.c → ggml-cpu/ggml-cpu-quants.c} +306 -6
- package/cpp/{ggml-cpu.c → ggml-cpu/ggml-cpu.c} +114 -55
- package/cpp/{ggml-cpu.cpp → ggml-cpu/ggml-cpu.cpp} +32 -16
- package/cpp/{ops.cpp → ggml-cpu/ops.cpp} +353 -173
- package/{ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers → cpp/ggml-cpu}/ops.h +2 -20
- package/cpp/{sgemm.cpp → ggml-cpu/sgemm.cpp} +501 -0
- package/{ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers → cpp/ggml-cpu}/simd-mappings.h +7 -3
- package/{ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers → cpp/ggml-cpu}/unary-ops.h +1 -1
- package/cpp/{vec.cpp → ggml-cpu/vec.cpp} +0 -6
- package/{ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers → cpp/ggml-cpu}/vec.h +16 -0
- package/cpp/ggml-cpu.h +5 -0
- package/cpp/ggml-impl.h +16 -9
- package/cpp/ggml-llama-sim.metallib +0 -0
- package/cpp/ggml-llama.metallib +0 -0
- package/cpp/ggml-metal-impl.h +36 -11
- package/cpp/ggml-metal.m +810 -176
- package/cpp/ggml-opt.cpp +373 -190
- package/cpp/ggml-opt.h +49 -28
- package/cpp/ggml-quants.c +0 -6
- package/cpp/ggml.c +227 -282
- package/cpp/ggml.h +82 -101
- package/cpp/gguf.cpp +33 -33
- package/cpp/json-schema-to-grammar.cpp +3 -0
- package/cpp/llama-adapter.cpp +6 -0
- package/cpp/llama-arch.cpp +49 -17
- package/cpp/llama-arch.h +9 -0
- package/cpp/llama-batch.cpp +8 -2
- package/cpp/llama-batch.h +2 -1
- package/cpp/llama-chat.cpp +39 -16
- package/cpp/llama-chat.h +4 -2
- package/cpp/llama-context.cpp +440 -611
- package/cpp/llama-context.h +44 -33
- package/cpp/llama-cparams.h +1 -0
- package/cpp/llama-graph.cpp +214 -291
- package/cpp/llama-graph.h +69 -21
- package/cpp/llama-hparams.cpp +17 -1
- package/cpp/llama-hparams.h +39 -5
- package/cpp/llama-kv-cache.cpp +2067 -620
- package/cpp/llama-kv-cache.h +410 -108
- package/cpp/llama-memory.h +12 -1
- package/cpp/llama-model-loader.cpp +24 -15
- package/cpp/llama-model-saver.cpp +281 -0
- package/cpp/llama-model-saver.h +37 -0
- package/cpp/llama-model.cpp +1089 -359
- package/cpp/llama-model.h +19 -3
- package/cpp/llama-sampling.cpp +20 -7
- package/cpp/llama-vocab.cpp +54 -9
- package/cpp/llama-vocab.h +6 -0
- package/cpp/llama.cpp +14 -0
- package/cpp/llama.h +86 -142
- package/cpp/minja/chat-template.hpp +9 -5
- package/cpp/minja/minja.hpp +69 -36
- package/cpp/rn-llama.cpp +602 -190
- package/cpp/rn-llama.h +34 -8
- package/cpp/sampling.cpp +57 -50
- package/cpp/tools/mtmd/clip-impl.h +462 -0
- package/cpp/tools/mtmd/clip.cpp +4024 -0
- package/cpp/tools/mtmd/clip.h +101 -0
- package/cpp/tools/mtmd/miniaudio.h +93468 -0
- package/cpp/tools/mtmd/mtmd-audio.cpp +855 -0
- package/cpp/tools/mtmd/mtmd-audio.h +62 -0
- package/cpp/tools/mtmd/mtmd-helper.cpp +297 -0
- package/cpp/tools/mtmd/mtmd.cpp +942 -0
- package/cpp/tools/mtmd/mtmd.h +362 -0
- package/cpp/tools/mtmd/stb_image.h +7988 -0
- package/ios/CMakeLists.txt +20 -10
- package/ios/RNLlama.h +6 -0
- package/ios/RNLlama.mm +82 -3
- package/ios/RNLlamaContext.h +5 -1
- package/ios/RNLlamaContext.mm +131 -38
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/chat.h +2 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/common.h +29 -21
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-backend.h +4 -4
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpp.h +1 -1
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu.h +5 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-impl.h +16 -9
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-metal-impl.h +36 -11
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-opt.h +49 -28
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml.h +82 -101
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-arch.h +9 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-batch.h +2 -1
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-chat.h +4 -2
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-context.h +44 -33
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-cparams.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-graph.h +69 -21
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-hparams.h +39 -5
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache.h +410 -108
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory.h +12 -1
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model-saver.h +37 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model.h +19 -3
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-vocab.h +6 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama.h +86 -142
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/chat-template.hpp +9 -5
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/minja/minja.hpp +69 -36
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-llama.h +34 -8
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Info.plist +0 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/ggml-llama.metallib +0 -0
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/rnllama +0 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +2 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +29 -21
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-backend.h +4 -4
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpp.h +1 -1
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +5 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +16 -9
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-metal-impl.h +36 -11
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-opt.h +49 -28
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +82 -101
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +9 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +2 -1
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +4 -2
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +44 -33
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +1 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +69 -21
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +39 -5
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +410 -108
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +12 -1
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model-saver.h +37 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +19 -3
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +6 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +86 -142
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +9 -5
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +69 -36
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +34 -8
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Info.plist +0 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/_CodeSignature/CodeResources +1 -1
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/chat.h +2 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/common.h +29 -21
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-backend.h +4 -4
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpp.h +1 -1
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu.h +5 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-impl.h +16 -9
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-metal-impl.h +36 -11
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-opt.h +49 -28
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml.h +82 -101
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-arch.h +9 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-batch.h +2 -1
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-chat.h +4 -2
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-context.h +44 -33
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-cparams.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-graph.h +69 -21
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-hparams.h +39 -5
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache.h +410 -108
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory.h +12 -1
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model-saver.h +37 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model.h +19 -3
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-vocab.h +6 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama.h +86 -142
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/chat-template.hpp +9 -5
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/minja/minja.hpp +69 -36
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-llama.h +34 -8
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Info.plist +0 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/ggml-llama.metallib +0 -0
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/rnllama +0 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h +2 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +29 -21
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-backend.h +4 -4
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpp.h +1 -1
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +5 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +16 -9
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-metal-impl.h +36 -11
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-opt.h +49 -28
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +82 -101
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +9 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +2 -1
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +4 -2
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +44 -33
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-cparams.h +1 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +69 -21
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +39 -5
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +410 -108
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +12 -1
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model-saver.h +37 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +19 -3
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-vocab.h +6 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +86 -142
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/chat-template.hpp +9 -5
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/minja/minja.hpp +69 -36
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +34 -8
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Info.plist +0 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/_CodeSignature/CodeResources +1 -1
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
- package/jest/mock.js +33 -7
- package/lib/commonjs/NativeRNLlama.js.map +1 -1
- package/lib/commonjs/index.js +153 -21
- package/lib/commonjs/index.js.map +1 -1
- package/lib/module/NativeRNLlama.js.map +1 -1
- package/lib/module/index.js +152 -20
- package/lib/module/index.js.map +1 -1
- package/lib/typescript/NativeRNLlama.d.ts +54 -4
- package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
- package/lib/typescript/index.d.ts +72 -6
- package/lib/typescript/index.d.ts.map +1 -1
- package/package.json +1 -1
- package/src/NativeRNLlama.ts +72 -4
- package/src/index.ts +212 -38
- package/cpp/binary-ops.h +0 -16
- package/cpp/ops.h +0 -128
- package/cpp/simd-mappings.h +0 -888
- package/cpp/unary-ops.h +0 -28
- package/cpp/vec.h +0 -802
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/binary-ops.h +0 -16
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ops.h +0 -128
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/sgemm.h +0 -14
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/simd-mappings.h +0 -888
- package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/vec.h +0 -802
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/sgemm.h +0 -14
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/unary-ops.h +0 -28
- package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/vec.h +0 -802
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/binary-ops.h +0 -16
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ops.h +0 -128
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/sgemm.h +0 -14
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/simd-mappings.h +0 -888
- package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/unary-ops.h +0 -28
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/binary-ops.h +0 -16
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ops.h +0 -128
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/sgemm.h +0 -14
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/simd-mappings.h +0 -888
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/unary-ops.h +0 -28
- package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/vec.h +0 -802
- package/lib/commonjs/chat.js +0 -37
- package/lib/commonjs/chat.js.map +0 -1
- package/lib/module/chat.js +0 -33
- package/lib/module/chat.js.map +0 -1
- package/lib/typescript/chat.d.ts +0 -10
- package/lib/typescript/chat.d.ts.map +0 -1
- package/src/chat.ts +0 -44
- /package/cpp/{binary-ops.cpp → ggml-cpu/binary-ops.cpp} +0 -0
- /package/cpp/{ggml-cpu-aarch64.h → ggml-cpu/ggml-cpu-aarch64.h} +0 -0
- /package/cpp/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +0 -0
- /package/cpp/{ggml-cpu-quants.h → ggml-cpu/ggml-cpu-quants.h} +0 -0
- /package/cpp/{ggml-cpu-traits.cpp → ggml-cpu/ggml-cpu-traits.cpp} +0 -0
- /package/cpp/{ggml-cpu-traits.h → ggml-cpu/ggml-cpu-traits.h} +0 -0
- /package/cpp/{sgemm.h → ggml-cpu/sgemm.h} +0 -0
- /package/cpp/{unary-ops.cpp → ggml-cpu/unary-ops.cpp} +0 -0
package/cpp/llama-arch.cpp
CHANGED
@@ -19,6 +19,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
19
19
|
{ LLM_ARCH_REFACT, "refact" },
|
20
20
|
{ LLM_ARCH_BERT, "bert" },
|
21
21
|
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" },
|
22
|
+
{ LLM_ARCH_NOMIC_BERT_MOE, "nomic-bert-moe" },
|
22
23
|
{ LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
|
23
24
|
{ LLM_ARCH_BLOOM, "bloom" },
|
24
25
|
{ LLM_ARCH_STABLELM, "stablelm" },
|
@@ -54,6 +55,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
54
55
|
{ LLM_ARCH_DEEPSEEK, "deepseek" },
|
55
56
|
{ LLM_ARCH_DEEPSEEK2, "deepseek2" },
|
56
57
|
{ LLM_ARCH_CHATGLM, "chatglm" },
|
58
|
+
{ LLM_ARCH_GLM4, "glm4" },
|
57
59
|
{ LLM_ARCH_BITNET, "bitnet" },
|
58
60
|
{ LLM_ARCH_T5, "t5" },
|
59
61
|
{ LLM_ARCH_T5ENCODER, "t5encoder" },
|
@@ -105,6 +107,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
105
107
|
{ LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
|
106
108
|
{ LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" },
|
107
109
|
{ LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" },
|
110
|
+
{ LLM_KV_MOE_EVERY_N_LAYERS, "%s.moe_every_n_layers" },
|
108
111
|
{ LLM_KV_POOLING_TYPE, "%s.pooling_type" },
|
109
112
|
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
|
110
113
|
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
|
@@ -139,6 +142,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
139
142
|
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
|
140
143
|
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
|
141
144
|
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
145
|
+
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
|
146
|
+
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
|
142
147
|
|
143
148
|
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
144
149
|
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
|
@@ -469,6 +474,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
469
474
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
470
475
|
},
|
471
476
|
},
|
477
|
+
{
|
478
|
+
LLM_ARCH_NOMIC_BERT_MOE,
|
479
|
+
{
|
480
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
481
|
+
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
482
|
+
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
|
483
|
+
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
484
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
485
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
486
|
+
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
|
487
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
488
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
489
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
490
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
491
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
492
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
493
|
+
},
|
494
|
+
},
|
472
495
|
{
|
473
496
|
LLM_ARCH_JINA_BERT_V2,
|
474
497
|
{
|
@@ -1102,6 +1125,8 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
1102
1125
|
{ LLM_TENSOR_ATTN_Q_B, "blk.%d.attn_q_b" },
|
1103
1126
|
{ LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" },
|
1104
1127
|
{ LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" },
|
1128
|
+
{ LLM_TENSOR_ATTN_K_B, "blk.%d.attn_k_b" },
|
1129
|
+
{ LLM_TENSOR_ATTN_V_B, "blk.%d.attn_v_b" },
|
1105
1130
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
1106
1131
|
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
1107
1132
|
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
@@ -1152,6 +1177,25 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
1152
1177
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
1153
1178
|
},
|
1154
1179
|
},
|
1180
|
+
{
|
1181
|
+
LLM_ARCH_GLM4,
|
1182
|
+
{
|
1183
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
1184
|
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
1185
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
1186
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
1187
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
1188
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
1189
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
1190
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
1191
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
1192
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
1193
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
1194
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
1195
|
+
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
|
1196
|
+
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
|
1197
|
+
},
|
1198
|
+
},
|
1155
1199
|
{
|
1156
1200
|
LLM_ARCH_BITNET,
|
1157
1201
|
{
|
@@ -1437,6 +1481,9 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
1437
1481
|
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
1438
1482
|
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
1439
1483
|
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
1484
|
+
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
|
1485
|
+
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
1486
|
+
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
1440
1487
|
},
|
1441
1488
|
},
|
1442
1489
|
{
|
@@ -1543,23 +1590,8 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
|
1543
1590
|
{LLM_TENSOR_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
|
1544
1591
|
{LLM_TENSOR_ATTN_KV_A_MQA, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
|
1545
1592
|
{LLM_TENSOR_ATTN_KV_B, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
|
1546
|
-
{
|
1547
|
-
{
|
1548
|
-
{LLM_TENSOR_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
|
1549
|
-
{LLM_TENSOR_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
|
1550
|
-
{LLM_TENSOR_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
|
1551
|
-
{LLM_TENSOR_ATTN_QKV, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
|
1552
|
-
{LLM_TENSOR_ATTN_OUT, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
|
1553
|
-
{LLM_TENSOR_FFN_GATE, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
|
1554
|
-
{LLM_TENSOR_FFN_DOWN, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
|
1555
|
-
{LLM_TENSOR_FFN_UP, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
|
1556
|
-
{LLM_TENSOR_FFN_DOWN_SHEXP, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
|
1557
|
-
{LLM_TENSOR_FFN_GATE_SHEXP, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
|
1558
|
-
{LLM_TENSOR_FFN_UP_SHEXP, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
|
1559
|
-
{LLM_TENSOR_ATTN_Q_A, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
|
1560
|
-
{LLM_TENSOR_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
|
1561
|
-
{LLM_TENSOR_ATTN_KV_A_MQA, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
|
1562
|
-
{LLM_TENSOR_ATTN_KV_B, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
|
1593
|
+
{LLM_TENSOR_ATTN_K_B, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
|
1594
|
+
{LLM_TENSOR_ATTN_V_B, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
|
1563
1595
|
{LLM_TENSOR_DEC_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
|
1564
1596
|
{LLM_TENSOR_DEC_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
|
1565
1597
|
{LLM_TENSOR_DEC_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, LM_GGML_OP_MUL_MAT}},
|
package/cpp/llama-arch.h
CHANGED
@@ -23,6 +23,7 @@ enum llm_arch {
|
|
23
23
|
LLM_ARCH_REFACT,
|
24
24
|
LLM_ARCH_BERT,
|
25
25
|
LLM_ARCH_NOMIC_BERT,
|
26
|
+
LLM_ARCH_NOMIC_BERT_MOE,
|
26
27
|
LLM_ARCH_JINA_BERT_V2,
|
27
28
|
LLM_ARCH_BLOOM,
|
28
29
|
LLM_ARCH_STABLELM,
|
@@ -58,6 +59,7 @@ enum llm_arch {
|
|
58
59
|
LLM_ARCH_DEEPSEEK,
|
59
60
|
LLM_ARCH_DEEPSEEK2,
|
60
61
|
LLM_ARCH_CHATGLM,
|
62
|
+
LLM_ARCH_GLM4,
|
61
63
|
LLM_ARCH_BITNET,
|
62
64
|
LLM_ARCH_T5,
|
63
65
|
LLM_ARCH_T5ENCODER,
|
@@ -109,6 +111,7 @@ enum llm_kv {
|
|
109
111
|
LLM_KV_EXPERT_WEIGHTS_SCALE,
|
110
112
|
LLM_KV_EXPERT_WEIGHTS_NORM,
|
111
113
|
LLM_KV_EXPERT_GATING_FUNC,
|
114
|
+
LLM_KV_MOE_EVERY_N_LAYERS,
|
112
115
|
LLM_KV_POOLING_TYPE,
|
113
116
|
LLM_KV_LOGIT_SCALE,
|
114
117
|
LLM_KV_DECODER_START_TOKEN_ID,
|
@@ -143,6 +146,8 @@ enum llm_kv {
|
|
143
146
|
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
|
144
147
|
LLM_KV_ATTENTION_SLIDING_WINDOW,
|
145
148
|
LLM_KV_ATTENTION_SCALE,
|
149
|
+
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
|
150
|
+
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
|
146
151
|
|
147
152
|
LLM_KV_ROPE_DIMENSION_COUNT,
|
148
153
|
LLM_KV_ROPE_DIMENSION_SECTIONS,
|
@@ -256,6 +261,8 @@ enum llm_tensor {
|
|
256
261
|
LLM_TENSOR_ATTN_Q_NORM,
|
257
262
|
LLM_TENSOR_ATTN_K_NORM,
|
258
263
|
LLM_TENSOR_LAYER_OUT_NORM,
|
264
|
+
LLM_TENSOR_POST_ATTN_NORM,
|
265
|
+
LLM_TENSOR_POST_MLP_NORM,
|
259
266
|
LLM_TENSOR_SSM_IN,
|
260
267
|
LLM_TENSOR_SSM_CONV1D,
|
261
268
|
LLM_TENSOR_SSM_X,
|
@@ -303,6 +310,8 @@ enum llm_tensor {
|
|
303
310
|
LLM_TENSOR_ATTN_Q_B,
|
304
311
|
LLM_TENSOR_ATTN_KV_A_MQA,
|
305
312
|
LLM_TENSOR_ATTN_KV_B,
|
313
|
+
LLM_TENSOR_ATTN_K_B,
|
314
|
+
LLM_TENSOR_ATTN_V_B,
|
306
315
|
LLM_TENSOR_ATTN_Q_A_NORM,
|
307
316
|
LLM_TENSOR_ATTN_KV_A_NORM,
|
308
317
|
LLM_TENSOR_ATTN_SUB_NORM,
|
package/cpp/llama-batch.cpp
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
#include "llama-batch.h"
|
2
2
|
|
3
|
+
#include <cassert>
|
3
4
|
#include <cstring>
|
4
5
|
#include <algorithm>
|
5
6
|
|
@@ -189,7 +190,7 @@ llama_ubatch llama_sbatch::split_seq(size_t n_ubatch) {
|
|
189
190
|
return ubatch;
|
190
191
|
}
|
191
192
|
|
192
|
-
|
193
|
+
llama_sbatch::llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split, bool logits_all) {
|
193
194
|
LM_GGML_ASSERT(batch.n_tokens >= 0);
|
194
195
|
this->batch = &batch;
|
195
196
|
this->n_embd = n_embd;
|
@@ -203,6 +204,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
|
|
203
204
|
for (size_t i = 0; i < n_tokens; ++i) {
|
204
205
|
ids[i] = i;
|
205
206
|
}
|
207
|
+
|
206
208
|
if (simple_split) {
|
207
209
|
seq.resize(1);
|
208
210
|
llama_sbatch_seq & s = seq[0];
|
@@ -212,6 +214,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
|
|
212
214
|
s.length = n_tokens;
|
213
215
|
return;
|
214
216
|
}
|
217
|
+
|
215
218
|
std::sort(ids.begin(), ids.end(),
|
216
219
|
[&batch](size_t a, size_t b) {
|
217
220
|
int32_t n_seq_a = batch.n_seq_id ? batch.n_seq_id[a] : 1;
|
@@ -239,6 +242,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
|
|
239
242
|
return n_seq_a > n_seq_b;
|
240
243
|
}
|
241
244
|
);
|
245
|
+
|
242
246
|
// init seq
|
243
247
|
llama_sbatch_seq * last_seq = nullptr;
|
244
248
|
|
@@ -262,6 +266,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
|
|
262
266
|
seq.push_back(new_seq);
|
263
267
|
last_seq = &seq.back();
|
264
268
|
}
|
269
|
+
|
265
270
|
// keep shared prompts first at the end, then sort by length descending.
|
266
271
|
std::sort(seq.begin(), seq.end(),
|
267
272
|
[](llama_sbatch_seq & a, llama_sbatch_seq & b) {
|
@@ -277,9 +282,10 @@ llama_batch_allocr::llama_batch_allocr(struct llama_batch in_batch, llama_pos p0
|
|
277
282
|
batch = in_batch;
|
278
283
|
LM_GGML_ASSERT(batch.n_tokens > 0);
|
279
284
|
if (!batch.pos) {
|
285
|
+
assert(p0 >= 0);
|
280
286
|
pos.resize(batch.n_tokens);
|
281
287
|
for (int32_t i = 0; i < batch.n_tokens; i++) {
|
282
|
-
pos[i] =
|
288
|
+
pos[i] = p0 + i;
|
283
289
|
}
|
284
290
|
batch.pos = pos.data();
|
285
291
|
}
|
package/cpp/llama-batch.h
CHANGED
@@ -70,7 +70,8 @@ struct llama_sbatch {
|
|
70
70
|
// sequence-wise split
|
71
71
|
llama_ubatch split_seq(size_t n_ubatch);
|
72
72
|
|
73
|
-
|
73
|
+
llama_sbatch() = default;
|
74
|
+
llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split = false, bool logits_all = false);
|
74
75
|
};
|
75
76
|
|
76
77
|
// temporary allocate memory for the input batch if needed
|
package/cpp/llama-chat.cpp
CHANGED
@@ -35,6 +35,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
|
35
35
|
{ "mistral-v3", LLM_CHAT_TEMPLATE_MISTRAL_V3 },
|
36
36
|
{ "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
|
37
37
|
{ "mistral-v7", LLM_CHAT_TEMPLATE_MISTRAL_V7 },
|
38
|
+
{ "mistral-v7-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN },
|
38
39
|
{ "phi3", LLM_CHAT_TEMPLATE_PHI_3 },
|
39
40
|
{ "phi4", LLM_CHAT_TEMPLATE_PHI_4 },
|
40
41
|
{ "falcon3", LLM_CHAT_TEMPLATE_FALCON_3 },
|
@@ -50,8 +51,8 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
|
50
51
|
{ "deepseek3", LLM_CHAT_TEMPLATE_DEEPSEEK_3 },
|
51
52
|
{ "command-r", LLM_CHAT_TEMPLATE_COMMAND_R },
|
52
53
|
{ "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 },
|
53
|
-
{ "chatglm3",
|
54
|
-
{ "chatglm4",
|
54
|
+
{ "chatglm3", LLM_CHAT_TEMPLATE_CHATGLM_3 },
|
55
|
+
{ "chatglm4", LLM_CHAT_TEMPLATE_CHATGLM_4 },
|
55
56
|
{ "glmedge", LLM_CHAT_TEMPLATE_GLMEDGE },
|
56
57
|
{ "minicpm", LLM_CHAT_TEMPLATE_MINICPM },
|
57
58
|
{ "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 },
|
@@ -62,6 +63,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
|
62
63
|
{ "yandex", LLM_CHAT_TEMPLATE_YANDEX },
|
63
64
|
{ "bailing", LLM_CHAT_TEMPLATE_BAILING },
|
64
65
|
{ "llama4", LLM_CHAT_TEMPLATE_LLAMA4 },
|
66
|
+
{ "smolvlm", LLM_CHAT_TEMPLATE_SMOLVLM },
|
65
67
|
};
|
66
68
|
|
67
69
|
llm_chat_template llm_chat_template_from_str(const std::string & name) {
|
@@ -81,7 +83,9 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
|
81
83
|
if (tmpl_contains("<|im_start|>")) {
|
82
84
|
return tmpl_contains("<|im_sep|>")
|
83
85
|
? LLM_CHAT_TEMPLATE_PHI_4
|
84
|
-
:
|
86
|
+
: tmpl_contains("<end_of_utterance>")
|
87
|
+
? LLM_CHAT_TEMPLATE_SMOLVLM // SmolVLM uses <|im_start|> as BOS, but it is NOT chatml
|
88
|
+
: LLM_CHAT_TEMPLATE_CHATML;
|
85
89
|
} else if (tmpl.find("mistral") == 0 || tmpl_contains("[INST]")) {
|
86
90
|
if (tmpl_contains("[SYSTEM_PROMPT]")) {
|
87
91
|
return LLM_CHAT_TEMPLATE_MISTRAL_V7;
|
@@ -119,8 +123,12 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
|
119
123
|
}
|
120
124
|
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
|
121
125
|
return LLM_CHAT_TEMPLATE_PHI_3;
|
126
|
+
} else if (tmpl_contains("[gMASK]<sop>")) {
|
127
|
+
return LLM_CHAT_TEMPLATE_CHATGLM_4;
|
122
128
|
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
|
123
129
|
return tmpl_contains("</s>") ? LLM_CHAT_TEMPLATE_FALCON_3 : LLM_CHAT_TEMPLATE_GLMEDGE;
|
130
|
+
} else if (tmpl_contains("<|{{ item['role'] }}|>") && tmpl_contains("<|begin_of_image|>")) {
|
131
|
+
return LLM_CHAT_TEMPLATE_GLMEDGE;
|
124
132
|
} else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) {
|
125
133
|
return LLM_CHAT_TEMPLATE_ZEPHYR;
|
126
134
|
} else if (tmpl_contains("bos_token + message['role']")) {
|
@@ -149,9 +157,7 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
|
149
157
|
return LLM_CHAT_TEMPLATE_LLAMA_3;
|
150
158
|
} else if (tmpl_contains("[gMASK]sop")) {
|
151
159
|
// chatglm3-6b
|
152
|
-
return
|
153
|
-
} else if (tmpl_contains("[gMASK]<sop>")) {
|
154
|
-
return LLM_CHAT_TEMPLATE_CHATGML_4;
|
160
|
+
return LLM_CHAT_TEMPLATE_CHATGLM_3;
|
155
161
|
} else if (tmpl_contains(LU8("<用户>"))) {
|
156
162
|
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF
|
157
163
|
return LLM_CHAT_TEMPLATE_MINICPM;
|
@@ -197,19 +203,20 @@ int32_t llm_chat_apply_template(
|
|
197
203
|
if (add_ass) {
|
198
204
|
ss << "<|im_start|>assistant\n";
|
199
205
|
}
|
200
|
-
} else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7) {
|
206
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7 || tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN) {
|
201
207
|
// Official mistral 'v7' template
|
202
208
|
// See: https://huggingface.co/mistralai/Mistral-Large-Instruct-2411#basic-instruct-template-v7
|
209
|
+
// https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503#basic-instruct-template-v7-tekken
|
210
|
+
const char * trailing_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7 ? " " : "";
|
203
211
|
for (auto message : chat) {
|
204
212
|
std::string role(message->role);
|
205
213
|
std::string content(message->content);
|
206
214
|
if (role == "system") {
|
207
|
-
ss << "[SYSTEM_PROMPT]
|
215
|
+
ss << "[SYSTEM_PROMPT]" << trailing_space << content << "[/SYSTEM_PROMPT]";
|
208
216
|
} else if (role == "user") {
|
209
|
-
ss << "[INST]
|
210
|
-
}
|
211
|
-
|
212
|
-
ss << " " << content << "</s>";
|
217
|
+
ss << "[INST]" << trailing_space << content << "[/INST]";
|
218
|
+
} else {
|
219
|
+
ss << trailing_space << content << "</s>";
|
213
220
|
}
|
214
221
|
}
|
215
222
|
} else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1
|
@@ -432,7 +439,7 @@ int32_t llm_chat_apply_template(
|
|
432
439
|
if (add_ass) {
|
433
440
|
ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
|
434
441
|
}
|
435
|
-
} else if (tmpl ==
|
442
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_3) {
|
436
443
|
// chatglm3-6b
|
437
444
|
ss << "[gMASK]" << "sop";
|
438
445
|
for (auto message : chat) {
|
@@ -442,14 +449,14 @@ int32_t llm_chat_apply_template(
|
|
442
449
|
if (add_ass) {
|
443
450
|
ss << "<|assistant|>";
|
444
451
|
}
|
445
|
-
} else if (tmpl ==
|
452
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_4) {
|
446
453
|
ss << "[gMASK]" << "<sop>";
|
447
454
|
for (auto message : chat) {
|
448
455
|
std::string role(message->role);
|
449
456
|
ss << "<|" << role << "|>" << "\n" << message->content;
|
450
457
|
}
|
451
458
|
if (add_ass) {
|
452
|
-
ss << "<|assistant
|
459
|
+
ss << "<|assistant|>\n";
|
453
460
|
}
|
454
461
|
} else if (tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
|
455
462
|
for (auto message : chat) {
|
@@ -620,7 +627,23 @@ int32_t llm_chat_apply_template(
|
|
620
627
|
if (add_ass) {
|
621
628
|
ss << "<|header_start|>assistant<|header_end|>\n\n";
|
622
629
|
}
|
623
|
-
}
|
630
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_SMOLVLM) {
|
631
|
+
// SmolVLM
|
632
|
+
ss << "<|im_start|>"; // uses <|im_start|> as BOS, but the actual content is NOT chatml
|
633
|
+
for (auto message : chat) {
|
634
|
+
std::string role(message->role);
|
635
|
+
if (role == "system") {
|
636
|
+
ss << message->content << "\n\n";
|
637
|
+
} else if (role == "user") {
|
638
|
+
ss << "User: " << message->content << "<end_of_utterance>\n";
|
639
|
+
} else {
|
640
|
+
ss << "Assistant: " << message->content << "<end_of_utterance>\n";
|
641
|
+
}
|
642
|
+
}
|
643
|
+
if (add_ass) {
|
644
|
+
ss << "Assistant:";
|
645
|
+
}
|
646
|
+
} else {
|
624
647
|
// template not supported
|
625
648
|
return -1;
|
626
649
|
}
|
package/cpp/llama-chat.h
CHANGED
@@ -14,6 +14,7 @@ enum llm_chat_template {
|
|
14
14
|
LLM_CHAT_TEMPLATE_MISTRAL_V3,
|
15
15
|
LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN,
|
16
16
|
LLM_CHAT_TEMPLATE_MISTRAL_V7,
|
17
|
+
LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN,
|
17
18
|
LLM_CHAT_TEMPLATE_PHI_3,
|
18
19
|
LLM_CHAT_TEMPLATE_PHI_4,
|
19
20
|
LLM_CHAT_TEMPLATE_FALCON_3,
|
@@ -29,8 +30,8 @@ enum llm_chat_template {
|
|
29
30
|
LLM_CHAT_TEMPLATE_DEEPSEEK_3,
|
30
31
|
LLM_CHAT_TEMPLATE_COMMAND_R,
|
31
32
|
LLM_CHAT_TEMPLATE_LLAMA_3,
|
32
|
-
|
33
|
-
|
33
|
+
LLM_CHAT_TEMPLATE_CHATGLM_3,
|
34
|
+
LLM_CHAT_TEMPLATE_CHATGLM_4,
|
34
35
|
LLM_CHAT_TEMPLATE_GLMEDGE,
|
35
36
|
LLM_CHAT_TEMPLATE_MINICPM,
|
36
37
|
LLM_CHAT_TEMPLATE_EXAONE_3,
|
@@ -41,6 +42,7 @@ enum llm_chat_template {
|
|
41
42
|
LLM_CHAT_TEMPLATE_YANDEX,
|
42
43
|
LLM_CHAT_TEMPLATE_BAILING,
|
43
44
|
LLM_CHAT_TEMPLATE_LLAMA4,
|
45
|
+
LLM_CHAT_TEMPLATE_SMOLVLM,
|
44
46
|
LLM_CHAT_TEMPLATE_UNKNOWN,
|
45
47
|
};
|
46
48
|
|