@novastera-oss/llamarn 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/build.gradle +2 -1
- package/android/proguard-rules.pro +12 -0
- package/android/src/main/cpp/include/llama.h +15 -47
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86/libggml.so +0 -0
- package/android/src/main/jniLibs/x86/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakePresets.json +11 -0
- package/cpp/llama.cpp/CODEOWNERS +1 -0
- package/cpp/llama.cpp/README.md +4 -3
- package/cpp/llama.cpp/common/arg.cpp +45 -1
- package/cpp/llama.cpp/common/common.cpp +22 -6
- package/cpp/llama.cpp/common/common.h +18 -4
- package/cpp/llama.cpp/convert_hf_to_gguf.py +500 -32
- package/cpp/llama.cpp/convert_hf_to_gguf_update.py +12 -13
- package/cpp/llama.cpp/ggml/CMakeLists.txt +6 -1
- package/cpp/llama.cpp/ggml/cmake/ggml-config.cmake.in +85 -47
- package/cpp/llama.cpp/ggml/include/ggml-webgpu.h +19 -0
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-alloc.c +0 -15
- package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +8 -20
- package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +3 -1
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +58 -3
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +130 -22
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +122 -16
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +5 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +109 -12
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +88 -10
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +343 -1094
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +0 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +14 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cu +64 -17
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy-utils.cuh +225 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +41 -301
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +85 -67
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +45 -62
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +28 -43
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +41 -56
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +36 -47
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +31 -43
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +22 -37
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +3 -13
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +73 -23
- package/cpp/llama.cpp/ggml/src/ggml-cuda/im2col.cu +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mma.cuh +111 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +6 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +1152 -689
- package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cu +92 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cuh +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/set-rows.cu +275 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/set-rows.cuh +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +13 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +16 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +13 -3
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +407 -69
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +380 -83
- package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +18 -4
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +295 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/im2col_f16.cl +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/im2col_f32.cl +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/rms_norm.cl +79 -0
- package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +4 -4
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +14 -26
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +131 -46
- package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +8 -9
- package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +43 -43
- package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +2 -6
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +287 -22
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +265 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +1 -5
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +3 -8
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +8 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +1 -4
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rte.comp +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +71 -16
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/CMakeLists.txt +54 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/ggml-webgpu.cpp +907 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +60 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +35 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +56 -0
- package/cpp/llama.cpp/ggml/src/ggml.c +4 -6
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +98 -0
- package/cpp/llama.cpp/gguf-py/gguf/metadata.py +4 -0
- package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_dump.py +24 -1
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +75 -52
- package/cpp/llama.cpp/include/llama.h +15 -7
- package/cpp/llama.cpp/models/templates/llama-cpp-rwkv-world.jinja +34 -0
- package/cpp/llama.cpp/models/templates/moonshotai-Kimi-K2.jinja +43 -0
- package/cpp/llama.cpp/requirements/requirements-all.txt +1 -0
- package/cpp/llama.cpp/requirements/requirements-server-bench.txt +5 -0
- package/cpp/llama.cpp/src/llama-arch.cpp +106 -0
- package/cpp/llama.cpp/src/llama-arch.h +5 -0
- package/cpp/llama.cpp/src/llama-batch.cpp +76 -70
- package/cpp/llama.cpp/src/llama-batch.h +24 -18
- package/cpp/llama.cpp/src/llama-chat.cpp +43 -1
- package/cpp/llama.cpp/src/llama-chat.h +2 -0
- package/cpp/llama.cpp/src/llama-context.cpp +180 -106
- package/cpp/llama.cpp/src/llama-context.h +26 -16
- package/cpp/llama.cpp/src/llama-cparams.h +3 -2
- package/cpp/llama.cpp/src/llama-graph.cpp +203 -39
- package/cpp/llama.cpp/src/llama-graph.h +147 -72
- package/cpp/llama.cpp/src/llama-hparams.cpp +40 -0
- package/cpp/llama.cpp/src/llama-hparams.h +10 -2
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +11 -5
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +3 -0
- package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +698 -302
- package/cpp/llama.cpp/src/llama-kv-cache-unified.h +89 -31
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +1 -0
- package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +16 -1
- package/cpp/llama.cpp/src/llama-model.cpp +1293 -312
- package/cpp/llama.cpp/src/llama-model.h +3 -4
- package/cpp/llama.cpp/src/llama-quant.cpp +1 -2
- package/cpp/llama.cpp/src/llama-vocab.cpp +363 -8
- package/cpp/llama.cpp/src/llama-vocab.h +2 -0
- package/cpp/llama.cpp/src/unicode.cpp +207 -0
- package/cpp/llama.cpp/src/unicode.h +2 -0
- package/ios/include/common.h +18 -4
- package/ios/include/llama.h +15 -7
- package/ios/libs/llama.xcframework/Info.plist +15 -15
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5267 -5059
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +15 -7
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5238 -5030
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4014 -3889
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +15 -7
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5238 -5030
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4016 -3891
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +15 -7
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +15 -7
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +15 -7
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5267 -5059
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +15 -7
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5238 -5030
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4014 -3889
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +15 -7
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5303 -5095
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +15 -7
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5274 -5066
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4044 -3919
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +15 -7
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +4 -4
package/android/build.gradle
CHANGED
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
# ProGuard rules for @novastera-oss/llamarn library
|
|
2
|
+
# These rules will be automatically included when apps use this library
|
|
3
|
+
|
|
4
|
+
# Keep all classes in our package (includes NativeRNLlamaCppSpec, RNLlamaCppPackage, etc.)
|
|
5
|
+
-keep class com.novastera.llamarn.** {
|
|
6
|
+
*;
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
# Keep native methods (JNI)
|
|
10
|
+
-keepclassmembers class com.novastera.llamarn.** {
|
|
11
|
+
native <methods>;
|
|
12
|
+
}
|
|
@@ -71,52 +71,13 @@ extern "C" {
|
|
|
71
71
|
typedef int32_t llama_seq_id;
|
|
72
72
|
|
|
73
73
|
enum llama_vocab_type {
|
|
74
|
-
LLAMA_VOCAB_TYPE_NONE
|
|
75
|
-
LLAMA_VOCAB_TYPE_SPM
|
|
76
|
-
LLAMA_VOCAB_TYPE_BPE
|
|
77
|
-
LLAMA_VOCAB_TYPE_WPM
|
|
78
|
-
LLAMA_VOCAB_TYPE_UGM
|
|
79
|
-
LLAMA_VOCAB_TYPE_RWKV
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
// pre-tokenization types
|
|
83
|
-
enum llama_vocab_pre_type {
|
|
84
|
-
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
|
|
85
|
-
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
|
|
86
|
-
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
|
|
87
|
-
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
|
|
88
|
-
LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
|
|
89
|
-
LLAMA_VOCAB_PRE_TYPE_MPT = 5,
|
|
90
|
-
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
|
|
91
|
-
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
|
|
92
|
-
LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
|
|
93
|
-
LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
|
|
94
|
-
LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10,
|
|
95
|
-
LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
|
|
96
|
-
LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
|
|
97
|
-
LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
|
|
98
|
-
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
|
|
99
|
-
LLAMA_VOCAB_PRE_TYPE_PORO = 15,
|
|
100
|
-
LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16,
|
|
101
|
-
LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
|
|
102
|
-
LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
|
|
103
|
-
LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
|
|
104
|
-
LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
|
|
105
|
-
LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
|
|
106
|
-
LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
|
|
107
|
-
LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
|
|
108
|
-
LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
|
|
109
|
-
LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
|
|
110
|
-
LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
|
|
111
|
-
LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
|
|
112
|
-
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
|
|
113
|
-
LLAMA_VOCAB_PRE_TYPE_GPT4O = 29,
|
|
114
|
-
LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30,
|
|
115
|
-
LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
|
|
116
|
-
LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
|
|
117
|
-
LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
|
|
118
|
-
LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
|
|
119
|
-
LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
|
|
74
|
+
LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
|
|
75
|
+
LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
|
|
76
|
+
LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
|
|
77
|
+
LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
|
|
78
|
+
LLAMA_VOCAB_TYPE_UGM = 4, // T5 tokenizer based on Unigram
|
|
79
|
+
LLAMA_VOCAB_TYPE_RWKV = 5, // RWKV tokenizer based on greedy tokenization
|
|
80
|
+
LLAMA_VOCAB_TYPE_PLAMO2 = 6, // PLaMo-2 tokenizer based on Aho-Corasick with dynamic programming
|
|
120
81
|
};
|
|
121
82
|
|
|
122
83
|
enum llama_rope_type {
|
|
@@ -374,6 +335,9 @@ extern "C" {
|
|
|
374
335
|
bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
|
|
375
336
|
// NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases
|
|
376
337
|
// ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
|
|
338
|
+
bool kv_unified; // use a unified buffer across the input sequences when computing the attention
|
|
339
|
+
// try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix
|
|
340
|
+
// ref: https://github.com/ggml-org/llama.cpp/pull/14363
|
|
377
341
|
};
|
|
378
342
|
|
|
379
343
|
// model quantization parameters
|
|
@@ -764,7 +728,7 @@ extern "C" {
|
|
|
764
728
|
// - lazily on next llama_decode()
|
|
765
729
|
// p0 < 0 : [0, p1]
|
|
766
730
|
// p1 < 0 : [p0, inf)
|
|
767
|
-
DEPRECATED(void llama_kv_self_seq_div(
|
|
731
|
+
DEPRECATED(LLAMA_API void llama_kv_self_seq_div(
|
|
768
732
|
struct llama_context * ctx,
|
|
769
733
|
llama_seq_id seq_id,
|
|
770
734
|
llama_pos p0,
|
|
@@ -992,6 +956,7 @@ extern "C" {
|
|
|
992
956
|
// in the order they have appeared in the batch.
|
|
993
957
|
// Rows: number of tokens for which llama_batch.logits[i] != 0
|
|
994
958
|
// Cols: n_vocab
|
|
959
|
+
// TODO: deprecate in favor of llama_get_logits_ith() (ref: https://github.com/ggml-org/llama.cpp/pull/14853#issuecomment-3113143522)
|
|
995
960
|
LLAMA_API float * llama_get_logits(struct llama_context * ctx);
|
|
996
961
|
|
|
997
962
|
// Logits for the ith token. For positive indices, Equivalent to:
|
|
@@ -1006,6 +971,7 @@ extern "C" {
|
|
|
1006
971
|
// in the order they have appeared in the batch.
|
|
1007
972
|
// shape: [n_outputs*n_embd]
|
|
1008
973
|
// Otherwise, returns NULL.
|
|
974
|
+
// TODO: deprecate in favor of llama_get_embeddings_ith() (ref: https://github.com/ggml-org/llama.cpp/pull/14853#issuecomment-3113143522)
|
|
1009
975
|
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
|
1010
976
|
|
|
1011
977
|
// Get the embeddings for the ith token. For positive indices, Equivalent to:
|
|
@@ -1044,6 +1010,7 @@ extern "C" {
|
|
|
1044
1010
|
LLAMA_API llama_token llama_vocab_sep(const struct llama_vocab * vocab); // sentence separator
|
|
1045
1011
|
LLAMA_API llama_token llama_vocab_nl (const struct llama_vocab * vocab); // next-line
|
|
1046
1012
|
LLAMA_API llama_token llama_vocab_pad(const struct llama_vocab * vocab); // padding
|
|
1013
|
+
LLAMA_API llama_token llama_vocab_mask(const struct llama_vocab * vocab); // mask
|
|
1047
1014
|
|
|
1048
1015
|
LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
|
|
1049
1016
|
LLAMA_API bool llama_vocab_get_add_eos(const struct llama_vocab * vocab);
|
|
@@ -1429,6 +1396,7 @@ extern "C" {
|
|
|
1429
1396
|
|
|
1430
1397
|
int32_t n_p_eval;
|
|
1431
1398
|
int32_t n_eval;
|
|
1399
|
+
int32_t n_reused; // number of times a ggml compute graph had been reused
|
|
1432
1400
|
};
|
|
1433
1401
|
|
|
1434
1402
|
struct llama_perf_sampler_data {
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
package/cpp/build-info.cpp
CHANGED
|
@@ -55,6 +55,17 @@
|
|
|
55
55
|
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-apple-clang.cmake"
|
|
56
56
|
}
|
|
57
57
|
},
|
|
58
|
+
{
|
|
59
|
+
"name": "x64-linux-gcc", "hidden": true,
|
|
60
|
+
"cacheVariables": {
|
|
61
|
+
"CMAKE_C_COMPILER": "gcc",
|
|
62
|
+
"CMAKE_CXX_COMPILER": "g++"
|
|
63
|
+
}
|
|
64
|
+
},
|
|
65
|
+
{ "name": "x64-linux-gcc-debug", "inherits": [ "base", "x64-linux-gcc", "debug" ] },
|
|
66
|
+
{ "name": "x64-linux-gcc-release", "inherits": [ "base", "x64-linux-gcc", "release" ] },
|
|
67
|
+
{ "name": "x64-linux-gcc-reldbg", "inherits": [ "base", "x64-linux-gcc", "reldbg" ] },
|
|
68
|
+
{ "name": "x64-linux-gcc+static-release", "inherits": [ "base", "x64-linux-gcc", "release", "static" ] },
|
|
58
69
|
|
|
59
70
|
{ "name": "arm64-windows-llvm-debug", "inherits": [ "base", "arm64-windows-llvm", "debug" ] },
|
|
60
71
|
{ "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg" ] },
|
package/cpp/llama.cpp/CODEOWNERS
CHANGED
package/cpp/llama.cpp/README.md
CHANGED
|
@@ -133,6 +133,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
|
|
133
133
|
- [x] [GigaChat-20B-A3B](https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct)
|
|
134
134
|
- [X] [Trillion-7B-preview](https://huggingface.co/trillionlabs/Trillion-7B-preview)
|
|
135
135
|
- [x] [Ling models](https://huggingface.co/collections/inclusionAI/ling-67c51c85b34a7ea0aba94c32)
|
|
136
|
+
- [x] [LFM2 models](https://huggingface.co/collections/LiquidAI/lfm2-686d721927015b2ad73eaa38)
|
|
136
137
|
|
|
137
138
|
#### Multimodal
|
|
138
139
|
|
|
@@ -268,6 +269,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
|
|
|
268
269
|
| [Vulkan](docs/build.md#vulkan) | GPU |
|
|
269
270
|
| [CANN](docs/build.md#cann) | Ascend NPU |
|
|
270
271
|
| [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
|
|
272
|
+
| [WebGPU [In Progress]](docs/build.md#webgpu) | All |
|
|
271
273
|
| [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |
|
|
272
274
|
|
|
273
275
|
## Obtaining and quantizing models
|
|
@@ -433,7 +435,7 @@ To learn more about model quantization, [read this documentation](tools/quantize
|
|
|
433
435
|
|
|
434
436
|
## [`llama-perplexity`](tools/perplexity)
|
|
435
437
|
|
|
436
|
-
#### A tool for measuring the perplexity [^1]
|
|
438
|
+
#### A tool for measuring the [perplexity](tools/perplexity/README.md) [^1] (and other quality metrics) of a model over a given text.
|
|
437
439
|
|
|
438
440
|
- <details open>
|
|
439
441
|
<summary>Measure the perplexity over a text file</summary>
|
|
@@ -456,8 +458,7 @@ To learn more about model quantization, [read this documentation](tools/quantize
|
|
|
456
458
|
|
|
457
459
|
</details>
|
|
458
460
|
|
|
459
|
-
[^1]: [
|
|
460
|
-
[^2]: [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity)
|
|
461
|
+
[^1]: [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity)
|
|
461
462
|
|
|
462
463
|
## [`llama-bench`](tools/llama-bench)
|
|
463
464
|
|
|
@@ -1464,6 +1464,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1464
1464
|
params.swa_full = true;
|
|
1465
1465
|
}
|
|
1466
1466
|
).set_env("LLAMA_ARG_SWA_FULL"));
|
|
1467
|
+
add_opt(common_arg(
|
|
1468
|
+
{"--kv-unified", "-kvu"},
|
|
1469
|
+
string_format("use single unified KV buffer for the KV cache of all sequences (default: %s)\n"
|
|
1470
|
+
"[(more info)](https://github.com/ggml-org/llama.cpp/pull/14363)", params.kv_unified ? "true" : "false"),
|
|
1471
|
+
[](common_params & params) {
|
|
1472
|
+
params.kv_unified = true;
|
|
1473
|
+
}
|
|
1474
|
+
).set_env("LLAMA_ARG_KV_SPLIT"));
|
|
1467
1475
|
add_opt(common_arg(
|
|
1468
1476
|
{"--no-context-shift"},
|
|
1469
1477
|
string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
|
|
@@ -1604,7 +1612,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
1604
1612
|
[](common_params & params, const std::string & value) {
|
|
1605
1613
|
params.antiprompt.emplace_back(value);
|
|
1606
1614
|
}
|
|
1607
|
-
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
1615
|
+
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
|
|
1608
1616
|
add_opt(common_arg(
|
|
1609
1617
|
{"-sp", "--special"},
|
|
1610
1618
|
string_format("special tokens output enabled (default: %s)", params.special ? "true" : "false"),
|
|
@@ -2647,6 +2655,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
2647
2655
|
params.i_chunk = value;
|
|
2648
2656
|
}
|
|
2649
2657
|
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
|
2658
|
+
add_opt(common_arg(
|
|
2659
|
+
{"--show-statistics"},
|
|
2660
|
+
string_format("show imatrix statistics and then exit (default: %s)", params.show_statistics ? "true" : "false"),
|
|
2661
|
+
[](common_params & params) {
|
|
2662
|
+
params.show_statistics = true;
|
|
2663
|
+
}
|
|
2664
|
+
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
|
|
2650
2665
|
add_opt(common_arg(
|
|
2651
2666
|
{"--parse-special"},
|
|
2652
2667
|
string_format("prase special tokens (chat, tool, etc) (default: %s)", params.parse_special ? "true" : "false"),
|
|
@@ -3423,5 +3438,34 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
3423
3438
|
}
|
|
3424
3439
|
).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
3425
3440
|
|
|
3441
|
+
// diffusion parameters
|
|
3442
|
+
add_opt(common_arg(
|
|
3443
|
+
{ "--diffusion-steps" }, "N",
|
|
3444
|
+
string_format("number of diffusion steps (default: %d)", params.diffusion.steps),
|
|
3445
|
+
[](common_params & params, int value) { params.diffusion.steps = value; }
|
|
3446
|
+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
3447
|
+
add_opt(common_arg(
|
|
3448
|
+
{ "--diffusion-eps" }, "F",
|
|
3449
|
+
string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps),
|
|
3450
|
+
[](common_params & params, const std::string & value) { params.diffusion.eps = std::stof(value); }
|
|
3451
|
+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
3452
|
+
add_opt(common_arg(
|
|
3453
|
+
{ "--diffusion-algorithm" }, "N",
|
|
3454
|
+
string_format("diffusion algorithm: 0=ORIGIN, 1=MASKGIT_PLUS, 2=TOPK_MARGIN, 3=ENTROPY (default: %d)",
|
|
3455
|
+
params.diffusion.algorithm),
|
|
3456
|
+
[](common_params & params, int value) { params.diffusion.algorithm = value; }
|
|
3457
|
+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
3458
|
+
add_opt(common_arg(
|
|
3459
|
+
{ "--diffusion-alg-temp" }, "F",
|
|
3460
|
+
string_format("algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
|
|
3461
|
+
[](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(value); }
|
|
3462
|
+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
3463
|
+
add_opt(common_arg(
|
|
3464
|
+
{ "--diffusion-visual" },
|
|
3465
|
+
string_format("enable visual diffusion mode (show progressive generation) (default: %s)",
|
|
3466
|
+
params.diffusion.visual_mode ? "true" : "false"),
|
|
3467
|
+
[](common_params & params) { params.diffusion.visual_mode = true; }
|
|
3468
|
+
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
|
|
3469
|
+
|
|
3426
3470
|
return ctx_arg;
|
|
3427
3471
|
}
|
|
@@ -448,6 +448,15 @@ void string_replace_all(std::string & s, const std::string & search, const std::
|
|
|
448
448
|
bool string_ends_with(const std::string_view & str, const std::string_view & suffix) {
|
|
449
449
|
return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
|
|
450
450
|
}
|
|
451
|
+
|
|
452
|
+
bool string_remove_suffix(std::string & str, const std::string_view & suffix) {
|
|
453
|
+
bool has_suffix = string_ends_with(str, suffix);
|
|
454
|
+
if (has_suffix) {
|
|
455
|
+
str = str.substr(0, str.size() - suffix.size());
|
|
456
|
+
}
|
|
457
|
+
return has_suffix;
|
|
458
|
+
}
|
|
459
|
+
|
|
451
460
|
size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop) {
|
|
452
461
|
if (!str.empty() && !stop.empty()) {
|
|
453
462
|
const char text_last_char = str.back();
|
|
@@ -1005,15 +1014,21 @@ struct common_init_result common_init_from_params(common_params & params) {
|
|
|
1005
1014
|
params.sampling.ignore_eos = false;
|
|
1006
1015
|
}
|
|
1007
1016
|
|
|
1008
|
-
|
|
1009
|
-
|
|
1010
|
-
|
|
1011
|
-
|
|
1012
|
-
|
|
1013
|
-
}
|
|
1017
|
+
// initialize once
|
|
1018
|
+
for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
|
|
1019
|
+
if (llama_vocab_is_eog(vocab, i)) {
|
|
1020
|
+
LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
|
|
1021
|
+
params.sampling.logit_bias_eog.push_back({i, -INFINITY});
|
|
1014
1022
|
}
|
|
1015
1023
|
}
|
|
1016
1024
|
|
|
1025
|
+
if (params.sampling.ignore_eos) {
|
|
1026
|
+
// add EOG biases to the active set of logit biases
|
|
1027
|
+
params.sampling.logit_bias.insert(
|
|
1028
|
+
params.sampling.logit_bias.end(),
|
|
1029
|
+
params.sampling.logit_bias_eog.begin(), params.sampling.logit_bias_eog.end());
|
|
1030
|
+
}
|
|
1031
|
+
|
|
1017
1032
|
if (params.sampling.penalty_last_n == -1) {
|
|
1018
1033
|
LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
|
|
1019
1034
|
params.sampling.penalty_last_n = llama_n_ctx(lctx);
|
|
@@ -1157,6 +1172,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
|
|
|
1157
1172
|
cparams.no_perf = params.no_perf;
|
|
1158
1173
|
cparams.op_offload = !params.no_op_offload;
|
|
1159
1174
|
cparams.swa_full = params.swa_full;
|
|
1175
|
+
cparams.kv_unified = params.kv_unified;
|
|
1160
1176
|
|
|
1161
1177
|
cparams.type_k = params.cache_type_k;
|
|
1162
1178
|
cparams.type_v = params.cache_type_v;
|
|
@@ -81,6 +81,7 @@ enum llama_example {
|
|
|
81
81
|
LLAMA_EXAMPLE_LOOKUP,
|
|
82
82
|
LLAMA_EXAMPLE_PARALLEL,
|
|
83
83
|
LLAMA_EXAMPLE_TTS,
|
|
84
|
+
LLAMA_EXAMPLE_DIFFUSION,
|
|
84
85
|
|
|
85
86
|
LLAMA_EXAMPLE_COUNT,
|
|
86
87
|
};
|
|
@@ -177,7 +178,8 @@ struct common_params_sampling {
|
|
|
177
178
|
std::vector<common_grammar_trigger> grammar_triggers; // optional triggers (for lazy grammars)
|
|
178
179
|
std::set<llama_token> preserved_tokens;
|
|
179
180
|
|
|
180
|
-
std::vector<llama_logit_bias> logit_bias;
|
|
181
|
+
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
|
|
182
|
+
std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens
|
|
181
183
|
|
|
182
184
|
// print the parameters into a string
|
|
183
185
|
std::string print() const;
|
|
@@ -217,6 +219,14 @@ struct common_params_vocoder {
|
|
|
217
219
|
bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy // NOLINT
|
|
218
220
|
};
|
|
219
221
|
|
|
222
|
+
struct common_params_diffusion {
|
|
223
|
+
int32_t steps = 64; // number of diffusion steps
|
|
224
|
+
float eps = 1e-3f; // epsilon for timesteps
|
|
225
|
+
int32_t algorithm = 0; // diffusion algorithm (0=ORIGIN, 1=MASKGIT_PLUS, 2=TOPK_MARGIN, 3=ENTROPY)
|
|
226
|
+
float alg_temp = 0.0f; // algorithm temperature
|
|
227
|
+
bool visual_mode = false; // show progressive diffusion on screen
|
|
228
|
+
};
|
|
229
|
+
|
|
220
230
|
enum common_reasoning_format {
|
|
221
231
|
COMMON_REASONING_FORMAT_NONE,
|
|
222
232
|
COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
|
|
@@ -268,6 +278,7 @@ struct common_params {
|
|
|
268
278
|
struct common_params_sampling sampling;
|
|
269
279
|
struct common_params_speculative speculative;
|
|
270
280
|
struct common_params_vocoder vocoder;
|
|
281
|
+
struct common_params_diffusion diffusion;
|
|
271
282
|
|
|
272
283
|
struct common_params_model model;
|
|
273
284
|
|
|
@@ -330,6 +341,7 @@ struct common_params {
|
|
|
330
341
|
bool no_perf = false; // disable performance metrics
|
|
331
342
|
bool ctx_shift = true; // context shift on inifinite text generation
|
|
332
343
|
bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
|
|
344
|
+
bool kv_unified = false; // enable unified KV cache
|
|
333
345
|
|
|
334
346
|
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
|
335
347
|
bool use_mmap = true; // use mmap for faster loads
|
|
@@ -420,9 +432,10 @@ struct common_params {
|
|
|
420
432
|
int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
|
|
421
433
|
int32_t i_chunk = 0; // start processing from this chunk
|
|
422
434
|
|
|
423
|
-
bool process_output
|
|
424
|
-
bool compute_ppl
|
|
425
|
-
bool
|
|
435
|
+
bool process_output = false; // collect data for the output tensor
|
|
436
|
+
bool compute_ppl = true; // whether to compute perplexity
|
|
437
|
+
bool show_statistics = false; // show imatrix statistics per tensor
|
|
438
|
+
bool parse_special = false; // whether to parse special tokens during imatrix tokenization
|
|
426
439
|
|
|
427
440
|
// cvector-generator params
|
|
428
441
|
int n_pca_batch = 100;
|
|
@@ -522,6 +535,7 @@ static bool string_starts_with(const std::string & str,
|
|
|
522
535
|
|
|
523
536
|
// While we wait for C++20's std::string::ends_with...
|
|
524
537
|
bool string_ends_with(const std::string_view & str, const std::string_view & suffix);
|
|
538
|
+
bool string_remove_suffix(std::string & str, const std::string_view & suffix);
|
|
525
539
|
size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop);
|
|
526
540
|
|
|
527
541
|
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
|