@novastera-oss/llamarn 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (190) hide show
  1. package/android/build.gradle +2 -1
  2. package/android/proguard-rules.pro +12 -0
  3. package/android/src/main/cpp/include/llama.h +15 -47
  4. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  6. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  8. package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
  9. package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
  10. package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
  11. package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
  12. package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
  13. package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
  14. package/android/src/main/jniLibs/x86/libggml.so +0 -0
  15. package/android/src/main/jniLibs/x86/libllama.so +0 -0
  16. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  17. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  18. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  19. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  20. package/cpp/build-info.cpp +2 -2
  21. package/cpp/llama.cpp/CMakePresets.json +11 -0
  22. package/cpp/llama.cpp/CODEOWNERS +1 -0
  23. package/cpp/llama.cpp/README.md +4 -3
  24. package/cpp/llama.cpp/common/arg.cpp +45 -1
  25. package/cpp/llama.cpp/common/common.cpp +22 -6
  26. package/cpp/llama.cpp/common/common.h +18 -4
  27. package/cpp/llama.cpp/convert_hf_to_gguf.py +500 -32
  28. package/cpp/llama.cpp/convert_hf_to_gguf_update.py +12 -13
  29. package/cpp/llama.cpp/ggml/CMakeLists.txt +6 -1
  30. package/cpp/llama.cpp/ggml/cmake/ggml-config.cmake.in +85 -47
  31. package/cpp/llama.cpp/ggml/include/ggml-webgpu.h +19 -0
  32. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +1 -0
  33. package/cpp/llama.cpp/ggml/src/ggml-alloc.c +0 -15
  34. package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +7 -0
  35. package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +8 -20
  36. package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +3 -1
  37. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +58 -3
  38. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +130 -22
  39. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +122 -16
  40. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +5 -2
  41. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +1 -1
  42. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +109 -12
  43. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +3 -0
  44. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +88 -10
  45. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +343 -1094
  46. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +3 -0
  47. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +0 -1
  48. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +3 -0
  49. package/cpp/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +3 -3
  50. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +14 -4
  51. package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cu +64 -17
  52. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy-utils.cuh +225 -0
  53. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +41 -301
  54. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +85 -67
  55. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +45 -62
  56. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +28 -43
  57. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +41 -56
  58. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +36 -47
  59. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +31 -43
  60. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +22 -37
  61. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +3 -13
  62. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +73 -23
  63. package/cpp/llama.cpp/ggml/src/ggml-cuda/im2col.cu +1 -1
  64. package/cpp/llama.cpp/ggml/src/ggml-cuda/mma.cuh +111 -3
  65. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +6 -4
  66. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +1152 -689
  67. package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cu +92 -5
  68. package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cuh +2 -0
  69. package/cpp/llama.cpp/ggml/src/ggml-cuda/set-rows.cu +275 -0
  70. package/cpp/llama.cpp/ggml/src/ggml-cuda/set-rows.cuh +7 -0
  71. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +7 -0
  72. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +2 -0
  73. package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +13 -1
  74. package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +2 -2
  75. package/cpp/llama.cpp/ggml/src/ggml-impl.h +16 -0
  76. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +13 -3
  77. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +407 -69
  78. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +380 -83
  79. package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +18 -4
  80. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +2 -0
  81. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +295 -2
  82. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
  83. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
  84. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/im2col_f16.cl +1 -1
  85. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/im2col_f32.cl +1 -1
  86. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/rms_norm.cl +79 -0
  87. package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +4 -4
  88. package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +14 -26
  89. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +131 -46
  90. package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
  91. package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +8 -9
  92. package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +43 -43
  93. package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +2 -6
  94. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +287 -22
  95. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +265 -0
  96. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +1 -5
  97. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +1 -1
  98. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
  99. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +1 -1
  100. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +1 -1
  101. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
  102. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +2 -0
  103. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +2 -0
  104. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +3 -8
  105. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +8 -2
  106. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +1 -4
  107. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rte.comp +5 -0
  108. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +71 -16
  109. package/cpp/llama.cpp/ggml/src/ggml-webgpu/CMakeLists.txt +54 -0
  110. package/cpp/llama.cpp/ggml/src/ggml-webgpu/ggml-webgpu.cpp +907 -0
  111. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +60 -0
  112. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +35 -0
  113. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
  114. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +56 -0
  115. package/cpp/llama.cpp/ggml/src/ggml.c +4 -6
  116. package/cpp/llama.cpp/gguf-py/gguf/constants.py +98 -0
  117. package/cpp/llama.cpp/gguf-py/gguf/metadata.py +4 -0
  118. package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_dump.py +24 -1
  119. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +75 -52
  120. package/cpp/llama.cpp/include/llama.h +15 -7
  121. package/cpp/llama.cpp/models/templates/llama-cpp-rwkv-world.jinja +34 -0
  122. package/cpp/llama.cpp/models/templates/moonshotai-Kimi-K2.jinja +43 -0
  123. package/cpp/llama.cpp/requirements/requirements-all.txt +1 -0
  124. package/cpp/llama.cpp/requirements/requirements-server-bench.txt +5 -0
  125. package/cpp/llama.cpp/src/llama-arch.cpp +106 -0
  126. package/cpp/llama.cpp/src/llama-arch.h +5 -0
  127. package/cpp/llama.cpp/src/llama-batch.cpp +76 -70
  128. package/cpp/llama.cpp/src/llama-batch.h +24 -18
  129. package/cpp/llama.cpp/src/llama-chat.cpp +43 -1
  130. package/cpp/llama.cpp/src/llama-chat.h +2 -0
  131. package/cpp/llama.cpp/src/llama-context.cpp +180 -106
  132. package/cpp/llama.cpp/src/llama-context.h +26 -16
  133. package/cpp/llama.cpp/src/llama-cparams.h +3 -2
  134. package/cpp/llama.cpp/src/llama-graph.cpp +203 -39
  135. package/cpp/llama.cpp/src/llama-graph.h +147 -72
  136. package/cpp/llama.cpp/src/llama-hparams.cpp +40 -0
  137. package/cpp/llama.cpp/src/llama-hparams.h +10 -2
  138. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +11 -5
  139. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +3 -0
  140. package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +698 -302
  141. package/cpp/llama.cpp/src/llama-kv-cache-unified.h +89 -31
  142. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +1 -0
  143. package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +16 -1
  144. package/cpp/llama.cpp/src/llama-model.cpp +1293 -312
  145. package/cpp/llama.cpp/src/llama-model.h +3 -4
  146. package/cpp/llama.cpp/src/llama-quant.cpp +1 -2
  147. package/cpp/llama.cpp/src/llama-vocab.cpp +363 -8
  148. package/cpp/llama.cpp/src/llama-vocab.h +2 -0
  149. package/cpp/llama.cpp/src/unicode.cpp +207 -0
  150. package/cpp/llama.cpp/src/unicode.h +2 -0
  151. package/ios/include/common.h +18 -4
  152. package/ios/include/llama.h +15 -7
  153. package/ios/libs/llama.xcframework/Info.plist +15 -15
  154. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  155. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5267 -5059
  156. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +15 -7
  157. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  158. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  159. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5238 -5030
  160. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4014 -3889
  161. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +15 -7
  162. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  163. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  164. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5238 -5030
  165. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4016 -3891
  166. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +15 -7
  167. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +15 -7
  168. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  169. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +15 -7
  170. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  171. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  172. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  173. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5267 -5059
  174. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +15 -7
  175. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  176. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  177. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5238 -5030
  178. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4014 -3889
  179. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +15 -7
  180. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  181. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  182. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5303 -5095
  183. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +15 -7
  184. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  185. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  186. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5274 -5066
  187. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4044 -3919
  188. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +15 -7
  189. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  190. package/package.json +4 -4
@@ -67,7 +67,8 @@ android {
67
67
  minSdkVersion getExtOrIntegerDefault("minSdkVersion")
68
68
  targetSdkVersion getExtOrIntegerDefault("targetSdkVersion")
69
69
 
70
-
70
+ // Include ProGuard rules for apps that use this library
71
+ consumerProguardFiles 'proguard-rules.pro'
71
72
  }
72
73
 
73
74
 
@@ -0,0 +1,12 @@
1
+ # ProGuard rules for @novastera-oss/llamarn library
2
+ # These rules will be automatically included when apps use this library
3
+
4
+ # Keep all classes in our package (includes NativeRNLlamaCppSpec, RNLlamaCppPackage, etc.)
5
+ -keep class com.novastera.llamarn.** {
6
+ *;
7
+ }
8
+
9
+ # Keep native methods (JNI)
10
+ -keepclassmembers class com.novastera.llamarn.** {
11
+ native <methods>;
12
+ }
@@ -71,52 +71,13 @@ extern "C" {
71
71
  typedef int32_t llama_seq_id;
72
72
 
73
73
  enum llama_vocab_type {
74
- LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
75
- LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
76
- LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
77
- LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
78
- LLAMA_VOCAB_TYPE_UGM = 4, // T5 tokenizer based on Unigram
79
- LLAMA_VOCAB_TYPE_RWKV = 5, // RWKV tokenizer based on greedy tokenization
80
- };
81
-
82
- // pre-tokenization types
83
- enum llama_vocab_pre_type {
84
- LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
85
- LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
86
- LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
87
- LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
88
- LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
89
- LLAMA_VOCAB_PRE_TYPE_MPT = 5,
90
- LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
91
- LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
92
- LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
93
- LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
94
- LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10,
95
- LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
96
- LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
97
- LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
98
- LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
99
- LLAMA_VOCAB_PRE_TYPE_PORO = 15,
100
- LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16,
101
- LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
102
- LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
103
- LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
104
- LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
105
- LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
106
- LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
107
- LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
108
- LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
109
- LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
110
- LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
111
- LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
112
- LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
113
- LLAMA_VOCAB_PRE_TYPE_GPT4O = 29,
114
- LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30,
115
- LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
116
- LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
117
- LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
118
- LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
119
- LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
74
+ LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
75
+ LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
76
+ LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
77
+ LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
78
+ LLAMA_VOCAB_TYPE_UGM = 4, // T5 tokenizer based on Unigram
79
+ LLAMA_VOCAB_TYPE_RWKV = 5, // RWKV tokenizer based on greedy tokenization
80
+ LLAMA_VOCAB_TYPE_PLAMO2 = 6, // PLaMo-2 tokenizer based on Aho-Corasick with dynamic programming
120
81
  };
121
82
 
122
83
  enum llama_rope_type {
@@ -374,6 +335,9 @@ extern "C" {
374
335
  bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
375
336
  // NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases
376
337
  // ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
338
+ bool kv_unified; // use a unified buffer across the input sequences when computing the attention
339
+ // try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix
340
+ // ref: https://github.com/ggml-org/llama.cpp/pull/14363
377
341
  };
378
342
 
379
343
  // model quantization parameters
@@ -764,7 +728,7 @@ extern "C" {
764
728
  // - lazily on next llama_decode()
765
729
  // p0 < 0 : [0, p1]
766
730
  // p1 < 0 : [p0, inf)
767
- DEPRECATED(void llama_kv_self_seq_div(
731
+ DEPRECATED(LLAMA_API void llama_kv_self_seq_div(
768
732
  struct llama_context * ctx,
769
733
  llama_seq_id seq_id,
770
734
  llama_pos p0,
@@ -992,6 +956,7 @@ extern "C" {
992
956
  // in the order they have appeared in the batch.
993
957
  // Rows: number of tokens for which llama_batch.logits[i] != 0
994
958
  // Cols: n_vocab
959
+ // TODO: deprecate in favor of llama_get_logits_ith() (ref: https://github.com/ggml-org/llama.cpp/pull/14853#issuecomment-3113143522)
995
960
  LLAMA_API float * llama_get_logits(struct llama_context * ctx);
996
961
 
997
962
  // Logits for the ith token. For positive indices, Equivalent to:
@@ -1006,6 +971,7 @@ extern "C" {
1006
971
  // in the order they have appeared in the batch.
1007
972
  // shape: [n_outputs*n_embd]
1008
973
  // Otherwise, returns NULL.
974
+ // TODO: deprecate in favor of llama_get_embeddings_ith() (ref: https://github.com/ggml-org/llama.cpp/pull/14853#issuecomment-3113143522)
1009
975
  LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
1010
976
 
1011
977
  // Get the embeddings for the ith token. For positive indices, Equivalent to:
@@ -1044,6 +1010,7 @@ extern "C" {
1044
1010
  LLAMA_API llama_token llama_vocab_sep(const struct llama_vocab * vocab); // sentence separator
1045
1011
  LLAMA_API llama_token llama_vocab_nl (const struct llama_vocab * vocab); // next-line
1046
1012
  LLAMA_API llama_token llama_vocab_pad(const struct llama_vocab * vocab); // padding
1013
+ LLAMA_API llama_token llama_vocab_mask(const struct llama_vocab * vocab); // mask
1047
1014
 
1048
1015
  LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
1049
1016
  LLAMA_API bool llama_vocab_get_add_eos(const struct llama_vocab * vocab);
@@ -1429,6 +1396,7 @@ extern "C" {
1429
1396
 
1430
1397
  int32_t n_p_eval;
1431
1398
  int32_t n_eval;
1399
+ int32_t n_reused; // number of times a ggml compute graph had been reused
1432
1400
  };
1433
1401
 
1434
1402
  struct llama_perf_sampler_data {
@@ -1,4 +1,4 @@
1
- int LLAMA_BUILD_NUMBER = 5880;
2
- char const *LLAMA_COMMIT = "3120413c";
1
+ int LLAMA_BUILD_NUMBER = 6000;
2
+ char const *LLAMA_COMMIT = "4762ad73";
3
3
  char const *LLAMA_COMPILER = "unknown";
4
4
  char const *LLAMA_BUILD_TARGET = "unknown";
@@ -55,6 +55,17 @@
55
55
  "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-apple-clang.cmake"
56
56
  }
57
57
  },
58
+ {
59
+ "name": "x64-linux-gcc", "hidden": true,
60
+ "cacheVariables": {
61
+ "CMAKE_C_COMPILER": "gcc",
62
+ "CMAKE_CXX_COMPILER": "g++"
63
+ }
64
+ },
65
+ { "name": "x64-linux-gcc-debug", "inherits": [ "base", "x64-linux-gcc", "debug" ] },
66
+ { "name": "x64-linux-gcc-release", "inherits": [ "base", "x64-linux-gcc", "release" ] },
67
+ { "name": "x64-linux-gcc-reldbg", "inherits": [ "base", "x64-linux-gcc", "reldbg" ] },
68
+ { "name": "x64-linux-gcc+static-release", "inherits": [ "base", "x64-linux-gcc", "release", "static" ] },
58
69
 
59
70
  { "name": "arm64-windows-llvm-debug", "inherits": [ "base", "arm64-windows-llvm", "debug" ] },
60
71
  { "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg" ] },
@@ -9,3 +9,4 @@
9
9
  /ggml/src/ggml-cuda/mmvq.* @JohannesGaessler
10
10
  /ggml/src/ggml-opt.cpp @JohannesGaessler
11
11
  /ggml/src/gguf.cpp @JohannesGaessler
12
+ /ggml/src/ggml-vulkan/ @0cc4m
@@ -133,6 +133,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
133
133
  - [x] [GigaChat-20B-A3B](https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct)
134
134
  - [X] [Trillion-7B-preview](https://huggingface.co/trillionlabs/Trillion-7B-preview)
135
135
  - [x] [Ling models](https://huggingface.co/collections/inclusionAI/ling-67c51c85b34a7ea0aba94c32)
136
+ - [x] [LFM2 models](https://huggingface.co/collections/LiquidAI/lfm2-686d721927015b2ad73eaa38)
136
137
 
137
138
  #### Multimodal
138
139
 
@@ -268,6 +269,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
268
269
  | [Vulkan](docs/build.md#vulkan) | GPU |
269
270
  | [CANN](docs/build.md#cann) | Ascend NPU |
270
271
  | [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
272
+ | [WebGPU [In Progress]](docs/build.md#webgpu) | All |
271
273
  | [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |
272
274
 
273
275
  ## Obtaining and quantizing models
@@ -433,7 +435,7 @@ To learn more about model quantization, [read this documentation](tools/quantize
433
435
 
434
436
  ## [`llama-perplexity`](tools/perplexity)
435
437
 
436
- #### A tool for measuring the perplexity [^1][^2] (and other quality metrics) of a model over a given text.
438
+ #### A tool for measuring the [perplexity](tools/perplexity/README.md) [^1] (and other quality metrics) of a model over a given text.
437
439
 
438
440
  - <details open>
439
441
  <summary>Measure the perplexity over a text file</summary>
@@ -456,8 +458,7 @@ To learn more about model quantization, [read this documentation](tools/quantize
456
458
 
457
459
  </details>
458
460
 
459
- [^1]: [tools/perplexity/README.md](./tools/perplexity/README.md)
460
- [^2]: [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity)
461
+ [^1]: [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity)
461
462
 
462
463
  ## [`llama-bench`](tools/llama-bench)
463
464
 
@@ -1464,6 +1464,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1464
1464
  params.swa_full = true;
1465
1465
  }
1466
1466
  ).set_env("LLAMA_ARG_SWA_FULL"));
1467
+ add_opt(common_arg(
1468
+ {"--kv-unified", "-kvu"},
1469
+ string_format("use single unified KV buffer for the KV cache of all sequences (default: %s)\n"
1470
+ "[(more info)](https://github.com/ggml-org/llama.cpp/pull/14363)", params.kv_unified ? "true" : "false"),
1471
+ [](common_params & params) {
1472
+ params.kv_unified = true;
1473
+ }
1474
+ ).set_env("LLAMA_ARG_KV_SPLIT"));
1467
1475
  add_opt(common_arg(
1468
1476
  {"--no-context-shift"},
1469
1477
  string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
@@ -1604,7 +1612,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
1604
1612
  [](common_params & params, const std::string & value) {
1605
1613
  params.antiprompt.emplace_back(value);
1606
1614
  }
1607
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
1615
+ ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
1608
1616
  add_opt(common_arg(
1609
1617
  {"-sp", "--special"},
1610
1618
  string_format("special tokens output enabled (default: %s)", params.special ? "true" : "false"),
@@ -2647,6 +2655,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2647
2655
  params.i_chunk = value;
2648
2656
  }
2649
2657
  ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
2658
+ add_opt(common_arg(
2659
+ {"--show-statistics"},
2660
+ string_format("show imatrix statistics and then exit (default: %s)", params.show_statistics ? "true" : "false"),
2661
+ [](common_params & params) {
2662
+ params.show_statistics = true;
2663
+ }
2664
+ ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
2650
2665
  add_opt(common_arg(
2651
2666
  {"--parse-special"},
2652
2667
  string_format("prase special tokens (chat, tool, etc) (default: %s)", params.parse_special ? "true" : "false"),
@@ -3423,5 +3438,34 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
3423
3438
  }
3424
3439
  ).set_examples({LLAMA_EXAMPLE_SERVER}));
3425
3440
 
3441
+ // diffusion parameters
3442
+ add_opt(common_arg(
3443
+ { "--diffusion-steps" }, "N",
3444
+ string_format("number of diffusion steps (default: %d)", params.diffusion.steps),
3445
+ [](common_params & params, int value) { params.diffusion.steps = value; }
3446
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3447
+ add_opt(common_arg(
3448
+ { "--diffusion-eps" }, "F",
3449
+ string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps),
3450
+ [](common_params & params, const std::string & value) { params.diffusion.eps = std::stof(value); }
3451
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3452
+ add_opt(common_arg(
3453
+ { "--diffusion-algorithm" }, "N",
3454
+ string_format("diffusion algorithm: 0=ORIGIN, 1=MASKGIT_PLUS, 2=TOPK_MARGIN, 3=ENTROPY (default: %d)",
3455
+ params.diffusion.algorithm),
3456
+ [](common_params & params, int value) { params.diffusion.algorithm = value; }
3457
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3458
+ add_opt(common_arg(
3459
+ { "--diffusion-alg-temp" }, "F",
3460
+ string_format("algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp),
3461
+ [](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(value); }
3462
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3463
+ add_opt(common_arg(
3464
+ { "--diffusion-visual" },
3465
+ string_format("enable visual diffusion mode (show progressive generation) (default: %s)",
3466
+ params.diffusion.visual_mode ? "true" : "false"),
3467
+ [](common_params & params) { params.diffusion.visual_mode = true; }
3468
+ ).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
3469
+
3426
3470
  return ctx_arg;
3427
3471
  }
@@ -448,6 +448,15 @@ void string_replace_all(std::string & s, const std::string & search, const std::
448
448
  bool string_ends_with(const std::string_view & str, const std::string_view & suffix) {
449
449
  return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
450
450
  }
451
+
452
+ bool string_remove_suffix(std::string & str, const std::string_view & suffix) {
453
+ bool has_suffix = string_ends_with(str, suffix);
454
+ if (has_suffix) {
455
+ str = str.substr(0, str.size() - suffix.size());
456
+ }
457
+ return has_suffix;
458
+ }
459
+
451
460
  size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop) {
452
461
  if (!str.empty() && !stop.empty()) {
453
462
  const char text_last_char = str.back();
@@ -1005,15 +1014,21 @@ struct common_init_result common_init_from_params(common_params & params) {
1005
1014
  params.sampling.ignore_eos = false;
1006
1015
  }
1007
1016
 
1008
- if (params.sampling.ignore_eos) {
1009
- for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
1010
- if (llama_vocab_is_eog(vocab, i)) {
1011
- LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
1012
- params.sampling.logit_bias.push_back({i, -INFINITY});
1013
- }
1017
+ // initialize once
1018
+ for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
1019
+ if (llama_vocab_is_eog(vocab, i)) {
1020
+ LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
1021
+ params.sampling.logit_bias_eog.push_back({i, -INFINITY});
1014
1022
  }
1015
1023
  }
1016
1024
 
1025
+ if (params.sampling.ignore_eos) {
1026
+ // add EOG biases to the active set of logit biases
1027
+ params.sampling.logit_bias.insert(
1028
+ params.sampling.logit_bias.end(),
1029
+ params.sampling.logit_bias_eog.begin(), params.sampling.logit_bias_eog.end());
1030
+ }
1031
+
1017
1032
  if (params.sampling.penalty_last_n == -1) {
1018
1033
  LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
1019
1034
  params.sampling.penalty_last_n = llama_n_ctx(lctx);
@@ -1157,6 +1172,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
1157
1172
  cparams.no_perf = params.no_perf;
1158
1173
  cparams.op_offload = !params.no_op_offload;
1159
1174
  cparams.swa_full = params.swa_full;
1175
+ cparams.kv_unified = params.kv_unified;
1160
1176
 
1161
1177
  cparams.type_k = params.cache_type_k;
1162
1178
  cparams.type_v = params.cache_type_v;
@@ -81,6 +81,7 @@ enum llama_example {
81
81
  LLAMA_EXAMPLE_LOOKUP,
82
82
  LLAMA_EXAMPLE_PARALLEL,
83
83
  LLAMA_EXAMPLE_TTS,
84
+ LLAMA_EXAMPLE_DIFFUSION,
84
85
 
85
86
  LLAMA_EXAMPLE_COUNT,
86
87
  };
@@ -177,7 +178,8 @@ struct common_params_sampling {
177
178
  std::vector<common_grammar_trigger> grammar_triggers; // optional triggers (for lazy grammars)
178
179
  std::set<llama_token> preserved_tokens;
179
180
 
180
- std::vector<llama_logit_bias> logit_bias; // logit biases to apply
181
+ std::vector<llama_logit_bias> logit_bias; // logit biases to apply
182
+ std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens
181
183
 
182
184
  // print the parameters into a string
183
185
  std::string print() const;
@@ -217,6 +219,14 @@ struct common_params_vocoder {
217
219
  bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy // NOLINT
218
220
  };
219
221
 
222
+ struct common_params_diffusion {
223
+ int32_t steps = 64; // number of diffusion steps
224
+ float eps = 1e-3f; // epsilon for timesteps
225
+ int32_t algorithm = 0; // diffusion algorithm (0=ORIGIN, 1=MASKGIT_PLUS, 2=TOPK_MARGIN, 3=ENTROPY)
226
+ float alg_temp = 0.0f; // algorithm temperature
227
+ bool visual_mode = false; // show progressive diffusion on screen
228
+ };
229
+
220
230
  enum common_reasoning_format {
221
231
  COMMON_REASONING_FORMAT_NONE,
222
232
  COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
@@ -268,6 +278,7 @@ struct common_params {
268
278
  struct common_params_sampling sampling;
269
279
  struct common_params_speculative speculative;
270
280
  struct common_params_vocoder vocoder;
281
+ struct common_params_diffusion diffusion;
271
282
 
272
283
  struct common_params_model model;
273
284
 
@@ -330,6 +341,7 @@ struct common_params {
330
341
  bool no_perf = false; // disable performance metrics
331
342
  bool ctx_shift = true; // context shift on inifinite text generation
332
343
  bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
344
+ bool kv_unified = false; // enable unified KV cache
333
345
 
334
346
  bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
335
347
  bool use_mmap = true; // use mmap for faster loads
@@ -420,9 +432,10 @@ struct common_params {
420
432
  int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
421
433
  int32_t i_chunk = 0; // start processing from this chunk
422
434
 
423
- bool process_output = false; // collect data for the output tensor
424
- bool compute_ppl = true; // whether to compute perplexity
425
- bool parse_special = false; // whether to parse special tokens during imatrix tokenization
435
+ bool process_output = false; // collect data for the output tensor
436
+ bool compute_ppl = true; // whether to compute perplexity
437
+ bool show_statistics = false; // show imatrix statistics per tensor
438
+ bool parse_special = false; // whether to parse special tokens during imatrix tokenization
426
439
 
427
440
  // cvector-generator params
428
441
  int n_pca_batch = 100;
@@ -522,6 +535,7 @@ static bool string_starts_with(const std::string & str,
522
535
 
523
536
  // While we wait for C++20's std::string::ends_with...
524
537
  bool string_ends_with(const std::string_view & str, const std::string_view & suffix);
538
+ bool string_remove_suffix(std::string & str, const std::string_view & suffix);
525
539
  size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop);
526
540
 
527
541
  bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);