@novastera-oss/llamarn 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (190) hide show
  1. package/android/build.gradle +2 -1
  2. package/android/proguard-rules.pro +12 -0
  3. package/android/src/main/cpp/include/llama.h +15 -47
  4. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  6. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  8. package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
  9. package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
  10. package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
  11. package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
  12. package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
  13. package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
  14. package/android/src/main/jniLibs/x86/libggml.so +0 -0
  15. package/android/src/main/jniLibs/x86/libllama.so +0 -0
  16. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  17. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  18. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  19. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  20. package/cpp/build-info.cpp +2 -2
  21. package/cpp/llama.cpp/CMakePresets.json +11 -0
  22. package/cpp/llama.cpp/CODEOWNERS +1 -0
  23. package/cpp/llama.cpp/README.md +4 -3
  24. package/cpp/llama.cpp/common/arg.cpp +45 -1
  25. package/cpp/llama.cpp/common/common.cpp +22 -6
  26. package/cpp/llama.cpp/common/common.h +18 -4
  27. package/cpp/llama.cpp/convert_hf_to_gguf.py +500 -32
  28. package/cpp/llama.cpp/convert_hf_to_gguf_update.py +12 -13
  29. package/cpp/llama.cpp/ggml/CMakeLists.txt +6 -1
  30. package/cpp/llama.cpp/ggml/cmake/ggml-config.cmake.in +85 -47
  31. package/cpp/llama.cpp/ggml/include/ggml-webgpu.h +19 -0
  32. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +1 -0
  33. package/cpp/llama.cpp/ggml/src/ggml-alloc.c +0 -15
  34. package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +7 -0
  35. package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +8 -20
  36. package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +3 -1
  37. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +58 -3
  38. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +130 -22
  39. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +122 -16
  40. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +5 -2
  41. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +1 -1
  42. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +109 -12
  43. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +3 -0
  44. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +88 -10
  45. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +343 -1094
  46. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +3 -0
  47. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +0 -1
  48. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +3 -0
  49. package/cpp/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +3 -3
  50. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +14 -4
  51. package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cu +64 -17
  52. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy-utils.cuh +225 -0
  53. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +41 -301
  54. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +85 -67
  55. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +45 -62
  56. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +28 -43
  57. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +41 -56
  58. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +36 -47
  59. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +31 -43
  60. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +22 -37
  61. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +3 -13
  62. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +73 -23
  63. package/cpp/llama.cpp/ggml/src/ggml-cuda/im2col.cu +1 -1
  64. package/cpp/llama.cpp/ggml/src/ggml-cuda/mma.cuh +111 -3
  65. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +6 -4
  66. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +1152 -689
  67. package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cu +92 -5
  68. package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cuh +2 -0
  69. package/cpp/llama.cpp/ggml/src/ggml-cuda/set-rows.cu +275 -0
  70. package/cpp/llama.cpp/ggml/src/ggml-cuda/set-rows.cuh +7 -0
  71. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +7 -0
  72. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +2 -0
  73. package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +13 -1
  74. package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +2 -2
  75. package/cpp/llama.cpp/ggml/src/ggml-impl.h +16 -0
  76. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +13 -3
  77. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +407 -69
  78. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +380 -83
  79. package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +18 -4
  80. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +2 -0
  81. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +295 -2
  82. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
  83. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
  84. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/im2col_f16.cl +1 -1
  85. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/im2col_f32.cl +1 -1
  86. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/rms_norm.cl +79 -0
  87. package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +4 -4
  88. package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +14 -26
  89. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +131 -46
  90. package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
  91. package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +8 -9
  92. package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +43 -43
  93. package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +2 -6
  94. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +287 -22
  95. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +265 -0
  96. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +1 -5
  97. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +1 -1
  98. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
  99. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +1 -1
  100. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +1 -1
  101. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
  102. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +2 -0
  103. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +2 -0
  104. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +3 -8
  105. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +8 -2
  106. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +1 -4
  107. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rte.comp +5 -0
  108. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +71 -16
  109. package/cpp/llama.cpp/ggml/src/ggml-webgpu/CMakeLists.txt +54 -0
  110. package/cpp/llama.cpp/ggml/src/ggml-webgpu/ggml-webgpu.cpp +907 -0
  111. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +60 -0
  112. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +35 -0
  113. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
  114. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +56 -0
  115. package/cpp/llama.cpp/ggml/src/ggml.c +4 -6
  116. package/cpp/llama.cpp/gguf-py/gguf/constants.py +98 -0
  117. package/cpp/llama.cpp/gguf-py/gguf/metadata.py +4 -0
  118. package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_dump.py +24 -1
  119. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +75 -52
  120. package/cpp/llama.cpp/include/llama.h +15 -7
  121. package/cpp/llama.cpp/models/templates/llama-cpp-rwkv-world.jinja +34 -0
  122. package/cpp/llama.cpp/models/templates/moonshotai-Kimi-K2.jinja +43 -0
  123. package/cpp/llama.cpp/requirements/requirements-all.txt +1 -0
  124. package/cpp/llama.cpp/requirements/requirements-server-bench.txt +5 -0
  125. package/cpp/llama.cpp/src/llama-arch.cpp +106 -0
  126. package/cpp/llama.cpp/src/llama-arch.h +5 -0
  127. package/cpp/llama.cpp/src/llama-batch.cpp +76 -70
  128. package/cpp/llama.cpp/src/llama-batch.h +24 -18
  129. package/cpp/llama.cpp/src/llama-chat.cpp +43 -1
  130. package/cpp/llama.cpp/src/llama-chat.h +2 -0
  131. package/cpp/llama.cpp/src/llama-context.cpp +180 -106
  132. package/cpp/llama.cpp/src/llama-context.h +26 -16
  133. package/cpp/llama.cpp/src/llama-cparams.h +3 -2
  134. package/cpp/llama.cpp/src/llama-graph.cpp +203 -39
  135. package/cpp/llama.cpp/src/llama-graph.h +147 -72
  136. package/cpp/llama.cpp/src/llama-hparams.cpp +40 -0
  137. package/cpp/llama.cpp/src/llama-hparams.h +10 -2
  138. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +11 -5
  139. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +3 -0
  140. package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +698 -302
  141. package/cpp/llama.cpp/src/llama-kv-cache-unified.h +89 -31
  142. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +1 -0
  143. package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +16 -1
  144. package/cpp/llama.cpp/src/llama-model.cpp +1293 -312
  145. package/cpp/llama.cpp/src/llama-model.h +3 -4
  146. package/cpp/llama.cpp/src/llama-quant.cpp +1 -2
  147. package/cpp/llama.cpp/src/llama-vocab.cpp +363 -8
  148. package/cpp/llama.cpp/src/llama-vocab.h +2 -0
  149. package/cpp/llama.cpp/src/unicode.cpp +207 -0
  150. package/cpp/llama.cpp/src/unicode.h +2 -0
  151. package/ios/include/common.h +18 -4
  152. package/ios/include/llama.h +15 -7
  153. package/ios/libs/llama.xcframework/Info.plist +15 -15
  154. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  155. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5267 -5059
  156. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +15 -7
  157. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  158. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  159. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5238 -5030
  160. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4014 -3889
  161. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +15 -7
  162. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  163. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  164. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5238 -5030
  165. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4016 -3891
  166. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +15 -7
  167. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +15 -7
  168. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  169. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +15 -7
  170. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  171. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  172. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  173. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5267 -5059
  174. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +15 -7
  175. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  176. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  177. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5238 -5030
  178. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4014 -3889
  179. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +15 -7
  180. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  181. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  182. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5303 -5095
  183. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +15 -7
  184. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  185. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  186. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5274 -5066
  187. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4044 -3919
  188. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +15 -7
  189. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  190. package/package.json +4 -4
@@ -557,6 +557,178 @@ static std::vector<size_t> unicode_regex_split_stl(const std::string & text, con
557
557
  return bpe_offsets;
558
558
  }
559
559
 
560
+ // K2 system regex patterns (from tokenization_kimi.py):
561
+ // [\p{Han}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+
562
+ static std::vector<size_t> unicode_regex_split_custom_kimi_k2(const std::string & text, const std::vector<size_t> & offsets) {
563
+ std::vector<size_t> bpe_offsets;
564
+ bpe_offsets.reserve(offsets.size());
565
+
566
+ const auto cpts = unicode_cpts_from_utf8(text);
567
+
568
+ size_t start = 0;
569
+ for (auto offset : offsets) {
570
+ const size_t offset_ini = start;
571
+ const size_t offset_end = start + offset;
572
+ assert(offset_end <= cpts.size());
573
+ start = offset_end;
574
+
575
+ static const uint32_t OUT_OF_RANGE = 0xFFFFFFFF;
576
+ auto _get_cpt = [&] (const size_t pos) -> uint32_t {
577
+ return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
578
+ };
579
+
580
+ auto _get_flags = [&] (const size_t pos) -> unicode_cpt_flags {
581
+ return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags_from_cpt(cpts[pos]) : unicode_cpt_flags{};
582
+ };
583
+
584
+ size_t _prev_end = offset_ini;
585
+ auto _add_token = [&] (const size_t end) -> size_t {
586
+ assert(_prev_end <= end && end <= offset_end);
587
+ size_t len = end - _prev_end;
588
+ if (len > 0) {
589
+ bpe_offsets.push_back(len);
590
+ }
591
+ _prev_end = end;
592
+ return len;
593
+ };
594
+
595
+ for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
596
+ const uint32_t cpt = _get_cpt(pos);
597
+ const auto flags = _get_flags(pos);
598
+
599
+ // Pattern 1: [\p{Han}]+ (Chinese characters)
600
+ if (unicode_cpt_is_han(cpt)) {
601
+ while (unicode_cpt_is_han(_get_cpt(pos))) {
602
+ pos++;
603
+ }
604
+ _add_token(pos);
605
+ continue;
606
+ }
607
+
608
+ // Pattern 2 & 3: Letter words excluding Han characters with optional contractions
609
+ // [^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+(?:'s|'t|'re|'ve|'m|'ll|'d)?
610
+ // [^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*(?:'s|'t|'re|'ve|'m|'ll|'d)?
611
+ // Check if current char is a letter OR if current char could be a leading char and next char is a letter
612
+ bool is_letter_pattern = (flags.is_letter && !unicode_cpt_is_han(cpt)) ||
613
+ (!(cpt == '\r' || cpt == '\n' || flags.is_letter || flags.is_number) &&
614
+ _get_flags(pos + 1).is_letter && !unicode_cpt_is_han(_get_cpt(pos + 1)));
615
+
616
+ if (is_letter_pattern) {
617
+ // Handle optional leading non-letter/non-number character
618
+ bool has_leading_char = false;
619
+ if (!(cpt == '\r' || cpt == '\n' || flags.is_letter || flags.is_number)) {
620
+ has_leading_char = true;
621
+ pos++;
622
+ }
623
+
624
+ // Match letter sequence (excluding Han characters)
625
+ bool has_letters = false;
626
+ while (_get_flags(pos).is_letter && !unicode_cpt_is_han(_get_cpt(pos))) {
627
+ has_letters = true;
628
+ pos++;
629
+ }
630
+
631
+ // Only proceed if we found letters (after potentially skipping leading char)
632
+ if (has_letters || (!has_leading_char && _get_flags(pos).is_letter && !unicode_cpt_is_han(_get_cpt(pos)))) {
633
+ if (!has_letters) pos++; // consume the first letter if we didn't already
634
+
635
+ // Continue consuming letters
636
+ while (_get_flags(pos).is_letter && !unicode_cpt_is_han(_get_cpt(pos))) {
637
+ pos++;
638
+ }
639
+
640
+ // Check for optional contractions (?:'s|'t|'re|'ve|'m|'ll|'d)
641
+ if (_get_cpt(pos) == '\'' && pos + 1 < offset_end) {
642
+ uint32_t cpt_next = unicode_tolower(_get_cpt(pos + 1));
643
+ if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') {
644
+ pos += 2;
645
+ } else if (pos + 2 < offset_end) {
646
+ uint32_t cpt_next_next = unicode_tolower(_get_cpt(pos + 2));
647
+ if ((cpt_next == 'r' && cpt_next_next == 'e') ||
648
+ (cpt_next == 'v' && cpt_next_next == 'e') ||
649
+ (cpt_next == 'l' && cpt_next_next == 'l')) {
650
+ pos += 3;
651
+ }
652
+ }
653
+ }
654
+
655
+ _add_token(pos);
656
+ continue;
657
+ } else if (has_leading_char) {
658
+ // We consumed a leading char but found no letters, backtrack
659
+ pos--;
660
+ }
661
+ }
662
+
663
+ // Pattern 4: \p{N}{1,3} (numbers 1-3 digits)
664
+ if (flags.is_number) {
665
+ size_t ini = pos;
666
+ while (_get_flags(pos).is_number) {
667
+ if (++pos - ini >= 3) {
668
+ _add_token(pos);
669
+ ini = pos;
670
+ }
671
+ }
672
+ _add_token(pos);
673
+ continue;
674
+ }
675
+
676
+ // Pattern 5: ?[^\s\p{L}\p{N}]+[\r\n]* (optional space + non-word chars + optional newlines)
677
+ auto flags2 = (cpt == ' ' ? _get_flags(pos + 1) : flags);
678
+ if (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number) && flags2.as_uint()) {
679
+ pos += (cpt == ' ');
680
+ while (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number) && flags2.as_uint()) {
681
+ flags2 = _get_flags(++pos);
682
+ }
683
+ // Match optional [\r\n]*
684
+ uint32_t cpt2 = _get_cpt(pos);
685
+ while (cpt2 == '\r' || cpt2 == '\n') {
686
+ cpt2 = _get_cpt(++pos);
687
+ }
688
+ _add_token(pos);
689
+ continue;
690
+ }
691
+
692
+ // Count whitespace characters
693
+ size_t num_whitespaces = 0;
694
+ size_t last_end_r_or_n = 0;
695
+ while (_get_flags(pos + num_whitespaces).is_whitespace) {
696
+ uint32_t cpt2 = _get_cpt(pos + num_whitespaces);
697
+ if (cpt2 == '\r' || cpt2 == '\n') {
698
+ last_end_r_or_n = pos + num_whitespaces + 1;
699
+ }
700
+ num_whitespaces++;
701
+ }
702
+
703
+ // Pattern 6: \s*[\r\n]+ (whitespace with newlines)
704
+ if (last_end_r_or_n > 0) {
705
+ pos = last_end_r_or_n;
706
+ _add_token(pos);
707
+ continue;
708
+ }
709
+
710
+ // Pattern 7: \s+(?!\S) (trailing whitespace)
711
+ if (num_whitespaces > 1 && _get_cpt(pos + num_whitespaces) != OUT_OF_RANGE) {
712
+ pos += num_whitespaces - 1;
713
+ _add_token(pos);
714
+ continue;
715
+ }
716
+
717
+ // Pattern 8: \s+ (general whitespace)
718
+ if (num_whitespaces > 0) {
719
+ pos += num_whitespaces;
720
+ _add_token(pos);
721
+ continue;
722
+ }
723
+
724
+ // No matches - consume single character
725
+ _add_token(++pos);
726
+ }
727
+ }
728
+
729
+ return bpe_offsets;
730
+ }
731
+
560
732
  static std::vector<size_t> unicode_regex_split_custom(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
561
733
  std::vector<size_t> bpe_offsets;
562
734
 
@@ -567,6 +739,9 @@ static std::vector<size_t> unicode_regex_split_custom(const std::string & text,
567
739
  regex_expr == "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+") {
568
740
 
569
741
  bpe_offsets = unicode_regex_split_custom_llama3(text, offsets);
742
+ } else if (regex_expr == "\\p{Han}+") {
743
+ // K2's first pattern - handle all K2 patterns together
744
+ bpe_offsets = unicode_regex_split_custom_kimi_k2(text, offsets);
570
745
  }
571
746
 
572
747
  return bpe_offsets;
@@ -672,6 +847,38 @@ uint32_t unicode_tolower(uint32_t cpt) {
672
847
  return cpt; // Return the original code point if no lowercase mapping is found
673
848
  }
674
849
 
850
+ bool unicode_cpt_is_han(uint32_t cpt) {
851
+ // Han character ranges (Chinese/CJK characters)
852
+ // CJK Unified Ideographs (most common)
853
+ if (cpt >= 0x4E00 && cpt <= 0x9FFF) return true;
854
+
855
+ // CJK Extension A
856
+ if (cpt >= 0x3400 && cpt <= 0x4DBF) return true;
857
+
858
+ // CJK Extension B
859
+ if (cpt >= 0x20000 && cpt <= 0x2A6DF) return true;
860
+
861
+ // CJK Extension C
862
+ if (cpt >= 0x2A700 && cpt <= 0x2B73F) return true;
863
+
864
+ // CJK Extension D
865
+ if (cpt >= 0x2B740 && cpt <= 0x2B81F) return true;
866
+
867
+ // CJK Extension E
868
+ if (cpt >= 0x2B820 && cpt <= 0x2CEAF) return true;
869
+
870
+ // CJK Extension F
871
+ if (cpt >= 0x2CEB0 && cpt <= 0x2EBEF) return true;
872
+
873
+ // CJK Compatibility Ideographs
874
+ if (cpt >= 0xF900 && cpt <= 0xFAFF) return true;
875
+
876
+ // CJK Compatibility Ideographs Supplement
877
+ if (cpt >= 0x2F800 && cpt <= 0x2FA1F) return true;
878
+
879
+ return false;
880
+ }
881
+
675
882
  std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
676
883
  // unicode categories
677
884
  static const std::map<std::string, int> k_ucat_enum = {
@@ -63,4 +63,6 @@ uint8_t unicode_utf8_to_byte(const std::string & utf8);
63
63
 
64
64
  uint32_t unicode_tolower(uint32_t cpt);
65
65
 
66
+ bool unicode_cpt_is_han(uint32_t cpt);
67
+
66
68
  std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);
@@ -81,6 +81,7 @@ enum llama_example {
81
81
  LLAMA_EXAMPLE_LOOKUP,
82
82
  LLAMA_EXAMPLE_PARALLEL,
83
83
  LLAMA_EXAMPLE_TTS,
84
+ LLAMA_EXAMPLE_DIFFUSION,
84
85
 
85
86
  LLAMA_EXAMPLE_COUNT,
86
87
  };
@@ -177,7 +178,8 @@ struct common_params_sampling {
177
178
  std::vector<common_grammar_trigger> grammar_triggers; // optional triggers (for lazy grammars)
178
179
  std::set<llama_token> preserved_tokens;
179
180
 
180
- std::vector<llama_logit_bias> logit_bias; // logit biases to apply
181
+ std::vector<llama_logit_bias> logit_bias; // logit biases to apply
182
+ std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens
181
183
 
182
184
  // print the parameters into a string
183
185
  std::string print() const;
@@ -217,6 +219,14 @@ struct common_params_vocoder {
217
219
  bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy // NOLINT
218
220
  };
219
221
 
222
+ struct common_params_diffusion {
223
+ int32_t steps = 64; // number of diffusion steps
224
+ float eps = 1e-3f; // epsilon for timesteps
225
+ int32_t algorithm = 0; // diffusion algorithm (0=ORIGIN, 1=MASKGIT_PLUS, 2=TOPK_MARGIN, 3=ENTROPY)
226
+ float alg_temp = 0.0f; // algorithm temperature
227
+ bool visual_mode = false; // show progressive diffusion on screen
228
+ };
229
+
220
230
  enum common_reasoning_format {
221
231
  COMMON_REASONING_FORMAT_NONE,
222
232
  COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
@@ -268,6 +278,7 @@ struct common_params {
268
278
  struct common_params_sampling sampling;
269
279
  struct common_params_speculative speculative;
270
280
  struct common_params_vocoder vocoder;
281
+ struct common_params_diffusion diffusion;
271
282
 
272
283
  struct common_params_model model;
273
284
 
@@ -330,6 +341,7 @@ struct common_params {
330
341
  bool no_perf = false; // disable performance metrics
331
342
  bool ctx_shift = true; // context shift on inifinite text generation
332
343
  bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
344
+ bool kv_unified = false; // enable unified KV cache
333
345
 
334
346
  bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
335
347
  bool use_mmap = true; // use mmap for faster loads
@@ -420,9 +432,10 @@ struct common_params {
420
432
  int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
421
433
  int32_t i_chunk = 0; // start processing from this chunk
422
434
 
423
- bool process_output = false; // collect data for the output tensor
424
- bool compute_ppl = true; // whether to compute perplexity
425
- bool parse_special = false; // whether to parse special tokens during imatrix tokenization
435
+ bool process_output = false; // collect data for the output tensor
436
+ bool compute_ppl = true; // whether to compute perplexity
437
+ bool show_statistics = false; // show imatrix statistics per tensor
438
+ bool parse_special = false; // whether to parse special tokens during imatrix tokenization
426
439
 
427
440
  // cvector-generator params
428
441
  int n_pca_batch = 100;
@@ -522,6 +535,7 @@ static bool string_starts_with(const std::string & str,
522
535
 
523
536
  // While we wait for C++20's std::string::ends_with...
524
537
  bool string_ends_with(const std::string_view & str, const std::string_view & suffix);
538
+ bool string_remove_suffix(std::string & str, const std::string_view & suffix);
525
539
  size_t string_find_partial_stop(const std::string_view & str, const std::string_view & stop);
526
540
 
527
541
  bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
@@ -71,12 +71,13 @@ extern "C" {
71
71
  typedef int32_t llama_seq_id;
72
72
 
73
73
  enum llama_vocab_type {
74
- LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
75
- LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
76
- LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
77
- LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
78
- LLAMA_VOCAB_TYPE_UGM = 4, // T5 tokenizer based on Unigram
79
- LLAMA_VOCAB_TYPE_RWKV = 5, // RWKV tokenizer based on greedy tokenization
74
+ LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
75
+ LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
76
+ LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
77
+ LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
78
+ LLAMA_VOCAB_TYPE_UGM = 4, // T5 tokenizer based on Unigram
79
+ LLAMA_VOCAB_TYPE_RWKV = 5, // RWKV tokenizer based on greedy tokenization
80
+ LLAMA_VOCAB_TYPE_PLAMO2 = 6, // PLaMo-2 tokenizer based on Aho-Corasick with dynamic programming
80
81
  };
81
82
 
82
83
  enum llama_rope_type {
@@ -334,6 +335,9 @@ extern "C" {
334
335
  bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
335
336
  // NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases
336
337
  // ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
338
+ bool kv_unified; // use a unified buffer across the input sequences when computing the attention
339
+ // try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix
340
+ // ref: https://github.com/ggml-org/llama.cpp/pull/14363
337
341
  };
338
342
 
339
343
  // model quantization parameters
@@ -724,7 +728,7 @@ extern "C" {
724
728
  // - lazily on next llama_decode()
725
729
  // p0 < 0 : [0, p1]
726
730
  // p1 < 0 : [p0, inf)
727
- DEPRECATED(void llama_kv_self_seq_div(
731
+ DEPRECATED(LLAMA_API void llama_kv_self_seq_div(
728
732
  struct llama_context * ctx,
729
733
  llama_seq_id seq_id,
730
734
  llama_pos p0,
@@ -952,6 +956,7 @@ extern "C" {
952
956
  // in the order they have appeared in the batch.
953
957
  // Rows: number of tokens for which llama_batch.logits[i] != 0
954
958
  // Cols: n_vocab
959
+ // TODO: deprecate in favor of llama_get_logits_ith() (ref: https://github.com/ggml-org/llama.cpp/pull/14853#issuecomment-3113143522)
955
960
  LLAMA_API float * llama_get_logits(struct llama_context * ctx);
956
961
 
957
962
  // Logits for the ith token. For positive indices, Equivalent to:
@@ -966,6 +971,7 @@ extern "C" {
966
971
  // in the order they have appeared in the batch.
967
972
  // shape: [n_outputs*n_embd]
968
973
  // Otherwise, returns NULL.
974
+ // TODO: deprecate in favor of llama_get_embeddings_ith() (ref: https://github.com/ggml-org/llama.cpp/pull/14853#issuecomment-3113143522)
969
975
  LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
970
976
 
971
977
  // Get the embeddings for the ith token. For positive indices, Equivalent to:
@@ -1004,6 +1010,7 @@ extern "C" {
1004
1010
  LLAMA_API llama_token llama_vocab_sep(const struct llama_vocab * vocab); // sentence separator
1005
1011
  LLAMA_API llama_token llama_vocab_nl (const struct llama_vocab * vocab); // next-line
1006
1012
  LLAMA_API llama_token llama_vocab_pad(const struct llama_vocab * vocab); // padding
1013
+ LLAMA_API llama_token llama_vocab_mask(const struct llama_vocab * vocab); // mask
1007
1014
 
1008
1015
  LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
1009
1016
  LLAMA_API bool llama_vocab_get_add_eos(const struct llama_vocab * vocab);
@@ -1389,6 +1396,7 @@ extern "C" {
1389
1396
 
1390
1397
  int32_t n_p_eval;
1391
1398
  int32_t n_eval;
1399
+ int32_t n_reused; // number of times a ggml compute graph had been reused
1392
1400
  };
1393
1401
 
1394
1402
  struct llama_perf_sampler_data {
@@ -10,7 +10,7 @@
10
10
  <key>DebugSymbolsPath</key>
11
11
  <string>dSYMs</string>
12
12
  <key>LibraryIdentifier</key>
13
- <string>xros-arm64_x86_64-simulator</string>
13
+ <string>ios-arm64_x86_64-simulator</string>
14
14
  <key>LibraryPath</key>
15
15
  <string>llama.framework</string>
16
16
  <key>SupportedArchitectures</key>
@@ -19,7 +19,7 @@
19
19
  <string>x86_64</string>
20
20
  </array>
21
21
  <key>SupportedPlatform</key>
22
- <string>xros</string>
22
+ <string>ios</string>
23
23
  <key>SupportedPlatformVariant</key>
24
24
  <string>simulator</string>
25
25
  </dict>
@@ -45,7 +45,7 @@
45
45
  <key>DebugSymbolsPath</key>
46
46
  <string>dSYMs</string>
47
47
  <key>LibraryIdentifier</key>
48
- <string>tvos-arm64</string>
48
+ <string>xros-arm64</string>
49
49
  <key>LibraryPath</key>
50
50
  <string>llama.framework</string>
51
51
  <key>SupportedArchitectures</key>
@@ -53,7 +53,7 @@
53
53
  <string>arm64</string>
54
54
  </array>
55
55
  <key>SupportedPlatform</key>
56
- <string>tvos</string>
56
+ <string>xros</string>
57
57
  </dict>
58
58
  <dict>
59
59
  <key>BinaryPath</key>
@@ -61,26 +61,23 @@
61
61
  <key>DebugSymbolsPath</key>
62
62
  <string>dSYMs</string>
63
63
  <key>LibraryIdentifier</key>
64
- <string>ios-arm64_x86_64-simulator</string>
64
+ <string>tvos-arm64</string>
65
65
  <key>LibraryPath</key>
66
66
  <string>llama.framework</string>
67
67
  <key>SupportedArchitectures</key>
68
68
  <array>
69
69
  <string>arm64</string>
70
- <string>x86_64</string>
71
70
  </array>
72
71
  <key>SupportedPlatform</key>
73
- <string>ios</string>
74
- <key>SupportedPlatformVariant</key>
75
- <string>simulator</string>
72
+ <string>tvos</string>
76
73
  </dict>
77
74
  <dict>
78
75
  <key>BinaryPath</key>
79
- <string>llama.framework/Versions/A/llama</string>
76
+ <string>llama.framework/llama</string>
80
77
  <key>DebugSymbolsPath</key>
81
78
  <string>dSYMs</string>
82
79
  <key>LibraryIdentifier</key>
83
- <string>macos-arm64_x86_64</string>
80
+ <string>xros-arm64_x86_64-simulator</string>
84
81
  <key>LibraryPath</key>
85
82
  <string>llama.framework</string>
86
83
  <key>SupportedArchitectures</key>
@@ -89,23 +86,26 @@
89
86
  <string>x86_64</string>
90
87
  </array>
91
88
  <key>SupportedPlatform</key>
92
- <string>macos</string>
89
+ <string>xros</string>
90
+ <key>SupportedPlatformVariant</key>
91
+ <string>simulator</string>
93
92
  </dict>
94
93
  <dict>
95
94
  <key>BinaryPath</key>
96
- <string>llama.framework/llama</string>
95
+ <string>llama.framework/Versions/A/llama</string>
97
96
  <key>DebugSymbolsPath</key>
98
97
  <string>dSYMs</string>
99
98
  <key>LibraryIdentifier</key>
100
- <string>xros-arm64</string>
99
+ <string>macos-arm64_x86_64</string>
101
100
  <key>LibraryPath</key>
102
101
  <string>llama.framework</string>
103
102
  <key>SupportedArchitectures</key>
104
103
  <array>
105
104
  <string>arm64</string>
105
+ <string>x86_64</string>
106
106
  </array>
107
107
  <key>SupportedPlatform</key>
108
- <string>xros</string>
108
+ <string>macos</string>
109
109
  </dict>
110
110
  <dict>
111
111
  <key>BinaryPath</key>