@novastera-oss/llamarn 0.2.7 → 0.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (186) hide show
  1. package/android/src/main/cpp/include/llama.h +8 -3
  2. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  3. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  4. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  6. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  7. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  8. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  10. package/cpp/LlamaCppModel.cpp +56 -22
  11. package/cpp/build-info.cpp +2 -2
  12. package/cpp/llama.cpp/CMakeLists.txt +1 -1
  13. package/cpp/llama.cpp/common/arg.cpp +7 -0
  14. package/cpp/llama.cpp/common/common.cpp +3 -0
  15. package/cpp/llama.cpp/common/common.h +1 -0
  16. package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
  17. package/cpp/llama.cpp/convert_hf_to_gguf.py +118 -20
  18. package/cpp/llama.cpp/ggml/CMakeLists.txt +1 -0
  19. package/cpp/llama.cpp/ggml/include/ggml-cpu.h +2 -0
  20. package/cpp/llama.cpp/ggml/include/ggml.h +33 -0
  21. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +17 -0
  22. package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +1 -1
  23. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +31 -2
  24. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +10 -9
  25. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +109 -108
  26. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +1027 -1038
  27. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +53 -52
  28. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  29. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +56 -55
  30. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +42 -41
  31. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +24 -23
  32. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +29 -28
  33. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +30 -29
  34. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +83 -82
  35. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +20 -19
  36. package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +3 -2
  37. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +9 -3
  38. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +83 -102
  39. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
  40. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3 -2
  41. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +192 -67
  42. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -0
  43. package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +25 -24
  44. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +56 -40
  45. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +211 -33
  46. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
  47. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
  48. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +54 -29
  49. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  50. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  51. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
  52. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
  53. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
  54. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +84 -31
  55. package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +19 -0
  56. package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cuh +3 -0
  57. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cu +257 -87
  58. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cuh +2 -3
  59. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +5 -18
  60. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cuh +0 -1
  61. package/cpp/llama.cpp/ggml/src/ggml-impl.h +61 -183
  62. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +16 -0
  63. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +227 -41
  64. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +362 -182
  65. package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +2 -2
  66. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +240 -535
  67. package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -6
  68. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +1 -24
  69. package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +28 -41
  70. package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +4 -10
  71. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +99 -166
  72. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +94 -72
  73. package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +49 -67
  74. package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
  75. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +99 -159
  76. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +6 -9
  77. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +45 -54
  78. package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
  79. package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
  80. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +60 -80
  81. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +132 -201
  82. package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +55 -74
  83. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +24 -20
  84. package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -3
  85. package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
  86. package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
  87. package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -8
  88. package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +12 -16
  89. package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +12 -1
  90. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +57 -1
  91. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -0
  92. package/cpp/llama.cpp/ggml/src/ggml.c +69 -13
  93. package/cpp/llama.cpp/ggml/src/gguf.cpp +5 -1
  94. package/cpp/llama.cpp/gguf-py/gguf/constants.py +76 -0
  95. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +21 -0
  96. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +64 -0
  97. package/cpp/llama.cpp/gguf-py/gguf/vocab.py +97 -4
  98. package/cpp/llama.cpp/gguf-py/pyproject.toml +2 -2
  99. package/cpp/llama.cpp/include/llama.h +8 -3
  100. package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +124 -0
  101. package/cpp/llama.cpp/src/llama-arch.cpp +55 -0
  102. package/cpp/llama.cpp/src/llama-arch.h +18 -0
  103. package/cpp/llama.cpp/src/llama-batch.cpp +570 -359
  104. package/cpp/llama.cpp/src/llama-batch.h +98 -70
  105. package/cpp/llama.cpp/src/llama-chat.cpp +11 -6
  106. package/cpp/llama.cpp/src/llama-context.cpp +101 -107
  107. package/cpp/llama.cpp/src/llama-context.h +13 -13
  108. package/cpp/llama.cpp/src/llama-graph.cpp +199 -252
  109. package/cpp/llama.cpp/src/llama-graph.h +44 -32
  110. package/cpp/llama.cpp/src/llama-hparams.cpp +4 -0
  111. package/cpp/llama.cpp/src/llama-hparams.h +8 -0
  112. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +51 -53
  113. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +19 -24
  114. package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +110 -104
  115. package/cpp/llama.cpp/src/llama-kv-cache-unified.h +17 -22
  116. package/cpp/llama.cpp/src/llama-kv-cells.h +35 -11
  117. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +66 -67
  118. package/cpp/llama.cpp/src/llama-memory-hybrid.h +16 -21
  119. package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +69 -68
  120. package/cpp/llama.cpp/src/llama-memory-recurrent.h +15 -20
  121. package/cpp/llama.cpp/src/llama-memory.h +18 -22
  122. package/cpp/llama.cpp/src/llama-model-saver.cpp +1 -0
  123. package/cpp/llama.cpp/src/llama-model.cpp +1006 -472
  124. package/cpp/llama.cpp/src/llama-model.h +22 -0
  125. package/cpp/llama.cpp/src/llama-quant.cpp +87 -5
  126. package/cpp/llama.cpp/src/llama-vocab.cpp +26 -3
  127. package/cpp/llama.cpp/src/llama-vocab.h +1 -0
  128. package/cpp/rn-utils.h +3 -0
  129. package/ios/include/common.h +1 -0
  130. package/ios/include/llama.h +8 -3
  131. package/ios/libs/llama.xcframework/Info.plist +19 -19
  132. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  133. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4863
  134. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  135. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +33 -0
  136. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +8 -3
  137. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  138. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  139. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4834
  140. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3742
  141. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  142. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  143. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -3
  144. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  145. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  146. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4834
  147. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3766 -3744
  148. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +2 -0
  149. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +33 -0
  150. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +8 -3
  151. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +2 -0
  152. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +33 -0
  153. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +8 -3
  154. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  155. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +2 -0
  156. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +33 -0
  157. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +8 -3
  158. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  159. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  160. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  161. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4863
  162. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  163. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +33 -0
  164. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +8 -3
  165. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  166. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  167. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4834
  168. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3742
  169. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  170. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  171. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -3
  172. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  173. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  174. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4926 -4900
  175. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  176. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +33 -0
  177. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +8 -3
  178. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  179. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  180. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4897 -4871
  181. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3794 -3773
  182. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  183. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  184. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -3
  185. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  186. package/package.json +1 -1
@@ -390,6 +390,7 @@ extern "C" {
390
390
  void * imatrix; // pointer to importance matrix data
391
391
  void * kv_overrides; // pointer to vector containing overrides
392
392
  void * tensor_types; // pointer to vector containing tensor types
393
+ void * prune_layers; // pointer to vector containing layer indices to prune
393
394
  } llama_model_quantize_params;
394
395
 
395
396
  typedef struct llama_logit_bias {
@@ -943,12 +944,14 @@ extern "C" {
943
944
  // Requires the context to have a memory.
944
945
  // For encode-decoder contexts, processes the batch using the decoder.
945
946
  // Positive return values does not mean a fatal error, but rather a warning.
946
- // Upon non-zero return values, the memory state is restored to the state before this call
947
+ // Upon fatal-error or abort, the ubatches that managed to be been processed will remain in the memory state of the context
948
+ // To handle this correctly, query the memory state using llama_memory_seq_pos_min() and llama_memory_seq_pos_max()
949
+ // Upon other return values, the memory state is restored to the state before this call
947
950
  // 0 - success
948
951
  // 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
949
- // 2 - aborted
952
+ // 2 - aborted (processed ubatches will remain in the context's memory)
950
953
  // -1 - invalid input batch
951
- // < -1 - error
954
+ // < -1 - fatal error (processed ubatches will remain in the context's memory)
952
955
  LLAMA_API int32_t llama_decode(
953
956
  struct llama_context * ctx,
954
957
  struct llama_batch batch);
@@ -1044,6 +1047,7 @@ extern "C" {
1044
1047
 
1045
1048
  LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
1046
1049
  LLAMA_API bool llama_vocab_get_add_eos(const struct llama_vocab * vocab);
1050
+ LLAMA_API bool llama_vocab_get_add_sep(const struct llama_vocab * vocab);
1047
1051
 
1048
1052
  LLAMA_API llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab);
1049
1053
  LLAMA_API llama_token llama_vocab_fim_suf(const struct llama_vocab * vocab);
@@ -1087,6 +1091,7 @@ extern "C" {
1087
1091
  /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
1088
1092
  /// @return Returns the number of tokens on success, no more than n_tokens_max
1089
1093
  /// @return Returns a negative number on failure - the number of tokens that would have been returned
1094
+ /// @return Returns INT32_MIN on overflow (e.g., tokenization result size exceeds int32_t limit)
1090
1095
  /// @param add_special Allow to add BOS and EOS tokens if model is configured to do so.
1091
1096
  /// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
1092
1097
  /// as plaintext. Does not insert a leading space.
@@ -5,6 +5,7 @@
5
5
  #include <cstdlib>
6
6
  #include <ctime>
7
7
  #include <chrono>
8
+ #include <thread>
8
9
  #include <fstream>
9
10
  #include <iostream>
10
11
  #include <random>
@@ -50,33 +51,60 @@ LlamaCppModel::~LlamaCppModel() {
50
51
  }
51
52
 
52
53
  void LlamaCppModel::release() {
53
- // Cancel any ongoing predictions
54
+ // Signal completion to stop and wait for it to finish gracefully
54
55
  if (is_predicting_) {
55
56
  should_stop_completion_ = true;
56
57
 
57
- // Optionally wait a bit for completion to stop
58
+ // Wait more patiently for completion to stop, with proper backoff
58
59
  int retry = 0;
59
- while (is_predicting_ && retry < 10) {
60
- std::this_thread::sleep_for(std::chrono::milliseconds(10));
60
+ while (is_predicting_ && retry < 100) { // Increased from 10 to 100
61
+ std::this_thread::sleep_for(std::chrono::milliseconds(retry < 50 ? 10 : 50));
61
62
  retry++;
62
63
  }
64
+
65
+ // Force stop if still predicting
66
+ if (is_predicting_) {
67
+ is_predicting_ = false;
68
+ }
63
69
  }
64
70
 
65
- // Clean up our resources
71
+ // Clean up our resources with proper mutex protection
66
72
  if (rn_ctx_) {
73
+ std::lock_guard<std::mutex> lock(rn_ctx_->mutex);
74
+
75
+ // Clear KV cache before freeing context (following server.cpp pattern)
67
76
  if (rn_ctx_->ctx) {
77
+ try {
78
+ llama_memory_clear(llama_get_memory(rn_ctx_->ctx), true);
79
+ } catch (...) {
80
+ // Ignore errors during cache clearing
81
+ }
82
+
68
83
  llama_free(rn_ctx_->ctx);
69
84
  rn_ctx_->ctx = nullptr;
70
85
  }
71
86
 
87
+ // Free model after context (following server.cpp cleanup order)
72
88
  if (rn_ctx_->model) {
73
89
  llama_model_free(rn_ctx_->model);
74
90
  rn_ctx_->model = nullptr;
75
91
  }
76
92
 
93
+ // Clean up additional resources
94
+ rn_ctx_->vocab = nullptr; // This is owned by the model, so just null it
95
+ rn_ctx_->chat_templates.reset(); // Clean up chat templates
96
+ rn_ctx_->lora_adapters.clear(); // Clear LoRA adapters
97
+
98
+ // Reset state flags
99
+ rn_ctx_->model_loaded = false;
100
+
77
101
  // Note: rn_ctx_ itself is owned by the module, so we don't delete it here
78
102
  rn_ctx_ = nullptr;
79
103
  }
104
+
105
+ // Reset our internal state
106
+ should_stop_completion_ = false;
107
+ is_predicting_ = false;
80
108
  }
81
109
 
82
110
  int32_t LlamaCppModel::getVocabSize() const {
@@ -133,6 +161,10 @@ CompletionOptions LlamaCppModel::parseCompletionOptions(jsi::Runtime& rt, const
133
161
  options.min_p = obj.getProperty(rt, "min_p").asNumber();
134
162
  }
135
163
 
164
+ if (obj.hasProperty(rt, "presence_penalty") && !obj.getProperty(rt, "presence_penalty").isUndefined()) {
165
+ options.presence_penalty = obj.getProperty(rt, "presence_penalty").asNumber();
166
+ }
167
+
136
168
  if (obj.hasProperty(rt, "n_predict") && !obj.getProperty(rt, "n_predict").isUndefined()) {
137
169
  options.n_predict = obj.getProperty(rt, "n_predict").asNumber();
138
170
  } else if (obj.hasProperty(rt, "max_tokens") && !obj.getProperty(rt, "max_tokens").isUndefined()) {
@@ -365,13 +397,14 @@ CompletionResult LlamaCppModel::completion(const CompletionOptions& options, std
365
397
  std::lock_guard<std::mutex> lock(rn_ctx_->mutex);
366
398
 
367
399
  // Clear the context KV cache
368
- llama_kv_self_clear(rn_ctx_->ctx);
400
+ llama_memory_clear(llama_get_memory(rn_ctx_->ctx), true);
369
401
 
370
402
  // Store original sampling parameters to restore later
371
403
  float orig_temp = rn_ctx_->params.sampling.temp;
372
404
  float orig_top_p = rn_ctx_->params.sampling.top_p;
373
405
  float orig_top_k = rn_ctx_->params.sampling.top_k;
374
406
  float orig_min_p = rn_ctx_->params.sampling.min_p;
407
+ float orig_presence_penalty = rn_ctx_->params.sampling.penalty_present;
375
408
  int orig_n_predict = rn_ctx_->params.n_predict;
376
409
 
377
410
  // Set sampling parameters from options
@@ -379,6 +412,7 @@ CompletionResult LlamaCppModel::completion(const CompletionOptions& options, std
379
412
  rn_ctx_->params.sampling.top_p = options.top_p;
380
413
  rn_ctx_->params.sampling.top_k = options.top_k;
381
414
  rn_ctx_->params.sampling.min_p = options.min_p;
415
+ rn_ctx_->params.sampling.penalty_present = options.presence_penalty;
382
416
  rn_ctx_->params.n_predict = options.n_predict;
383
417
 
384
418
  // Check for a partial callback
@@ -426,6 +460,7 @@ CompletionResult LlamaCppModel::completion(const CompletionOptions& options, std
426
460
  rn_ctx_->params.sampling.top_p = orig_top_p;
427
461
  rn_ctx_->params.sampling.top_k = orig_top_k;
428
462
  rn_ctx_->params.sampling.min_p = orig_min_p;
463
+ rn_ctx_->params.sampling.penalty_present = orig_presence_penalty;
429
464
  rn_ctx_->params.n_predict = orig_n_predict;
430
465
 
431
466
  return result;
@@ -885,29 +920,28 @@ jsi::Value LlamaCppModel::embeddingJsi(jsi::Runtime& rt, const jsi::Value* args,
885
920
  }
886
921
 
887
922
  // Clear the context KV cache to ensure clean embedding
888
- llama_kv_self_clear(rn_ctx_->ctx);
923
+ llama_memory_clear(llama_get_memory(rn_ctx_->ctx), true);
889
924
 
890
925
  // Enable embedding mode
891
926
  llama_set_embeddings(rn_ctx_->ctx, true);
892
927
 
893
- // Evaluate tokens one by one
928
+ // Create and populate batch using common_batch functions (following server.cpp pattern)
929
+ llama_batch batch = llama_batch_init(tokens.size(), 0, 1);
930
+
931
+ common_batch_clear(batch);
894
932
  for (int i = 0; i < (int)tokens.size(); i++) {
895
- llama_token token = tokens[i];
896
- llama_batch batch = {
897
- /* n_tokens */ 1,
898
- /* token */ &token,
899
- /* embd */ nullptr,
900
- /* pos */ &i,
901
- /* n_seq_id */ nullptr,
902
- /* seq_id */ nullptr,
903
- /* logits */ nullptr
904
- };
905
-
906
- if (llama_decode(rn_ctx_->ctx, batch) != 0) {
907
- throw std::runtime_error("Failed to decode token for embedding");
908
- }
933
+ // For embeddings, we typically need logits for the last token (for pooling)
934
+ bool needs_logits = (i == (int)tokens.size() - 1);
935
+ common_batch_add(batch, tokens[i], i, {0}, needs_logits);
909
936
  }
910
937
 
938
+ if (llama_decode(rn_ctx_->ctx, batch) != 0) {
939
+ llama_batch_free(batch);
940
+ throw std::runtime_error("Failed to decode tokens for embedding");
941
+ }
942
+
943
+ llama_batch_free(batch);
944
+
911
945
  // Get embedding size from the model
912
946
  const int n_embd = llama_model_n_embd(rn_ctx_->model);
913
947
  if (n_embd <= 0) {
@@ -1,4 +1,4 @@
1
- int LLAMA_BUILD_NUMBER = 5709;
2
- char const *LLAMA_COMMIT = "d67341dc";
1
+ int LLAMA_BUILD_NUMBER = 5770;
2
+ char const *LLAMA_COMMIT = "b25e9277";
3
3
  char const *LLAMA_COMPILER = "unknown";
4
4
  char const *LLAMA_BUILD_TARGET = "unknown";
@@ -95,7 +95,7 @@ endif()
95
95
  if (NOT DEFINED LLAMA_BUILD_COMMIT)
96
96
  set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT})
97
97
  endif()
98
- set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER})
98
+ set(LLAMA_INSTALL_VERSION 0.0.${LLAMA_BUILD_NUMBER})
99
99
 
100
100
  # override ggml options
101
101
  set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS})
@@ -2706,6 +2706,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
2706
2706
  params.embd_sep = value;
2707
2707
  }
2708
2708
  ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
2709
+ add_opt(common_arg(
2710
+ {"--cls-separator"}, "STRING",
2711
+ "separator of classification sequences (default \\t) for example \"<#seq#>\"",
2712
+ [](common_params & params, const std::string & value) {
2713
+ params.cls_sep = value;
2714
+ }
2715
+ ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
2709
2716
  add_opt(common_arg(
2710
2717
  {"--host"}, "HOST",
2711
2718
  string_format("ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: %s)", params.hostname.c_str()),
@@ -1290,6 +1290,9 @@ std::vector<llama_token> common_tokenize(
1290
1290
  int n_tokens = text.length() + 2 * add_special;
1291
1291
  std::vector<llama_token> result(n_tokens);
1292
1292
  n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
1293
+ if (n_tokens == std::numeric_limits<int32_t>::min()) {
1294
+ throw std::runtime_error("Tokenization failed: input text too large, tokenization result exceeds int32_t limit");
1295
+ }
1293
1296
  if (n_tokens < 0) {
1294
1297
  result.resize(-n_tokens);
1295
1298
  int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
@@ -358,6 +358,7 @@ struct common_params {
358
358
  int32_t embd_normalize = 2; // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
359
359
  std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
360
360
  std::string embd_sep = "\n"; // separator of embeddings
361
+ std::string cls_sep = "\t"; // separator of classification sequences
361
362
 
362
363
  // server params
363
364
  int32_t port = 8080; // server listens on this network port
@@ -41,49 +41,6 @@ static std::string build_repetition(const std::string & item_rule, int min_items
41
41
  return result;
42
42
  }
43
43
 
44
- /* Minimalistic replacement for std::string_view, which is only available from C++17 onwards */
45
- class string_view {
46
- const std::string & _str;
47
- const size_t _start;
48
- const size_t _end;
49
- public:
50
- string_view(const std::string & str, size_t start = 0, size_t end = std::string::npos) : _str(str), _start(start), _end(end == std::string::npos ? str.length() : end) {}
51
-
52
- size_t size() const {
53
- return _end - _start;
54
- }
55
-
56
- size_t length() const {
57
- return size();
58
- }
59
-
60
- operator std::string() const {
61
- return str();
62
- }
63
-
64
- std::string str() const {
65
- return _str.substr(_start, _end - _start);
66
- }
67
-
68
- string_view substr(size_t pos, size_t len = std::string::npos) const {
69
- return string_view(_str, _start + pos, len == std::string::npos ? _end : _start + pos + len);
70
- }
71
-
72
- char operator[](size_t pos) const {
73
- auto index = _start + pos;
74
- if (index >= _end) {
75
- throw std::out_of_range("string_view index out of range");
76
- }
77
- return _str[_start + pos];
78
- }
79
-
80
- bool operator==(const string_view & other) const {
81
- std::string this_str = *this;
82
- std::string other_str = other;
83
- return this_str == other_str;
84
- }
85
- };
86
-
87
44
  static void _build_min_max_int(int min_value, int max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) {
88
45
  auto has_min = min_value != std::numeric_limits<int>::min();
89
46
  auto has_max = max_value != std::numeric_limits<int>::max();
@@ -112,14 +69,14 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
112
69
  }
113
70
  out << "}";
114
71
  };
115
- std::function<void(const string_view &, const string_view &)> uniform_range =
116
- [&](const string_view & from, const string_view & to) {
72
+ std::function<void(const std::string_view &, const std::string_view &)> uniform_range =
73
+ [&](const std::string_view & from, const std::string_view & to) {
117
74
  size_t i = 0;
118
75
  while (i < from.length() && i < to.length() && from[i] == to[i]) {
119
76
  i++;
120
77
  }
121
78
  if (i > 0) {
122
- out << "\"" << from.substr(0, i).str() << "\"";
79
+ out << "\"" << from.substr(0, i) << "\"";
123
80
  }
124
81
  if (i < from.length() && i < to.length()) {
125
82
  if (i > 0) {
@@ -310,6 +310,8 @@ class ModelBase:
310
310
  gguf.MODEL_TENSOR.POSNET_NORM2,
311
311
  gguf.MODEL_TENSOR.V_ENC_EMBD_POS,
312
312
  gguf.MODEL_TENSOR.A_ENC_EMBD_POS,
313
+ gguf.MODEL_TENSOR.ALTUP_CORRECT_COEF,
314
+ gguf.MODEL_TENSOR.ALTUP_PREDICT_COEF,
313
315
  )
314
316
  )
315
317
  or not new_name.endswith(".weight")
@@ -320,7 +322,11 @@ class ModelBase:
320
322
  self.match_model_tensor_name(new_name, key, bid)
321
323
  for key in (
322
324
  gguf.MODEL_TENSOR.TOKEN_EMBD,
325
+ gguf.MODEL_TENSOR.PER_LAYER_TOKEN_EMBD,
323
326
  gguf.MODEL_TENSOR.OUTPUT,
327
+ gguf.MODEL_TENSOR.ALTUP_ROUTER,
328
+ gguf.MODEL_TENSOR.LAUREL_L,
329
+ gguf.MODEL_TENSOR.LAUREL_R,
324
330
  )
325
331
  ):
326
332
  if self.ftype in (
@@ -921,13 +927,20 @@ class TextModel(ModelBase):
921
927
  tokenizer = SentencePieceProcessor()
922
928
  tokenizer.LoadFromFile(str(tokenizer_path))
923
929
 
924
- vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
930
+ vocab_size = self.find_hparam([
931
+ "vocab_size_per_layer_input", # gemma3n
932
+ "vocab_size",
933
+ ], optional=True) or tokenizer.vocab_size()
925
934
 
926
935
  tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
927
936
  scores: list[float] = [-10000.0] * vocab_size
928
937
  toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
929
938
 
930
939
  for token_id in range(tokenizer.vocab_size()):
940
+ if token_id >= vocab_size:
941
+ logger.warning(f'ignore tokens from {token_id}: id is out of range, max={vocab_size - 1}')
942
+ break
943
+
931
944
  piece = tokenizer.IdToPiece(token_id)
932
945
  text = piece.encode("utf-8")
933
946
  score = tokenizer.GetScore(token_id)
@@ -2145,7 +2158,6 @@ class Llama4Model(LlamaModel):
2145
2158
 
2146
2159
  def set_vocab(self):
2147
2160
  self._set_vocab_gpt2()
2148
- self.gguf_writer.add_add_bos_token(True)
2149
2161
 
2150
2162
  def set_gguf_parameters(self):
2151
2163
  super().set_gguf_parameters()
@@ -2194,7 +2206,7 @@ class Llama4VisionModel(MmprojModel):
2194
2206
  name += ".weight"
2195
2207
  if "multi_modal_projector.linear_1" in name:
2196
2208
  # despite the name with number postfix, this is a single fully connected layer
2197
- return [(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_MMPROJ_FC], data_torch)]
2209
+ return [(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_MMPROJ_FC] + '.weight', data_torch)]
2198
2210
  return [(self.map_tensor_name(name), data_torch)]
2199
2211
  return []
2200
2212
 
@@ -3918,9 +3930,6 @@ class BertModel(TextModel):
3918
3930
  special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
3919
3931
  special_vocab.add_to_gguf(self.gguf_writer)
3920
3932
 
3921
- self.gguf_writer.add_add_bos_token(True)
3922
- self.gguf_writer.add_add_eos_token(True)
3923
-
3924
3933
 
3925
3934
  @ModelBase.register("DistilBertModel", "DistilBertForMaskedLM", "DistilBertForSequenceClassification")
3926
3935
  class DistilBertModel(BertModel):
@@ -3962,8 +3971,6 @@ class RobertaModel(BertModel):
3962
3971
  bpe_tok_path = self.dir_model / "tokenizer.json"
3963
3972
  if bpe_tok_path.exists():
3964
3973
  self._set_vocab_gpt2()
3965
- self.gguf_writer.add_add_bos_token(True)
3966
- self.gguf_writer.add_add_eos_token(True)
3967
3974
 
3968
3975
  # we need this to validate the size of the token_type embeddings
3969
3976
  # though currently we are passing all zeros to the token_type embeddings
@@ -4223,6 +4230,7 @@ class Gemma2Model(TextModel):
4223
4230
  @ModelBase.register("Gemma3ForCausalLM", "Gemma3ForConditionalGeneration")
4224
4231
  class Gemma3Model(TextModel):
4225
4232
  model_arch = gguf.MODEL_ARCH.GEMMA3
4233
+ norm_shift = 1.0 # Gemma3RMSNorm adds 1.0 to the norm value
4226
4234
 
4227
4235
  def set_vocab(self):
4228
4236
  self._set_vocab_sentencepiece()
@@ -4244,9 +4252,8 @@ class Gemma3Model(TextModel):
4244
4252
  self.gguf_writer.add_value_length(hparams.get("head_dim", 256))
4245
4253
  self.gguf_writer.add_file_type(self.ftype)
4246
4254
  self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 1_000_000.0)) # for global layers
4247
- # both attn_logit_softcapping and final_logit_softcapping are removed in Gemma3
4255
+ # attn_logit_softcapping is removed in Gemma3
4248
4256
  assert hparams.get("attn_logit_softcapping") is None
4249
- assert hparams.get("final_logit_softcapping") is None
4250
4257
  self.gguf_writer.add_sliding_window(hparams["sliding_window"])
4251
4258
  self.gguf_writer.add_head_count_kv(hparams.get("num_key_value_heads", 4))
4252
4259
  if hparams.get("rope_scaling") is not None:
@@ -4258,7 +4265,7 @@ class Gemma3Model(TextModel):
4258
4265
  def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4259
4266
  del bid # unused
4260
4267
 
4261
- if name.startswith("language_model."):
4268
+ if "language_model." in name:
4262
4269
  name = name.replace("language_model.", "")
4263
4270
 
4264
4271
  elif name.startswith("multi_modal_projector.") or name.startswith("vision_tower.") \
@@ -4273,8 +4280,9 @@ class Gemma3Model(TextModel):
4273
4280
 
4274
4281
  # ref code in Gemma3RMSNorm
4275
4282
  # output = output * (1.0 + self.weight.float())
4283
+ # note: this is not the case on gemma3n
4276
4284
  if name.endswith("norm.weight"):
4277
- data_torch = data_torch + 1
4285
+ data_torch = data_torch + self.norm_shift
4278
4286
 
4279
4287
  return [(self.map_tensor_name(name), data_torch)]
4280
4288
 
@@ -4331,6 +4339,104 @@ class Gemma3VisionModel(MmprojModel):
4331
4339
  return [] # skip other tensors
4332
4340
 
4333
4341
 
4342
+ @ModelBase.register("Gemma3nForConditionalGeneration")
4343
+ class Gemma3NModel(Gemma3Model):
4344
+ model_arch = gguf.MODEL_ARCH.GEMMA3N
4345
+ norm_shift = 0.0 # same value with Gemma3p5RMSNorm scale_shift on python code
4346
+
4347
+ _altup_proj: list[Tensor] = []
4348
+ _altup_unembd: list[Tensor] = []
4349
+
4350
+ def __init__(self, *args, **kwargs):
4351
+ super().__init__(*args, **kwargs)
4352
+ assert self.hparams["altup_num_inputs"] == 4, "Current conversion only supports 4 altup inputs"
4353
+ self._altup_proj = [
4354
+ torch.Tensor(), # to be replaced
4355
+ torch.Tensor(), # to be replaced
4356
+ torch.Tensor(), # to be replaced
4357
+ ]
4358
+ self._altup_unembd = [
4359
+ torch.Tensor(), # to be replaced
4360
+ torch.Tensor(), # to be replaced
4361
+ torch.Tensor(), # to be replaced
4362
+ ]
4363
+
4364
+ def set_vocab(self):
4365
+ with open(self.dir_model / "chat_template.jinja") as f:
4366
+ # quick hack to make sure chat template is added
4367
+ self.gguf_writer.add_chat_template(f.read())
4368
+ super().set_vocab()
4369
+
4370
+ def set_gguf_parameters(self):
4371
+ super().set_gguf_parameters()
4372
+ self.gguf_writer.add_altup_active_idx(self.hparams["altup_active_idx"])
4373
+ self.gguf_writer.add_altup_num_inputs(self.hparams["altup_num_inputs"])
4374
+ self.gguf_writer.add_embedding_length_per_layer_input(self.hparams["hidden_size_per_layer_input"])
4375
+ self.gguf_writer.add_shared_kv_layers(self.hparams["num_kv_shared_layers"])
4376
+
4377
+ activation_sparsity_scale = []
4378
+ for s in self.hparams["activation_sparsity_pattern"]:
4379
+ normal_dist = torch.distributions.normal.Normal(0, 1)
4380
+ std_multiplier = normal_dist.icdf(torch.tensor(s, dtype=torch.float32))
4381
+ activation_sparsity_scale.append(std_multiplier.item())
4382
+ self.gguf_writer.add_activation_sparsity_scale(activation_sparsity_scale)
4383
+
4384
+ sliding_window_pattern = []
4385
+ for t in self.hparams["layer_types"]:
4386
+ sliding_window_pattern.append(t == "sliding_attention")
4387
+ self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern)
4388
+
4389
+ def _stack_matrices(self, matrices: list[Tensor]) -> Tensor | None:
4390
+ has_all = all(m.numel() > 0 for m in matrices)
4391
+ if not has_all:
4392
+ return None
4393
+ else:
4394
+ return torch.stack(matrices, dim=0)
4395
+
4396
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4397
+ if name.endswith("_scale"):
4398
+ name = name + ".weight"
4399
+
4400
+ # TODO: implement self.prediction_coefs.weight.clamp_(...)
4401
+
4402
+ if "language_model." not in name:
4403
+ return [] # skip non-language model tensors
4404
+
4405
+ if "altup_unembed_projections" in name:
4406
+ data_torch = data_torch.to(device="cpu")
4407
+ if ".0." in name:
4408
+ self._altup_unembd[0] = data_torch
4409
+ elif ".1." in name:
4410
+ self._altup_unembd[1] = data_torch
4411
+ elif ".2." in name:
4412
+ self._altup_unembd[2] = data_torch
4413
+ else:
4414
+ raise ValueError(f"Unknown name: {name}")
4415
+ out = self._stack_matrices(self._altup_unembd)
4416
+ if out is not None:
4417
+ return [(self.map_tensor_name("model.altup_unembed_projections.weight"), out)]
4418
+ else:
4419
+ return []
4420
+
4421
+ if "altup_projections" in name:
4422
+ data_torch = data_torch.to(device="cpu")
4423
+ if ".0." in name:
4424
+ self._altup_proj[0] = data_torch
4425
+ elif ".1." in name:
4426
+ self._altup_proj[1] = data_torch
4427
+ elif ".2." in name:
4428
+ self._altup_proj[2] = data_torch
4429
+ else:
4430
+ raise ValueError(f"Unknown name: {name}")
4431
+ out = self._stack_matrices(self._altup_proj)
4432
+ if out is not None:
4433
+ return [(self.map_tensor_name("model.altup_projections.weight"), out)]
4434
+ else:
4435
+ return []
4436
+
4437
+ return super().modify_tensors(data_torch, name, bid)
4438
+
4439
+
4334
4440
  @ModelBase.register("Starcoder2ForCausalLM")
4335
4441
  class StarCoder2Model(TextModel):
4336
4442
  model_arch = gguf.MODEL_ARCH.STARCODER2
@@ -4848,8 +4954,6 @@ class JinaBertV2Model(BertModel):
4848
4954
  self.gguf_writer.add_token_type_count(2)
4849
4955
  else:
4850
4956
  raise NotImplementedError(f'Tokenizer {tokenizer_class} is not supported for JinaBertModel')
4851
- self.gguf_writer.add_add_bos_token(True)
4852
- self.gguf_writer.add_add_eos_token(True)
4853
4957
 
4854
4958
 
4855
4959
  @ModelBase.register("OpenELMForCausalLM")
@@ -5451,9 +5555,6 @@ class T5Model(TextModel):
5451
5555
  special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
5452
5556
  special_vocab.add_to_gguf(self.gguf_writer)
5453
5557
 
5454
- self.gguf_writer.add_add_bos_token(False)
5455
- self.gguf_writer.add_add_eos_token(True)
5456
-
5457
5558
  def set_gguf_parameters(self):
5458
5559
  if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None:
5459
5560
  logger.warning("Couldn't find context length in config.json, assuming default value of 512")
@@ -5591,9 +5692,6 @@ class T5EncoderModel(TextModel):
5591
5692
  special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
5592
5693
  special_vocab.add_to_gguf(self.gguf_writer)
5593
5694
 
5594
- self.gguf_writer.add_add_bos_token(False)
5595
- self.gguf_writer.add_add_eos_token(True)
5596
-
5597
5695
  def set_gguf_parameters(self):
5598
5696
  if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None:
5599
5697
  logger.warning("Couldn't find context length in config.json, assuming default value of 512")
@@ -131,6 +131,7 @@ option(GGML_RVV "ggml: enable rvv" ON)
131
131
  option(GGML_RV_ZFH "ggml: enable riscv zfh" OFF)
132
132
  option(GGML_XTHEADVECTOR "ggml: enable xtheadvector" OFF)
133
133
  option(GGML_VXE "ggml: enable vxe" ON)
134
+ option(GGML_NNPA "ggml: enable nnpa" ON)
134
135
 
135
136
  option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
136
137
  set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
@@ -101,6 +101,7 @@ extern "C" {
101
101
  GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
102
102
  GGML_BACKEND_API int ggml_cpu_has_vsx (void);
103
103
  GGML_BACKEND_API int ggml_cpu_has_vxe (void);
104
+ GGML_BACKEND_API int ggml_cpu_has_nnpa (void);
104
105
  GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
105
106
  GGML_BACKEND_API int ggml_cpu_has_llamafile (void);
106
107
 
@@ -133,6 +134,7 @@ extern "C" {
133
134
 
134
135
  GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
135
136
 
137
+ GGML_BACKEND_API void ggml_cpu_fp32_to_fp32(const float *, float *, int64_t);
136
138
  GGML_BACKEND_API void ggml_cpu_fp32_to_fp16(const float *, ggml_fp16_t *, int64_t);
137
139
  GGML_BACKEND_API void ggml_cpu_fp16_to_fp32(const ggml_fp16_t *, float *, int64_t);
138
140
  GGML_BACKEND_API void ggml_cpu_fp32_to_bf16(const float *, ggml_bf16_t *, int64_t);