@novastera-oss/llamarn 0.2.5 → 0.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (225) hide show
  1. package/RNLlamaCpp.podspec +3 -2
  2. package/android/CMakeLists.txt +6 -3
  3. package/android/src/main/cpp/include/llama.h +140 -38
  4. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  6. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  8. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  10. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  11. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  12. package/cpp/LlamaCppModel.cpp +48 -67
  13. package/cpp/LlamaCppModel.h +8 -3
  14. package/cpp/PureCppImpl.cpp +1 -1
  15. package/cpp/PureCppImpl.h +2 -2
  16. package/cpp/build-info.cpp +2 -2
  17. package/cpp/llama.cpp/CMakeLists.txt +15 -4
  18. package/cpp/llama.cpp/Makefile +2 -2
  19. package/cpp/llama.cpp/README.md +33 -13
  20. package/cpp/llama.cpp/common/CMakeLists.txt +15 -28
  21. package/cpp/llama.cpp/common/arg.cpp +38 -12
  22. package/cpp/llama.cpp/common/build-info.cpp.in +2 -2
  23. package/cpp/llama.cpp/common/chat-parser.cpp +9 -3
  24. package/cpp/llama.cpp/common/chat-parser.h +4 -1
  25. package/cpp/llama.cpp/common/chat.cpp +16 -13
  26. package/cpp/llama.cpp/common/chat.h +1 -1
  27. package/cpp/llama.cpp/common/common.cpp +52 -40
  28. package/cpp/llama.cpp/common/common.h +5 -2
  29. package/cpp/llama.cpp/common/json-partial.cpp +5 -4
  30. package/cpp/llama.cpp/common/json-partial.h +2 -1
  31. package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +2 -1
  32. package/cpp/llama.cpp/common/json-schema-to-grammar.h +4 -4
  33. package/cpp/llama.cpp/common/speculative.cpp +6 -4
  34. package/cpp/llama.cpp/convert_hf_to_gguf.py +128 -84
  35. package/cpp/llama.cpp/ggml/CMakeLists.txt +47 -2
  36. package/cpp/llama.cpp/ggml/cmake/common.cmake +1 -2
  37. package/cpp/llama.cpp/ggml/include/ggml.h +1 -3
  38. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +49 -13
  39. package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +5 -0
  40. package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +10 -5
  41. package/cpp/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +3 -3
  42. package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +6 -1
  43. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
  44. package/cpp/llama.cpp/ggml/src/ggml-common.h +4 -0
  45. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +93 -24
  46. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
  47. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +1 -1
  48. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  49. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4113 -0
  50. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2174 -0
  51. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2638 -0
  52. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2731 -0
  53. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2068 -0
  54. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +396 -0
  55. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1299 -0
  56. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1480 -0
  57. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +4310 -0
  58. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +59 -3206
  59. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +184 -0
  60. package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +1 -1
  61. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +7 -4
  62. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +33 -2
  63. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +8 -8
  64. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
  65. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
  66. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +56 -7
  67. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
  68. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +2 -2
  69. package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +1157 -0
  70. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
  71. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +1555 -0
  72. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.h +98 -0
  73. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +2 -4
  74. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
  75. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +6 -8
  76. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +5 -2
  77. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +25 -16
  78. package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
  79. package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -0
  80. package/cpp/llama.cpp/ggml/src/ggml-impl.h +2 -0
  81. package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +11 -10
  82. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +33 -8
  83. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +135 -100
  84. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +7 -0
  85. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +908 -3
  86. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
  87. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  88. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
  89. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
  90. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
  91. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  92. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
  93. package/cpp/llama.cpp/ggml/src/ggml-quants.c +0 -2
  94. package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
  95. package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +3 -3
  96. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +19 -24
  97. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +21 -2
  98. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +121 -4
  99. package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +32 -0
  100. package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +3 -0
  101. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +2 -96
  102. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +164 -46
  103. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +32 -8
  104. package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +38 -10
  105. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +118 -11
  106. package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
  107. package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +26 -29
  108. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +432 -248
  109. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -12
  110. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  111. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +2 -0
  112. package/cpp/llama.cpp/ggml/src/ggml.c +9 -8
  113. package/cpp/llama.cpp/ggml/src/ggml.cpp +26 -0
  114. package/cpp/llama.cpp/ggml/src/gguf.cpp +19 -2
  115. package/cpp/llama.cpp/gguf-py/gguf/constants.py +57 -0
  116. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +4 -1
  117. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +14 -3
  118. package/cpp/llama.cpp/include/llama.h +140 -38
  119. package/cpp/llama.cpp/requirements/requirements-compare-llama-bench.txt +1 -0
  120. package/cpp/llama.cpp/src/CMakeLists.txt +4 -1
  121. package/cpp/llama.cpp/src/llama-arch.cpp +95 -3
  122. package/cpp/llama.cpp/src/llama-arch.h +7 -1
  123. package/cpp/llama.cpp/src/llama-batch.cpp +289 -31
  124. package/cpp/llama.cpp/src/llama-batch.h +47 -17
  125. package/cpp/llama.cpp/src/llama-chat.cpp +19 -2
  126. package/cpp/llama.cpp/src/llama-chat.h +1 -0
  127. package/cpp/llama.cpp/src/llama-context.cpp +488 -313
  128. package/cpp/llama.cpp/src/llama-context.h +38 -17
  129. package/cpp/llama.cpp/src/llama-cparams.cpp +1 -1
  130. package/cpp/llama.cpp/src/llama-cparams.h +1 -1
  131. package/cpp/llama.cpp/src/llama-graph.cpp +275 -152
  132. package/cpp/llama.cpp/src/llama-graph.h +109 -52
  133. package/cpp/llama.cpp/src/llama-hparams.cpp +6 -2
  134. package/cpp/llama.cpp/src/llama-hparams.h +8 -2
  135. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +281 -0
  136. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +133 -0
  137. package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +1835 -0
  138. package/cpp/llama.cpp/src/llama-kv-cache-unified.h +308 -0
  139. package/cpp/llama.cpp/src/llama-kv-cells.h +53 -17
  140. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +247 -0
  141. package/cpp/llama.cpp/src/llama-memory-hybrid.h +143 -0
  142. package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +1116 -0
  143. package/cpp/llama.cpp/src/llama-memory-recurrent.h +188 -0
  144. package/cpp/llama.cpp/src/llama-memory.cpp +41 -0
  145. package/cpp/llama.cpp/src/llama-memory.h +89 -4
  146. package/cpp/llama.cpp/src/llama-mmap.cpp +1 -1
  147. package/cpp/llama.cpp/src/llama-model-loader.cpp +42 -17
  148. package/cpp/llama.cpp/src/llama-model.cpp +735 -143
  149. package/cpp/llama.cpp/src/llama-model.h +4 -0
  150. package/cpp/llama.cpp/src/llama-quant.cpp +2 -1
  151. package/cpp/llama.cpp/src/llama-vocab.cpp +39 -25
  152. package/cpp/llama.cpp/src/llama.cpp +11 -7
  153. package/cpp/llama.cpp/src/unicode.cpp +5 -0
  154. package/cpp/llama.cpp/vendor/cpp-httplib/httplib.h +10518 -0
  155. package/cpp/llama.cpp/vendor/miniaudio/miniaudio.h +93468 -0
  156. package/cpp/llama.cpp/{common → vendor}/minja/chat-template.hpp +1 -1
  157. package/cpp/llama.cpp/{common → vendor}/minja/minja.hpp +1 -1
  158. package/cpp/llama.cpp/{common → vendor/nlohmann}/json.hpp +3027 -2267
  159. package/cpp/llama.cpp/vendor/nlohmann/json_fwd.hpp +187 -0
  160. package/cpp/llama.cpp/vendor/stb/stb_image.h +7988 -0
  161. package/cpp/rn-completion.cpp +65 -10
  162. package/cpp/{rn-llama.hpp → rn-llama.h} +1 -1
  163. package/cpp/{rn-utils.hpp → rn-utils.h} +8 -1
  164. package/ios/include/chat.h +1 -1
  165. package/ios/include/common/minja/chat-template.hpp +1 -1
  166. package/ios/include/common/minja/minja.hpp +1 -1
  167. package/ios/include/common.h +5 -2
  168. package/ios/include/json-schema-to-grammar.h +4 -4
  169. package/ios/include/llama.h +140 -38
  170. package/ios/include/{common → nlohmann}/json.hpp +3027 -2267
  171. package/ios/libs/llama.xcframework/Info.plist +20 -20
  172. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  173. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4863 -4617
  174. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +1 -3
  175. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +140 -38
  176. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  177. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  178. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4638
  179. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3742 -3557
  180. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +1 -3
  181. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +140 -38
  182. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  183. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  184. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4638
  185. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3744 -3559
  186. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +1 -3
  187. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +140 -38
  188. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +1 -3
  189. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +140 -38
  190. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  191. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +1 -3
  192. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +140 -38
  193. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  194. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  195. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  196. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4863 -4616
  197. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +1 -3
  198. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +140 -38
  199. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  200. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  201. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4637
  202. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3742 -3556
  203. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +1 -3
  204. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +140 -38
  205. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  206. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  207. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4900 -4653
  208. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +1 -3
  209. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +140 -38
  210. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  211. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  212. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4871 -4674
  213. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3773 -3587
  214. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +1 -3
  215. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +140 -38
  216. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  217. package/package.json +1 -2
  218. package/cpp/llama.cpp/common/cmake/build-info-gen-cpp.cmake +0 -24
  219. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  220. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13891
  221. package/cpp/llama.cpp/src/llama-kv-cache.cpp +0 -2747
  222. package/cpp/llama.cpp/src/llama-kv-cache.h +0 -502
  223. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
  224. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
  225. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
@@ -519,7 +519,7 @@ class TextModel(ModelBase):
519
519
  def set_gguf_parameters(self):
520
520
  self.gguf_writer.add_block_count(self.block_count)
521
521
 
522
- if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx", "n_positions"], optional=True)) is not None:
522
+ if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx", "n_positions", "max_length"], optional=True)) is not None:
523
523
  self.gguf_writer.add_context_length(n_ctx)
524
524
  logger.info(f"gguf: context length = {n_ctx}")
525
525
 
@@ -1047,6 +1047,10 @@ class TextModel(ModelBase):
1047
1047
  special_vocab.chat_template = "rwkv-world"
1048
1048
  # hack: Add '\n\n' as the EOT token to make it chat normally
1049
1049
  special_vocab._set_special_token("eot", 261)
1050
+ # hack: Override these as they have already been set (incorrectly)
1051
+ special_vocab.special_token_ids["bos"] = 0
1052
+ special_vocab.special_token_ids["eos"] = 0
1053
+
1050
1054
  special_vocab.add_to_gguf(self.gguf_writer)
1051
1055
 
1052
1056
  def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab_size: int):
@@ -1894,9 +1898,7 @@ class LlamaModel(TextModel):
1894
1898
  hparams = self.hparams
1895
1899
  self.gguf_writer.add_vocab_size(hparams["vocab_size"])
1896
1900
 
1897
- if "head_dim" in hparams:
1898
- rope_dim = hparams["head_dim"]
1899
- else:
1901
+ if (rope_dim := hparams.get("head_dim")) is None:
1900
1902
  rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
1901
1903
  self.gguf_writer.add_rope_dimension_count(rope_dim)
1902
1904
 
@@ -1978,7 +1980,8 @@ class LlamaModel(TextModel):
1978
1980
  if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
1979
1981
  if rope_scaling.get("rope_type", '').lower() == "llama3":
1980
1982
  base = self.hparams.get("rope_theta", 10000.0)
1981
- dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
1983
+ if (dim := self.hparams.get("head_dim")) is None:
1984
+ dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
1982
1985
  freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
1983
1986
 
1984
1987
  factor = rope_scaling.get("factor", 8.0)
@@ -2013,6 +2016,20 @@ class LlamaModel(TextModel):
2013
2016
  raise ValueError(f"Unprocessed experts: {experts}")
2014
2017
 
2015
2018
 
2019
+ @ModelBase.register("ArceeForCausalLM")
2020
+ class ArceeModel(LlamaModel):
2021
+ model_arch = gguf.MODEL_ARCH.ARCEE
2022
+
2023
+ def set_gguf_parameters(self):
2024
+ super().set_gguf_parameters()
2025
+ self._try_set_pooling_type()
2026
+ rope_scaling = self.hparams.get("rope_scaling") or {}
2027
+ if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
2028
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
2029
+ self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
2030
+ self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
2031
+
2032
+
2016
2033
  @ModelBase.register(
2017
2034
  "LlavaForConditionalGeneration", # pixtral
2018
2035
  "Mistral3ForConditionalGeneration", # mistral small 3.1
@@ -2300,9 +2317,7 @@ class DeciModel(TextModel):
2300
2317
  hparams = self.hparams
2301
2318
  self.gguf_writer.add_vocab_size(hparams["vocab_size"])
2302
2319
 
2303
- if "head_dim" in hparams:
2304
- rope_dim = hparams["head_dim"]
2305
- else:
2320
+ if (rope_dim := hparams.get("head_dim")) is None:
2306
2321
  rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
2307
2322
  self.gguf_writer.add_rope_dimension_count(rope_dim)
2308
2323
 
@@ -2342,7 +2357,8 @@ class DeciModel(TextModel):
2342
2357
  if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
2343
2358
  if rope_scaling.get("rope_type", '').lower() == "llama3":
2344
2359
  base = self.hparams.get("rope_theta", 10000.0)
2345
- dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
2360
+ if (dim := self.hparams.get("head_dim")) is None:
2361
+ dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
2346
2362
  freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
2347
2363
 
2348
2364
  factor = rope_scaling.get("factor", 8.0)
@@ -3660,9 +3676,7 @@ class InternLM3Model(TextModel):
3660
3676
  hparams = self.hparams
3661
3677
  self.gguf_writer.add_vocab_size(hparams["vocab_size"])
3662
3678
 
3663
- if "head_dim" in hparams:
3664
- rope_dim = hparams["head_dim"]
3665
- else:
3679
+ if (rope_dim := hparams.get("head_dim")) is None:
3666
3680
  rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
3667
3681
  self.gguf_writer.add_rope_dimension_count(rope_dim)
3668
3682
 
@@ -3705,8 +3719,7 @@ class BertModel(TextModel):
3705
3719
  self._try_set_pooling_type()
3706
3720
 
3707
3721
  if self.cls_out_labels:
3708
- key_name = gguf.Keys.Classifier.OUTPUT_LABELS.format(arch = gguf.MODEL_ARCH_NAMES[self.model_arch])
3709
- self.gguf_writer.add_array(key_name, [v for k, v in sorted(self.cls_out_labels.items())])
3722
+ self.gguf_writer.add_classifier_output_labels([v for k, v in sorted(self.cls_out_labels.items())])
3710
3723
 
3711
3724
  def set_vocab(self):
3712
3725
  tokens, toktypes, tokpre = self.get_vocab_base()
@@ -3810,7 +3823,7 @@ class BertModel(TextModel):
3810
3823
  remove_whitespaces = tokenizer.clean_up_tokenization_spaces
3811
3824
  precompiled_charsmap = b64decode(tokenizer_json["normalizer"]["precompiled_charsmap"])
3812
3825
 
3813
- vocab_size = self.hparams.get("vocab_size", tokenizer.vocab_size)
3826
+ vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size)
3814
3827
  else:
3815
3828
  sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
3816
3829
  sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
@@ -3823,7 +3836,7 @@ class BertModel(TextModel):
3823
3836
  tokenizer = SentencePieceProcessor()
3824
3837
  tokenizer.LoadFromFile(str(tokenizer_path))
3825
3838
 
3826
- vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
3839
+ vocab_size = max(self.hparams.get("vocab_size", 0), tokenizer.vocab_size())
3827
3840
 
3828
3841
  tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
3829
3842
  scores: list[float] = [-10000.0] * vocab_size
@@ -3853,33 +3866,26 @@ class BertModel(TextModel):
3853
3866
  unk_token = tokenizer_config_json.get("unk_token")
3854
3867
  unk_token_id = added_vocab.get(unk_token, tokenizer_json["model"].get("unk_id", 3))
3855
3868
 
3856
- for token_id in range(vocab_size):
3869
+ for token_id in range(tokenizer.vocab_size):
3857
3870
  piece = tokenizer._convert_id_to_token(token_id)
3858
- text = piece.encode("utf-8")
3859
- score = tokenizer_json["model"]["vocab"][token_id][1]
3860
-
3861
- toktype = SentencePieceTokenTypes.NORMAL
3862
- if token_id == unk_token_id:
3863
- toktype = SentencePieceTokenTypes.UNKNOWN
3864
- elif token_id in tokenizer.all_special_ids:
3865
- toktype = SentencePieceTokenTypes.CONTROL
3866
- elif token_id in added_vocab.values():
3867
- toktype = SentencePieceTokenTypes.USER_DEFINED
3868
- # No reliable way to detect this, but jina doesn't have any
3869
- # elif tokenizer.IsByte(token_id):
3870
- # toktype = SentencePieceTokenTypes.BYTE
3871
-
3872
- tokens[token_id] = text
3873
- scores[token_id] = score
3874
- toktypes[token_id] = toktype
3875
-
3876
- if vocab_size > len(tokens):
3877
- pad_count = vocab_size - len(tokens)
3878
- logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
3879
- for i in range(1, pad_count + 1):
3880
- tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
3881
- scores.append(-1000.0)
3882
- toktypes.append(SentencePieceTokenTypes.UNUSED)
3871
+ if (piece := tokenizer._convert_id_to_token(token_id)) is not None:
3872
+ text = piece.encode("utf-8")
3873
+ score = tokenizer_json["model"]["vocab"][token_id][1]
3874
+
3875
+ toktype = SentencePieceTokenTypes.NORMAL
3876
+ if token_id == unk_token_id:
3877
+ toktype = SentencePieceTokenTypes.UNKNOWN
3878
+ elif token_id in tokenizer.all_special_ids:
3879
+ toktype = SentencePieceTokenTypes.CONTROL
3880
+ elif token_id in added_vocab.values():
3881
+ toktype = SentencePieceTokenTypes.USER_DEFINED
3882
+ # No reliable way to detect this, but jina doesn't have any
3883
+ # elif tokenizer.IsByte(token_id):
3884
+ # toktype = SentencePieceTokenTypes.BYTE
3885
+
3886
+ tokens[token_id] = text
3887
+ scores[token_id] = score
3888
+ toktypes[token_id] = toktype
3883
3889
 
3884
3890
  if isinstance(tokenizer, SentencePieceProcessor):
3885
3891
  # realign tokens (see HF tokenizer code)
@@ -3892,6 +3898,12 @@ class BertModel(TextModel):
3892
3898
  SentencePieceTokenTypes.UNKNOWN,
3893
3899
  ] + toktypes[3:-1]
3894
3900
 
3901
+ if self.model_arch == gguf.MODEL_ARCH.NOMIC_BERT_MOE:
3902
+ # Add mask token missing from sentencepiece.bpe.model
3903
+ tokens[250001] = b'<mask>'
3904
+ scores[250001] = 0.0
3905
+ toktypes[250001] = SentencePieceTokenTypes.CONTROL
3906
+
3895
3907
  self.gguf_writer.add_tokenizer_model("t5")
3896
3908
  self.gguf_writer.add_tokenizer_pre("default")
3897
3909
  self.gguf_writer.add_token_list(tokens)
@@ -4057,6 +4069,34 @@ class NomicBertModel(BertModel):
4057
4069
  raise ValueError(f"unknown tokenizer: {toktyp}")
4058
4070
 
4059
4071
 
4072
+ @ModelBase.register("NeoBERT", "NeoBERTLMHead", "NeoBERTForSequenceClassification")
4073
+ class NeoBert(BertModel):
4074
+ model_arch = gguf.MODEL_ARCH.NEO_BERT
4075
+
4076
+ def set_gguf_parameters(self):
4077
+ super().set_gguf_parameters()
4078
+
4079
+ # NeoBERT uses 2/3 of the intermediate size as feed forward length
4080
+ self.gguf_writer.add_feed_forward_length(int(2 * self.hparams["intermediate_size"] / 3))
4081
+ self.gguf_writer.add_rope_freq_base(10000.0) # default value for NeoBERT
4082
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
4083
+
4084
+ f_rms_eps = self.hparams.get("norm_eps", 1e-6) # default value for NeoBERT
4085
+ self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps)
4086
+ logger.info(f"gguf: rms norm epsilon = {f_rms_eps}")
4087
+
4088
+ self.gguf_writer.add_pooling_type(gguf.PoolingType.CLS) # https://huggingface.co/chandar-lab/NeoBERT#how-to-use
4089
+
4090
+ def modify_tensors(self, data_torch, name, bid):
4091
+ if name.startswith("decoder."):
4092
+ return []
4093
+
4094
+ if name.startswith("model."):
4095
+ name = name[6:]
4096
+
4097
+ return super().modify_tensors(data_torch, name, bid)
4098
+
4099
+
4060
4100
  @ModelBase.register("XLMRobertaModel", "XLMRobertaForSequenceClassification")
4061
4101
  class XLMRobertaModel(BertModel):
4062
4102
  model_arch = gguf.MODEL_ARCH.BERT
@@ -4796,25 +4836,6 @@ class OlmoeModel(TextModel):
4796
4836
  class JinaBertV2Model(BertModel):
4797
4837
  model_arch = gguf.MODEL_ARCH.JINA_BERT_V2
4798
4838
 
4799
- def __init__(self, *args, **kwargs):
4800
- super().__init__(*args, **kwargs)
4801
- self.intermediate_size = self.hparams["intermediate_size"]
4802
-
4803
- def get_tensors(self):
4804
- for name, data in super().get_tensors():
4805
- if 'gated_layer' in name:
4806
- d1 = data[:self.intermediate_size, :]
4807
- name1 = name.replace('gated_layers', 'gated_layers_w')
4808
- name1 = name1.replace('up_gated_layer', 'gated_layers_v')
4809
- d2 = data[self.intermediate_size:, :]
4810
- name2 = name.replace('gated_layers', 'gated_layers_v')
4811
- name2 = name2.replace('up_gated_layer', 'gated_layers_w')
4812
- yield name1, d1
4813
- yield name2, d2
4814
- continue
4815
-
4816
- yield name, data
4817
-
4818
4839
  def set_vocab(self):
4819
4840
  tokenizer_class = 'BertTokenizer'
4820
4841
  with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
@@ -4830,14 +4851,6 @@ class JinaBertV2Model(BertModel):
4830
4851
  self.gguf_writer.add_add_bos_token(True)
4831
4852
  self.gguf_writer.add_add_eos_token(True)
4832
4853
 
4833
- def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4834
- # if name starts with "bert.", remove the prefix
4835
- # e.g. https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
4836
- if name.startswith("bert."):
4837
- name = name[5:]
4838
-
4839
- return super().modify_tensors(data_torch, name, bid)
4840
-
4841
4854
 
4842
4855
  @ModelBase.register("OpenELMForCausalLM")
4843
4856
  class OpenELMModel(TextModel):
@@ -5078,9 +5091,7 @@ class DeepseekModel(TextModel):
5078
5091
  def set_gguf_parameters(self):
5079
5092
  super().set_gguf_parameters()
5080
5093
  hparams = self.hparams
5081
- if "head_dim" in hparams:
5082
- rope_dim = hparams["head_dim"]
5083
- else:
5094
+ if (rope_dim := hparams.get("head_dim")) is None:
5084
5095
  rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
5085
5096
 
5086
5097
  self.gguf_writer.add_rope_dimension_count(rope_dim)
@@ -5284,6 +5295,34 @@ class DeepseekV2Model(TextModel):
5284
5295
  raise ValueError(f"Unprocessed experts: {experts}")
5285
5296
 
5286
5297
 
5298
+ @ModelBase.register("Dots1ForCausalLM")
5299
+ class Dots1Model(Qwen2MoeModel):
5300
+ model_arch = gguf.MODEL_ARCH.DOTS1
5301
+
5302
+ def __init__(self, *args, **kwargs):
5303
+ super().__init__(*args, **kwargs)
5304
+ self.hparams["num_experts"] = self.hparams["n_routed_experts"]
5305
+
5306
+ def set_gguf_parameters(self):
5307
+ super().set_gguf_parameters()
5308
+ self.gguf_writer.add_leading_dense_block_count(self.hparams["first_k_dense_replace"])
5309
+ self.gguf_writer.add_expert_shared_count(self.hparams["n_shared_experts"])
5310
+ self.gguf_writer.add_expert_weights_scale(self.hparams["routed_scaling_factor"])
5311
+ self.gguf_writer.add_expert_weights_norm(self.hparams["norm_topk_prob"])
5312
+
5313
+ if self.hparams["scoring_func"] == "noaux_tc":
5314
+ self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
5315
+ else:
5316
+ raise ValueError(f"Unsupported scoring_func value: {self.hparams['scoring_func']}")
5317
+
5318
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
5319
+ if name.endswith("e_score_correction_bias"):
5320
+ name = name.replace("e_score_correction_bias", "e_score_correction.bias")
5321
+ if "shared_experts" in name:
5322
+ return [(self.map_tensor_name(name), data_torch)]
5323
+ return super().modify_tensors(data_torch, name, bid)
5324
+
5325
+
5287
5326
  @ModelBase.register("PLMForCausalLM")
5288
5327
  class PLMModel(TextModel):
5289
5328
  model_arch = gguf.MODEL_ARCH.PLM
@@ -5942,7 +5981,8 @@ class ExaoneModel(TextModel):
5942
5981
  if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
5943
5982
  if rope_scaling.get("rope_type", '').lower() == "llama3":
5944
5983
  base = self.hparams.get("rope_theta", 10000.0)
5945
- dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
5984
+ if (dim := self.hparams.get("head_dim")) is None:
5985
+ dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
5946
5986
  freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
5947
5987
 
5948
5988
  factor = rope_scaling.get("factor", 8.0)
@@ -6054,7 +6094,8 @@ class BailingMoeModel(TextModel):
6054
6094
  def set_gguf_parameters(self):
6055
6095
  super().set_gguf_parameters()
6056
6096
  hparams = self.hparams
6057
- rope_dim = hparams.get("head_dim") or hparams["hidden_size"] // hparams["num_attention_heads"]
6097
+ if (rope_dim := hparams.get("head_dim")) is None:
6098
+ rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
6058
6099
 
6059
6100
  self.gguf_writer.add_rope_dimension_count(rope_dim)
6060
6101
  rope_scaling = self.hparams.get("rope_scaling") or {}
@@ -6086,7 +6127,8 @@ class BailingMoeModel(TextModel):
6086
6127
  n_head = self.hparams["num_attention_heads"]
6087
6128
  n_kv_head = self.hparams.get("num_key_value_heads")
6088
6129
  n_embd = self.hparams["hidden_size"]
6089
- head_dim = self.hparams.get("head_dim") or n_embd // n_head
6130
+ if (head_dim := self.hparams.get("head_dim")) is None:
6131
+ head_dim = n_embd // n_head
6090
6132
 
6091
6133
  output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT)
6092
6134
 
@@ -6347,8 +6389,8 @@ def parse_args() -> argparse.Namespace:
6347
6389
  help="model is executed on big endian machine",
6348
6390
  )
6349
6391
  parser.add_argument(
6350
- "model", type=Path,
6351
- help="directory containing model file",
6392
+ "model", type=str,
6393
+ help="directory containing model file or huggingface repository ID (if --remote)",
6352
6394
  nargs="?",
6353
6395
  )
6354
6396
  parser.add_argument(
@@ -6451,18 +6493,20 @@ def main() -> None:
6451
6493
  else:
6452
6494
  logging.basicConfig(level=logging.INFO)
6453
6495
 
6454
- dir_model = args.model
6455
-
6456
6496
  if args.remote:
6497
+ hf_repo_id = args.model
6457
6498
  from huggingface_hub import snapshot_download
6458
6499
  local_dir = snapshot_download(
6459
- repo_id=str(dir_model),
6500
+ repo_id=hf_repo_id,
6460
6501
  allow_patterns=["LICENSE", "*.json", "*.md", "*.txt", "tokenizer.model"])
6461
6502
  dir_model = Path(local_dir)
6462
6503
  logger.info(f"Downloaded config and tokenizer to {local_dir}")
6504
+ else:
6505
+ hf_repo_id = None
6506
+ dir_model = Path(args.model)
6463
6507
 
6464
6508
  if not dir_model.is_dir():
6465
- logger.error(f'Error: {args.model} is not a directory')
6509
+ logger.error(f'Error: {dir_model} is not a directory')
6466
6510
  sys.exit(1)
6467
6511
 
6468
6512
  ftype_map: dict[str, gguf.LlamaFileType] = {
@@ -6482,9 +6526,9 @@ def main() -> None:
6482
6526
 
6483
6527
  if args.outfile is not None:
6484
6528
  fname_out = args.outfile
6485
- elif args.remote:
6529
+ elif hf_repo_id:
6486
6530
  # if remote, use the model ID as the output file name
6487
- fname_out = Path("./" + str(args.model).replace("/", "-") + "-{ftype}.gguf")
6531
+ fname_out = Path("./" + hf_repo_id.replace("/", "-") + "-{ftype}.gguf")
6488
6532
  else:
6489
6533
  fname_out = dir_model
6490
6534
 
@@ -6513,7 +6557,7 @@ def main() -> None:
6513
6557
  split_max_tensors=args.split_max_tensors,
6514
6558
  split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run,
6515
6559
  small_first_shard=args.no_tensor_first_split,
6516
- remote_hf_model_id=str(args.model) if args.remote else None)
6560
+ remote_hf_model_id=hf_repo_id)
6517
6561
 
6518
6562
  if args.vocab_only:
6519
6563
  logger.info("Exporting model vocab...")
@@ -105,7 +105,7 @@ message(DEBUG "GGML_NATIVE_DEFAULT : ${GGML_NATIVE_DEFAULT}")
105
105
  message(DEBUG "INS_ENB : ${INS_ENB}")
106
106
 
107
107
  option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF)
108
- option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
108
+ option(GGML_CPU_REPACK "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
109
109
  option(GGML_CPU_KLEIDIAI "ggml: use KleidiAI optimized kernels if applicable" OFF)
110
110
  option(GGML_SSE42 "ggml: enable SSE 4.2" ${INS_ENB})
111
111
  option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
@@ -137,7 +137,7 @@ set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
137
137
  set(GGML_CPU_POWERPC_CPUTYPE "" CACHE STRING "ggml: CPU type for PowerPC")
138
138
 
139
139
 
140
- if (WIN32)
140
+ if (MINGW)
141
141
  set(GGML_WIN_VER "0x602" CACHE STRING "ggml: Windows version")
142
142
  endif()
143
143
 
@@ -172,6 +172,7 @@ option(GGML_HIP "ggml: use HIP"
172
172
  option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental, slow" OFF)
173
173
  option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
174
174
  option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
175
+ option(GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 "ggml: enable rocWMMA FlashAttention on GFX12" OFF)
175
176
  option(GGML_VULKAN "ggml: use Vulkan" OFF)
176
177
  option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
177
178
  option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
@@ -367,6 +368,8 @@ if (MSVC)
367
368
  /wd4005 # Macro redefinition
368
369
  /wd4244 # Conversion from one type to another type, possible loss of data
369
370
  /wd4267 # Conversion from 'size_t' to a smaller type, possible loss of data
371
+ /wd4305 # Conversion from 'type1' to 'type2', possible loss of data
372
+ /wd4566 # Conversion from 'char' to 'wchar_t', possible loss of data
370
373
  /wd4996 # Disable POSIX deprecation warnings
371
374
  /wd4702 # Unreachable code warnings
372
375
  )
@@ -386,4 +389,46 @@ if (MSVC)
386
389
  disable_msvc_warnings(ggml-cpu-skylakex)
387
390
  disable_msvc_warnings(ggml-cpu-icelake)
388
391
  disable_msvc_warnings(ggml-cpu-alderlake)
392
+
393
+ if (GGML_BUILD_EXAMPLES)
394
+ disable_msvc_warnings(common-ggml)
395
+ disable_msvc_warnings(common)
396
+
397
+ disable_msvc_warnings(mnist-common)
398
+ disable_msvc_warnings(mnist-eval)
399
+ disable_msvc_warnings(mnist-train)
400
+
401
+ disable_msvc_warnings(gpt-2-ctx)
402
+ disable_msvc_warnings(gpt-2-alloc)
403
+ disable_msvc_warnings(gpt-2-backend)
404
+ disable_msvc_warnings(gpt-2-sched)
405
+ disable_msvc_warnings(gpt-2-quantize)
406
+ disable_msvc_warnings(gpt-2-batched)
407
+
408
+ disable_msvc_warnings(gpt-j)
409
+ disable_msvc_warnings(gpt-j-quantize)
410
+
411
+ disable_msvc_warnings(magika)
412
+ disable_msvc_warnings(yolov3-tiny)
413
+ disable_msvc_warnings(sam)
414
+
415
+ disable_msvc_warnings(simple-ctx)
416
+ disable_msvc_warnings(simple-backend)
417
+ endif()
418
+
419
+ if (GGML_BUILD_TESTS)
420
+ disable_msvc_warnings(test-mul-mat)
421
+ disable_msvc_warnings(test-arange)
422
+ disable_msvc_warnings(test-backend-ops)
423
+ disable_msvc_warnings(test-cont)
424
+ disable_msvc_warnings(test-conv-transpose)
425
+ disable_msvc_warnings(test-conv-transpose-1d)
426
+ disable_msvc_warnings(test-conv1d)
427
+ disable_msvc_warnings(test-conv2d)
428
+ disable_msvc_warnings(test-conv2d-dw)
429
+ disable_msvc_warnings(test-customop)
430
+ disable_msvc_warnings(test-dup)
431
+ disable_msvc_warnings(test-opt)
432
+ disable_msvc_warnings(test-pool)
433
+ endif ()
389
434
  endif()
@@ -36,8 +36,7 @@ function(ggml_get_system_arch)
36
36
  (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
37
37
  CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64|amd64)$"))
38
38
  set(GGML_SYSTEM_ARCH "x86" PARENT_SCOPE)
39
- elseif ("${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "ppc64le " OR
40
- "${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "powerpc ")
39
+ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc|power")
41
40
  set(GGML_SYSTEM_ARCH "PowerPC" PARENT_SCOPE)
42
41
  elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
43
42
  set(GGML_SYSTEM_ARCH "loongarch64" PARENT_SCOPE)
@@ -2095,9 +2095,6 @@ extern "C" {
2095
2095
  GGML_API struct ggml_tensor * ggml_graph_get_grad (const struct ggml_cgraph * cgraph, const struct ggml_tensor * node);
2096
2096
  GGML_API struct ggml_tensor * ggml_graph_get_grad_acc(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node);
2097
2097
 
2098
- GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
2099
- GGML_API struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
2100
-
2101
2098
  // print info and performance information for the graph
2102
2099
  GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
2103
2100
 
@@ -2181,6 +2178,7 @@ extern "C" {
2181
2178
 
2182
2179
  // scheduling priorities
2183
2180
  enum ggml_sched_priority {
2181
+ GGML_SCHED_PRIO_LOW = -1,
2184
2182
  GGML_SCHED_PRIO_NORMAL,
2185
2183
  GGML_SCHED_PRIO_MEDIUM,
2186
2184
  GGML_SCHED_PRIO_HIGH,
@@ -125,7 +125,6 @@ if (NOT MSVC)
125
125
  endif()
126
126
 
127
127
  if (MINGW)
128
- # Target Windows 8 for PrefetchVirtualMemory
129
128
  add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
130
129
  endif()
131
130
 
@@ -196,6 +195,7 @@ add_library(ggml-base
196
195
  ../include/ggml-opt.h
197
196
  ../include/gguf.h
198
197
  ggml.c
198
+ ggml.cpp
199
199
  ggml-alloc.c
200
200
  ggml-backend.cpp
201
201
  ggml-opt.cpp
@@ -212,6 +212,7 @@ endif()
212
212
 
213
213
  add_library(ggml
214
214
  ggml-backend-reg.cpp)
215
+ add_library(ggml::ggml ALIAS ggml)
215
216
 
216
217
  target_link_libraries(ggml PUBLIC ggml-base)
217
218
 
@@ -226,6 +227,7 @@ function(ggml_add_backend_library backend)
226
227
  set_target_properties(${backend} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
227
228
  target_compile_definitions(${backend} PRIVATE GGML_BACKEND_DL)
228
229
  add_dependencies(ggml ${backend})
230
+ install(TARGETS ${backend} LIBRARY DESTINATION ${CMAKE_INSTALL_BINDIR})
229
231
  else()
230
232
  add_library(${backend} ${ARGN})
231
233
  target_link_libraries(ggml PUBLIC ${backend})
@@ -268,17 +270,23 @@ endfunction()
268
270
  function(ggml_add_cpu_backend_variant tag_name)
269
271
  set(GGML_CPU_TAG_NAME ${tag_name})
270
272
  # other: OPENMP LLAMAFILE CPU_HBM
271
- foreach (feat NATIVE
272
- SSE42
273
- AVX AVX2 BMI2 AVX_VNNI FMA F16C
274
- AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
275
- AMX_TILE AMX_INT8 AMX_BF16)
276
- set(GGML_${feat} OFF)
277
- endforeach()
278
-
279
- foreach (feat ${ARGN})
280
- set(GGML_${feat} ON)
281
- endforeach()
273
+ if (GGML_SYSTEM_ARCH STREQUAL "x86")
274
+ foreach (feat NATIVE
275
+ SSE42
276
+ AVX AVX2 BMI2 AVX_VNNI FMA F16C
277
+ AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
278
+ AMX_TILE AMX_INT8 AMX_BF16)
279
+ set(GGML_${feat} OFF)
280
+ endforeach()
281
+
282
+ foreach (feat ${ARGN})
283
+ set(GGML_${feat} ON)
284
+ endforeach()
285
+ elseif (GGML_SYSTEM_ARCH STREQUAL "ARM")
286
+ foreach (feat ${ARGN})
287
+ set(GGML_INTERNAL_${feat} ON)
288
+ endforeach()
289
+ endif()
282
290
 
283
291
  ggml_add_cpu_backend_variant_impl(${tag_name})
284
292
  endfunction()
@@ -288,6 +296,8 @@ ggml_add_backend(CPU)
288
296
  if (GGML_CPU_ALL_VARIANTS)
289
297
  if (NOT GGML_BACKEND_DL)
290
298
  message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
299
+ elseif (GGML_CPU_ARM_ARCH)
300
+ message(FATAL_ERROR "Cannot use both GGML_CPU_ARM_ARCH and GGML_CPU_ALL_VARIANTS")
291
301
  endif()
292
302
  if (GGML_SYSTEM_ARCH STREQUAL "x86")
293
303
  ggml_add_cpu_backend_variant(x64)
@@ -301,8 +311,34 @@ if (GGML_CPU_ALL_VARIANTS)
301
311
  # MSVC doesn't support AMX
302
312
  ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
303
313
  endif()
314
+ elseif(GGML_SYSTEM_ARCH STREQUAL "ARM")
315
+ if (CMAKE_SYSTEM_NAME MATCHES "Linux")
316
+ # Many of these features are optional so we build versions with popular
317
+ # combinations and name the backends based on the version they were
318
+ # first released with
319
+ ggml_add_cpu_backend_variant(armv8.0_1)
320
+ ggml_add_cpu_backend_variant(armv8.2_1 DOTPROD)
321
+ ggml_add_cpu_backend_variant(armv8.2_2 DOTPROD FP16_VECTOR_ARITHMETIC)
322
+ ggml_add_cpu_backend_variant(armv8.2_3 DOTPROD FP16_VECTOR_ARITHMETIC SVE)
323
+ ggml_add_cpu_backend_variant(armv8.6_1 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8)
324
+ ggml_add_cpu_backend_variant(armv8.6_2 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SVE2)
325
+ ggml_add_cpu_backend_variant(armv9.2_1 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SME)
326
+ ggml_add_cpu_backend_variant(armv9.2_2 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SVE2 SME)
327
+ elseif (CMAKE_SYSTEM_NAME MATCHES "Android")
328
+ # Android-specific backends with SoC-compatible feature sets
329
+ ggml_add_cpu_backend_variant(android_armv8.0_1)
330
+ ggml_add_cpu_backend_variant(android_armv8.2_1 DOTPROD)
331
+ ggml_add_cpu_backend_variant(android_armv8.2_2 DOTPROD FP16_VECTOR_ARITHMETIC)
332
+ ggml_add_cpu_backend_variant(android_armv8.6_1 DOTPROD FP16_VECTOR_ARITHMETIC MATMUL_INT8)
333
+ elseif (APPLE)
334
+ ggml_add_cpu_backend_variant(apple_m1 DOTPROD)
335
+ ggml_add_cpu_backend_variant(apple_m2_m3 DOTPROD MATMUL_INT8)
336
+ ggml_add_cpu_backend_variant(apple_m4 DOTPROD MATMUL_INT8 NOSVE SME)
337
+ else()
338
+ message(FATAL_ERROR "Unsupported ARM target OS: ${CMAKE_SYSTEM_NAME}")
339
+ endif()
304
340
  else()
305
- message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS not yet supported on ${GGML_SYSTEM_ARCH}")
341
+ message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS not yet supported with ${GGML_SYSTEM_ARCH} on ${CMAKE_SYSTEM_NAME}")
306
342
  endif()
307
343
  elseif (GGML_CPU)
308
344
  ggml_add_cpu_backend_variant_impl("")
@@ -69,6 +69,9 @@
69
69
  #if defined(__clang__)
70
70
  # pragma clang diagnostic push
71
71
  # pragma clang diagnostic ignored "-Wdeprecated-declarations"
72
+ #elif defined(__GNUC__)
73
+ # pragma GCC diagnostic push
74
+ # pragma GCC diagnostic ignored "-Wdeprecated-declarations"
72
75
  #endif
73
76
 
74
77
  namespace fs = std::filesystem;
@@ -91,6 +94,8 @@ static std::string path_str(const fs::path & path) {
91
94
 
92
95
  #if defined(__clang__)
93
96
  # pragma clang diagnostic pop
97
+ #elif defined(__GNUC__)
98
+ # pragma GCC diagnostic pop
94
99
  #endif
95
100
 
96
101
  #ifdef _WIN32