@novastera-oss/llamarn 0.2.6 → 0.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (192) hide show
  1. package/android/src/main/cpp/include/llama.h +134 -36
  2. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  3. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  4. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  6. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  7. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  8. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  10. package/cpp/LlamaCppModel.cpp +2 -2
  11. package/cpp/LlamaCppModel.h +3 -3
  12. package/cpp/PureCppImpl.cpp +1 -1
  13. package/cpp/PureCppImpl.h +2 -2
  14. package/cpp/build-info.cpp +2 -2
  15. package/cpp/llama.cpp/CMakeLists.txt +15 -4
  16. package/cpp/llama.cpp/Makefile +2 -2
  17. package/cpp/llama.cpp/README.md +32 -13
  18. package/cpp/llama.cpp/common/CMakeLists.txt +10 -20
  19. package/cpp/llama.cpp/common/arg.cpp +30 -6
  20. package/cpp/llama.cpp/common/build-info.cpp.in +2 -2
  21. package/cpp/llama.cpp/common/chat-parser.cpp +5 -0
  22. package/cpp/llama.cpp/common/chat-parser.h +2 -0
  23. package/cpp/llama.cpp/common/chat.cpp +12 -9
  24. package/cpp/llama.cpp/common/chat.h +1 -1
  25. package/cpp/llama.cpp/common/common.cpp +50 -40
  26. package/cpp/llama.cpp/common/common.h +5 -2
  27. package/cpp/llama.cpp/common/speculative.cpp +6 -4
  28. package/cpp/llama.cpp/convert_hf_to_gguf.py +97 -56
  29. package/cpp/llama.cpp/ggml/CMakeLists.txt +47 -2
  30. package/cpp/llama.cpp/ggml/cmake/common.cmake +1 -2
  31. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +47 -13
  32. package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +5 -0
  33. package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +6 -1
  34. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
  35. package/cpp/llama.cpp/ggml/src/ggml-common.h +4 -0
  36. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +93 -24
  37. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
  38. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +1 -1
  39. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  40. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4113 -0
  41. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2174 -0
  42. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2638 -0
  43. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2731 -0
  44. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2068 -0
  45. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +396 -0
  46. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1299 -0
  47. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1480 -0
  48. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +4310 -0
  49. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +59 -3206
  50. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +184 -0
  51. package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +1 -1
  52. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +7 -4
  53. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +10 -2
  54. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +8 -8
  55. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
  56. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
  57. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +56 -7
  58. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
  59. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +2 -2
  60. package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +1157 -0
  61. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
  62. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +1555 -0
  63. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.h +98 -0
  64. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +2 -4
  65. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
  66. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +5 -8
  67. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +4 -1
  68. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +6 -8
  69. package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
  70. package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -0
  71. package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +11 -10
  72. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +33 -8
  73. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +135 -100
  74. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +7 -0
  75. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +908 -3
  76. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
  77. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  78. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
  79. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
  80. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
  81. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  82. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
  83. package/cpp/llama.cpp/ggml/src/ggml-quants.c +0 -2
  84. package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
  85. package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +1 -1
  86. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +19 -24
  87. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +21 -2
  88. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +121 -4
  89. package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +32 -0
  90. package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +3 -0
  91. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +2 -96
  92. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +164 -38
  93. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +32 -8
  94. package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +38 -10
  95. package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
  96. package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +26 -29
  97. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +431 -247
  98. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -12
  99. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  100. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +2 -0
  101. package/cpp/llama.cpp/ggml/src/ggml.c +0 -6
  102. package/cpp/llama.cpp/gguf-py/gguf/constants.py +57 -0
  103. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +4 -1
  104. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +14 -3
  105. package/cpp/llama.cpp/include/llama.h +134 -36
  106. package/cpp/llama.cpp/requirements/requirements-compare-llama-bench.txt +1 -0
  107. package/cpp/llama.cpp/src/CMakeLists.txt +2 -2
  108. package/cpp/llama.cpp/src/llama-arch.cpp +95 -3
  109. package/cpp/llama.cpp/src/llama-arch.h +7 -1
  110. package/cpp/llama.cpp/src/llama-batch.cpp +270 -19
  111. package/cpp/llama.cpp/src/llama-batch.h +36 -11
  112. package/cpp/llama.cpp/src/llama-chat.cpp +19 -2
  113. package/cpp/llama.cpp/src/llama-chat.h +1 -0
  114. package/cpp/llama.cpp/src/llama-context.cpp +313 -213
  115. package/cpp/llama.cpp/src/llama-context.h +16 -12
  116. package/cpp/llama.cpp/src/llama-cparams.cpp +1 -1
  117. package/cpp/llama.cpp/src/llama-cparams.h +1 -1
  118. package/cpp/llama.cpp/src/llama-graph.cpp +249 -129
  119. package/cpp/llama.cpp/src/llama-graph.h +90 -34
  120. package/cpp/llama.cpp/src/llama-hparams.cpp +6 -2
  121. package/cpp/llama.cpp/src/llama-hparams.h +8 -2
  122. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +82 -50
  123. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +23 -26
  124. package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +292 -174
  125. package/cpp/llama.cpp/src/llama-kv-cache-unified.h +68 -38
  126. package/cpp/llama.cpp/src/llama-kv-cells.h +18 -13
  127. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +247 -0
  128. package/cpp/llama.cpp/src/llama-memory-hybrid.h +143 -0
  129. package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.cpp → llama-memory-recurrent.cpp} +266 -282
  130. package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.h → llama-memory-recurrent.h} +54 -57
  131. package/cpp/llama.cpp/src/llama-memory.cpp +41 -0
  132. package/cpp/llama.cpp/src/llama-memory.h +64 -23
  133. package/cpp/llama.cpp/src/llama-mmap.cpp +1 -1
  134. package/cpp/llama.cpp/src/llama-model-loader.cpp +42 -17
  135. package/cpp/llama.cpp/src/llama-model.cpp +726 -141
  136. package/cpp/llama.cpp/src/llama-model.h +4 -0
  137. package/cpp/llama.cpp/src/llama-quant.cpp +2 -1
  138. package/cpp/llama.cpp/src/llama-vocab.cpp +32 -23
  139. package/cpp/llama.cpp/src/llama.cpp +11 -7
  140. package/cpp/llama.cpp/src/unicode.cpp +5 -0
  141. package/cpp/rn-completion.cpp +2 -2
  142. package/cpp/{rn-llama.hpp → rn-llama.h} +1 -1
  143. package/ios/include/chat.h +1 -1
  144. package/ios/include/common.h +5 -2
  145. package/ios/include/llama.h +134 -36
  146. package/ios/libs/llama.xcframework/Info.plist +18 -18
  147. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  148. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4863 -4689
  149. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +134 -36
  150. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  151. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  152. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4710
  153. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3742 -3622
  154. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +134 -36
  155. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  156. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  157. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4710
  158. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3744 -3624
  159. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +134 -36
  160. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +134 -36
  161. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  162. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +134 -36
  163. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  164. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  165. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  166. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4863 -4689
  167. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +134 -36
  168. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  169. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  170. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4710
  171. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3742 -3622
  172. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +134 -36
  173. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  174. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  175. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4900 -4725
  176. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +134 -36
  177. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  178. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  179. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4871 -4746
  180. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3773 -3652
  181. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +134 -36
  182. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  183. package/package.json +1 -2
  184. package/cpp/llama.cpp/common/cmake/build-info-gen-cpp.cmake +0 -24
  185. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  186. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13891
  187. package/cpp/llama.cpp/src/llama-kv-cache.cpp +0 -1
  188. package/cpp/llama.cpp/src/llama-kv-cache.h +0 -44
  189. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
  190. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
  191. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
  192. /package/cpp/{rn-utils.hpp → rn-utils.h} +0 -0
@@ -519,7 +519,7 @@ class TextModel(ModelBase):
519
519
  def set_gguf_parameters(self):
520
520
  self.gguf_writer.add_block_count(self.block_count)
521
521
 
522
- if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx", "n_positions"], optional=True)) is not None:
522
+ if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx", "n_positions", "max_length"], optional=True)) is not None:
523
523
  self.gguf_writer.add_context_length(n_ctx)
524
524
  logger.info(f"gguf: context length = {n_ctx}")
525
525
 
@@ -1898,9 +1898,7 @@ class LlamaModel(TextModel):
1898
1898
  hparams = self.hparams
1899
1899
  self.gguf_writer.add_vocab_size(hparams["vocab_size"])
1900
1900
 
1901
- if "head_dim" in hparams:
1902
- rope_dim = hparams["head_dim"]
1903
- else:
1901
+ if (rope_dim := hparams.get("head_dim")) is None:
1904
1902
  rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
1905
1903
  self.gguf_writer.add_rope_dimension_count(rope_dim)
1906
1904
 
@@ -1982,7 +1980,8 @@ class LlamaModel(TextModel):
1982
1980
  if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
1983
1981
  if rope_scaling.get("rope_type", '').lower() == "llama3":
1984
1982
  base = self.hparams.get("rope_theta", 10000.0)
1985
- dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
1983
+ if (dim := self.hparams.get("head_dim")) is None:
1984
+ dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
1986
1985
  freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
1987
1986
 
1988
1987
  factor = rope_scaling.get("factor", 8.0)
@@ -2017,6 +2016,20 @@ class LlamaModel(TextModel):
2017
2016
  raise ValueError(f"Unprocessed experts: {experts}")
2018
2017
 
2019
2018
 
2019
+ @ModelBase.register("ArceeForCausalLM")
2020
+ class ArceeModel(LlamaModel):
2021
+ model_arch = gguf.MODEL_ARCH.ARCEE
2022
+
2023
+ def set_gguf_parameters(self):
2024
+ super().set_gguf_parameters()
2025
+ self._try_set_pooling_type()
2026
+ rope_scaling = self.hparams.get("rope_scaling") or {}
2027
+ if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
2028
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
2029
+ self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
2030
+ self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
2031
+
2032
+
2020
2033
  @ModelBase.register(
2021
2034
  "LlavaForConditionalGeneration", # pixtral
2022
2035
  "Mistral3ForConditionalGeneration", # mistral small 3.1
@@ -2304,9 +2317,7 @@ class DeciModel(TextModel):
2304
2317
  hparams = self.hparams
2305
2318
  self.gguf_writer.add_vocab_size(hparams["vocab_size"])
2306
2319
 
2307
- if "head_dim" in hparams:
2308
- rope_dim = hparams["head_dim"]
2309
- else:
2320
+ if (rope_dim := hparams.get("head_dim")) is None:
2310
2321
  rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
2311
2322
  self.gguf_writer.add_rope_dimension_count(rope_dim)
2312
2323
 
@@ -2346,7 +2357,8 @@ class DeciModel(TextModel):
2346
2357
  if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
2347
2358
  if rope_scaling.get("rope_type", '').lower() == "llama3":
2348
2359
  base = self.hparams.get("rope_theta", 10000.0)
2349
- dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
2360
+ if (dim := self.hparams.get("head_dim")) is None:
2361
+ dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
2350
2362
  freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
2351
2363
 
2352
2364
  factor = rope_scaling.get("factor", 8.0)
@@ -3664,9 +3676,7 @@ class InternLM3Model(TextModel):
3664
3676
  hparams = self.hparams
3665
3677
  self.gguf_writer.add_vocab_size(hparams["vocab_size"])
3666
3678
 
3667
- if "head_dim" in hparams:
3668
- rope_dim = hparams["head_dim"]
3669
- else:
3679
+ if (rope_dim := hparams.get("head_dim")) is None:
3670
3680
  rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
3671
3681
  self.gguf_writer.add_rope_dimension_count(rope_dim)
3672
3682
 
@@ -3709,8 +3719,7 @@ class BertModel(TextModel):
3709
3719
  self._try_set_pooling_type()
3710
3720
 
3711
3721
  if self.cls_out_labels:
3712
- key_name = gguf.Keys.Classifier.OUTPUT_LABELS.format(arch = gguf.MODEL_ARCH_NAMES[self.model_arch])
3713
- self.gguf_writer.add_array(key_name, [v for k, v in sorted(self.cls_out_labels.items())])
3722
+ self.gguf_writer.add_classifier_output_labels([v for k, v in sorted(self.cls_out_labels.items())])
3714
3723
 
3715
3724
  def set_vocab(self):
3716
3725
  tokens, toktypes, tokpre = self.get_vocab_base()
@@ -4060,6 +4069,34 @@ class NomicBertModel(BertModel):
4060
4069
  raise ValueError(f"unknown tokenizer: {toktyp}")
4061
4070
 
4062
4071
 
4072
+ @ModelBase.register("NeoBERT", "NeoBERTLMHead", "NeoBERTForSequenceClassification")
4073
+ class NeoBert(BertModel):
4074
+ model_arch = gguf.MODEL_ARCH.NEO_BERT
4075
+
4076
+ def set_gguf_parameters(self):
4077
+ super().set_gguf_parameters()
4078
+
4079
+ # NeoBERT uses 2/3 of the intermediate size as feed forward length
4080
+ self.gguf_writer.add_feed_forward_length(int(2 * self.hparams["intermediate_size"] / 3))
4081
+ self.gguf_writer.add_rope_freq_base(10000.0) # default value for NeoBERT
4082
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
4083
+
4084
+ f_rms_eps = self.hparams.get("norm_eps", 1e-6) # default value for NeoBERT
4085
+ self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps)
4086
+ logger.info(f"gguf: rms norm epsilon = {f_rms_eps}")
4087
+
4088
+ self.gguf_writer.add_pooling_type(gguf.PoolingType.CLS) # https://huggingface.co/chandar-lab/NeoBERT#how-to-use
4089
+
4090
+ def modify_tensors(self, data_torch, name, bid):
4091
+ if name.startswith("decoder."):
4092
+ return []
4093
+
4094
+ if name.startswith("model."):
4095
+ name = name[6:]
4096
+
4097
+ return super().modify_tensors(data_torch, name, bid)
4098
+
4099
+
4063
4100
  @ModelBase.register("XLMRobertaModel", "XLMRobertaForSequenceClassification")
4064
4101
  class XLMRobertaModel(BertModel):
4065
4102
  model_arch = gguf.MODEL_ARCH.BERT
@@ -4799,25 +4836,6 @@ class OlmoeModel(TextModel):
4799
4836
  class JinaBertV2Model(BertModel):
4800
4837
  model_arch = gguf.MODEL_ARCH.JINA_BERT_V2
4801
4838
 
4802
- def __init__(self, *args, **kwargs):
4803
- super().__init__(*args, **kwargs)
4804
- self.intermediate_size = self.hparams["intermediate_size"]
4805
-
4806
- def get_tensors(self):
4807
- for name, data in super().get_tensors():
4808
- if 'gated_layer' in name:
4809
- d1 = data[:self.intermediate_size, :]
4810
- name1 = name.replace('gated_layers', 'gated_layers_w')
4811
- name1 = name1.replace('up_gated_layer', 'gated_layers_v')
4812
- d2 = data[self.intermediate_size:, :]
4813
- name2 = name.replace('gated_layers', 'gated_layers_v')
4814
- name2 = name2.replace('up_gated_layer', 'gated_layers_w')
4815
- yield name1, d1
4816
- yield name2, d2
4817
- continue
4818
-
4819
- yield name, data
4820
-
4821
4839
  def set_vocab(self):
4822
4840
  tokenizer_class = 'BertTokenizer'
4823
4841
  with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f:
@@ -4833,14 +4851,6 @@ class JinaBertV2Model(BertModel):
4833
4851
  self.gguf_writer.add_add_bos_token(True)
4834
4852
  self.gguf_writer.add_add_eos_token(True)
4835
4853
 
4836
- def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
4837
- # if name starts with "bert.", remove the prefix
4838
- # e.g. https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
4839
- if name.startswith("bert."):
4840
- name = name[5:]
4841
-
4842
- return super().modify_tensors(data_torch, name, bid)
4843
-
4844
4854
 
4845
4855
  @ModelBase.register("OpenELMForCausalLM")
4846
4856
  class OpenELMModel(TextModel):
@@ -5081,9 +5091,7 @@ class DeepseekModel(TextModel):
5081
5091
  def set_gguf_parameters(self):
5082
5092
  super().set_gguf_parameters()
5083
5093
  hparams = self.hparams
5084
- if "head_dim" in hparams:
5085
- rope_dim = hparams["head_dim"]
5086
- else:
5094
+ if (rope_dim := hparams.get("head_dim")) is None:
5087
5095
  rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
5088
5096
 
5089
5097
  self.gguf_writer.add_rope_dimension_count(rope_dim)
@@ -5287,6 +5295,34 @@ class DeepseekV2Model(TextModel):
5287
5295
  raise ValueError(f"Unprocessed experts: {experts}")
5288
5296
 
5289
5297
 
5298
+ @ModelBase.register("Dots1ForCausalLM")
5299
+ class Dots1Model(Qwen2MoeModel):
5300
+ model_arch = gguf.MODEL_ARCH.DOTS1
5301
+
5302
+ def __init__(self, *args, **kwargs):
5303
+ super().__init__(*args, **kwargs)
5304
+ self.hparams["num_experts"] = self.hparams["n_routed_experts"]
5305
+
5306
+ def set_gguf_parameters(self):
5307
+ super().set_gguf_parameters()
5308
+ self.gguf_writer.add_leading_dense_block_count(self.hparams["first_k_dense_replace"])
5309
+ self.gguf_writer.add_expert_shared_count(self.hparams["n_shared_experts"])
5310
+ self.gguf_writer.add_expert_weights_scale(self.hparams["routed_scaling_factor"])
5311
+ self.gguf_writer.add_expert_weights_norm(self.hparams["norm_topk_prob"])
5312
+
5313
+ if self.hparams["scoring_func"] == "noaux_tc":
5314
+ self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
5315
+ else:
5316
+ raise ValueError(f"Unsupported scoring_func value: {self.hparams['scoring_func']}")
5317
+
5318
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
5319
+ if name.endswith("e_score_correction_bias"):
5320
+ name = name.replace("e_score_correction_bias", "e_score_correction.bias")
5321
+ if "shared_experts" in name:
5322
+ return [(self.map_tensor_name(name), data_torch)]
5323
+ return super().modify_tensors(data_torch, name, bid)
5324
+
5325
+
5290
5326
  @ModelBase.register("PLMForCausalLM")
5291
5327
  class PLMModel(TextModel):
5292
5328
  model_arch = gguf.MODEL_ARCH.PLM
@@ -5945,7 +5981,8 @@ class ExaoneModel(TextModel):
5945
5981
  if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
5946
5982
  if rope_scaling.get("rope_type", '').lower() == "llama3":
5947
5983
  base = self.hparams.get("rope_theta", 10000.0)
5948
- dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
5984
+ if (dim := self.hparams.get("head_dim")) is None:
5985
+ dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
5949
5986
  freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
5950
5987
 
5951
5988
  factor = rope_scaling.get("factor", 8.0)
@@ -6057,7 +6094,8 @@ class BailingMoeModel(TextModel):
6057
6094
  def set_gguf_parameters(self):
6058
6095
  super().set_gguf_parameters()
6059
6096
  hparams = self.hparams
6060
- rope_dim = hparams.get("head_dim") or hparams["hidden_size"] // hparams["num_attention_heads"]
6097
+ if (rope_dim := hparams.get("head_dim")) is None:
6098
+ rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
6061
6099
 
6062
6100
  self.gguf_writer.add_rope_dimension_count(rope_dim)
6063
6101
  rope_scaling = self.hparams.get("rope_scaling") or {}
@@ -6089,7 +6127,8 @@ class BailingMoeModel(TextModel):
6089
6127
  n_head = self.hparams["num_attention_heads"]
6090
6128
  n_kv_head = self.hparams.get("num_key_value_heads")
6091
6129
  n_embd = self.hparams["hidden_size"]
6092
- head_dim = self.hparams.get("head_dim") or n_embd // n_head
6130
+ if (head_dim := self.hparams.get("head_dim")) is None:
6131
+ head_dim = n_embd // n_head
6093
6132
 
6094
6133
  output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT)
6095
6134
 
@@ -6350,8 +6389,8 @@ def parse_args() -> argparse.Namespace:
6350
6389
  help="model is executed on big endian machine",
6351
6390
  )
6352
6391
  parser.add_argument(
6353
- "model", type=Path,
6354
- help="directory containing model file",
6392
+ "model", type=str,
6393
+ help="directory containing model file or huggingface repository ID (if --remote)",
6355
6394
  nargs="?",
6356
6395
  )
6357
6396
  parser.add_argument(
@@ -6454,18 +6493,20 @@ def main() -> None:
6454
6493
  else:
6455
6494
  logging.basicConfig(level=logging.INFO)
6456
6495
 
6457
- dir_model = args.model
6458
-
6459
6496
  if args.remote:
6497
+ hf_repo_id = args.model
6460
6498
  from huggingface_hub import snapshot_download
6461
6499
  local_dir = snapshot_download(
6462
- repo_id=str(dir_model),
6500
+ repo_id=hf_repo_id,
6463
6501
  allow_patterns=["LICENSE", "*.json", "*.md", "*.txt", "tokenizer.model"])
6464
6502
  dir_model = Path(local_dir)
6465
6503
  logger.info(f"Downloaded config and tokenizer to {local_dir}")
6504
+ else:
6505
+ hf_repo_id = None
6506
+ dir_model = Path(args.model)
6466
6507
 
6467
6508
  if not dir_model.is_dir():
6468
- logger.error(f'Error: {args.model} is not a directory')
6509
+ logger.error(f'Error: {dir_model} is not a directory')
6469
6510
  sys.exit(1)
6470
6511
 
6471
6512
  ftype_map: dict[str, gguf.LlamaFileType] = {
@@ -6485,9 +6526,9 @@ def main() -> None:
6485
6526
 
6486
6527
  if args.outfile is not None:
6487
6528
  fname_out = args.outfile
6488
- elif args.remote:
6529
+ elif hf_repo_id:
6489
6530
  # if remote, use the model ID as the output file name
6490
- fname_out = Path("./" + str(args.model).replace("/", "-") + "-{ftype}.gguf")
6531
+ fname_out = Path("./" + hf_repo_id.replace("/", "-") + "-{ftype}.gguf")
6491
6532
  else:
6492
6533
  fname_out = dir_model
6493
6534
 
@@ -6516,7 +6557,7 @@ def main() -> None:
6516
6557
  split_max_tensors=args.split_max_tensors,
6517
6558
  split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run,
6518
6559
  small_first_shard=args.no_tensor_first_split,
6519
- remote_hf_model_id=str(args.model) if args.remote else None)
6560
+ remote_hf_model_id=hf_repo_id)
6520
6561
 
6521
6562
  if args.vocab_only:
6522
6563
  logger.info("Exporting model vocab...")
@@ -105,7 +105,7 @@ message(DEBUG "GGML_NATIVE_DEFAULT : ${GGML_NATIVE_DEFAULT}")
105
105
  message(DEBUG "INS_ENB : ${INS_ENB}")
106
106
 
107
107
  option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF)
108
- option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
108
+ option(GGML_CPU_REPACK "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
109
109
  option(GGML_CPU_KLEIDIAI "ggml: use KleidiAI optimized kernels if applicable" OFF)
110
110
  option(GGML_SSE42 "ggml: enable SSE 4.2" ${INS_ENB})
111
111
  option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
@@ -137,7 +137,7 @@ set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
137
137
  set(GGML_CPU_POWERPC_CPUTYPE "" CACHE STRING "ggml: CPU type for PowerPC")
138
138
 
139
139
 
140
- if (WIN32)
140
+ if (MINGW)
141
141
  set(GGML_WIN_VER "0x602" CACHE STRING "ggml: Windows version")
142
142
  endif()
143
143
 
@@ -172,6 +172,7 @@ option(GGML_HIP "ggml: use HIP"
172
172
  option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental, slow" OFF)
173
173
  option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
174
174
  option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
175
+ option(GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 "ggml: enable rocWMMA FlashAttention on GFX12" OFF)
175
176
  option(GGML_VULKAN "ggml: use Vulkan" OFF)
176
177
  option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
177
178
  option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
@@ -367,6 +368,8 @@ if (MSVC)
367
368
  /wd4005 # Macro redefinition
368
369
  /wd4244 # Conversion from one type to another type, possible loss of data
369
370
  /wd4267 # Conversion from 'size_t' to a smaller type, possible loss of data
371
+ /wd4305 # Conversion from 'type1' to 'type2', possible loss of data
372
+ /wd4566 # Conversion from 'char' to 'wchar_t', possible loss of data
370
373
  /wd4996 # Disable POSIX deprecation warnings
371
374
  /wd4702 # Unreachable code warnings
372
375
  )
@@ -386,4 +389,46 @@ if (MSVC)
386
389
  disable_msvc_warnings(ggml-cpu-skylakex)
387
390
  disable_msvc_warnings(ggml-cpu-icelake)
388
391
  disable_msvc_warnings(ggml-cpu-alderlake)
392
+
393
+ if (GGML_BUILD_EXAMPLES)
394
+ disable_msvc_warnings(common-ggml)
395
+ disable_msvc_warnings(common)
396
+
397
+ disable_msvc_warnings(mnist-common)
398
+ disable_msvc_warnings(mnist-eval)
399
+ disable_msvc_warnings(mnist-train)
400
+
401
+ disable_msvc_warnings(gpt-2-ctx)
402
+ disable_msvc_warnings(gpt-2-alloc)
403
+ disable_msvc_warnings(gpt-2-backend)
404
+ disable_msvc_warnings(gpt-2-sched)
405
+ disable_msvc_warnings(gpt-2-quantize)
406
+ disable_msvc_warnings(gpt-2-batched)
407
+
408
+ disable_msvc_warnings(gpt-j)
409
+ disable_msvc_warnings(gpt-j-quantize)
410
+
411
+ disable_msvc_warnings(magika)
412
+ disable_msvc_warnings(yolov3-tiny)
413
+ disable_msvc_warnings(sam)
414
+
415
+ disable_msvc_warnings(simple-ctx)
416
+ disable_msvc_warnings(simple-backend)
417
+ endif()
418
+
419
+ if (GGML_BUILD_TESTS)
420
+ disable_msvc_warnings(test-mul-mat)
421
+ disable_msvc_warnings(test-arange)
422
+ disable_msvc_warnings(test-backend-ops)
423
+ disable_msvc_warnings(test-cont)
424
+ disable_msvc_warnings(test-conv-transpose)
425
+ disable_msvc_warnings(test-conv-transpose-1d)
426
+ disable_msvc_warnings(test-conv1d)
427
+ disable_msvc_warnings(test-conv2d)
428
+ disable_msvc_warnings(test-conv2d-dw)
429
+ disable_msvc_warnings(test-customop)
430
+ disable_msvc_warnings(test-dup)
431
+ disable_msvc_warnings(test-opt)
432
+ disable_msvc_warnings(test-pool)
433
+ endif ()
389
434
  endif()
@@ -36,8 +36,7 @@ function(ggml_get_system_arch)
36
36
  (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
37
37
  CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64|amd64)$"))
38
38
  set(GGML_SYSTEM_ARCH "x86" PARENT_SCOPE)
39
- elseif ("${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "ppc64le " OR
40
- "${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "powerpc ")
39
+ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc|power")
41
40
  set(GGML_SYSTEM_ARCH "PowerPC" PARENT_SCOPE)
42
41
  elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
43
42
  set(GGML_SYSTEM_ARCH "loongarch64" PARENT_SCOPE)
@@ -125,7 +125,6 @@ if (NOT MSVC)
125
125
  endif()
126
126
 
127
127
  if (MINGW)
128
- # Target Windows 8 for PrefetchVirtualMemory
129
128
  add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
130
129
  endif()
131
130
 
@@ -213,6 +212,7 @@ endif()
213
212
 
214
213
  add_library(ggml
215
214
  ggml-backend-reg.cpp)
215
+ add_library(ggml::ggml ALIAS ggml)
216
216
 
217
217
  target_link_libraries(ggml PUBLIC ggml-base)
218
218
 
@@ -270,17 +270,23 @@ endfunction()
270
270
  function(ggml_add_cpu_backend_variant tag_name)
271
271
  set(GGML_CPU_TAG_NAME ${tag_name})
272
272
  # other: OPENMP LLAMAFILE CPU_HBM
273
- foreach (feat NATIVE
274
- SSE42
275
- AVX AVX2 BMI2 AVX_VNNI FMA F16C
276
- AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
277
- AMX_TILE AMX_INT8 AMX_BF16)
278
- set(GGML_${feat} OFF)
279
- endforeach()
280
-
281
- foreach (feat ${ARGN})
282
- set(GGML_${feat} ON)
283
- endforeach()
273
+ if (GGML_SYSTEM_ARCH STREQUAL "x86")
274
+ foreach (feat NATIVE
275
+ SSE42
276
+ AVX AVX2 BMI2 AVX_VNNI FMA F16C
277
+ AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
278
+ AMX_TILE AMX_INT8 AMX_BF16)
279
+ set(GGML_${feat} OFF)
280
+ endforeach()
281
+
282
+ foreach (feat ${ARGN})
283
+ set(GGML_${feat} ON)
284
+ endforeach()
285
+ elseif (GGML_SYSTEM_ARCH STREQUAL "ARM")
286
+ foreach (feat ${ARGN})
287
+ set(GGML_INTERNAL_${feat} ON)
288
+ endforeach()
289
+ endif()
284
290
 
285
291
  ggml_add_cpu_backend_variant_impl(${tag_name})
286
292
  endfunction()
@@ -290,6 +296,8 @@ ggml_add_backend(CPU)
290
296
  if (GGML_CPU_ALL_VARIANTS)
291
297
  if (NOT GGML_BACKEND_DL)
292
298
  message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
299
+ elseif (GGML_CPU_ARM_ARCH)
300
+ message(FATAL_ERROR "Cannot use both GGML_CPU_ARM_ARCH and GGML_CPU_ALL_VARIANTS")
293
301
  endif()
294
302
  if (GGML_SYSTEM_ARCH STREQUAL "x86")
295
303
  ggml_add_cpu_backend_variant(x64)
@@ -303,8 +311,34 @@ if (GGML_CPU_ALL_VARIANTS)
303
311
  # MSVC doesn't support AMX
304
312
  ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
305
313
  endif()
314
+ elseif(GGML_SYSTEM_ARCH STREQUAL "ARM")
315
+ if (CMAKE_SYSTEM_NAME MATCHES "Linux")
316
+ # Many of these features are optional so we build versions with popular
317
+ # combinations and name the backends based on the version they were
318
+ # first released with
319
+ ggml_add_cpu_backend_variant(armv8.0_1)
320
+ ggml_add_cpu_backend_variant(armv8.2_1 DOTPROD)
321
+ ggml_add_cpu_backend_variant(armv8.2_2 DOTPROD FP16_VECTOR_ARITHMETIC)
322
+ ggml_add_cpu_backend_variant(armv8.2_3 DOTPROD FP16_VECTOR_ARITHMETIC SVE)
323
+ ggml_add_cpu_backend_variant(armv8.6_1 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8)
324
+ ggml_add_cpu_backend_variant(armv8.6_2 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SVE2)
325
+ ggml_add_cpu_backend_variant(armv9.2_1 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SME)
326
+ ggml_add_cpu_backend_variant(armv9.2_2 DOTPROD FP16_VECTOR_ARITHMETIC SVE MATMUL_INT8 SVE2 SME)
327
+ elseif (CMAKE_SYSTEM_NAME MATCHES "Android")
328
+ # Android-specific backends with SoC-compatible feature sets
329
+ ggml_add_cpu_backend_variant(android_armv8.0_1)
330
+ ggml_add_cpu_backend_variant(android_armv8.2_1 DOTPROD)
331
+ ggml_add_cpu_backend_variant(android_armv8.2_2 DOTPROD FP16_VECTOR_ARITHMETIC)
332
+ ggml_add_cpu_backend_variant(android_armv8.6_1 DOTPROD FP16_VECTOR_ARITHMETIC MATMUL_INT8)
333
+ elseif (APPLE)
334
+ ggml_add_cpu_backend_variant(apple_m1 DOTPROD)
335
+ ggml_add_cpu_backend_variant(apple_m2_m3 DOTPROD MATMUL_INT8)
336
+ ggml_add_cpu_backend_variant(apple_m4 DOTPROD MATMUL_INT8 NOSVE SME)
337
+ else()
338
+ message(FATAL_ERROR "Unsupported ARM target OS: ${CMAKE_SYSTEM_NAME}")
339
+ endif()
306
340
  else()
307
- message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS not yet supported on ${GGML_SYSTEM_ARCH}")
341
+ message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS not yet supported with ${GGML_SYSTEM_ARCH} on ${CMAKE_SYSTEM_NAME}")
308
342
  endif()
309
343
  elseif (GGML_CPU)
310
344
  ggml_add_cpu_backend_variant_impl("")
@@ -69,6 +69,9 @@
69
69
  #if defined(__clang__)
70
70
  # pragma clang diagnostic push
71
71
  # pragma clang diagnostic ignored "-Wdeprecated-declarations"
72
+ #elif defined(__GNUC__)
73
+ # pragma GCC diagnostic push
74
+ # pragma GCC diagnostic ignored "-Wdeprecated-declarations"
72
75
  #endif
73
76
 
74
77
  namespace fs = std::filesystem;
@@ -91,6 +94,8 @@ static std::string path_str(const fs::path & path) {
91
94
 
92
95
  #if defined(__clang__)
93
96
  # pragma clang diagnostic pop
97
+ #elif defined(__GNUC__)
98
+ # pragma GCC diagnostic pop
94
99
  #endif
95
100
 
96
101
  #ifdef _WIN32
@@ -37,6 +37,7 @@
37
37
  #include <thread>
38
38
  #include <unistd.h>
39
39
  #include <functional>
40
+ #include <optional>
40
41
 
41
42
  #include "../include/ggml-cann.h"
42
43
  #include "../include/ggml.h"
@@ -103,6 +104,9 @@ const ggml_cann_device_info& ggml_cann_info();
103
104
  void ggml_cann_set_device(int32_t device);
104
105
  int32_t ggml_cann_get_device();
105
106
 
107
+ std::optional<std::string> get_env(const std::string& name);
108
+ bool parse_bool(const std::string& value);
109
+
106
110
  /**
107
111
  * @brief Abstract base class for memory pools used by CANN.
108
112
  */
@@ -354,7 +358,8 @@ struct ggml_backend_cann_context {
354
358
  : device(device), name("CANN" + std::to_string(device)), task_queue(1024, device) {
355
359
  ggml_cann_set_device(device);
356
360
  description = aclrtGetSocName();
357
- async_mode = (getenv("GGML_CANN_ASYNC_MODE") != nullptr);
361
+
362
+ bool async_mode = parse_bool(get_env("GGML_CANN_ASYNC_MODE").value_or(""));
358
363
  GGML_LOG_INFO("%s: device %d async operator submission is %s\n", __func__,
359
364
  device, async_mode ? "ON" : "OFF");
360
365
  }
@@ -31,6 +31,8 @@
31
31
  #include <mutex>
32
32
  #include <queue>
33
33
  #include <chrono>
34
+ #include <unordered_set>
35
+ #include <optional>
34
36
 
35
37
  #include "ggml-impl.h"
36
38
  #include "ggml-backend-impl.h"
@@ -93,6 +95,26 @@ int32_t ggml_cann_get_device() {
93
95
  return id;
94
96
  }
95
97
 
98
+ /**
99
+ * @brief Get the value of the specified environment variable (name).
100
+ * if not empty, return a std::string object
101
+ */
102
+ std::optional<std::string> get_env(const std::string& name) {
103
+ const char* val = std::getenv(name.c_str());
104
+ if (!val) return std::nullopt;
105
+ std::string res = std::string(val);
106
+ std::transform(res.begin(), res.end(), res.begin(), ::tolower);
107
+ return res;
108
+ }
109
+
110
+ /**
111
+ * @brief Verify whether the environment variable is a valid value.
112
+ */
113
+ bool parse_bool(const std::string& value) {
114
+ std::unordered_set<std::string> valid_values = {"on", "1", "yes", "y", "enable", "true"};
115
+ return valid_values.find(value) != valid_values.end();
116
+ }
117
+
96
118
  /**
97
119
  * @brief Initialize the CANN device information.
98
120
  *
@@ -214,7 +236,7 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
214
236
  * @param device The device ID to associate with this buffer pool.
215
237
  */
216
238
  explicit ggml_cann_pool_buf_prio(int device) : device(device) {
217
- disable_clean = getenv("GGML_CANN_DISABLE_BUF_POOL_CLEAN") != nullptr;
239
+ disable_clean = parse_bool(get_env("GGML_CANN_DISABLE_BUF_POOL_CLEAN").value_or(""));
218
240
  }
219
241
 
220
242
  /**
@@ -410,7 +432,7 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
410
432
  * @param device The device ID to associate with this buffer pool.
411
433
  */
412
434
  explicit ggml_cann_pool_buf(int device) : device(device) {
413
- disable_clean = getenv("GGML_CANN_DISABLE_BUF_POOL_CLEAN") != nullptr;
435
+ disable_clean = parse_bool(get_env("GGML_CANN_DISABLE_BUF_POOL_CLEAN").value_or(""));
414
436
  }
415
437
 
416
438
  /**
@@ -731,16 +753,18 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
731
753
  */
732
754
  std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(
733
755
  int device) {
734
- bool disable_vmm = (getenv("GGML_CANN_DISABLE_VMM_POOL") != nullptr);
735
- if (!disable_vmm && ggml_cann_info().devices[device].vmm) {
736
- GGML_LOG_INFO("%s: device %d use vmm pool\n", __func__, device);
737
- return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_vmm(device));
738
- }
739
- bool enable_buf_prio = (getenv("GGML_CANN_ENABLE_BUF_PRIO_POOL") != nullptr);
740
- if (enable_buf_prio) {
756
+ std::string mem_pool_type = get_env("GGML_CANN_MEM_POOL").value_or("");
757
+
758
+ if (mem_pool_type == "prio") {
741
759
  GGML_LOG_INFO("%s: device %d use buffer pool with priority queue\n", __func__, device);
742
760
  return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_buf_prio(device));
743
761
  }
762
+
763
+ if (ggml_cann_info().devices[device].vmm && mem_pool_type != "leg") {
764
+ GGML_LOG_INFO("%s: device %d use vmm pool\n", __func__, device);
765
+ return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_vmm(device));
766
+ }
767
+
744
768
  GGML_LOG_INFO("%s: device %d use buffer pool\n", __func__, device);
745
769
  return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_buf(device));
746
770
  }
@@ -1074,6 +1074,10 @@ GGML_TABLE_BEGIN(uint32_t, iq3s_grid, 512)
1074
1074
  0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101,
1075
1075
  GGML_TABLE_END()
1076
1076
 
1077
+ GGML_TABLE_BEGIN(int8_t, kvalues_iq4nl, 16)
1078
+ -127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113,
1079
+ GGML_TABLE_END()
1080
+
1077
1081
  #define NGRID_IQ1S 2048
1078
1082
  #define IQ1S_DELTA 0.125f
1079
1083
  #define IQ1M_DELTA 0.125f