@novastera-oss/llamarn 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (190) hide show
  1. package/android/build.gradle +2 -1
  2. package/android/proguard-rules.pro +12 -0
  3. package/android/src/main/cpp/include/llama.h +15 -47
  4. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  6. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  8. package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
  9. package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
  10. package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
  11. package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
  12. package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
  13. package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
  14. package/android/src/main/jniLibs/x86/libggml.so +0 -0
  15. package/android/src/main/jniLibs/x86/libllama.so +0 -0
  16. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  17. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  18. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  19. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  20. package/cpp/build-info.cpp +2 -2
  21. package/cpp/llama.cpp/CMakePresets.json +11 -0
  22. package/cpp/llama.cpp/CODEOWNERS +1 -0
  23. package/cpp/llama.cpp/README.md +4 -3
  24. package/cpp/llama.cpp/common/arg.cpp +45 -1
  25. package/cpp/llama.cpp/common/common.cpp +22 -6
  26. package/cpp/llama.cpp/common/common.h +18 -4
  27. package/cpp/llama.cpp/convert_hf_to_gguf.py +500 -32
  28. package/cpp/llama.cpp/convert_hf_to_gguf_update.py +12 -13
  29. package/cpp/llama.cpp/ggml/CMakeLists.txt +6 -1
  30. package/cpp/llama.cpp/ggml/cmake/ggml-config.cmake.in +85 -47
  31. package/cpp/llama.cpp/ggml/include/ggml-webgpu.h +19 -0
  32. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +1 -0
  33. package/cpp/llama.cpp/ggml/src/ggml-alloc.c +0 -15
  34. package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +7 -0
  35. package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +8 -20
  36. package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +3 -1
  37. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +58 -3
  38. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +130 -22
  39. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +122 -16
  40. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +5 -2
  41. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +1 -1
  42. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +109 -12
  43. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +3 -0
  44. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +88 -10
  45. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +343 -1094
  46. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +3 -0
  47. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +0 -1
  48. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +3 -0
  49. package/cpp/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +3 -3
  50. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +14 -4
  51. package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cu +64 -17
  52. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy-utils.cuh +225 -0
  53. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +41 -301
  54. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +85 -67
  55. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +45 -62
  56. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +28 -43
  57. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +41 -56
  58. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +36 -47
  59. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +31 -43
  60. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +22 -37
  61. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +3 -13
  62. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +73 -23
  63. package/cpp/llama.cpp/ggml/src/ggml-cuda/im2col.cu +1 -1
  64. package/cpp/llama.cpp/ggml/src/ggml-cuda/mma.cuh +111 -3
  65. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +6 -4
  66. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +1152 -689
  67. package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cu +92 -5
  68. package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cuh +2 -0
  69. package/cpp/llama.cpp/ggml/src/ggml-cuda/set-rows.cu +275 -0
  70. package/cpp/llama.cpp/ggml/src/ggml-cuda/set-rows.cuh +7 -0
  71. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +7 -0
  72. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +2 -0
  73. package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +13 -1
  74. package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +2 -2
  75. package/cpp/llama.cpp/ggml/src/ggml-impl.h +16 -0
  76. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +13 -3
  77. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +407 -69
  78. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +380 -83
  79. package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +18 -4
  80. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +2 -0
  81. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +295 -2
  82. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/conv2d.cl +185 -0
  83. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/conv2d_f16_f32.cl +176 -0
  84. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/im2col_f16.cl +1 -1
  85. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/im2col_f32.cl +1 -1
  86. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/rms_norm.cl +79 -0
  87. package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +4 -4
  88. package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +14 -26
  89. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +131 -46
  90. package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
  91. package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +8 -9
  92. package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +43 -43
  93. package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +2 -6
  94. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +287 -22
  95. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +265 -0
  96. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +1 -5
  97. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +1 -1
  98. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
  99. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +1 -1
  100. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +1 -1
  101. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
  102. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.comp +2 -0
  103. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +2 -0
  104. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +3 -8
  105. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +8 -2
  106. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +1 -4
  107. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rte.comp +5 -0
  108. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +71 -16
  109. package/cpp/llama.cpp/ggml/src/ggml-webgpu/CMakeLists.txt +54 -0
  110. package/cpp/llama.cpp/ggml/src/ggml-webgpu/ggml-webgpu.cpp +907 -0
  111. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +60 -0
  112. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +35 -0
  113. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +40 -0
  114. package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +56 -0
  115. package/cpp/llama.cpp/ggml/src/ggml.c +4 -6
  116. package/cpp/llama.cpp/gguf-py/gguf/constants.py +98 -0
  117. package/cpp/llama.cpp/gguf-py/gguf/metadata.py +4 -0
  118. package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_dump.py +24 -1
  119. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +75 -52
  120. package/cpp/llama.cpp/include/llama.h +15 -7
  121. package/cpp/llama.cpp/models/templates/llama-cpp-rwkv-world.jinja +34 -0
  122. package/cpp/llama.cpp/models/templates/moonshotai-Kimi-K2.jinja +43 -0
  123. package/cpp/llama.cpp/requirements/requirements-all.txt +1 -0
  124. package/cpp/llama.cpp/requirements/requirements-server-bench.txt +5 -0
  125. package/cpp/llama.cpp/src/llama-arch.cpp +106 -0
  126. package/cpp/llama.cpp/src/llama-arch.h +5 -0
  127. package/cpp/llama.cpp/src/llama-batch.cpp +76 -70
  128. package/cpp/llama.cpp/src/llama-batch.h +24 -18
  129. package/cpp/llama.cpp/src/llama-chat.cpp +43 -1
  130. package/cpp/llama.cpp/src/llama-chat.h +2 -0
  131. package/cpp/llama.cpp/src/llama-context.cpp +180 -106
  132. package/cpp/llama.cpp/src/llama-context.h +26 -16
  133. package/cpp/llama.cpp/src/llama-cparams.h +3 -2
  134. package/cpp/llama.cpp/src/llama-graph.cpp +203 -39
  135. package/cpp/llama.cpp/src/llama-graph.h +147 -72
  136. package/cpp/llama.cpp/src/llama-hparams.cpp +40 -0
  137. package/cpp/llama.cpp/src/llama-hparams.h +10 -2
  138. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +11 -5
  139. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +3 -0
  140. package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +698 -302
  141. package/cpp/llama.cpp/src/llama-kv-cache-unified.h +89 -31
  142. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +1 -0
  143. package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +16 -1
  144. package/cpp/llama.cpp/src/llama-model.cpp +1293 -312
  145. package/cpp/llama.cpp/src/llama-model.h +3 -4
  146. package/cpp/llama.cpp/src/llama-quant.cpp +1 -2
  147. package/cpp/llama.cpp/src/llama-vocab.cpp +363 -8
  148. package/cpp/llama.cpp/src/llama-vocab.h +2 -0
  149. package/cpp/llama.cpp/src/unicode.cpp +207 -0
  150. package/cpp/llama.cpp/src/unicode.h +2 -0
  151. package/ios/include/common.h +18 -4
  152. package/ios/include/llama.h +15 -7
  153. package/ios/libs/llama.xcframework/Info.plist +15 -15
  154. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  155. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5267 -5059
  156. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +15 -7
  157. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  158. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  159. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5238 -5030
  160. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4014 -3889
  161. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +15 -7
  162. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  163. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  164. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5238 -5030
  165. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4016 -3891
  166. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +15 -7
  167. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +15 -7
  168. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  169. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +15 -7
  170. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  171. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  172. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  173. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5267 -5059
  174. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +15 -7
  175. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  176. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  177. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5238 -5030
  178. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4014 -3889
  179. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +15 -7
  180. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  181. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  182. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5303 -5095
  183. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +15 -7
  184. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  185. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  186. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +5274 -5066
  187. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4044 -3919
  188. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +15 -7
  189. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  190. package/package.json +4 -4
@@ -7,7 +7,6 @@ import pathlib
7
7
  import re
8
8
 
9
9
  import requests
10
- import sys
11
10
  import json
12
11
  import shutil
13
12
  import argparse
@@ -69,8 +68,7 @@ args = parser.parse_args()
69
68
  hf_token = args.hf_token if args.hf_token is not None else hf_token
70
69
 
71
70
  if hf_token is None:
72
- logger.error("HF token is required. Please provide it as an argument or set it in ~/.cache/huggingface/token")
73
- sys.exit(1)
71
+ logger.warning("HF token not found. You can provide it as an argument or set it in ~/.cache/huggingface/token")
74
72
 
75
73
  # TODO: this string has to exercise as much pre-tokenizer functionality as possible
76
74
  # will be updated with time - contributions welcome
@@ -131,6 +129,7 @@ models = [
131
129
  {"name": "a.x-4.0", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/skt/A.X-4.0", },
132
130
  {"name": "midm-2.0", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct", },
133
131
  {"name": "lfm2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LiquidAI/LFM2-Tokenizer"},
132
+ {"name": "exaone4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B", },
134
133
  ]
135
134
 
136
135
  # some models are known to be broken upstream, so we will skip them as exceptions
@@ -146,11 +145,12 @@ pre_computed_hashes = [
146
145
  {"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-1B-Base", "chkhsh": "60476e1243776c4fb1b993dbd7a5f15ac22f83c80afdf425fa5ae01c8d44ef86"},
147
146
  {"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-7B-Base", "chkhsh": "3eda48b4c4dc7de733d1a8b3e3b4a85243dbbf704da2ee9d42c6beced8897896"},
148
147
  {"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-34B-Base", "chkhsh": "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b"},
148
+ {"name": "kimi-k2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/moonshotai/Kimi-K2-Base", "chkhsh": "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890"},
149
149
  ]
150
150
 
151
151
 
152
152
  def download_file_with_auth(url, token, save_path):
153
- headers = {"Authorization": f"Bearer {token}"}
153
+ headers = {"Authorization": f"Bearer {token}"} if token else None
154
154
  response = sess.get(url, headers=headers)
155
155
  response.raise_for_status()
156
156
  os.makedirs(os.path.dirname(save_path), exist_ok=True)
@@ -231,7 +231,7 @@ for model in models:
231
231
  # generate the source code for the convert_hf_to_gguf.py:get_vocab_base_pre() function:
232
232
 
233
233
  src_ifs = ""
234
- for model in [*all_models, *pre_computed_hashes]:
234
+ for model in [*pre_computed_hashes, *all_models]:
235
235
  name = model["name"]
236
236
  tokt = model["tokt"]
237
237
  chkhsh = model.get("chkhsh")
@@ -239,11 +239,6 @@ for model in [*all_models, *pre_computed_hashes]:
239
239
  if tokt == TOKENIZER_TYPE.SPM or tokt == TOKENIZER_TYPE.UGM:
240
240
  continue
241
241
 
242
- # Skip if the tokenizer folder does not exist or there are other download issues previously
243
- if not os.path.exists(f"models/tokenizers/{name}"):
244
- logger.warning(f"Directory for tokenizer {name} not found. Skipping...")
245
- continue
246
-
247
242
  # create the tokenizer
248
243
  if chkhsh is not None:
249
244
  # if the model has a pre-computed hash, use it
@@ -253,15 +248,19 @@ for model in [*all_models, *pre_computed_hashes]:
253
248
  chkhsh = existing_models[name]
254
249
  else:
255
250
  # otherwise, compute the hash of the tokenizer
251
+
252
+ # Fail if the tokenizer folder with config does not exist or there are other download issues previously
253
+ if not os.path.isfile(f"models/tokenizers/{name}/tokenizer_config.json"):
254
+ raise OSError(f"Config for tokenizer {name} not found. The model may not exist or is not accessible with the provided token.")
255
+
256
256
  try:
257
257
  logger.info(f"Loading tokenizer from {f'models/tokenizers/{name}'}...")
258
258
  if name == "t5":
259
259
  tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False)
260
260
  else:
261
261
  tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
262
- except OSError as e:
263
- logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
264
- continue # Skip to the next model if the tokenizer can't be loaded
262
+ except Exception as e:
263
+ raise OSError(f"Error loading tokenizer for model {name}.") from e
265
264
 
266
265
  chktok = tokenizer.encode(CHK_TXT)
267
266
  chkhsh = sha256(str(chktok).encode()).hexdigest()
@@ -131,7 +131,7 @@ option(GGML_RVV "ggml: enable rvv" ON)
131
131
  option(GGML_RV_ZFH "ggml: enable riscv zfh" OFF)
132
132
  option(GGML_XTHEADVECTOR "ggml: enable xtheadvector" OFF)
133
133
  option(GGML_VXE "ggml: enable vxe" ON)
134
- option(GGML_NNPA "ggml: enable nnpa" ON)
134
+ option(GGML_NNPA "ggml: enable nnpa" OFF) # temp disabled by default, see: https://github.com/ggml-org/llama.cpp/issues/14877
135
135
 
136
136
  option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
137
137
  set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
@@ -174,6 +174,8 @@ option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental,
174
174
  option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
175
175
  option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
176
176
  option(GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 "ggml: enable rocWMMA FlashAttention on GFX12" OFF)
177
+ option(GGML_MUSA_GRAPHS "ggml: use MUSA graph, experimental, unstable" OFF)
178
+ option(GGML_MUSA_MUDNN_COPY "ggml: enable muDNN for accelerated copy" OFF)
177
179
  option(GGML_VULKAN "ggml: use Vulkan" OFF)
178
180
  option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
179
181
  option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
@@ -181,6 +183,8 @@ option(GGML_VULKAN_MEMORY_DEBUG "ggml: enable Vulkan memory debug ou
181
183
  option(GGML_VULKAN_SHADER_DEBUG_INFO "ggml: enable Vulkan shader debug info" OFF)
182
184
  option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF)
183
185
  option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
186
+ option(GGML_WEBGPU "ggml: use WebGPU" OFF)
187
+ option(GGML_WEBGPU_DEBUG "ggml: enable WebGPU debug output" OFF)
184
188
  option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT})
185
189
  option(GGML_METAL_USE_BF16 "ggml: use bfloat if available" OFF)
186
190
  option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF)
@@ -270,6 +274,7 @@ set(GGML_PUBLIC_HEADERS
270
274
  include/ggml-rpc.h
271
275
  include/ggml-sycl.h
272
276
  include/ggml-vulkan.h
277
+ include/ggml-webgpu.h
273
278
  include/gguf.h)
274
279
 
275
280
  set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
@@ -1,94 +1,130 @@
1
-
2
- @GGML_VARIABLES_EXPANDED@
3
-
4
1
  @PACKAGE_INIT@
5
2
 
6
- set_and_check(GGML_INCLUDE_DIR "@PACKAGE_GGML_INCLUDE_INSTALL_DIR@")
7
- set_and_check(GGML_LIB_DIR "@PACKAGE_GGML_LIB_INSTALL_DIR@")
8
- #set_and_check(GGML_BIN_DIR "@PACKAGE_GGML_BIN_INSTALL_DIR@")
9
-
10
- find_package(Threads REQUIRED)
11
-
12
- find_library(GGML_LIBRARY ggml
13
- REQUIRED
14
- HINTS ${GGML_LIB_DIR}
15
- NO_CMAKE_FIND_ROOT_PATH)
16
-
17
- add_library(ggml::ggml UNKNOWN IMPORTED)
18
- set_target_properties(ggml::ggml
19
- PROPERTIES
20
- IMPORTED_LOCATION "${GGML_LIBRARY}")
21
-
22
- find_library(GGML_BASE_LIBRARY ggml-base
23
- REQUIRED
24
- HINTS ${GGML_LIB_DIR}
25
- NO_CMAKE_FIND_ROOT_PATH)
26
-
27
- add_library(ggml::ggml-base UNKNOWN IMPORTED)
28
- set_target_properties(ggml::ggml-base
29
- PROPERTIES
30
- IMPORTED_LOCATION "${GGML_BASE_LIBRARY}")
3
+ @GGML_VARIABLES_EXPANDED@
31
4
 
5
+ # Find all dependencies before creating any target.
6
+ include(CMakeFindDependencyMacro)
7
+ find_dependency(Threads)
32
8
  if (NOT GGML_SHARED_LIB)
9
+ set(GGML_CPU_INTERFACE_LINK_LIBRARIES "")
10
+ set(GGML_CPU_INTERFACE_LINK_OPTIONS "")
11
+
33
12
  if (APPLE AND GGML_ACCELERATE)
34
- find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED)
13
+ find_library(ACCELERATE_FRAMEWORK Accelerate)
14
+ if(NOT ACCELERATE_FRAMEWORK)
15
+ set(${CMAKE_FIND_PACKAGE_NAME}_FOUND 0)
16
+ return()
17
+ endif()
35
18
  list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES ${ACCELERATE_FRAMEWORK})
36
19
  endif()
37
20
 
38
- if (GGML_OPENMP)
39
- find_package(OpenMP REQUIRED)
21
+ if (GGML_OPENMP_ENABLED)
22
+ find_dependency(OpenMP)
40
23
  list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
41
24
  endif()
42
25
 
43
26
  if (GGML_CPU_HBM)
44
- find_library(memkind memkind REQUIRED)
27
+ find_library(memkind memkind)
28
+ if(NOT memkind)
29
+ set(${CMAKE_FIND_PACKAGE_NAME}_FOUND 0)
30
+ return()
31
+ endif()
45
32
  list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES memkind)
46
33
  endif()
47
34
 
48
35
  if (GGML_BLAS)
49
- find_package(BLAS REQUIRED)
36
+ find_dependency(BLAS)
50
37
  list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES ${BLAS_LIBRARIES})
51
38
  list(APPEND GGML_CPU_INTERFACE_LINK_OPTIONS ${BLAS_LINKER_FLAGS})
52
39
  endif()
53
40
 
54
41
  if (GGML_CUDA)
55
- find_package(CUDAToolkit REQUIRED)
42
+ set(GGML_CUDA_INTERFACE_LINK_LIBRARIES "")
43
+ find_dependency(CUDAToolkit)
44
+ if (GGML_STATIC)
45
+ list(APPEND GGML_CUDA_INTERFACE_LINK_LIBRARIES $<LINK_ONLY:CUDA::cudart_static>)
46
+ if (WIN32)
47
+ list(APPEND GGML_CUDA_INTERFACE_LINK_LIBRARIES $<LINK_ONLY:CUDA::cublas> $<LINK_ONLY:CUDA::cublasLt>)
48
+ else()
49
+ list(APPEND GGML_CUDA_INTERFACE_LINK_LIBRARIES $<LINK_ONLY:CUDA::cublas_static> $<LINK_ONLY:CUDA::cublasLt_static>)
50
+ endif()
51
+ endif()
52
+ if (NOT GGML_CUDA_NO_VMM)
53
+ list(APPEND GGML_CUDA_INTERFACE_LINK_LIBRARIES $<LINK_ONLY:CUDA::cuda_driver>)
54
+ endif()
56
55
  endif()
57
56
 
58
57
  if (GGML_METAL)
59
- find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
60
- find_library(METAL_FRAMEWORK Metal REQUIRED)
61
- find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
58
+ find_library(FOUNDATION_LIBRARY Foundation)
59
+ find_library(METAL_FRAMEWORK Metal)
60
+ find_library(METALKIT_FRAMEWORK MetalKit)
61
+ if(NOT FOUNDATION_LIBRARY OR NOT METAL_FRAMEWORK OR NOT METALKIT_FRAMEWORK)
62
+ set(${CMAKE_FIND_PACKAGE_NAME}_FOUND 0)
63
+ return()
64
+ endif()
65
+ set(GGML_METAL_INTERFACE_LINK_LIBRARIES
66
+ ${FOUNDATION_LIBRARY} ${METAL_FRAMEWORK} ${METALKIT_FRAMEWORK})
67
+ endif()
62
68
 
63
- list(APPEND GGML_METAL_INTERFACE_LINK_LIBRARIES
64
- ${FOUNDATION_LIBRARY} ${METAL_FRAMEWORK} ${METALKIT_FRAMEWORK})
69
+ if (GGML_OPENCL)
70
+ find_dependency(OpenCL)
71
+ set(GGML_OPENCL_INTERFACE_LINK_LIBRARIES $<LINK_ONLY:OpenCL::OpenCL>)
65
72
  endif()
66
73
 
67
74
  if (GGML_VULKAN)
68
- find_package(Vulkan REQUIRED)
69
- list(APPEND GGML_VULKAN_INTERFACE_LINK_LIBRARIES Vulkan::Vulkan)
75
+ find_dependency(Vulkan)
76
+ set(GGML_VULKAN_INTERFACE_LINK_LIBRARIES $<LINK_ONLY:Vulkan::Vulkan>)
70
77
  endif()
71
78
 
72
79
  if (GGML_HIP)
73
- find_package(hip REQUIRED)
74
- find_package(hipblas REQUIRED)
75
- find_package(rocblas REQUIRED)
76
- list(APPEND GGML_HIP_INTERFACE_LINK_LIBRARIES hip::host roc::rocblas roc::hipblas)
80
+ find_dependency(hip)
81
+ find_dependency(hipblas)
82
+ find_dependency(rocblas)
83
+ set(GGML_HIP_INTERFACE_LINK_LIBRARIES hip::host roc::rocblas roc::hipblas)
77
84
  endif()
78
85
 
79
86
  if (GGML_SYCL)
87
+ set(GGML_SYCL_INTERFACE_LINK_LIBRARIES "")
80
88
  find_package(DNNL)
81
89
  if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL")
82
90
  list(APPEND GGML_SYCL_INTERFACE_LINK_LIBRARIES DNNL::dnnl)
83
91
  endif()
84
92
  if (WIN32)
85
- find_package(IntelSYCL REQUIRED)
86
- find_package(MKL REQUIRED)
93
+ find_dependency(IntelSYCL)
94
+ find_dependency(MKL)
87
95
  list(APPEND GGML_SYCL_INTERFACE_LINK_LIBRARIES IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
88
96
  endif()
89
97
  endif()
90
98
  endif()
91
99
 
100
+ set_and_check(GGML_INCLUDE_DIR "@PACKAGE_GGML_INCLUDE_INSTALL_DIR@")
101
+ set_and_check(GGML_LIB_DIR "@PACKAGE_GGML_LIB_INSTALL_DIR@")
102
+ #set_and_check(GGML_BIN_DIR "@PACKAGE_GGML_BIN_INSTALL_DIR@")
103
+
104
+ if(NOT TARGET ggml::ggml)
105
+
106
+ find_package(Threads REQUIRED)
107
+
108
+ find_library(GGML_LIBRARY ggml
109
+ REQUIRED
110
+ HINTS ${GGML_LIB_DIR}
111
+ NO_CMAKE_FIND_ROOT_PATH)
112
+
113
+ add_library(ggml::ggml UNKNOWN IMPORTED)
114
+ set_target_properties(ggml::ggml
115
+ PROPERTIES
116
+ IMPORTED_LOCATION "${GGML_LIBRARY}")
117
+
118
+ find_library(GGML_BASE_LIBRARY ggml-base
119
+ REQUIRED
120
+ HINTS ${GGML_LIB_DIR}
121
+ NO_CMAKE_FIND_ROOT_PATH)
122
+
123
+ add_library(ggml::ggml-base UNKNOWN IMPORTED)
124
+ set_target_properties(ggml::ggml-base
125
+ PROPERTIES
126
+ IMPORTED_LOCATION "${GGML_BASE_LIBRARY}")
127
+
92
128
  set(_ggml_all_targets "")
93
129
  foreach(_ggml_backend ${GGML_AVAILABLE_BACKENDS})
94
130
  string(REPLACE "-" "_" _ggml_backend_pfx "${_ggml_backend}")
@@ -149,4 +185,6 @@ set_target_properties(ggml::all
149
185
  PROPERTIES
150
186
  INTERFACE_LINK_LIBRARIES "${_ggml_all_targets}")
151
187
 
188
+ endif() # TARGET ggml::ggml
189
+
152
190
  check_required_components(ggml)
@@ -0,0 +1,19 @@
1
+ #pragma once
2
+
3
+ #include "ggml.h"
4
+ #include "ggml-backend.h"
5
+
6
+ #ifdef __cplusplus
7
+ extern "C" {
8
+ #endif
9
+
10
+ #define GGML_WEBGPU_NAME "WebGPU"
11
+
12
+ // Needed for examples in ggml
13
+ GGML_BACKEND_API ggml_backend_t ggml_backend_webgpu_init(void);
14
+
15
+ GGML_BACKEND_API ggml_backend_reg_t ggml_backend_webgpu_reg(void);
16
+
17
+ #ifdef __cplusplus
18
+ }
19
+ #endif
@@ -370,6 +370,7 @@ ggml_add_backend(MUSA)
370
370
  ggml_add_backend(RPC)
371
371
  ggml_add_backend(SYCL)
372
372
  ggml_add_backend(Vulkan)
373
+ ggml_add_backend(WebGPU)
373
374
  ggml_add_backend(OpenCL)
374
375
 
375
376
  foreach (target ggml-base ggml)
@@ -22,21 +22,6 @@ static bool ggml_is_view(const struct ggml_tensor * t) {
22
22
  return t->view_src != NULL;
23
23
  }
24
24
 
25
- static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
26
- if (a->type != b->type) {
27
- return false;
28
- }
29
- for (int i = 0; i < GGML_MAX_DIMS; i++) {
30
- if (a->ne[i] != b->ne[i]) {
31
- return false;
32
- }
33
- if (a->nb[i] != b->nb[i]) {
34
- return false;
35
- }
36
- }
37
- return true;
38
- }
39
-
40
25
  // ops that return true for this function must not use restrict pointers for their backend implementations
41
26
  static bool ggml_op_can_inplace(enum ggml_op op) {
42
27
  switch (op) {
@@ -45,6 +45,10 @@
45
45
  #include "ggml-vulkan.h"
46
46
  #endif
47
47
 
48
+ #ifdef GGML_USE_WEBGPU
49
+ #include "ggml-webgpu.h"
50
+ #endif
51
+
48
52
  #ifdef GGML_USE_OPENCL
49
53
  #include "ggml-opencl.h"
50
54
  #endif
@@ -173,6 +177,9 @@ struct ggml_backend_registry {
173
177
  #ifdef GGML_USE_VULKAN
174
178
  register_backend(ggml_backend_vk_reg());
175
179
  #endif
180
+ #ifdef GGML_USE_WEBGPU
181
+ register_backend(ggml_backend_webgpu_reg());
182
+ #endif
176
183
  #ifdef GGML_USE_OPENCL
177
184
  register_backend(ggml_backend_opencl_reg());
178
185
  #endif
@@ -352,21 +352,6 @@ ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) {
352
352
 
353
353
  // backend copy
354
354
 
355
- static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
356
- if (a->type != b->type) {
357
- return false;
358
- }
359
- for (int i = 0; i < GGML_MAX_DIMS; i++) {
360
- if (a->ne[i] != b->ne[i]) {
361
- return false;
362
- }
363
- if (a->nb[i] != b->nb[i]) {
364
- return false;
365
- }
366
- }
367
- return true;
368
- }
369
-
370
355
  void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) {
371
356
  GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
372
357
 
@@ -662,6 +647,7 @@ struct ggml_backend_sched {
662
647
  // pipeline parallelism support
663
648
  int n_copies;
664
649
  int cur_copy;
650
+ int next_copy;
665
651
  ggml_backend_event_t events[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
666
652
  struct ggml_tensor * graph_inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
667
653
  int n_graph_inputs;
@@ -1448,8 +1434,6 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
1448
1434
  }
1449
1435
  }
1450
1436
 
1451
- sched->cur_copy = (sched->cur_copy + 1) % sched->n_copies;
1452
-
1453
1437
  return GGML_STATUS_SUCCESS;
1454
1438
  }
1455
1439
 
@@ -1550,10 +1534,10 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
1550
1534
  bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
1551
1535
  GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
1552
1536
 
1553
- ggml_backend_sched_split_graph(sched, measure_graph);
1554
-
1555
1537
  ggml_backend_sched_synchronize(sched);
1556
1538
 
1539
+ ggml_backend_sched_split_graph(sched, measure_graph);
1540
+
1557
1541
  if (!ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
1558
1542
  return false;
1559
1543
  }
@@ -1565,6 +1549,10 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
1565
1549
 
1566
1550
  bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
1567
1551
  GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + graph->n_leafs);
1552
+ GGML_ASSERT(!sched->is_alloc);
1553
+
1554
+ sched->cur_copy = sched->next_copy;
1555
+ sched->next_copy = (sched->next_copy + 1) % sched->n_copies;
1568
1556
 
1569
1557
  ggml_backend_sched_split_graph(sched, graph);
1570
1558
 
@@ -1605,7 +1593,7 @@ void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
1605
1593
  // if the graph is not already allocated, always use copy 0 after a synchronization
1606
1594
  // this ensures that during generation the same copy is used every time,
1607
1595
  // which avoids changes in the graph that could cause CUDA or other graphs to be disabled
1608
- sched->cur_copy = 0;
1596
+ sched->next_copy = 0;
1609
1597
  }
1610
1598
  }
1611
1599
 
@@ -77,6 +77,8 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne,
77
77
  for (int i = 0; i < final_dims; i++) {
78
78
  acl_storage_len += (acl_ne[i] - 1) * acl_stride[i];
79
79
  }
80
+ size_t elem_offset = offset / ggml_element_size(tensor);
81
+ acl_storage_len += elem_offset;
80
82
 
81
83
  // Reverse ne and stride.
82
84
  std::reverse(acl_ne, acl_ne + final_dims);
@@ -84,7 +86,7 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne,
84
86
 
85
87
  aclTensor* acl_tensor = aclCreateTensor(
86
88
  acl_ne, final_dims, ggml_cann_type_mapping(tensor->type), acl_stride,
87
- offset / ggml_element_size(tensor), format, &acl_storage_len, 1,
89
+ elem_offset, format, &acl_storage_len, 1,
88
90
  tensor->data);
89
91
 
90
92
  return acl_tensor;
@@ -99,7 +99,7 @@ void bcast_shape(ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst, aclT
99
99
  }
100
100
  }
101
101
 
102
- void ggml_cann_unary_op(
102
+ void ggml_cann_op_unary(
103
103
  std::function<void(ggml_backend_cann_context&, aclTensor*, aclTensor*)> unary_op,
104
104
  ggml_backend_cann_context& ctx, ggml_tensor* dst) {
105
105
  ggml_tensor* src = dst->src[0];
@@ -111,6 +111,42 @@ void ggml_cann_unary_op(
111
111
  ggml_cann_release_resources(ctx, acl_src, acl_dst);
112
112
  }
113
113
 
114
+ void ggml_cann_op_unary_gated(
115
+ std::function<void(ggml_backend_cann_context&, aclTensor*, aclTensor*)> unary_op,
116
+ ggml_backend_cann_context& ctx, ggml_tensor* dst) {
117
+ ggml_tensor* src0 = dst->src[0];
118
+ ggml_tensor* src1 = dst->src[1];
119
+
120
+ GGML_ASSERT(ggml_is_contiguous_1(src0));
121
+ GGML_ASSERT(ggml_is_contiguous_1(dst));
122
+ const int32_t swapped = ggml_get_op_params_i32(dst, 1);
123
+
124
+ aclTensor* acl_dst = ggml_cann_create_tensor(dst);
125
+ aclTensor *acl_src0 = nullptr, *acl_src1 = nullptr;
126
+ if(src1) {
127
+ GGML_ASSERT(ggml_is_contiguous_1(src1));
128
+ GGML_ASSERT(src0->type == src1->type);
129
+
130
+ acl_src0 = ggml_cann_create_tensor(src0);
131
+ acl_src1 = ggml_cann_create_tensor(src1);
132
+ } else {
133
+ int64_t ne[] = {src0->ne[0] / 2, src0->ne[1], src0->ne[2], src0->ne[3]};
134
+ size_t nb[] = {src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]};
135
+ acl_src0 = ggml_cann_create_tensor(src0, ne, nb, GGML_MAX_DIMS, ACL_FORMAT_ND, 0);
136
+ acl_src1 = ggml_cann_create_tensor(src0, ne, nb, GGML_MAX_DIMS, ACL_FORMAT_ND, ne[0] * ggml_element_size(src0));
137
+ if (swapped) {
138
+ std::swap(acl_src0, acl_src1);
139
+ }
140
+ }
141
+
142
+ unary_op(ctx, acl_src0, acl_dst);
143
+ GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMul, acl_dst, acl_src1);
144
+
145
+ ggml_cann_release_resources(ctx, acl_src0, acl_dst);
146
+ if(src1)
147
+ ggml_cann_release_resources(ctx, acl_src1);
148
+ }
149
+
114
150
  /**
115
151
  * @brief Repeats elements of a tensor along each dimension according to the
116
152
  * specified repeat array.
@@ -1785,8 +1821,27 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
1785
1821
  size_t transpose_nb[] = {bcast_weight_nb[1], bcast_weight_nb[0],
1786
1822
  bcast_weight_nb[2], bcast_weight_nb[3],
1787
1823
  bcast_weight_nb[4], bcast_weight_nb[5]};
1788
- aclTensor* acl_weight_tensor =
1789
- ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims);
1824
+ aclTensor* acl_weight_tensor;
1825
+
1826
+ bool weightToNZ = false;
1827
+ #ifdef ASCEND_310P
1828
+ weightToNZ = (getenv("GGML_CANN_WEIGHT_NZ") != nullptr);
1829
+ #endif
1830
+ if (weightToNZ && is_matmul_weight(weight)) {
1831
+ int64_t acl_stride[2] = {1, transpose_ne[1]};
1832
+
1833
+ // Reverse ne.
1834
+ std::reverse(transpose_ne, transpose_ne + n_dims);
1835
+
1836
+ std::vector<int64_t> storageDims = {transpose_ne[0], transpose_ne[1]};
1837
+
1838
+ acl_weight_tensor = aclCreateTensor(
1839
+ transpose_ne, n_dims, ggml_cann_type_mapping(weight->type), acl_stride,
1840
+ 0, ACL_FORMAT_FRACTAL_NZ, storageDims.data(), 2, weight->data);
1841
+ } else {
1842
+ acl_weight_tensor =
1843
+ ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims, ACL_FORMAT_ND);
1844
+ }
1790
1845
  aclTensor* acl_dst =
1791
1846
  ggml_cann_create_tensor(dst, bcast_dst_ne, bcast_dst_nb, n_dims);
1792
1847