@novastera-oss/llamarn 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (266) hide show
  1. package/README.md +80 -14
  2. package/RNLlamaCpp.podspec +10 -3
  3. package/android/CMakeLists.txt +8 -0
  4. package/android/src/main/cpp/include/llama.h +62 -125
  5. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  6. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  10. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  11. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  12. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  13. package/cpp/build-info.cpp +2 -2
  14. package/cpp/llama.cpp/README.md +11 -3
  15. package/cpp/llama.cpp/build-xcframework.sh +1 -0
  16. package/cpp/llama.cpp/common/CMakeLists.txt +8 -2
  17. package/cpp/llama.cpp/common/arg.cpp +153 -113
  18. package/cpp/llama.cpp/common/chat-parser.cpp +379 -0
  19. package/cpp/llama.cpp/common/chat-parser.h +117 -0
  20. package/cpp/llama.cpp/common/chat.cpp +847 -699
  21. package/cpp/llama.cpp/common/chat.h +73 -6
  22. package/cpp/llama.cpp/common/common.cpp +50 -82
  23. package/cpp/llama.cpp/common/common.h +21 -17
  24. package/cpp/llama.cpp/common/json-partial.cpp +255 -0
  25. package/cpp/llama.cpp/common/json-partial.h +37 -0
  26. package/cpp/llama.cpp/common/minja/chat-template.hpp +9 -5
  27. package/cpp/llama.cpp/common/minja/minja.hpp +69 -36
  28. package/cpp/llama.cpp/common/regex-partial.cpp +204 -0
  29. package/cpp/llama.cpp/common/regex-partial.h +56 -0
  30. package/cpp/llama.cpp/common/sampling.cpp +7 -8
  31. package/cpp/llama.cpp/convert_hf_to_gguf.py +453 -118
  32. package/cpp/llama.cpp/convert_hf_to_gguf_update.py +120 -68
  33. package/cpp/llama.cpp/ggml/CMakeLists.txt +2 -1
  34. package/cpp/llama.cpp/ggml/cmake/common.cmake +25 -0
  35. package/cpp/llama.cpp/ggml/include/ggml-opt.h +49 -28
  36. package/cpp/llama.cpp/ggml/include/ggml.h +26 -7
  37. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +16 -10
  38. package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +4 -1
  39. package/cpp/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +1 -0
  40. package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +2 -0
  41. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +604 -0
  42. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +42 -0
  43. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +54 -2
  44. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +50 -51
  45. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -2
  46. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +5 -9
  47. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +779 -19
  48. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +22 -0
  49. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +88 -5
  50. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -12
  51. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +264 -69
  52. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +322 -100
  53. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +117 -1
  54. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +85 -16
  55. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +220 -49
  56. package/cpp/llama.cpp/ggml/src/ggml-cuda/acc.cu +40 -26
  57. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +1 -1
  58. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +11 -1
  59. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +15 -7
  60. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +266 -64
  61. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +49 -4
  62. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +48 -4
  63. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +2 -1
  64. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +5 -1
  65. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +2 -0
  66. package/cpp/llama.cpp/ggml/src/ggml-cuda/quantize.cu +7 -6
  67. package/cpp/llama.cpp/ggml/src/ggml-cuda/sum.cu +1 -1
  68. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +10 -0
  69. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +2 -0
  70. package/cpp/llama.cpp/ggml/src/ggml-impl.h +1 -1
  71. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +4 -0
  72. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +99 -17
  73. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +200 -2
  74. package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +8 -2
  75. package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cu +112 -0
  76. package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +12 -0
  77. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +6 -0
  78. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +972 -178
  79. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  80. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/div.cl +72 -0
  81. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/group_norm.cl +72 -0
  82. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  83. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sub.cl +72 -0
  84. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
  85. package/cpp/llama.cpp/ggml/src/ggml-opt.cpp +373 -190
  86. package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +29 -23
  87. package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -10
  88. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +101 -5
  89. package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +31 -33
  90. package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +1 -0
  91. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +29 -2
  92. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +4 -5
  93. package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
  94. package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +9 -1
  95. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +84 -72
  96. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +2 -0
  97. package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
  98. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +1 -3
  99. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +324 -129
  100. package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +1 -0
  101. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +31 -2
  102. package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +95 -68
  103. package/cpp/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +1 -0
  104. package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +22 -0
  105. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -2
  106. package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +1 -4
  107. package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +2 -3
  108. package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +69 -43
  109. package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +2 -14
  110. package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +81 -91
  111. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +432 -181
  112. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +17 -0
  113. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +1 -1
  114. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +6 -152
  115. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +162 -0
  116. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +360 -0
  117. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +2 -118
  118. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +1 -1
  119. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +12 -1
  120. package/cpp/llama.cpp/ggml/src/ggml.c +107 -36
  121. package/cpp/llama.cpp/ggml/src/gguf.cpp +33 -33
  122. package/cpp/llama.cpp/gguf-py/gguf/constants.py +100 -15
  123. package/cpp/llama.cpp/gguf-py/gguf/gguf_reader.py +1 -1
  124. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +44 -12
  125. package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_editor_gui.py +21 -10
  126. package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_new_metadata.py +5 -2
  127. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +128 -31
  128. package/cpp/llama.cpp/gguf-py/gguf/utility.py +1 -1
  129. package/cpp/llama.cpp/gguf-py/pyproject.toml +1 -1
  130. package/cpp/llama.cpp/include/llama.h +62 -125
  131. package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +1 -1
  132. package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +1 -1
  133. package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.inp +1 -1
  134. package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.out +1 -1
  135. package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +1 -1
  136. package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +1 -1
  137. package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +1 -1
  138. package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +1 -1
  139. package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.inp +1 -1
  140. package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.out +1 -1
  141. package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +1 -1
  142. package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +1 -1
  143. package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +1 -1
  144. package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +1 -1
  145. package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +1 -1
  146. package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +1 -1
  147. package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.inp +1 -1
  148. package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.out +1 -1
  149. package/cpp/llama.cpp/models/ggml-vocab-nomic-bert-moe.gguf +0 -0
  150. package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +1 -1
  151. package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.out +1 -1
  152. package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +1 -1
  153. package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.out +1 -1
  154. package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.inp +1 -1
  155. package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.out +1 -1
  156. package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +1 -1
  157. package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.out +1 -1
  158. package/cpp/llama.cpp/models/templates/Qwen-QwQ-32B.jinja +62 -0
  159. package/cpp/llama.cpp/models/templates/Qwen-Qwen3-0.6B.jinja +85 -0
  160. package/cpp/llama.cpp/models/templates/README.md +2 -0
  161. package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +5 -1
  162. package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +5 -1
  163. package/cpp/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
  164. package/cpp/llama.cpp/requirements/requirements-gguf_editor_gui.txt +1 -1
  165. package/cpp/llama.cpp/src/CMakeLists.txt +2 -0
  166. package/cpp/llama.cpp/src/llama-arch.cpp +6 -0
  167. package/cpp/llama.cpp/src/llama-arch.h +2 -0
  168. package/cpp/llama.cpp/src/llama-batch.cpp +3 -1
  169. package/cpp/llama.cpp/src/llama-context.cpp +340 -123
  170. package/cpp/llama.cpp/src/llama-context.h +30 -0
  171. package/cpp/llama.cpp/src/llama-cparams.cpp +4 -0
  172. package/cpp/llama.cpp/src/llama-cparams.h +2 -0
  173. package/cpp/llama.cpp/src/llama-grammar.cpp +12 -2
  174. package/cpp/llama.cpp/src/llama-graph.cpp +157 -247
  175. package/cpp/llama.cpp/src/llama-graph.h +52 -7
  176. package/cpp/llama.cpp/src/llama-hparams.cpp +17 -1
  177. package/cpp/llama.cpp/src/llama-hparams.h +37 -5
  178. package/cpp/llama.cpp/src/llama-kv-cache.cpp +742 -481
  179. package/cpp/llama.cpp/src/llama-kv-cache.h +196 -99
  180. package/cpp/llama.cpp/src/llama-kv-cells.h +379 -0
  181. package/cpp/llama.cpp/src/llama-memory.h +4 -3
  182. package/cpp/llama.cpp/src/llama-model-loader.cpp +22 -17
  183. package/cpp/llama.cpp/src/llama-model-saver.cpp +281 -0
  184. package/cpp/llama.cpp/src/llama-model-saver.h +37 -0
  185. package/cpp/llama.cpp/src/llama-model.cpp +529 -172
  186. package/cpp/llama.cpp/src/llama-model.h +6 -1
  187. package/cpp/llama.cpp/src/llama-quant.cpp +15 -13
  188. package/cpp/llama.cpp/src/llama-sampling.cpp +2 -2
  189. package/cpp/llama.cpp/src/llama-vocab.cpp +35 -8
  190. package/cpp/llama.cpp/src/llama-vocab.h +6 -0
  191. package/cpp/llama.cpp/src/llama.cpp +14 -0
  192. package/cpp/rn-completion.cpp +4 -2
  193. package/ios/include/chat.h +73 -6
  194. package/ios/include/common/minja/chat-template.hpp +9 -5
  195. package/ios/include/common/minja/minja.hpp +69 -36
  196. package/ios/include/common.h +21 -17
  197. package/ios/include/llama.h +62 -125
  198. package/ios/libs/llama.xcframework/Info.plist +19 -19
  199. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  200. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4617 -4487
  201. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-opt.h +237 -0
  202. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +26 -7
  203. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +62 -125
  204. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  205. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  206. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4638 -4508
  207. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3557 -3435
  208. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
  209. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
  210. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
  211. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  212. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  213. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4638 -4508
  214. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3559 -3437
  215. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-opt.h +237 -0
  216. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +26 -7
  217. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +62 -125
  218. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-opt.h +237 -0
  219. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +26 -7
  220. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +62 -125
  221. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  222. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-opt.h +237 -0
  223. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +26 -7
  224. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +62 -125
  225. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  226. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  227. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  228. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4616 -4487
  229. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-opt.h +237 -0
  230. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +26 -7
  231. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +62 -125
  232. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  233. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  234. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4637 -4508
  235. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3556 -3435
  236. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
  237. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
  238. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
  239. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  240. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  241. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4653 -4523
  242. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-opt.h +237 -0
  243. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +26 -7
  244. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +62 -125
  245. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  246. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  247. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4674 -4544
  248. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3587 -3465
  249. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
  250. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
  251. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
  252. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  253. package/package.json +1 -1
  254. package/cpp/llama.cpp/common/stb_image.h +0 -7988
  255. package/cpp/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +0 -112
  256. package/cpp/llama.cpp/models/ggml-vocab-chameleon.gguf.out +0 -46
  257. package/cpp/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.inp +0 -112
  258. package/cpp/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.out +0 -46
  259. package/cpp/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +0 -112
  260. package/cpp/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +0 -46
  261. package/cpp/llama.cpp/models/ggml-vocab-llama4.gguf.inp +0 -112
  262. package/cpp/llama.cpp/models/ggml-vocab-llama4.gguf.out +0 -46
  263. package/cpp/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +0 -112
  264. package/cpp/llama.cpp/models/ggml-vocab-pixtral.gguf.out +0 -46
  265. package/cpp/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +0 -112
  266. package/cpp/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +0 -46
@@ -49,6 +49,7 @@ static bool g_sycl_loaded = false;
49
49
  int g_ggml_sycl_debug = 0;
50
50
  int g_ggml_sycl_disable_optimize = 0;
51
51
  int g_ggml_sycl_disable_graph = 0;
52
+ int g_ggml_sycl_disable_dnn = 0;
52
53
  int g_ggml_sycl_prioritize_dmmv = 0;
53
54
 
54
55
  static ggml_sycl_device_info ggml_sycl_init() {
@@ -196,12 +197,22 @@ static void ggml_check_sycl() try {
196
197
  g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
197
198
  g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 1);
198
199
  g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1);
200
+ g_ggml_sycl_disable_dnn = get_sycl_env("GGML_SYCL_DISABLE_DNN", 0);
199
201
  g_ggml_sycl_prioritize_dmmv = get_sycl_env("GGML_SYCL_PRIORITIZE_DMMV", 0);
200
202
  GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n");
201
203
  GGML_LOG_INFO("Running with Environment Variables:\n");
202
204
  GGML_LOG_INFO(" GGML_SYCL_DEBUG: %d\n", g_ggml_sycl_debug);
203
205
  GGML_LOG_INFO(" GGML_SYCL_DISABLE_OPT: %d\n", g_ggml_sycl_disable_optimize);
206
+ #ifdef GGML_SYCL_GRAPH
204
207
  GGML_LOG_INFO(" GGML_SYCL_DISABLE_GRAPH: %d\n", g_ggml_sycl_disable_graph);
208
+ #else
209
+ GGML_LOG_INFO(" GGML_SYCL_DISABLE_GRAPH: graph disabled by compile flag\n");
210
+ #endif
211
+ #if GGML_SYCL_DNNL
212
+ GGML_LOG_INFO(" GGML_SYCL_DISABLE_DNN: %d\n", g_ggml_sycl_disable_dnn);
213
+ #else
214
+ GGML_LOG_INFO(" GGML_SYCL_DISABLE_DNN: DNN disabled by compile flag\n");
215
+ #endif
205
216
  GGML_LOG_INFO(" GGML_SYCL_PRIORITIZE_DMMV: %d\n", g_ggml_sycl_prioritize_dmmv);
206
217
  GGML_LOG_INFO("Build with Macros:\n");
207
218
  #if defined(GGML_SYCL_FORCE_MMQ)
@@ -335,13 +346,15 @@ static void * ggml_backend_sycl_buffer_get_base(ggml_backend_buffer_t buffer) {
335
346
  static enum ggml_status
336
347
  ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
337
348
  ggml_tensor *tensor) try {
349
+ GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
350
+ debug_print_tensor(": tensor=", tensor, "\n");
338
351
  ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context;
339
352
 
340
353
  if (tensor->view_src != NULL) {
341
354
  assert(tensor->view_src->buffer->buft == buffer->buft);
342
355
  return GGML_STATUS_SUCCESS;
343
356
  }
344
- if (tensor->type == GGML_TYPE_Q4_0 && !g_ggml_sycl_disable_optimize) {
357
+ if ((tensor->type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q4_K) && !g_ggml_sycl_disable_optimize) {
345
358
  ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
346
359
  tensor->extra = extra;
347
360
  ctx->tensor_extras.push_back(extra); //used to release it when destroy ctx.
@@ -370,20 +383,23 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer,
370
383
  ggml_tensor *tensor,
371
384
  const void *data, size_t offset,
372
385
  size_t size) try {
373
-
386
+ GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
387
+ debug_print_tensor(": tensor=", tensor);
388
+ GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
374
389
  ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
375
390
  ggml_sycl_set_device(ctx->device);
376
391
  auto stream = &(dpct::dev_mgr::instance().get_device(ctx->device).default_queue());
377
- SYCL_CHECK(
378
- CHECK_TRY_ERROR(dpct::dev_mgr::instance().get_device(ctx->device).queues_wait_and_throw()));
392
+ SYCL_CHECK(CHECK_TRY_ERROR(dpct::dev_mgr::instance().get_device(ctx->device).queues_wait_and_throw()));
393
+ #ifndef _WIN32
379
394
  // Note: Use host buffer to save the data from mmap(), then copy to device. It's workaround for mmap() issue on PVC GPU.
380
395
  // This function will be called during load model from disk. Use memory buffer replace dynamic won't save more time and brings potential memory leak risk here.
381
- char* host_buf = (char*)malloc(size);
396
+ char * host_buf = (char *) malloc(size);
382
397
  memcpy(host_buf, data, size);
383
- SYCL_CHECK(
384
- CHECK_TRY_ERROR((*stream).memcpy((char *)tensor->data + offset, host_buf, size)
385
- .wait()));
398
+ SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy((char *) tensor->data + offset, host_buf, size).wait()));
386
399
  free(host_buf);
400
+ #else
401
+ SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy((char *) tensor->data + offset, data, size).wait()));
402
+ #endif
387
403
  }
388
404
  catch (sycl::exception const &exc) {
389
405
  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -395,7 +411,9 @@ static void ggml_backend_sycl_buffer_get_tensor(ggml_backend_buffer_t buffer,
395
411
  const ggml_tensor *tensor,
396
412
  void *data, size_t offset,
397
413
  size_t size) try {
398
-
414
+ GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
415
+ debug_print_tensor(": tensor=", tensor);
416
+ GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
399
417
  ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
400
418
 
401
419
  ggml_sycl_set_device(ctx->device);
@@ -423,7 +441,12 @@ static bool
423
441
  ggml_backend_sycl_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
424
442
  const ggml_tensor *src,
425
443
  ggml_tensor *dst) try {
426
- if (ggml_backend_buffer_is_sycl(src->buffer)) {
444
+ bool is_cpy_supported = ggml_backend_buffer_is_sycl(src->buffer);
445
+ GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
446
+ debug_print_tensor(": dst=", dst);
447
+ debug_print_tensor(" src=", src);
448
+ GGML_SYCL_DEBUG(" is_cpy_supported=%d\n", is_cpy_supported);
449
+ if (is_cpy_supported) {
427
450
  ggml_backend_sycl_buffer_context * src_ctx = (ggml_backend_sycl_buffer_context *)src->buffer->context;
428
451
  ggml_backend_sycl_buffer_context * dst_ctx = (ggml_backend_sycl_buffer_context *)dst->buffer->context;
429
452
 
@@ -480,7 +503,8 @@ ggml_backend_sycl_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
480
503
 
481
504
  static void ggml_backend_sycl_buffer_clear(ggml_backend_buffer_t buffer,
482
505
  uint8_t value) try {
483
- ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
506
+ GGML_SYCL_DEBUG("[SYCL] call %s: size=%zu\n", __func__, buffer->size);
507
+ ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *) buffer->context;
484
508
 
485
509
  ggml_sycl_set_device(ctx->device);
486
510
  queue_ptr stream = ctx->stream;
@@ -499,7 +523,9 @@ catch (sycl::exception const &exc) {
499
523
 
500
524
  static void ggml_backend_sycl_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value,
501
525
  size_t offset, size_t size) {
502
- GGML_SYCL_DEBUG(" [SYCL] call %s\n", __func__);
526
+ GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
527
+ debug_print_tensor(": tensor=", tensor);
528
+ GGML_SYCL_DEBUG(" size=%zu offset=%zu value=%u\n", size, offset, value);
503
529
  ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *) buffer->context;
504
530
  SYCL_CHECK(ggml_sycl_set_device(ctx->device));
505
531
  auto stream = &(dpct::dev_mgr::instance().get_device(ctx->device).default_queue());
@@ -777,6 +803,8 @@ static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buff
777
803
  static enum ggml_status
778
804
  ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
779
805
  ggml_tensor *tensor) try {
806
+ GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
807
+ debug_print_tensor(": tensor=", tensor, "\n");
780
808
  GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
781
809
 
782
810
  ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
@@ -861,6 +889,9 @@ static void
861
889
  ggml_backend_sycl_split_buffer_set_tensor(ggml_backend_buffer_t buffer,
862
890
  ggml_tensor *tensor, const void *data,
863
891
  size_t offset, size_t size) try {
892
+ GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
893
+ debug_print_tensor(": tensor=", tensor);
894
+ GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
864
895
  // split tensors must always be set in their entirety at once
865
896
  GGML_ASSERT(offset == 0);
866
897
  GGML_ASSERT(size == ggml_nbytes(tensor));
@@ -914,6 +945,9 @@ static void
914
945
  ggml_backend_sycl_split_buffer_get_tensor(ggml_backend_buffer_t buffer,
915
946
  const ggml_tensor *tensor, void *data,
916
947
  size_t offset, size_t size) try {
948
+ GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
949
+ debug_print_tensor(": tensor=", tensor);
950
+ GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
917
951
  // split tensors must always be set in their entirety at once
918
952
  GGML_ASSERT(offset == 0);
919
953
  GGML_ASSERT(size == ggml_nbytes(tensor));
@@ -1985,31 +2019,30 @@ inline void ggml_sycl_op_mul_mat_sycl(
1985
2019
 
1986
2020
  const int64_t ne00 = src0->ne[0];
1987
2021
  const int64_t ne10 = src1->ne[0];
1988
-
2022
+ GGML_ASSERT(ne00 == ne10);
1989
2023
 
1990
2024
  const int64_t row_diff = row_high - row_low;
1991
2025
 
1992
2026
  int id;
1993
2027
  SYCL_CHECK(
1994
2028
  CHECK_TRY_ERROR(id = get_current_device_id()));
1995
- #if !GGML_SYCL_DNNL
1996
- const int64_t ne0 = dst->ne[0];
2029
+
2030
+ const int64_t ne0 = dst->ne[0]; // used by MKL only
1997
2031
  // the main device has a larger memory buffer to hold the results from all GPUs
1998
2032
  // ldc == nrows of the matrix that cuBLAS writes into
1999
- int ldc = id == ctx.device ? ne0 : row_diff;
2000
- #endif
2033
+ int ldc = id == ctx.device ? ne0 : row_diff; // used by MKL only
2001
2034
 
2002
2035
  #ifdef GGML_SYCL_F16
2003
2036
  bool use_fp16 = true; // TODO(Yu) SYCL capability check
2004
2037
  #else
2005
2038
  bool use_fp16 = false;
2006
2039
  #endif
2007
- if ((src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
2008
- use_fp16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1] &&
2009
- dst->op_params[0] == GGML_PREC_DEFAULT) {
2010
- // GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat_sycl - fp16 path\n");
2040
+ if ((src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && use_fp16 && ggml_is_contiguous(src0) &&
2041
+ row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
2011
2042
  ggml_sycl_pool_alloc<sycl::half> src0_as_f16(ctx.pool());
2012
2043
  if (src0->type != GGML_TYPE_F16) {
2044
+ scope_op_debug_print scope_dbg_print(__func__, "/to_fp16_sycl", dst, /*num_src=*/2,
2045
+ " : converting src0 to fp16");
2013
2046
  const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src0->type, dst);
2014
2047
  GGML_ASSERT(to_fp16_sycl != nullptr);
2015
2048
  size_t ne = row_diff*ne00;
@@ -2022,6 +2055,8 @@ inline void ggml_sycl_op_mul_mat_sycl(
2022
2055
 
2023
2056
  ggml_sycl_pool_alloc<sycl::half> src1_as_f16(ctx.pool());
2024
2057
  if (src1->type != GGML_TYPE_F16) {
2058
+ scope_op_debug_print scope_dbg_print(__func__, "/to_fp16_sycl", dst, /*num_src=*/2,
2059
+ " : converting src1 to fp16");
2025
2060
  const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type, dst);
2026
2061
  GGML_ASSERT(to_fp16_sycl != nullptr);
2027
2062
  size_t ne = src1_ncols*ne10;
@@ -2033,37 +2068,47 @@ inline void ggml_sycl_op_mul_mat_sycl(
2033
2068
  : src1_as_f16.get();
2034
2069
  ggml_sycl_pool_alloc<sycl::half> dst_f16(ctx.pool(), row_diff * src1_ncols);
2035
2070
 
2036
- #if !GGML_SYCL_DNNL
2037
- const sycl::half alpha_f16 = 1.0f;
2038
- const sycl::half beta_f16 = 0.0f;
2039
- SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm(
2040
- *stream, oneapi::math::transpose::trans,
2041
- oneapi::math::transpose::nontrans, row_diff, src1_ncols, ne10,
2042
- &alpha_f16, src0_ptr, dpct::library_data_t::real_half, ne00,
2043
- src1_ptr, dpct::library_data_t::real_half, ne10, &beta_f16,
2044
- dst_f16.get(), dpct::library_data_t::real_half, ldc,
2045
- dpct::library_data_t::real_half)));
2046
- const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst);
2047
- to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
2048
- #else
2049
- DnnlGemmWrapper::row_gemm(ctx, false, true, src1_ncols, row_diff, ne10, src1_ptr,
2050
- DnnlGemmWrapper::to_dt<sycl::half>(), src0_ptr, DnnlGemmWrapper::to_dt<sycl::half>(),
2051
- dst_f16.get(), DnnlGemmWrapper::to_dt<sycl::half>(), stream);
2052
- const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst);
2053
- to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff* src1_ncols, stream);
2071
+ #if GGML_SYCL_DNNL
2072
+ if (!g_ggml_sycl_disable_dnn) {
2073
+ DnnlGemmWrapper::row_gemm(ctx, src1_ncols, row_diff, ne10, src1_ptr,
2074
+ DnnlGemmWrapper::to_dt<sycl::half>(), src0_ptr, DnnlGemmWrapper::to_dt<sycl::half>(),
2075
+ dst_f16.get(), DnnlGemmWrapper::to_dt<sycl::half>(), stream);
2076
+ scope_op_debug_print scope_dbg_print(__func__, "/to_fp32_sycl", dst, /*num_src=*/2,
2077
+ " : converting dst to fp32");
2078
+ const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst);
2079
+ to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff* src1_ncols, stream);
2080
+ }
2081
+ else
2054
2082
  #endif
2055
- }
2056
- else {
2057
- // GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat_sycl - fp32 path\n");
2083
+ {
2084
+ const sycl::half alpha_f16 = 1.0f;
2085
+ const sycl::half beta_f16 = 0.0f;
2086
+ SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm(
2087
+ *stream, oneapi::math::transpose::trans,
2088
+ oneapi::math::transpose::nontrans, row_diff, src1_ncols, ne10,
2089
+ &alpha_f16, src0_ptr, dpct::library_data_t::real_half, ne00,
2090
+ src1_ptr, dpct::library_data_t::real_half, ne10, &beta_f16,
2091
+ dst_f16.get(), dpct::library_data_t::real_half, ldc,
2092
+ dpct::library_data_t::real_half)));
2093
+ scope_op_debug_print scope_dbg_print(__func__, "/to_fp32_sycl", dst, /*num_src=*/2,
2094
+ " : converting dst to fp32");
2095
+ const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst);
2096
+ to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
2097
+ }
2098
+ } else {
2058
2099
  ggml_sycl_pool_alloc<float> src0_ddq_as_f32(ctx.pool());
2059
2100
  ggml_sycl_pool_alloc<float> src1_ddq_as_f32(ctx.pool());
2060
2101
  if (src0->type != GGML_TYPE_F32) {
2102
+ scope_op_debug_print scope_dbg_print(__func__, "/to_fp32_sycl", dst, /*num_src=*/2,
2103
+ " : converting src0 to fp32");
2061
2104
  const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(src0->type, dst);
2062
2105
  GGML_ASSERT(to_fp32_sycl != nullptr);
2063
2106
  src0_ddq_as_f32.alloc(row_diff*ne00);
2064
2107
  to_fp32_sycl(src0_dd_i, src0_ddq_as_f32.get(), row_diff*ne00, stream);
2065
2108
  }
2066
2109
  if (src1->type != GGML_TYPE_F32) {
2110
+ scope_op_debug_print scope_dbg_print(__func__, "/to_fp32_sycl", dst, /*num_src=*/2,
2111
+ " : converting src1 to fp32");
2067
2112
  const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(src1->type, dst);
2068
2113
  GGML_ASSERT(to_fp32_sycl != nullptr);
2069
2114
  src1_ddq_as_f32.alloc(src1_ncols*ne10);
@@ -2072,18 +2117,22 @@ inline void ggml_sycl_op_mul_mat_sycl(
2072
2117
  const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get();
2073
2118
  const float * src1_ddf1_i = src1->type == GGML_TYPE_F32 ? (const float *) src1_ddf_i : src1_ddq_as_f32.get();
2074
2119
 
2075
- #if !GGML_SYCL_DNNL
2076
- const float alpha = 1.0f;
2077
- const float beta = 0.0f;
2078
- SYCL_CHECK(CHECK_TRY_ERROR(oneapi::math::blas::column_major::gemm(
2079
- get_onemath_backend(*stream), oneapi::math::transpose::trans, oneapi::math::transpose::nontrans, row_diff,
2080
- src1_ncols, ne10, dpct::get_value(&alpha, *stream), src0_ddf_i, ne00, src1_ddf1_i, ne10,
2081
- dpct::get_value(&beta, *stream), dst_dd_i, ldc)));
2082
- #else
2083
- DnnlGemmWrapper::row_gemm(ctx, false, true, src1_ncols, row_diff, ne10, src1_ddf1_i,
2084
- DnnlGemmWrapper::to_dt<float>(), src0_ddf_i, DnnlGemmWrapper::to_dt<float>(),
2085
- dst_dd_i, DnnlGemmWrapper::to_dt<float>(), stream);
2120
+ #if GGML_SYCL_DNNL
2121
+ if (!g_ggml_sycl_disable_dnn) {
2122
+ DnnlGemmWrapper::row_gemm(ctx, src1_ncols, row_diff, ne10, src1_ddf1_i,
2123
+ DnnlGemmWrapper::to_dt<float>(), src0_ddf_i, DnnlGemmWrapper::to_dt<float>(),
2124
+ dst_dd_i, DnnlGemmWrapper::to_dt<float>(), stream);
2125
+ }
2126
+ else
2086
2127
  #endif
2128
+ {
2129
+ const float alpha = 1.0f;
2130
+ const float beta = 0.0f;
2131
+ SYCL_CHECK(CHECK_TRY_ERROR(oneapi::math::blas::column_major::gemm(
2132
+ get_onemath_backend(*stream), oneapi::math::transpose::trans, oneapi::math::transpose::nontrans, row_diff,
2133
+ src1_ncols, ne10, dpct::get_value(&alpha, *stream), src0_ddf_i, ne00, src1_ddf1_i, ne10,
2134
+ dpct::get_value(&beta, *stream), dst_dd_i, ldc)));
2135
+ }
2087
2136
  }
2088
2137
  GGML_UNUSED(dst);
2089
2138
  GGML_UNUSED(src1_ddq_i);
@@ -2095,8 +2144,7 @@ catch (sycl::exception const &exc) {
2095
2144
  std::exit(1);
2096
2145
  }
2097
2146
 
2098
- static void ggml_sycl_op_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
2099
-
2147
+ static void ggml_sycl_op_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
2100
2148
  GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
2101
2149
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
2102
2150
  dpct::queue_ptr main_stream = ctx.stream();
@@ -2148,8 +2196,7 @@ inline void ggml_sycl_op_sum(ggml_backend_sycl_context & ctx, ggml_tensor *dst)
2148
2196
  sum_rows_f32_sycl(src0_dd, dst_dd, ne, 1, main_stream);
2149
2197
  }
2150
2198
 
2151
- inline void ggml_sycl_op_sum_rows(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
2152
-
2199
+ inline void ggml_sycl_op_sum_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
2153
2200
  GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
2154
2201
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
2155
2202
  dpct::queue_ptr main_stream = ctx.stream();
@@ -2180,8 +2227,7 @@ inline void ggml_sycl_op_argsort(ggml_backend_sycl_context & ctx, ggml_tensor *
2180
2227
  argsort_f32_i32_sycl(src0_dd, (int *) dst_dd, ncols, nrows, order, main_stream);
2181
2228
  }
2182
2229
 
2183
- inline void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
2184
-
2230
+ inline void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
2185
2231
  GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
2186
2232
  GGML_ASSERT( dst->type == GGML_TYPE_I32);
2187
2233
 
@@ -2196,8 +2242,7 @@ inline void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, ggml_tensor *ds
2196
2242
  argmax_f32_i32_sycl(src0_dd, dst_dd, ncols, nrows, main_stream);
2197
2243
  }
2198
2244
 
2199
- inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx,ggml_tensor *dst) {
2200
-
2245
+ inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
2201
2246
  GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
2202
2247
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
2203
2248
  dpct::queue_ptr main_stream = ctx.stream();
@@ -2214,8 +2259,7 @@ inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx,ggml_tens
2214
2259
  diag_mask_inf_f32_sycl(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream);
2215
2260
  }
2216
2261
 
2217
- inline void ggml_sycl_op_scale(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
2218
-
2262
+ inline void ggml_sycl_op_scale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
2219
2263
  GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
2220
2264
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
2221
2265
  dpct::queue_ptr main_stream = ctx.stream();
@@ -2402,6 +2446,8 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
2402
2446
  dev[i].src1_ddq = dev[i].src1_ddq_alloc.alloc(ctx.pool(i), nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs);
2403
2447
 
2404
2448
  if (src1_on_device && src1_is_contiguous) {
2449
+ scope_op_debug_print scope_dbg_print(__func__, "/quantize_row_q8_1_sycl", dst,
2450
+ /*num_src=*/2, " : converting src1 to Q8_1");
2405
2451
  quantize_row_q8_1_sycl(dev[i].src1_ddf, dev[i].src1_ddq, ne10, nrows1, src1_padded_col_size, stream);
2406
2452
  /*
2407
2453
  DPCT1010:90: SYCL uses exceptions to report errors and does not
@@ -2506,6 +2552,8 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
2506
2552
  }
2507
2553
 
2508
2554
  if (convert_src1_to_q8_1 && !src1_is_contiguous) {
2555
+ scope_op_debug_print scope_dbg_print(__func__, "/quantize_row_q8_1_sycl", dst,
2556
+ /*num_src=*/2, " : converting src1 to Q8_1");
2509
2557
  quantize_row_q8_1_sycl(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
2510
2558
  /*
2511
2559
  DPCT1010:92: SYCL uses exceptions to report errors and does
@@ -2600,33 +2648,28 @@ catch (sycl::exception const &exc) {
2600
2648
 
2601
2649
 
2602
2650
  static void ggml_sycl_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
2603
- GGML_SYCL_DEBUG("call %s\n", __func__);
2651
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
2604
2652
  ggml_sycl_op_get_rows(ctx, dst);
2605
- GGML_SYCL_DEBUG("call %s done\n", __func__);
2606
2653
  }
2607
2654
 
2608
2655
  static void ggml_sycl_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
2609
- GGML_SYCL_DEBUG("call %s\n", __func__);
2656
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
2610
2657
  ggml_sycl_op_norm(ctx, dst);
2611
- GGML_SYCL_DEBUG("call %s done\n", __func__);
2612
2658
  }
2613
2659
 
2614
2660
  static void ggml_sycl_rms_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
2615
- GGML_SYCL_DEBUG("call %s\n", __func__);
2661
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
2616
2662
  ggml_sycl_op_rms_norm(ctx, dst);
2617
- GGML_SYCL_DEBUG("call %s done\n", __func__);
2618
2663
  }
2619
2664
 
2620
2665
  static void ggml_sycl_l2_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
2621
- GGML_SYCL_DEBUG("call %s\n", __func__);
2666
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
2622
2667
  ggml_sycl_op_l2_norm(ctx, dst);
2623
- GGML_SYCL_DEBUG("call %s done\n", __func__);
2624
2668
  }
2625
2669
 
2626
2670
  static void ggml_sycl_group_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
2627
- GGML_SYCL_DEBUG("call %s\n", __func__);
2671
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
2628
2672
  ggml_sycl_op_group_norm(ctx, dst);
2629
- GGML_SYCL_DEBUG("call %s done\n", __func__);
2630
2673
  }
2631
2674
 
2632
2675
  static void ggml_sycl_mul_mat_vec_p021(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@@ -2697,7 +2740,7 @@ catch (sycl::exception const &exc) {
2697
2740
  std::exit(1);
2698
2741
  }
2699
2742
 
2700
- static void k_compute_batched_ptrs(const sycl::half * src0_as_f16, const sycl::half * src1_as_f16, char * dst,
2743
+ static void k_compute_batched_ptrs(const sycl::half * src0_as_f16, const sycl::half * src1_as_f16, void * dst,
2701
2744
  const void ** ptrs_src, void ** ptrs_dst, int64_t ne12, int64_t ne13, int64_t ne23,
2702
2745
  size_t nb02, size_t nb03, size_t nb12, size_t nb13, size_t nbd2, size_t nbd3,
2703
2746
  int64_t r2, int64_t r3, const sycl::nd_item<3> & item_ct1) {
@@ -2713,7 +2756,7 @@ static void k_compute_batched_ptrs(const sycl::half * src0_as_f16, const sycl::h
2713
2756
 
2714
2757
  const uint8_t * src0_bytes = reinterpret_cast<const uint8_t *>(src0_as_f16);
2715
2758
  const uint8_t * src1_bytes = reinterpret_cast<const uint8_t *>(src1_as_f16);
2716
- uint8_t * dst_bytes = reinterpret_cast<uint8_t *>(dst);
2759
+ uint8_t * dst_bytes = static_cast<uint8_t *>(dst);
2717
2760
 
2718
2761
  ptrs_src[0 * ne23 + i12 + i13 * ne12] = src0_bytes + i02 * nb02 + i03 * nb03;
2719
2762
  ptrs_src[1 * ne23 + i12 + i13 * ne12] = src1_bytes + i12 * nb12 + i13 * nb13;
@@ -2726,6 +2769,7 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
2726
2769
  GGML_ASSERT(!ggml_is_transposed(src1));
2727
2770
  GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src0->buffer));
2728
2771
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
2772
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
2729
2773
 
2730
2774
  GGML_TENSOR_BINARY_OP_LOCALS
2731
2775
 
@@ -2753,6 +2797,8 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
2753
2797
 
2754
2798
  // convert src1 to fp16
2755
2799
  if (src1->type != GGML_TYPE_F16) {
2800
+ scope_op_debug_print scope_dbg_print(__func__, "/to_fp16_nc_sycl", dst, /*num_src=*/2,
2801
+ " : converting src1 to fp16");
2756
2802
  const to_fp16_nc_sycl_t to_fp16_nc_sycl = get_to_fp16_nc_sycl(src1->type);
2757
2803
  GGML_ASSERT(to_fp16_nc_sycl != nullptr);
2758
2804
  const int64_t ne_src1 = ggml_nelements(src1);
@@ -2766,7 +2812,6 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
2766
2812
  }
2767
2813
 
2768
2814
  ggml_sycl_pool_alloc<sycl::half> dst_f16(ctx.pool());
2769
- char * dst_t = reinterpret_cast<char *>(dst_ddf);
2770
2815
 
2771
2816
  dpct::library_data_t mkl_compute_type = dpct::library_data_t::real_float;
2772
2817
  dpct::library_data_t mkl_data_type = dpct::library_data_t::real_float;
@@ -2783,42 +2828,83 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
2783
2828
 
2784
2829
  GGML_ASSERT(ne12 % ne02 == 0);
2785
2830
  GGML_ASSERT(ne13 % ne03 == 0);
2831
+ GGML_ASSERT(ne01 == static_cast<int64_t>(nb1/nb0));
2832
+ GGML_ASSERT(ne10 == ne00);
2786
2833
 
2787
2834
  // broadcast factors
2788
2835
  const int64_t r2 = ne12 / ne02;
2789
2836
  const int64_t r3 = ne13 / ne03;
2790
2837
 
2791
- if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
2792
- // there is no broadcast and src0, src1 are contiguous across dims 2, 3
2793
- SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(*queue, oneapi::math::transpose::trans,
2794
- oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
2795
- src0_f16, dpct::library_data_t::real_half, nb01 / nb00, nb02 / nb00,
2796
- src1_f16, dpct::library_data_t::real_half, s11, s12, beta, dst_t,
2797
- mkl_data_type, ne0, ne1 * ne0, ne12 * ne13, mkl_compute_type)));
2798
- } else {
2799
- const int ne23 = ne12 * ne13;
2800
-
2801
- ggml_sycl_pool_alloc<const void *> ptrs_src(ctx.pool(), 2 * ne23);
2802
- ggml_sycl_pool_alloc<void *> ptrs_dst(ctx.pool(), 1 * ne23);
2803
- ggml_sycl_pool_alloc<matrix_info_t<float>> matrix_info(ctx.host_pool(), 1);
2804
-
2805
- sycl::range<3> block_dims(1, ne12, ne13);
2806
- queue->submit([&](sycl::handler & cgh) {
2807
- const void ** ptrs_src_get = ptrs_src.get();
2808
- void ** ptrs_dst_get = ptrs_dst.get();
2809
- size_t nb12_scaled = src1->type == GGML_TYPE_F16 ? nb12 : s12 * sizeof(sycl::half);
2810
- size_t nb13_scaled = src1->type == GGML_TYPE_F16 ? nb13 : s13 * sizeof(sycl::half);
2811
- cgh.parallel_for(sycl::nd_range<3>(block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
2812
- k_compute_batched_ptrs(src0_f16, src1_f16, dst_t, ptrs_src_get, ptrs_dst_get, ne12, ne13, ne23, nb02,
2813
- nb03, nb12_scaled, nb13_scaled, nbd2, nbd3, r2, r3, item_ct1);
2838
+ #if GGML_SYCL_DNNL
2839
+ if (!g_ggml_sycl_disable_dnn) {
2840
+ auto dnn_gemm = [&ctx, queue, ne11, ne01, ne10, nb00, nb01, nb02, s11, s12]
2841
+ (const sycl::half* src1, const sycl::half* src0, float* dst, const dnnl_dim_t batches_a, const dnnl_dim_t batches_b) {
2842
+
2843
+ DnnlGemmWrapper::gemm(ctx, ne11,ne01, ne10,
2844
+ src1, DnnlGemmWrapper::to_dt<sycl::half>(), s11, 1, s12,
2845
+ src0, DnnlGemmWrapper::to_dt<sycl::half>(), 1, nb01/nb00, nb02/nb00,
2846
+ dst, DnnlGemmWrapper::to_dt<float>(), queue, batches_a, batches_b);
2847
+ };
2848
+
2849
+ if (r2 == 1 && r3 == 1) {
2850
+ if (ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
2851
+ dnn_gemm(src1_f16, src0_f16, dst_ddf, ne12*ne13, ne02 * ne03);
2852
+ }
2853
+ else {
2854
+ for (int64_t ie03 = 0; ie03 < ne03; ++ie03) {
2855
+ const sycl::half* src0_f16_shifted = src0_f16 + ((ie03*nb03)/sizeof(sycl::half)); // nb is in bytes
2856
+ const sycl::half* src1_f16_shifted = src1_f16 + ie03*s13;
2857
+ float* dst_shifted = dst_ddf + ((ie03*nb3)/sizeof(float));
2858
+ dnn_gemm(src1_f16_shifted, src0_f16_shifted, dst_shifted, ne12, ne02);
2859
+ }
2860
+ }
2861
+ } else {
2862
+ // iterate over batches from smaller set of matrices (matrix 0)
2863
+ for (int64_t ie02 = 0; ie02 < ne02; ++ie02) {
2864
+ for (int64_t ie03 = 0; ie03 < ne03; ++ie03) {
2865
+ const sycl::half* src0_f16_shifted = src0_f16 + ((ie02*nb02 + ie03*nb03)/sizeof(sycl::half));
2866
+ const sycl::half* src1_f16_shifted = src1_f16 + ie02*s12*r2 + ie03*s13*r3;
2867
+ float* dst_shifted = dst_ddf + ((ie02*nb2*r2 + ie03*nb3*r3)/sizeof(float));
2868
+ dnn_gemm(src1_f16_shifted, src0_f16_shifted, dst_shifted, r2*r3, 1);
2869
+ }
2870
+ }
2871
+ }
2872
+ }
2873
+ else
2874
+ #endif
2875
+ {
2876
+ if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
2877
+ // there is no broadcast and src0, src1 are contiguous across dims 2, 3
2878
+ SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(*queue, oneapi::math::transpose::trans,
2879
+ oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
2880
+ src0_f16, dpct::library_data_t::real_half, nb01 / nb00, nb02 / nb00,
2881
+ src1_f16, dpct::library_data_t::real_half, s11, s12, beta, dst_ddf,
2882
+ mkl_data_type, ne0, ne1 * ne0, ne12 * ne13, mkl_compute_type)));
2883
+ } else {
2884
+ const int ne23 = ne12 * ne13;
2885
+
2886
+ ggml_sycl_pool_alloc<const void *> ptrs_src(ctx.pool(), 2 * ne23);
2887
+ ggml_sycl_pool_alloc<void *> ptrs_dst(ctx.pool(), 1 * ne23);
2888
+ ggml_sycl_pool_alloc<matrix_info_t<float>> matrix_info(ctx.host_pool(), 1);
2889
+
2890
+ sycl::range<3> block_dims(1, ne12, ne13);
2891
+ queue->submit([&](sycl::handler & cgh) {
2892
+ const void ** ptrs_src_get = ptrs_src.get();
2893
+ void ** ptrs_dst_get = ptrs_dst.get();
2894
+ size_t nb12_scaled = src1->type == GGML_TYPE_F16 ? nb12 : s12 * sizeof(sycl::half);
2895
+ size_t nb13_scaled = src1->type == GGML_TYPE_F16 ? nb13 : s13 * sizeof(sycl::half);
2896
+ cgh.parallel_for(sycl::nd_range<3>(block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
2897
+ k_compute_batched_ptrs(src0_f16, src1_f16, dst_ddf, ptrs_src_get, ptrs_dst_get, ne12, ne13, ne23, nb02,
2898
+ nb03, nb12_scaled, nb13_scaled, nbd2, nbd3, r2, r3, item_ct1);
2899
+ });
2814
2900
  });
2815
- });
2816
2901
 
2817
- SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
2818
- *queue, oneapi::math::transpose::trans, oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
2819
- (const void **) (ptrs_src.get() + 0 * ne23), dpct::library_data_t::real_half, nb01 / nb00,
2820
- (const void **) (ptrs_src.get() + 1 * ne23), dpct::library_data_t::real_half, s11, beta,
2821
- (void **) (ptrs_dst.get() + 0 * ne23), mkl_data_type, ne0, ne23, mkl_compute_type, matrix_info.get())));
2902
+ SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
2903
+ *queue, oneapi::math::transpose::trans, oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
2904
+ (const void **) (ptrs_src.get() + 0 * ne23), dpct::library_data_t::real_half, nb01 / nb00,
2905
+ (const void **) (ptrs_src.get() + 1 * ne23), dpct::library_data_t::real_half, s11, beta,
2906
+ (void **) (ptrs_dst.get() + 0 * ne23), mkl_data_type, ne0, ne23, mkl_compute_type, matrix_info.get())));
2907
+ }
2822
2908
  }
2823
2909
  } catch (const sycl::exception & exc) {
2824
2910
  std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl;
@@ -2841,6 +2927,8 @@ inline bool ggml_sycl_supports_reorder_mul_mat_sycl(enum ggml_type type) {
2841
2927
  switch (type) {
2842
2928
  case GGML_TYPE_Q4_0:
2843
2929
  return true;
2930
+ case GGML_TYPE_Q4_K:
2931
+ return !g_ggml_sycl_prioritize_dmmv;
2844
2932
  default:
2845
2933
  return false;
2846
2934
  }
@@ -2858,6 +2946,7 @@ inline bool ggml_sycl_supports_reorder_dmmv(enum ggml_type type) {
2858
2946
  inline bool ggml_sycl_supports_reorder_mmvq(enum ggml_type type) {
2859
2947
  switch (type) {
2860
2948
  case GGML_TYPE_Q4_0:
2949
+ case GGML_TYPE_Q4_K:
2861
2950
  return true;
2862
2951
  default:
2863
2952
  return false;
@@ -2883,16 +2972,16 @@ static bool ggml_sycl_supports_dmmv(enum ggml_type type) {
2883
2972
  }
2884
2973
  }
2885
2974
 
2886
- static void reorder_qw(char *data_device, const int ncols, const int nrows,
2887
- size_t size, size_t offset, dpct::queue_ptr stream) {
2888
- auto tmp_buf = sycl::malloc_shared<char>(size, *stream);
2975
+ static void reorder_qw_q4_0(uint8_t * data_device, const int ncols, const int nrows, size_t size, size_t offset,
2976
+ dpct::queue_ptr stream) {
2977
+ auto * tmp_buf = sycl::malloc_shared<uint8_t>(size, *stream);
2889
2978
  SYCL_CHECK(
2890
2979
  CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size)
2891
2980
  .wait()));
2892
2981
  GGML_ASSERT((size % sizeof(block_q4_0) == 0));
2893
2982
  GGML_ASSERT((offset % sizeof(block_q4_0) == 0));
2894
2983
  int offset_blks = offset / sizeof(block_q4_0);
2895
- auto qs_ptr = (uint8_t*)data_device + offset_blks * QK4_0 / 2;
2984
+ auto qs_ptr = data_device + offset_blks * QK4_0 / 2;
2896
2985
  auto d_ptr = (sycl::half*)(qs_ptr + ncols * nrows / 2) + offset_blks;
2897
2986
 
2898
2987
  stream->parallel_for(
@@ -2906,25 +2995,66 @@ static void reorder_qw(char *data_device, const int ncols, const int nrows,
2906
2995
  *(qs_ptr + ib * QK4_0 / 2 + j) = x[ib].qs[j];
2907
2996
  }
2908
2997
  *(d_ptr + ib) = x[ib].d;
2909
- });
2998
+ }).wait_and_throw();
2999
+
3000
+ sycl::free(tmp_buf, *stream);
3001
+ }
3002
+
3003
+ static void reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
3004
+ GGML_ASSERT(size % sizeof(block_q4_K) == 0);
3005
+ GGML_ASSERT(offset % sizeof(block_q4_K) == 0);
3006
+
3007
+ const int nblocks = size / sizeof(block_q4_K);
3008
+
3009
+ auto * tmp_buf = sycl::malloc_shared<uint8_t>(size, *stream);
3010
+ SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size).wait()));
3011
+
3012
+ auto * qs_ptr = data_device;
3013
+ auto * scales_ptr = qs_ptr + QK_K / 2 * nblocks;
3014
+ auto * dm_ptr = (sycl::half2 *) (scales_ptr + K_SCALE_SIZE * nblocks);
3015
+
3016
+ stream->parallel_for(nblocks, [=](auto i) {
3017
+ const block_q4_K * x = (const block_q4_K *) tmp_buf;
3018
+ const int ib = i;
3019
+
3020
+ for (int j = 0; j < QK_K / 2; ++j) {
3021
+ qs_ptr[ib * (QK_K / 2) + j] = x[ib].qs[j];
3022
+ }
3023
+
3024
+ for (int j = 0; j < K_SCALE_SIZE; ++j) {
3025
+ scales_ptr[ib * K_SCALE_SIZE + j] = x[ib].scales[j];
3026
+ }
3027
+
3028
+ dm_ptr[ib] = x[ib].dm;
3029
+ }).wait_and_throw();
2910
3030
 
2911
3031
  sycl::free(tmp_buf, *stream);
2912
3032
  }
2913
3033
 
2914
3034
  static void reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
2915
- char*data_device = (char*)src0->data;
3035
+ uint8_t * data_device = (uint8_t *) src0->data;
2916
3036
  size_t ncols = src0->ne[0];
2917
3037
  size_t nrows = src0->ne[1];
2918
3038
  size_t size = ggml_nbytes(src0);
2919
3039
 
2920
- reorder_qw(data_device, ncols, nrows, size, 0, stream);
3040
+ switch (src0->type) {
3041
+ case GGML_TYPE_Q4_0:
3042
+ reorder_qw_q4_0(data_device, ncols, nrows, size, 0, stream);
3043
+ break;
3044
+ case GGML_TYPE_Q4_K:
3045
+ reorder_qw_q4_k(data_device, size, 0, stream);
3046
+ break;
3047
+ default:
3048
+ GGML_ABORT("reorder_qw() called with unsupported type");
3049
+ break;
3050
+ }
2921
3051
  }
2922
3052
 
2923
3053
  static bool should_reorder_tensor(ggml_backend_sycl_context& ctx, const ggml_tensor * dst) {
2924
3054
  return !g_ggml_sycl_disable_optimize && //allow optimize, controlled by $GGML_SYCL_DISABLE_OPT
2925
3055
  ctx.opt_feature.reorder && //allow this device due to good perf, skip the devices with bad perf.
2926
3056
  dst->op == GGML_OP_MUL_MAT && //limit to some supported cases of Q4_0, to do for more cases.
2927
- dst->src[1]->ne[2]==1 && dst->src[1]->ne[3]==1;
3057
+ dst->src[1]->ne[1]==1 && dst->src[1]->ne[2]==1 && dst->src[1]->ne[3]==1;
2928
3058
  }
2929
3059
 
2930
3060
  static void opt_for_reorder(ggml_backend_sycl_context * ctx, const ggml_tensor * src0, const ggml_tensor * /* src1 */,
@@ -2960,8 +3090,19 @@ static void opt_for_reorder(ggml_backend_sycl_context * ctx, const ggml_tensor *
2960
3090
  extra->optimized_feature.reorder = true; // Used to decode/dequan in next steps and avoid re-reordering
2961
3091
  }
2962
3092
 
2963
- static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2964
3093
 
3094
+ static bool can_use_dequantize_mul_mat_vec(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3095
+ return ggml_sycl_supports_dmmv(src0->type) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 &&
3096
+ src0->ne[0] % GGML_SYCL_DMMV_X == 0 && src1->ne[1] == 1;
3097
+ }
3098
+
3099
+ static bool can_use_mul_mat_vec_q(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3100
+ return ggml_is_quantized(src0->type) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 &&
3101
+ src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
3102
+ }
3103
+
3104
+ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3105
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
2965
3106
  const bool split = ggml_backend_buffer_is_sycl_split(src0->buffer);
2966
3107
  int64_t min_compute_capability = INT_MAX;
2967
3108
 
@@ -2984,13 +3125,9 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
2984
3125
  }
2985
3126
 
2986
3127
  // check data types and tensor shapes for custom matrix multiplication kernels:
2987
- bool use_dequantize_mul_mat_vec = ggml_sycl_supports_dmmv(src0->type)
2988
- && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
2989
- && src0->ne[0] % GGML_SYCL_DMMV_X == 0 && src1->ne[1] == 1;
3128
+ bool use_dequantize_mul_mat_vec = can_use_dequantize_mul_mat_vec(src0, src1, dst);
2990
3129
 
2991
- bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
2992
- && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
2993
- && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
3130
+ bool use_mul_mat_vec_q = can_use_mul_mat_vec_q(src0, src1, dst);
2994
3131
 
2995
3132
  bool use_mul_mat_q = ggml_sycl_supports_mmq(src0->type)
2996
3133
  && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
@@ -3041,11 +3178,8 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
3041
3178
  ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_q, convert_src1_to_q8_1);
3042
3179
  } else {
3043
3180
  constexpr bool convert_src1_to_q8_1 = false;
3044
- // MUL_MAT_SYCL supports reorder
3045
- opt_for_reorder(&ctx, src0, src1, dst, mul_mat_algo::MUL_MAT_SYCL);
3046
3181
  ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_sycl, convert_src1_to_q8_1);
3047
3182
  }
3048
- GGML_SYCL_DEBUG("call %s done\n", __func__);
3049
3183
  }
3050
3184
 
3051
3185
 
@@ -3116,6 +3250,7 @@ __dpct_inline__ static void k_copy_dst_from_contiguous(
3116
3250
 
3117
3251
  static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
3118
3252
  ggml_tensor *dst) try {
3253
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/3);
3119
3254
  const ggml_tensor *src0 = dst->src[0];
3120
3255
  const ggml_tensor *src1 = dst->src[1];
3121
3256
  GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src0->buffer) && "mul_mat_id does not support split buffers");
@@ -3284,37 +3419,45 @@ catch (sycl::exception const &exc) {
3284
3419
  }
3285
3420
 
3286
3421
  static void ggml_sycl_scale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
3422
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
3287
3423
  ggml_sycl_op_scale(ctx, dst);
3288
3424
  }
3289
3425
 
3290
3426
  static void ggml_sycl_diag_mask_inf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
3427
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
3291
3428
  ggml_sycl_op_diag_mask_inf(ctx, dst);
3292
3429
  }
3293
3430
 
3294
3431
  static void ggml_sycl_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
3432
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
3295
3433
  ggml_sycl_op_pool2d(ctx, dst);
3296
3434
  }
3297
3435
 
3298
3436
  static void ggml_sycl_im2col(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
3437
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
3299
3438
  ggml_sycl_op_im2col(ctx, dst);
3300
3439
  }
3301
3440
 
3302
3441
  static void ggml_sycl_sum(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
3442
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
3303
3443
  GGML_ASSERT(ggml_is_contiguous(dst->src[0]));
3304
3444
  ggml_sycl_op_sum(ctx, dst);
3305
3445
  }
3306
3446
 
3307
3447
  static void ggml_sycl_sum_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
3448
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
3308
3449
  GGML_ASSERT(ggml_is_contiguous(dst->src[0]));
3309
3450
  ggml_sycl_op_sum_rows(ctx, dst);
3310
3451
  }
3311
3452
 
3312
3453
  static void ggml_sycl_argsort(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
3454
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
3313
3455
  GGML_ASSERT(ggml_is_contiguous(dst->src[0]));
3314
3456
  ggml_sycl_op_argsort(ctx, dst);
3315
3457
  }
3316
3458
 
3317
3459
  static void ggml_sycl_argmax(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
3460
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
3318
3461
  GGML_ASSERT(ggml_is_contiguous(dst->src[0]));
3319
3462
  ggml_sycl_op_argmax(ctx, dst);
3320
3463
  }
@@ -3400,6 +3543,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
3400
3543
  case GGML_UNARY_OP_GELU_QUICK:
3401
3544
  ggml_sycl_gelu_quick(ctx, dst);
3402
3545
  break;
3546
+ case GGML_UNARY_OP_GELU_ERF:
3547
+ ggml_sycl_gelu_erf(ctx, dst);
3548
+ break;
3403
3549
  case GGML_UNARY_OP_TANH:
3404
3550
  ggml_sycl_tanh(ctx, dst);
3405
3551
  break;
@@ -3608,6 +3754,9 @@ static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
3608
3754
  ggml_tensor *tensor,
3609
3755
  const void *data, size_t offset,
3610
3756
  size_t size) try {
3757
+ GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
3758
+ debug_print_tensor(": tensor=", tensor);
3759
+ GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
3611
3760
  ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
3612
3761
  ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
3613
3762
 
@@ -3626,13 +3775,16 @@ static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend,
3626
3775
  const ggml_tensor *tensor,
3627
3776
  void *data, size_t offset,
3628
3777
  size_t size) try {
3778
+ GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
3779
+ debug_print_tensor(": tensor=", tensor);
3780
+ GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
3629
3781
  ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
3630
3782
  ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
3631
3783
 
3632
3784
  GGML_ASSERT(buf->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
3633
3785
  const queue_ptr stream = sycl_ctx->stream(sycl_ctx->device, 0);
3634
3786
  SYCL_CHECK(CHECK_TRY_ERROR((stream)->memcpy(
3635
- data, (const char *)tensor->data + offset, size).wait()));
3787
+ data, (const char *)tensor->data + offset, size)));
3636
3788
  }
3637
3789
  catch (sycl::exception const &exc) {
3638
3790
  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -3644,7 +3796,13 @@ static bool ggml_backend_sycl_cpy_tensor_async(ggml_backend_t backend,
3644
3796
  const ggml_tensor *src,
3645
3797
  ggml_tensor *dst) try {
3646
3798
  ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
3647
- if (dst->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && ggml_backend_buffer_is_sycl(src->buffer)) {
3799
+ bool is_cpy_supported = dst->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) &&
3800
+ ggml_backend_buffer_is_sycl(src->buffer);
3801
+ GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
3802
+ debug_print_tensor(": dst=", dst);
3803
+ debug_print_tensor(" src=", src);
3804
+ GGML_SYCL_DEBUG(" is_cpy_supported=%d\n", is_cpy_supported);
3805
+ if (is_cpy_supported) {
3648
3806
  /*
3649
3807
  DPCT1009:215: SYCL uses exceptions to report errors and does not use the
3650
3808
  error codes. The original code was commented out and a warning string
@@ -3652,7 +3810,7 @@ static bool ggml_backend_sycl_cpy_tensor_async(ggml_backend_t backend,
3652
3810
  */
3653
3811
  const queue_ptr stream = sycl_ctx->stream(sycl_ctx->device, 0);
3654
3812
  SYCL_CHECK(CHECK_TRY_ERROR((stream)->memcpy(
3655
- dst->data, src->data, ggml_nbytes(dst)).wait()));
3813
+ dst->data, src->data, ggml_nbytes(dst))));
3656
3814
  return true;
3657
3815
  }
3658
3816
 
@@ -3665,6 +3823,7 @@ catch (sycl::exception const &exc) {
3665
3823
  }
3666
3824
 
3667
3825
  static void ggml_backend_sycl_synchronize(ggml_backend_t backend) try {
3826
+ GGML_SYCL_DEBUG("[SYCL] call %s\n", __func__);
3668
3827
  ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
3669
3828
  const queue_ptr stream = sycl_ctx->stream(sycl_ctx->device, 0);
3670
3829
  SYCL_CHECK(CHECK_TRY_ERROR((stream)->wait()));
@@ -3701,11 +3860,43 @@ static void ggml_backend_sycl_graph_compute_impl(ggml_backend_sycl_context * syc
3701
3860
  }
3702
3861
  }
3703
3862
 
3863
+ #ifdef GGML_SYCL_GRAPH
3864
+ static bool check_graph_compatibility(ggml_cgraph * cgraph) {
3865
+ if (ggml_sycl_info().device_count > 1) {
3866
+ // A sycl_ex::command_graph object can only be created for a single device
3867
+ GGML_LOG_INFO("%s: disabling SYCL graphs due to multiple devices\n", __func__);
3868
+ return false;
3869
+ }
3870
+
3871
+ for (int i = 0; i < cgraph->n_nodes; i++) {
3872
+ const ggml_op node_op = cgraph->nodes[i]->op;
3873
+ switch (node_op) {
3874
+ default:
3875
+ break;
3876
+ case GGML_OP_CONCAT:
3877
+ // ggml_sycl_op_concat() does a blocking host wait after memcpy operations,
3878
+ // but wait() can't be called on the events returned by a queue recording
3879
+ // to a graph.
3880
+ [[fallthrough]];
3881
+ case GGML_OP_MUL_MAT_ID:
3882
+ // ggml_sycl_mul_mat_id() does a blocking host wait on the sycl queue after
3883
+ // submitting a memcpy operation, but wait() can't be called on a queue that
3884
+ // is recording to a graph.
3885
+ GGML_LOG_INFO("%s: disabling SYCL graphs due to unsupported node type %s\n", __func__,
3886
+ ggml_op_name(node_op));
3887
+ return false;
3888
+ }
3889
+ }
3890
+ return true;
3891
+ }
3892
+ #endif
3893
+
3704
3894
  static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
3705
3895
  auto * sycl_ctx = static_cast<ggml_backend_sycl_context *>(backend->context);
3706
3896
 
3707
3897
  #ifdef GGML_SYCL_GRAPH
3708
- if (!g_ggml_sycl_disable_graph) {
3898
+ bool use_sycl_graph = !g_ggml_sycl_disable_graph && check_graph_compatibility(cgraph);
3899
+ if (use_sycl_graph) {
3709
3900
  const bool graph_support = dpct::get_device(sycl_ctx->device).has(sycl::aspect::ext_oneapi_limited_graph);
3710
3901
  if (!graph_support) {
3711
3902
  GGML_SYCL_DEBUG("[SYCL-GRAPH] can not use graphs on device:%d\n", sycl_ctx->device);
@@ -3713,7 +3904,8 @@ static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_
3713
3904
  return GGML_STATUS_SUCCESS;
3714
3905
  }
3715
3906
 
3716
- sycl_ex::command_graph model_sycl_graph(*(sycl_ctx->stream()));
3907
+ sycl_ex::command_graph model_sycl_graph(*(sycl_ctx->stream()), {sycl_ex::property::graph::assume_buffer_outlives_graph{}});
3908
+
3717
3909
  model_sycl_graph.begin_recording(*(sycl_ctx->stream()));
3718
3910
  ggml_backend_sycl_graph_compute_impl(sycl_ctx, cgraph);
3719
3911
  model_sycl_graph.end_recording();
@@ -3765,7 +3957,7 @@ catch (sycl::exception const &exc)
3765
3957
  }
3766
3958
 
3767
3959
  static void ggml_backend_sycl_event_wait(ggml_backend_t backend, ggml_backend_event_t event) try {
3768
-
3960
+ GGML_SYCL_DEBUG("[SYCL] call %s\n", __func__);
3769
3961
  sycl::event* sycl_event = static_cast<sycl::event*>(event->context);
3770
3962
 
3771
3963
  if (ggml_backend_is_sycl(backend)) {
@@ -3907,6 +4099,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
3907
4099
  case GGML_UNARY_OP_HARDSIGMOID:
3908
4100
  case GGML_UNARY_OP_HARDSWISH:
3909
4101
  case GGML_UNARY_OP_GELU_QUICK:
4102
+ case GGML_UNARY_OP_GELU_ERF:
3910
4103
  case GGML_UNARY_OP_TANH:
3911
4104
  case GGML_UNARY_OP_EXP:
3912
4105
  case GGML_UNARY_OP_SGN:
@@ -4052,6 +4245,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
4052
4245
  #endif
4053
4246
  case GGML_OP_NORM:
4054
4247
  case GGML_OP_RMS_NORM:
4248
+ return true;
4055
4249
  case GGML_OP_L2_NORM:
4056
4250
  case GGML_OP_GROUP_NORM:
4057
4251
  return ggml_is_contiguous(op->src[0]);
@@ -4160,6 +4354,7 @@ static void ggml_backend_sycl_device_event_free(ggml_backend_dev_t dev, ggml_bac
4160
4354
 
4161
4355
  static void ggml_backend_sycl_device_event_synchronize(ggml_backend_dev_t dev, ggml_backend_event_t event) try {
4162
4356
  GGML_UNUSED(dev);
4357
+ GGML_SYCL_DEBUG("[SYCL] call %s\n", __func__);
4163
4358
 
4164
4359
  sycl::event *sycl_event = static_cast<sycl::event *>(event->context);
4165
4360
  SYCL_CHECK(CHECK_TRY_ERROR(sycl_event->wait()));