@fugood/llama.node 0.3.3 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (225) hide show
  1. package/CMakeLists.txt +5 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/lib/binding.ts +18 -1
  17. package/package.json +1 -1
  18. package/src/EmbeddingWorker.cpp +15 -5
  19. package/src/EmbeddingWorker.h +2 -1
  20. package/src/LlamaCompletionWorker.cpp +1 -1
  21. package/src/LlamaContext.cpp +81 -18
  22. package/src/LlamaContext.h +2 -0
  23. package/src/llama.cpp/.github/workflows/build.yml +197 -159
  24. package/src/llama.cpp/.github/workflows/docker.yml +5 -8
  25. package/src/llama.cpp/.github/workflows/python-lint.yml +8 -1
  26. package/src/llama.cpp/.github/workflows/server.yml +21 -14
  27. package/src/llama.cpp/CMakeLists.txt +11 -6
  28. package/src/llama.cpp/Sources/llama/llama.h +4 -0
  29. package/src/llama.cpp/cmake/common.cmake +33 -0
  30. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +11 -0
  31. package/src/llama.cpp/common/CMakeLists.txt +6 -2
  32. package/src/llama.cpp/common/arg.cpp +426 -245
  33. package/src/llama.cpp/common/common.cpp +143 -80
  34. package/src/llama.cpp/common/common.h +81 -24
  35. package/src/llama.cpp/common/sampling.cpp +53 -19
  36. package/src/llama.cpp/common/sampling.h +22 -1
  37. package/src/llama.cpp/common/speculative.cpp +274 -0
  38. package/src/llama.cpp/common/speculative.h +28 -0
  39. package/src/llama.cpp/docs/build.md +101 -148
  40. package/src/llama.cpp/examples/CMakeLists.txt +32 -13
  41. package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
  42. package/src/llama.cpp/examples/batched/batched.cpp +5 -4
  43. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
  44. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
  45. package/src/llama.cpp/examples/cvector-generator/CMakeLists.txt +1 -1
  46. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +1 -1
  47. package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
  48. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +3 -2
  49. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
  50. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +1 -1
  51. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +4 -7
  52. package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +1 -1
  53. package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
  54. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +8 -1
  55. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
  56. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +2 -2
  57. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
  58. package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
  59. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
  60. package/src/llama.cpp/examples/imatrix/imatrix.cpp +11 -2
  61. package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
  62. package/src/llama.cpp/examples/infill/infill.cpp +1 -1
  63. package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +1 -1
  64. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +405 -316
  65. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  66. package/src/llama.cpp/examples/llava/CMakeLists.txt +10 -3
  67. package/src/llama.cpp/examples/llava/clip.cpp +262 -66
  68. package/src/llama.cpp/examples/llava/clip.h +8 -2
  69. package/src/llama.cpp/examples/llava/llava-cli.cpp +1 -1
  70. package/src/llama.cpp/examples/llava/llava.cpp +46 -19
  71. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +1 -1
  72. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +581 -0
  73. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
  74. package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -1
  75. package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
  76. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -1
  77. package/src/llama.cpp/examples/lookup/lookup.cpp +2 -2
  78. package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
  79. package/src/llama.cpp/examples/main/main.cpp +9 -5
  80. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +1 -1
  81. package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
  82. package/src/llama.cpp/examples/parallel/parallel.cpp +1 -1
  83. package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
  84. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
  85. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  86. package/src/llama.cpp/examples/quantize/quantize.cpp +0 -3
  87. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
  88. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
  89. package/src/llama.cpp/examples/retrieval/retrieval.cpp +4 -4
  90. package/src/llama.cpp/examples/run/CMakeLists.txt +5 -0
  91. package/src/llama.cpp/examples/run/run.cpp +911 -0
  92. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
  93. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +4 -4
  94. package/src/llama.cpp/examples/server/CMakeLists.txt +3 -7
  95. package/src/llama.cpp/examples/server/server.cpp +1758 -886
  96. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
  97. package/src/llama.cpp/examples/server/utils.hpp +94 -304
  98. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  99. package/src/llama.cpp/examples/simple/simple.cpp +4 -0
  100. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +1 -1
  101. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +3 -0
  102. package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
  103. package/src/llama.cpp/examples/speculative/speculative.cpp +16 -15
  104. package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +5 -0
  105. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +265 -0
  106. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
  107. package/src/llama.cpp/examples/tokenize/tokenize.cpp +1 -1
  108. package/src/llama.cpp/examples/tts/CMakeLists.txt +5 -0
  109. package/src/llama.cpp/examples/tts/tts.cpp +932 -0
  110. package/src/llama.cpp/ggml/CMakeLists.txt +46 -34
  111. package/src/llama.cpp/ggml/include/ggml-backend.h +16 -0
  112. package/src/llama.cpp/ggml/include/ggml-cpu.h +7 -49
  113. package/src/llama.cpp/ggml/include/ggml-opencl.h +26 -0
  114. package/src/llama.cpp/ggml/include/ggml.h +106 -24
  115. package/src/llama.cpp/ggml/src/CMakeLists.txt +73 -24
  116. package/src/llama.cpp/ggml/src/ggml-alloc.c +0 -1
  117. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +51 -11
  118. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +379 -22
  119. package/src/llama.cpp/ggml/src/ggml-backend.cpp +4 -4
  120. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +3 -7
  121. package/src/llama.cpp/ggml/src/ggml-blas/ggml-blas.cpp +5 -2
  122. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +33 -3
  123. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +456 -111
  124. package/src/llama.cpp/ggml/src/ggml-cann/common.h +6 -3
  125. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +95 -35
  126. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -5
  127. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +22 -9
  128. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +24 -13
  129. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +23 -13
  130. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +11 -0
  131. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +10 -0
  132. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +10 -0
  133. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +17 -0
  134. package/src/llama.cpp/ggml/src/ggml-common.h +42 -42
  135. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +288 -213
  136. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
  137. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
  138. package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/common.h +19 -22
  139. package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/mmq.cpp +93 -92
  140. package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/mmq.h +2 -9
  141. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
  142. package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.c → ggml-cpu-aarch64.cpp} +892 -190
  143. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +2 -24
  144. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
  145. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
  146. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +15 -0
  147. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +38 -25
  148. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
  149. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
  150. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +552 -399
  151. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +101 -136
  152. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +2 -2
  153. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +7 -10
  154. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +8 -0
  155. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -6
  156. package/src/llama.cpp/ggml/src/ggml-impl.h +32 -11
  157. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +13 -9
  158. package/src/llama.cpp/ggml/src/ggml-kompute/ggml-kompute.cpp +131 -64
  159. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +3 -6
  160. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +39 -0
  161. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +14 -7
  162. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +147 -0
  163. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +4004 -0
  164. package/src/llama.cpp/ggml/src/ggml-opt.cpp +67 -80
  165. package/src/llama.cpp/ggml/src/ggml-quants.c +0 -9
  166. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +3 -5
  167. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +5 -2
  168. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +13 -10
  169. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +2 -11
  170. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +1 -0
  171. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +2 -2
  172. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
  173. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +5 -5
  174. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +32 -13
  175. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +80 -61
  176. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +4 -4
  177. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +159 -114
  178. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +3 -2
  179. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +6 -6
  180. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +6 -20
  181. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +4 -3
  182. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +8 -8
  183. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +4 -3
  184. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +7 -7
  185. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +1 -0
  186. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +4 -1
  187. package/src/llama.cpp/ggml/src/ggml-threading.h +4 -2
  188. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +21 -7
  189. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1718 -399
  190. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +3 -1
  191. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +105 -31
  192. package/src/llama.cpp/ggml/src/ggml.c +367 -207
  193. package/src/llama.cpp/include/llama-cpp.h +25 -0
  194. package/src/llama.cpp/include/llama.h +26 -19
  195. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +112 -0
  196. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +46 -0
  197. package/src/llama.cpp/pocs/CMakeLists.txt +3 -1
  198. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
  199. package/src/llama.cpp/src/CMakeLists.txt +2 -7
  200. package/src/llama.cpp/src/llama-grammar.cpp +15 -15
  201. package/src/llama.cpp/src/llama-grammar.h +2 -5
  202. package/src/llama.cpp/src/llama-sampling.cpp +35 -90
  203. package/src/llama.cpp/src/llama-vocab.cpp +6 -1
  204. package/src/llama.cpp/src/llama.cpp +1748 -640
  205. package/src/llama.cpp/src/unicode.cpp +62 -51
  206. package/src/llama.cpp/src/unicode.h +9 -10
  207. package/src/llama.cpp/tests/CMakeLists.txt +48 -37
  208. package/src/llama.cpp/tests/test-arg-parser.cpp +2 -2
  209. package/src/llama.cpp/tests/test-backend-ops.cpp +140 -21
  210. package/src/llama.cpp/tests/test-chat-template.cpp +50 -4
  211. package/src/llama.cpp/tests/test-gguf.cpp +1303 -0
  212. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -6
  213. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -4
  214. package/src/llama.cpp/tests/test-quantize-fns.cpp +3 -3
  215. package/src/llama.cpp/tests/test-rope.cpp +61 -20
  216. package/src/llama.cpp/tests/test-sampling.cpp +2 -2
  217. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +0 -72
  218. package/src/llama.cpp/.github/workflows/nix-ci.yml +0 -79
  219. package/src/llama.cpp/.github/workflows/nix-flake-update.yml +0 -22
  220. package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +0 -36
  221. package/src/llama.cpp/ggml/include/ggml-amx.h +0 -25
  222. package/src/llama.cpp/ggml/src/ggml-aarch64.c +0 -129
  223. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -19
  224. package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +0 -107
  225. package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +0 -446
@@ -47,7 +47,7 @@ static ggml_sycl_device_info ggml_sycl_init() {
47
47
 
48
48
  info.device_count = dpct::dev_mgr::instance().device_count();
49
49
  if (info.device_count == 0) {
50
- fprintf(stderr, "%s: failed to initialize " GGML_SYCL_NAME ": %s\n", __func__);
50
+ GGML_LOG_ERROR("%s: failed to initialize: %s\n", GGML_SYCL_NAME, __func__);
51
51
  return info;
52
52
  }
53
53
 
@@ -55,16 +55,16 @@ static ggml_sycl_device_info ggml_sycl_init() {
55
55
 
56
56
  int64_t total_vram = 0;
57
57
  #if defined(GGML_SYCL_FORCE_MMQ)
58
- fprintf(stderr, "%s: GGML_SYCL_FORCE_MMQ: yes\n", __func__);
58
+ GGML_LOG_INFO("%s: GGML_SYCL_FORCE_MMQ: yes\n", __func__);
59
59
  #else
60
- fprintf(stderr, "%s: GGML_SYCL_FORCE_MMQ: no\n", __func__);
60
+ GGML_LOG_INFO("%s: GGML_SYCL_FORCE_MMQ: no\n", __func__);
61
61
  #endif
62
62
  #if defined(SYCL_USE_XMX)
63
- fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__);
63
+ GGML_LOG_INFO("%s: SYCL_USE_XMX: yes\n", __func__);
64
64
  #else
65
- fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
65
+ GGML_LOG_INFO("%s: SYCL_USE_XMX: no\n", __func__);
66
66
  #endif
67
- fprintf(stderr, "%s: found %d " GGML_SYCL_NAME " devices:\n", __func__, info.device_count);
67
+ GGML_LOG_INFO("%s: found %d %s devices:\n", __func__, info.device_count, GGML_SYCL_NAME);
68
68
 
69
69
  for (int i = 0; i < info.device_count; ++i) {
70
70
  info.devices[i].vmm = 0;
@@ -110,7 +110,7 @@ void print_device_detail(int id, sycl::device &device, std::string device_type)
110
110
 
111
111
  auto global_mem_size = prop.get_global_mem_size()/1000000;
112
112
 
113
- fprintf(stderr, "|%2d|%19s|%39s|%7s|%7d|%8d|%5d|%6luM|%21s|\n", id, device_type.c_str(),
113
+ GGML_LOG_INFO("|%2d|%19s|%39s|%7s|%7d|%8d|%5d|%6luM|%21s|\n", id, device_type.c_str(),
114
114
  name.c_str(), version.c_str(), prop.get_max_compute_units(),
115
115
  prop.get_max_work_group_size(), prop.get_max_sub_group_size(),
116
116
  global_mem_size, device.get_info<sycl::info::device::driver_version>().c_str());
@@ -120,19 +120,29 @@ void ggml_backend_sycl_print_sycl_devices() {
120
120
  GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_print_sycl_devices\n");
121
121
  int device_count = dpct::dev_mgr::instance().device_count();
122
122
  std::map<std::string, size_t> DeviceNums;
123
- fprintf(stderr, "found %d SYCL devices:\n", device_count);
124
- fprintf(stderr, "| | | | |Max | |Max |Global | |\n");
125
- fprintf(stderr, "| | | | |compute|Max work|sub |mem | |\n");
126
- fprintf(stderr, "|ID| Device Type| Name|Version|units |group |group|size | Driver version|\n");
127
- fprintf(stderr, "|--|-------------------|---------------------------------------|-------|-------|--------|-----|-------|---------------------|\n");
123
+ GGML_LOG_INFO("Found %d SYCL devices:\n", device_count);
124
+
125
+ GGML_LOG_INFO(
126
+ "| | | | "
127
+ " |Max | |Max |Global | |\n");
128
+ GGML_LOG_INFO(
129
+ "| | | | "
130
+ " |compute|Max work|sub |mem | |\n");
131
+ GGML_LOG_INFO(
132
+ "|ID| Device Type| "
133
+ "Name|Version|units |group |group|size | Driver version|\n");
134
+ GGML_LOG_INFO(
135
+ "|--|-------------------|---------------------------------------|------"
136
+ "-|-------|--------|-----|-------|---------------------|\n");
137
+
128
138
  for (int id = 0; id < device_count; ++id) {
129
- sycl::device device = dpct::dev_mgr::instance().get_device(id);
130
- sycl::backend backend = device.get_backend();
131
- std::string backend_type = get_device_backend_and_type(device);
132
- int type_id=DeviceNums[backend_type]++;
133
- std::stringstream device_type;
134
- device_type << "[" << backend_type << ":" << std::to_string(type_id) << "]";
135
- print_device_detail(id, device, device_type.str());
139
+ sycl::device device = dpct::dev_mgr::instance().get_device(id);
140
+ std::string backend_type = get_device_backend_and_type(device);
141
+ int type_id = DeviceNums[backend_type]++;
142
+ std::stringstream device_type;
143
+ device_type << "[" << backend_type << ":" << std::to_string(type_id)
144
+ << "]";
145
+ print_device_detail(id, device, device_type.str());
136
146
  }
137
147
  }
138
148
 
@@ -154,15 +164,14 @@ static void ggml_check_sycl() try {
154
164
  static bool initialized = false;
155
165
 
156
166
  if (!initialized) {
157
- fprintf(stderr, "[SYCL] call ggml_check_sycl\n");
167
+ GGML_LOG_INFO("[SYCL] call ggml_check_sycl\n");
158
168
  g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
159
-
160
- fprintf(stderr, "%s: GGML_SYCL_DEBUG: %d\n", __func__, g_ggml_sycl_debug);
169
+ GGML_LOG_INFO("%s: GGML_SYCL_DEBUG: %d\n", __func__, g_ggml_sycl_debug);
161
170
 
162
171
  #if defined(GGML_SYCL_F16)
163
- fprintf(stderr, "%s: GGML_SYCL_F16: yes\n", __func__);
172
+ GGML_LOG_INFO("%s: GGML_SYCL_F16: yes\n", __func__);
164
173
  #else
165
- fprintf(stderr, "%s: GGML_SYCL_F16: no\n", __func__);
174
+ GGML_LOG_INFO("%s: GGML_SYCL_F16: no\n", __func__);
166
175
  #endif
167
176
 
168
177
  /* NOT REMOVE, keep it for next optimize for XMX.
@@ -180,9 +189,10 @@ static void ggml_check_sycl() try {
180
189
  return;
181
190
  }
182
191
  GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES);
183
- ggml_backend_sycl_print_sycl_devices();
192
+
184
193
  initialized = true;
185
194
  g_sycl_loaded = true;
195
+ ggml_backend_sycl_print_sycl_devices();
186
196
  }
187
197
  }
188
198
  catch (sycl::exception const &exc) {
@@ -205,7 +215,7 @@ inline void check_allow_gpu_index(const int device_index) {
205
215
  __func__,
206
216
  device_index,
207
217
  ggml_sycl_info().device_count - 1);
208
- fprintf(stderr, "%s\n", error_buf);
218
+ GGML_LOG_ERROR("%s\n", error_buf);
209
219
  assert(false);
210
220
  }
211
221
  }
@@ -409,14 +419,12 @@ ggml_backend_sycl_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
409
419
  return true;
410
420
  }
411
421
  return false;
412
- }
413
- catch (sycl::exception const &exc) {
414
- std::cerr << exc.what() << "Exception caught at file:" << __FILE__
415
- << ", line:" << __LINE__ << std::endl;
416
- std::exit(1);
422
+ GGML_UNUSED(buffer);
423
+ } catch (const sycl::exception & exc) {
424
+ std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl;
425
+ std::exit(1);
417
426
  }
418
427
 
419
-
420
428
  static void ggml_backend_sycl_buffer_clear(ggml_backend_buffer_t buffer,
421
429
  uint8_t value) try {
422
430
  ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
@@ -475,8 +483,8 @@ ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
475
483
  SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)sycl::malloc_device(
476
484
  size, *stream)));
477
485
  if (!dev_ptr) {
478
- fprintf(stderr, "%s: can't malloc %lu Bytes memory on device", __func__, size);
479
- return nullptr;
486
+ GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on device\n", __func__, size);
487
+ return nullptr;
480
488
  }
481
489
  ggml_backend_sycl_buffer_context * ctx = new ggml_backend_sycl_buffer_context(buft_ctx->device, dev_ptr, buft_ctx->stream);
482
490
  return ggml_backend_buffer_init(buft, ggml_backend_sycl_buffer_interface, ctx, size);
@@ -752,7 +760,7 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
752
760
  size, *stream)));
753
761
  if (!buf) {
754
762
  char err_buf[1024];
755
- snprintf(err_buf, 1023, "%s: can't malloc %lu Bytes memory on device", __func__, size);
763
+ snprintf(err_buf, 1023, "%s: can't allocate %lu Bytes of memory on device\n", __func__, size);
756
764
  throw std::runtime_error(err_buf);
757
765
  }
758
766
  // set padding to 0 to avoid possible NaN values
@@ -1081,10 +1089,7 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
1081
1089
  ggml_sycl_buffer buffer_pool[MAX_SYCL_BUFFERS] = {};
1082
1090
  size_t pool_size = 0;
1083
1091
 
1084
- explicit ggml_sycl_pool_leg(queue_ptr qptr_, int device_) :
1085
- qptr(qptr_),
1086
- device(device_) {
1087
- }
1092
+ explicit ggml_sycl_pool_leg(queue_ptr qptr_, int device_) : device(device_), qptr(qptr_) {}
1088
1093
 
1089
1094
  ~ggml_sycl_pool_leg() {
1090
1095
  for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) {
@@ -1142,17 +1147,18 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
1142
1147
  CHECK_TRY_ERROR(ptr = (void *)sycl::malloc_device(
1143
1148
  look_ahead_size, *qptr)));
1144
1149
  if (!ptr) {
1145
- fprintf(stderr, "%s: can't malloc %lu Bytes memory on device", __func__, look_ahead_size);
1150
+ GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on device/GPU\n", __func__, look_ahead_size);
1146
1151
  return nullptr;
1147
1152
  }
1148
1153
 
1149
1154
  *actual_size = look_ahead_size;
1150
1155
  pool_size += look_ahead_size;
1151
1156
 
1152
- #ifdef DEBUG_SYCL_MALLOC
1153
- fprintf(stderr, "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, id, nnz,
1157
+ #ifdef DEBUG_SYCL_MALLOC
1158
+ GGML_LOG_DEBUG("%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, id, nnz,
1154
1159
  (uint32_t)(max_size/1024/1024), (uint32_t)(g_sycl_pool_size[id]/1024/1024), (uint32_t)(size/1024/1024));
1155
- #endif
1160
+ #endif
1161
+
1156
1162
  // GGML_SYCL_DEBUG("ggml_sycl_pool_malloc_leg look_ahead_size=%lu, return %p\n", look_ahead_size, ptr);
1157
1163
  return ptr;
1158
1164
  }
@@ -1166,7 +1172,7 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
1166
1172
  return;
1167
1173
  }
1168
1174
  }
1169
- fprintf(stderr, "WARNING: sycl buffer pool full, increase MAX_sycl_BUFFERS\n");
1175
+ GGML_LOG_WARN("WARNING: sycl buffer pool full, increase MAX_sycl_BUFFERS\n");
1170
1176
  SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, *qptr)));
1171
1177
  pool_size -= size;
1172
1178
  }
@@ -1226,7 +1232,7 @@ static void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy,
1226
1232
  zeros[i] = 0.f;
1227
1233
  qzeros[i] = 0;
1228
1234
  }
1229
- const TC xi = ix < kx ? *(TC *)&x[iy * kx + ix] : zeros;
1235
+ const TC xi = ix < kx ? *(const TC *)&x[iy * kx + ix] : zeros;
1230
1236
  float sum = xi[0];
1231
1237
  float amax = sycl::fabs(xi[0]);
1232
1238
  #pragma unroll
@@ -1787,6 +1793,9 @@ static void pool2d_nchw_kernel(
1787
1793
  switch (op) {
1788
1794
  case GGML_OP_POOL_AVG: res = 0; break;
1789
1795
  case GGML_OP_POOL_MAX: res = -FLT_MAX; break;
1796
+ default:
1797
+ res = (To) sycl::nan(uint32_t(0));
1798
+ break;
1790
1799
  }
1791
1800
 
1792
1801
  for (int i = bh; i < eh; i += 1) {
@@ -1805,6 +1814,9 @@ static void pool2d_nchw_kernel(
1805
1814
  switch (op) {
1806
1815
  case GGML_OP_POOL_AVG: res += (cur / (kh * kw)); break;
1807
1816
  case GGML_OP_POOL_MAX: res = sycl::max(res, (To)cur); break;
1817
+ default:
1818
+ res = (To) sycl::nan(uint32_t(0));
1819
+ break;
1808
1820
  }
1809
1821
  }
1810
1822
  }
@@ -1843,7 +1855,8 @@ static void get_rows_sycl(ggml_backend_sycl_context & ctx, const ggml_tensor *sr
1843
1855
  s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);
1844
1856
  });
1845
1857
 
1846
- (void) dst;
1858
+ GGML_UNUSED(dst);
1859
+ GGML_UNUSED(ctx);
1847
1860
  }
1848
1861
 
1849
1862
  template <typename src0_t>
@@ -1881,10 +1894,10 @@ static void get_rows_sycl_float(ggml_backend_sycl_context & ctx, const ggml_tens
1881
1894
  });
1882
1895
  }
1883
1896
 
1884
- (void) dst;
1897
+ GGML_UNUSED(dst);
1898
+ GGML_UNUSED(ctx);
1885
1899
  }
1886
1900
 
1887
-
1888
1901
  static void quantize_row_q8_1_sycl(const float *x, void *vy, const int kx,
1889
1902
  const int ky, const int kx_padded,
1890
1903
  queue_ptr stream) {
@@ -2437,7 +2450,7 @@ static void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, const ggml_te
2437
2450
  break;
2438
2451
  default:
2439
2452
  // TODO: k-quants
2440
- fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
2453
+ GGML_LOG_ERROR("%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
2441
2454
  GGML_ABORT("fatal error");
2442
2455
  break;
2443
2456
  }
@@ -2452,8 +2465,8 @@ static void ggml_sycl_op_repeat(ggml_backend_sycl_context & ctx, const ggml_tens
2452
2465
 
2453
2466
  ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_repeat>>(ctx, dst, src0, dst, nullptr, src0_d, dst_d, main_stream);
2454
2467
 
2455
- (void) src1;
2456
- (void) src1_d;
2468
+ GGML_UNUSED(src1);
2469
+ GGML_UNUSED(src1_d);
2457
2470
  }
2458
2471
 
2459
2472
 
@@ -2472,17 +2485,18 @@ inline void ggml_sycl_op_mul_mat_sycl(
2472
2485
  const int64_t ne00 = src0->ne[0];
2473
2486
  const int64_t ne10 = src1->ne[0];
2474
2487
 
2475
- const int64_t ne0 = dst->ne[0];
2476
2488
 
2477
2489
  const int64_t row_diff = row_high - row_low;
2478
2490
 
2479
2491
  int id;
2480
2492
  SYCL_CHECK(
2481
2493
  CHECK_TRY_ERROR(id = get_current_device_id()));
2482
-
2494
+ #if !GGML_SYCL_DNNL
2495
+ const int64_t ne0 = dst->ne[0];
2483
2496
  // the main device has a larger memory buffer to hold the results from all GPUs
2484
2497
  // ldc == nrows of the matrix that cuBLAS writes into
2485
2498
  int ldc = id == ctx.device ? ne0 : row_diff;
2499
+ #endif
2486
2500
 
2487
2501
  #ifdef GGML_SYCL_F16
2488
2502
  bool use_fp16 = true; // TODO(Yu) SYCL capability check
@@ -2519,9 +2533,9 @@ inline void ggml_sycl_op_mul_mat_sycl(
2519
2533
  : src1_as_f16.get();
2520
2534
  ggml_sycl_pool_alloc<sycl::half> dst_f16(ctx.pool(), row_diff * src1_ncols);
2521
2535
 
2522
- const sycl::half alpha_f16 = 1.0f;
2523
- const sycl::half beta_f16 = 0.0f;
2524
2536
  #if !GGML_SYCL_DNNL
2537
+ const sycl::half alpha_f16 = 1.0f;
2538
+ const sycl::half beta_f16 = 0.0f;
2525
2539
  SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm(
2526
2540
  *stream, oneapi::mkl::transpose::trans,
2527
2541
  oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
@@ -2558,24 +2572,29 @@ inline void ggml_sycl_op_mul_mat_sycl(
2558
2572
  const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get();
2559
2573
  const float * src1_ddf1_i = src1->type == GGML_TYPE_F32 ? (const float *) src1_ddf_i : src1_ddq_as_f32.get();
2560
2574
 
2561
- const float alpha = 1.0f;
2562
- const float beta = 0.0f;
2563
2575
  #if !GGML_SYCL_DNNL
2576
+ const float alpha = 1.0f;
2577
+ const float beta = 0.0f;
2578
+ # ifdef GGML_SYCL_NVIDIA
2564
2579
  SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm(
2565
- *stream, oneapi::mkl::transpose::trans,
2566
- oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
2567
- dpct::get_value(&alpha, *stream), src0_ddf_i, ne00,
2568
- src1_ddf1_i, ne10, dpct::get_value(&beta, *stream),
2580
+ oneapi::mkl::backend_selector<oneapi::mkl::backend::cublas>{ *stream }, oneapi::mkl::transpose::trans,
2581
+ oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10, dpct::get_value(&alpha, *stream), src0_ddf_i,
2582
+ ne00, src1_ddf1_i, ne10, dpct::get_value(&beta, *stream), dst_dd_i, ldc)));
2583
+ # else
2584
+ SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm(
2585
+ *stream, oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
2586
+ dpct::get_value(&alpha, *stream), src0_ddf_i, ne00, src1_ddf1_i, ne10, dpct::get_value(&beta, *stream),
2569
2587
  dst_dd_i, ldc)));
2588
+ # endif
2570
2589
  #else
2571
2590
  auto dnnl_stream = ctx.stream_dnnl(stream);
2572
2591
  DnnlGemmWrapper::row_gemm(dnnl_stream, false, true, src1_ncols, row_diff, ne10, src1_ddf1_i, DnnlGemmWrapper::to_dt<float>(),
2573
2592
  src0_ddf_i, DnnlGemmWrapper::to_dt<float>(), dst_dd_i, DnnlGemmWrapper::to_dt<float>());
2574
2593
  #endif
2575
2594
  }
2576
- (void) dst;
2577
- (void) src1_ddq_i;
2578
- (void) src1_padded_row_size;
2595
+ GGML_UNUSED(dst);
2596
+ GGML_UNUSED(src1_ddq_i);
2597
+ GGML_UNUSED(src1_padded_row_size);
2579
2598
  }
2580
2599
  catch (sycl::exception const &exc) {
2581
2600
  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -2621,8 +2640,9 @@ static void ggml_sycl_op_pool2d(ggml_backend_sycl_context & ctx, const ggml_tens
2621
2640
  item_ct1);
2622
2641
  });
2623
2642
 
2624
- (void) src1;
2625
- (void) src1_dd;
2643
+ GGML_UNUSED(src1);
2644
+ GGML_UNUSED(src1_dd);
2645
+ GGML_UNUSED(ctx);
2626
2646
  }
2627
2647
 
2628
2648
  inline void ggml_sycl_op_sum(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@@ -2637,9 +2657,10 @@ inline void ggml_sycl_op_sum(ggml_backend_sycl_context & ctx, const ggml_tensor
2637
2657
 
2638
2658
  sum_rows_f32_sycl(src0_dd, dst_dd, ne, 1, main_stream);
2639
2659
 
2640
- (void) src1;
2641
- (void) dst;
2642
- (void) src1_dd;
2660
+ GGML_UNUSED(src1);
2661
+ GGML_UNUSED(dst);
2662
+ GGML_UNUSED(src1_dd);
2663
+ GGML_UNUSED(ctx);
2643
2664
  }
2644
2665
 
2645
2666
  inline void ggml_sycl_op_sum_rows(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@@ -2656,9 +2677,10 @@ inline void ggml_sycl_op_sum_rows(ggml_backend_sycl_context & ctx, const ggml_te
2656
2677
 
2657
2678
  sum_rows_f32_sycl(src0_dd, dst_dd, ncols, nrows, main_stream);
2658
2679
 
2659
- (void) src1;
2660
- (void) dst;
2661
- (void) src1_dd;
2680
+ GGML_UNUSED(src1);
2681
+ GGML_UNUSED(dst);
2682
+ GGML_UNUSED(src1_dd);
2683
+ GGML_UNUSED(ctx);
2662
2684
  }
2663
2685
 
2664
2686
  inline void ggml_sycl_op_argsort(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@@ -2677,9 +2699,10 @@ inline void ggml_sycl_op_argsort(ggml_backend_sycl_context & ctx, const ggml_ten
2677
2699
 
2678
2700
  argsort_f32_i32_sycl(src0_dd, (int *)dst_dd, ncols, nrows, order, main_stream);
2679
2701
 
2680
- (void) src1;
2681
- (void) dst;
2682
- (void) src1_dd;
2702
+ GGML_UNUSED(src1);
2703
+ GGML_UNUSED(dst);
2704
+ GGML_UNUSED(src1_dd);
2705
+ GGML_UNUSED(ctx);
2683
2706
  }
2684
2707
 
2685
2708
  inline void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@@ -2696,9 +2719,10 @@ inline void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, const ggml_tens
2696
2719
 
2697
2720
  argmax_f32_i32_sycl(src0_dd, (int *)dst_dd, ncols, nrows, main_stream);
2698
2721
 
2699
- (void) src1;
2700
- (void) dst;
2701
- (void) src1_dd;
2722
+ GGML_UNUSED(src1);
2723
+ GGML_UNUSED(dst);
2724
+ GGML_UNUSED(src1_dd);
2725
+ GGML_UNUSED(ctx);
2702
2726
  }
2703
2727
 
2704
2728
  inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
@@ -2718,9 +2742,10 @@ inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx, const gg
2718
2742
 
2719
2743
  diag_mask_inf_f32_sycl(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream);
2720
2744
 
2721
- (void) src1;
2722
- (void) dst;
2723
- (void) src1_dd;
2745
+ GGML_UNUSED(src1);
2746
+ GGML_UNUSED(dst);
2747
+ GGML_UNUSED(src1_dd);
2748
+ GGML_UNUSED(ctx);
2724
2749
  }
2725
2750
 
2726
2751
  inline void ggml_sycl_op_scale(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
@@ -2741,9 +2766,10 @@ inline void ggml_sycl_op_scale(ggml_backend_sycl_context & ctx, const ggml_tenso
2741
2766
  */
2742
2767
  SYCL_CHECK(0);
2743
2768
 
2744
- (void) src1;
2745
- (void) dst;
2746
- (void) src1_dd;
2769
+ GGML_UNUSED(src1);
2770
+ GGML_UNUSED(dst);
2771
+ GGML_UNUSED(src1_dd);
2772
+ GGML_UNUSED(ctx);
2747
2773
  }
2748
2774
 
2749
2775
  inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
@@ -2766,9 +2792,10 @@ inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, const ggml_tenso
2766
2792
  */
2767
2793
  SYCL_CHECK(0);
2768
2794
 
2769
- (void) src1;
2770
- (void) dst;
2771
- (void) src1_dd;
2795
+ GGML_UNUSED(src1);
2796
+ GGML_UNUSED(dst);
2797
+ GGML_UNUSED(src1_dd);
2798
+ GGML_UNUSED(ctx);
2772
2799
  }
2773
2800
 
2774
2801
  static void ggml_sycl_set_peer_access(const int n_tokens, int main_device) {
@@ -2845,7 +2872,6 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
2845
2872
 
2846
2873
  ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
2847
2874
  ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
2848
- ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
2849
2875
 
2850
2876
  const bool src0_is_contiguous = ggml_is_contiguous(src0);
2851
2877
  const bool src1_is_contiguous = ggml_is_contiguous(src1);
@@ -3272,7 +3298,6 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx,
3272
3298
 
3273
3299
  GGML_TENSOR_BINARY_OP_LOCALS
3274
3300
 
3275
- const int64_t ne_dst = ggml_nelements(dst);
3276
3301
 
3277
3302
  SYCL_CHECK(ggml_sycl_set_device(ctx.device));
3278
3303
  queue_ptr main_stream = ctx.stream();;
@@ -3380,6 +3405,7 @@ catch (sycl::exception const &exc) {
3380
3405
 
3381
3406
  inline bool ggml_sycl_supports_mmq(enum ggml_type type) {
3382
3407
  // TODO: accuracy issues in MMQ
3408
+ GGML_UNUSED(type);
3383
3409
  return false;
3384
3410
  }
3385
3411
 
@@ -3447,8 +3473,15 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
3447
3473
  use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
3448
3474
 
3449
3475
  if (!split && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
3450
- // KQ single-batch
3451
- ggml_sycl_mul_mat_vec_p021(ctx, src0, src1, dst);
3476
+ // TODO: Refactor and cleanup of mul mat dispatching.
3477
+ if (src0->ne[3] == 1 && src1->ne[3] == 1) {
3478
+ // KQ single-batch
3479
+ // mmv p021 was specific for these dimensions
3480
+ ggml_sycl_mul_mat_vec_p021(ctx, src0, src1, dst);
3481
+ } else {
3482
+ // The kernel from the if path is faster for that specific case, but does not support all mul mats.
3483
+ ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst);
3484
+ }
3452
3485
  } else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
3453
3486
  // KQV single-batch
3454
3487
  ggml_sycl_mul_mat_vec_nc(ctx, src0, src1, dst);
@@ -3743,12 +3776,12 @@ static void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor *sr
3743
3776
  } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) {
3744
3777
  ggml_cpy_i32_i32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
3745
3778
  } else {
3746
- fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
3779
+ GGML_LOG_ERROR("%s: unsupported type combination (%s to %s)\n", __func__,
3747
3780
  ggml_type_name(src0->type), ggml_type_name(src1->type));
3748
3781
  GGML_ABORT("fatal error");
3749
3782
  }
3750
3783
 
3751
- (void) dst;
3784
+ GGML_UNUSED(dst);
3752
3785
  }
3753
3786
  catch (sycl::exception const &exc) {
3754
3787
  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -3759,7 +3792,7 @@ catch (sycl::exception const &exc) {
3759
3792
  static void ggml_sycl_dup(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3760
3793
  // TODO: why do we pass dst as src1 here?
3761
3794
  ggml_sycl_cpy(ctx, src0, dst, nullptr);
3762
- (void) src1;
3795
+ GGML_UNUSED(src1);
3763
3796
  }
3764
3797
 
3765
3798
  static void ggml_sycl_diag_mask_inf(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -3804,13 +3837,16 @@ static void ggml_sycl_argmax(ggml_backend_sycl_context & ctx, const ggml_tensor
3804
3837
  }
3805
3838
 
3806
3839
  static void ggml_sycl_nop(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3807
- (void) src0;
3808
- (void) src1;
3809
- (void) dst;
3840
+ GGML_UNUSED(src0);
3841
+ GGML_UNUSED(src1);
3842
+ GGML_UNUSED(dst);
3843
+ GGML_UNUSED(ctx);
3810
3844
  }
3811
3845
 
3812
3846
  void ggml_sycl_set_main_device(const int main_device) try {
3813
- if (dpct::get_current_device_id() == main_device) return;
3847
+ if (dpct::get_current_device_id() == static_cast<unsigned int> (main_device)) {
3848
+ return;
3849
+ }
3814
3850
  check_allow_gpu_index(main_device);
3815
3851
  dpct::select_device(main_device);
3816
3852
 
@@ -3818,7 +3854,7 @@ void ggml_sycl_set_main_device(const int main_device) try {
3818
3854
  dpct::device_info prop;
3819
3855
  SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
3820
3856
  prop, dpct::dev_mgr::instance().get_device(main_device))));
3821
- fprintf(stderr, "Using device %d (%s) as main device\n",
3857
+ GGML_LOG_INFO("Using device %d (%s) as main device\n",
3822
3858
  main_device, prop.get_name());
3823
3859
  }
3824
3860
  }
@@ -4165,7 +4201,7 @@ static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_
4165
4201
  #endif
4166
4202
  bool ok = ggml_sycl_compute_forward(*sycl_ctx, node);
4167
4203
  if (!ok) {
4168
- fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
4204
+ GGML_LOG_ERROR("%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
4169
4205
  }
4170
4206
  GGML_ASSERT(ok);
4171
4207
  }
@@ -4178,6 +4214,7 @@ try
4178
4214
  {
4179
4215
  ggml_backend_sycl_context *sycl_ctx =
4180
4216
  (ggml_backend_sycl_context *)backend->context;
4217
+
4181
4218
  sycl::event *sycl_event = static_cast<sycl::event *>(event->context);
4182
4219
 
4183
4220
  const queue_ptr &stream = sycl_ctx->stream(sycl_ctx->device, 0);
@@ -4192,7 +4229,7 @@ catch (sycl::exception const &exc)
4192
4229
  }
4193
4230
 
4194
4231
  static void ggml_backend_sycl_event_wait(ggml_backend_t backend, ggml_backend_event_t event) try {
4195
- ggml_backend_sycl_context* sycl_ctx = static_cast<ggml_backend_sycl_context*>(backend->context);
4232
+
4196
4233
  sycl::event* sycl_event = static_cast<sycl::event*>(event->context);
4197
4234
 
4198
4235
  if (ggml_backend_is_sycl(backend)) {
@@ -4350,10 +4387,6 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
4350
4387
  if (op->op == GGML_OP_MUL_MAT) {
4351
4388
  a = op->src[0];
4352
4389
  b = op->src[1];
4353
- if (ggml_is_permuted(a) || ggml_is_permuted(b)) {
4354
- // TODO: fix like https://github.com/ggerganov/llama.cpp/pull/10021
4355
- return false;
4356
- }
4357
4390
  } else {
4358
4391
  a = op->src[2];
4359
4392
  b = op->src[1];
@@ -4455,7 +4488,16 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
4455
4488
  case GGML_OP_SOFT_MAX:
4456
4489
  return true;
4457
4490
  case GGML_OP_ROPE:
4458
- return ggml_is_contiguous(op->src[0]);
4491
+ {
4492
+ const int mode = ((const int32_t *) op->op_params)[2];
4493
+ if (mode & GGML_ROPE_TYPE_MROPE) {
4494
+ return false;
4495
+ }
4496
+ if (mode & GGML_ROPE_TYPE_VISION) {
4497
+ return false;
4498
+ }
4499
+ return ggml_is_contiguous(op->src[0]);
4500
+ }
4459
4501
  case GGML_OP_IM2COL:
4460
4502
  // TODO: add support for the new F32 operations
4461
4503
  return op->src[0]->type == GGML_TYPE_F16;
@@ -4490,7 +4532,7 @@ static bool ggml_backend_sycl_device_supports_buft(ggml_backend_dev_t dev, ggml_
4490
4532
  static int64_t get_op_batch_size(const ggml_tensor * op) {
4491
4533
  switch (op->op) {
4492
4534
  case GGML_OP_GET_ROWS:
4493
- return op->ne[1]; // this will increse the speed of prefill in test
4535
+ return 0;
4494
4536
  case GGML_OP_MUL_MAT:
4495
4537
  return op->ne[1];
4496
4538
  case GGML_OP_MUL_MAT_ID:
@@ -4604,13 +4646,14 @@ static void *ggml_backend_sycl_reg_get_proc_address(ggml_backend_reg_t reg, cons
4604
4646
  // SYCL doesn't support registering host memory, left here for reference
4605
4647
  // "ggml_backend_register_host_buffer"
4606
4648
  // "ggml_backend_unregister_host_buffer"
4649
+ GGML_UNUSED(name);
4607
4650
  return nullptr;
4608
4651
  }
4609
4652
 
4610
4653
  static const ggml_backend_reg_i ggml_backend_sycl_reg_interface = {
4611
4654
  /* .get_name = */ ggml_backend_sycl_reg_get_name,
4612
4655
  /* .get_device_count = */ ggml_backend_sycl_reg_get_device_count,
4613
- /* .get_device_get = */ ggml_backend_sycl_reg_get_device,
4656
+ /* .get_device = */ ggml_backend_sycl_reg_get_device,
4614
4657
  /* .get_proc_address = */ ggml_backend_sycl_reg_get_proc_address,
4615
4658
  };
4616
4659
 
@@ -4641,16 +4684,17 @@ ggml_backend_reg_t ggml_backend_sycl_reg() {
4641
4684
  dev_ctx->description = prop.get_name();
4642
4685
 
4643
4686
  ggml_backend_dev_t dev = new ggml_backend_device {
4644
- /* .interface = */ ggml_backend_sycl_device_interface,
4645
- /* .reg = */ &reg,
4646
- /* .context = */ dev_ctx
4687
+ /* .iface = */ ggml_backend_sycl_device_interface,
4688
+ /* .reg = */ &reg,
4689
+ /* .context = */ dev_ctx
4647
4690
  };
4648
4691
  ctx->devices.push_back(dev);
4649
4692
  }
4650
4693
 
4651
4694
  reg = ggml_backend_reg {
4652
- /* .interface = */ ggml_backend_sycl_reg_interface,
4653
- /* .context = */ ctx
4695
+ /* .api_version = */ GGML_BACKEND_API_VERSION,
4696
+ /* .iface = */ ggml_backend_sycl_reg_interface,
4697
+ /* .context = */ ctx
4654
4698
  };
4655
4699
  }
4656
4700
 
@@ -4668,7 +4712,7 @@ ggml_backend_t ggml_backend_sycl_init(int device) {
4668
4712
 
4669
4713
  ggml_backend_sycl_context * ctx = new ggml_backend_sycl_context(device);
4670
4714
  if (ctx == nullptr) {
4671
- fprintf(stderr, "%s: error: failed to allocate context\n", __func__);
4715
+ GGML_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
4672
4716
  return nullptr;
4673
4717
  };
4674
4718
 
@@ -4682,3 +4726,4 @@ ggml_backend_t ggml_backend_sycl_init(int device) {
4682
4726
  return sycl_backend;
4683
4727
  }
4684
4728
 
4729
+ GGML_BACKEND_DL_IMPL(ggml_backend_sycl_reg)
@@ -120,6 +120,7 @@ void ggml_sycl_op_im2col(
120
120
  im2col_sycl(src1_dd, (float *)dst_dd, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
121
121
  }
122
122
 
123
- (void) src0;
124
- (void) src0_dd;
123
+ GGML_UNUSED(src0);
124
+ GGML_UNUSED(src0_dd);
125
+ GGML_UNUSED(ctx);
125
126
  }