@fugood/llama.node 0.3.13 → 0.3.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (184) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/lib/binding.ts +1 -1
  18. package/package.json +1 -1
  19. package/src/LlamaContext.cpp +98 -76
  20. package/src/LlamaContext.h +1 -1
  21. package/src/common.hpp +1 -2
  22. package/src/llama.cpp/.github/workflows/build.yml +89 -10
  23. package/src/llama.cpp/.github/workflows/server.yml +2 -0
  24. package/src/llama.cpp/CMakeLists.txt +9 -1
  25. package/src/llama.cpp/cmake/common.cmake +2 -0
  26. package/src/llama.cpp/common/CMakeLists.txt +3 -3
  27. package/src/llama.cpp/common/arg.cpp +132 -13
  28. package/src/llama.cpp/common/chat.cpp +960 -266
  29. package/src/llama.cpp/common/chat.h +135 -0
  30. package/src/llama.cpp/common/common.cpp +33 -174
  31. package/src/llama.cpp/common/common.h +27 -67
  32. package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
  33. package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
  34. package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +37 -5
  35. package/src/llama.cpp/common/ngram-cache.cpp +1 -0
  36. package/src/llama.cpp/common/sampling.cpp +45 -7
  37. package/src/llama.cpp/common/speculative.cpp +10 -9
  38. package/src/llama.cpp/common/speculative.h +1 -1
  39. package/src/llama.cpp/docs/build.md +45 -7
  40. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +2 -2
  41. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +4 -2
  42. package/src/llama.cpp/examples/embedding/embedding.cpp +2 -1
  43. package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
  44. package/src/llama.cpp/examples/gritlm/gritlm.cpp +2 -2
  45. package/src/llama.cpp/examples/imatrix/imatrix.cpp +3 -4
  46. package/src/llama.cpp/examples/infill/infill.cpp +2 -2
  47. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +2 -2
  48. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +5 -5
  49. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  50. package/src/llama.cpp/examples/llava/clip.cpp +373 -107
  51. package/src/llama.cpp/examples/llava/clip.h +19 -3
  52. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
  53. package/src/llama.cpp/examples/llava/llava.cpp +4 -2
  54. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
  55. package/src/llama.cpp/examples/lookahead/lookahead.cpp +7 -6
  56. package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
  57. package/src/llama.cpp/examples/main/main.cpp +79 -34
  58. package/src/llama.cpp/examples/parallel/parallel.cpp +6 -5
  59. package/src/llama.cpp/examples/passkey/passkey.cpp +15 -14
  60. package/src/llama.cpp/examples/perplexity/perplexity.cpp +6 -6
  61. package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
  62. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -2
  63. package/src/llama.cpp/examples/retrieval/retrieval.cpp +1 -1
  64. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
  65. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
  66. package/src/llama.cpp/examples/run/run.cpp +196 -108
  67. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +2 -2
  68. package/src/llama.cpp/examples/server/server.cpp +113 -101
  69. package/src/llama.cpp/examples/server/utils.hpp +94 -105
  70. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
  71. package/src/llama.cpp/examples/speculative/speculative.cpp +14 -14
  72. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  73. package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
  74. package/src/llama.cpp/examples/tts/tts.cpp +263 -151
  75. package/src/llama.cpp/ggml/CMakeLists.txt +14 -1
  76. package/src/llama.cpp/ggml/cmake/common.cmake +26 -0
  77. package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
  78. package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
  79. package/src/llama.cpp/ggml/include/ggml-cpu.h +3 -0
  80. package/src/llama.cpp/ggml/include/ggml.h +29 -1
  81. package/src/llama.cpp/ggml/src/CMakeLists.txt +15 -34
  82. package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
  83. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
  84. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
  85. package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
  86. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +6 -2
  87. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -7
  88. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
  89. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +139 -16
  90. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
  91. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
  92. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
  93. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +151 -0
  94. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1546 -387
  95. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1645 -113
  96. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +22 -0
  97. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
  98. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
  99. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  101. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +15 -2
  102. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +2 -1
  103. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -1
  104. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
  105. package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
  106. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
  107. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +242 -0
  108. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -6
  109. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
  110. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +315 -138
  111. package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
  112. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
  113. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +5 -0
  114. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +2 -1
  115. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
  116. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +117 -36
  117. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
  118. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
  119. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
  120. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
  121. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
  122. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +147 -16
  123. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +40 -40
  124. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +307 -0
  125. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
  126. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +262 -746
  127. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +0 -1
  128. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -78
  129. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +114 -6
  130. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +6 -0
  131. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +4 -1
  132. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
  133. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
  134. package/src/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +305 -0
  135. package/src/llama.cpp/ggml/src/ggml-sycl/wkv.hpp +10 -0
  136. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +498 -188
  137. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -4
  138. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +16 -3
  139. package/src/llama.cpp/ggml/src/ggml.c +93 -5
  140. package/src/llama.cpp/include/llama.h +105 -27
  141. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
  142. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
  143. package/src/llama.cpp/requirements/requirements-all.txt +1 -0
  144. package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
  145. package/src/llama.cpp/requirements.txt +1 -0
  146. package/src/llama.cpp/src/CMakeLists.txt +5 -2
  147. package/src/llama.cpp/src/llama-adapter.cpp +19 -20
  148. package/src/llama.cpp/src/llama-adapter.h +11 -9
  149. package/src/llama.cpp/src/llama-arch.cpp +123 -16
  150. package/src/llama.cpp/src/llama-arch.h +19 -0
  151. package/src/llama.cpp/src/llama-batch.h +2 -2
  152. package/src/llama.cpp/src/llama-chat.cpp +1 -0
  153. package/src/llama.cpp/src/llama-context.cpp +2253 -1222
  154. package/src/llama.cpp/src/llama-context.h +214 -77
  155. package/src/llama.cpp/src/llama-cparams.h +1 -0
  156. package/src/llama.cpp/src/llama-grammar.cpp +182 -182
  157. package/src/llama.cpp/src/llama-grammar.h +12 -3
  158. package/src/llama.cpp/src/llama-graph.cpp +1662 -0
  159. package/src/llama.cpp/src/llama-graph.h +574 -0
  160. package/src/llama.cpp/src/llama-hparams.cpp +8 -0
  161. package/src/llama.cpp/src/llama-hparams.h +9 -0
  162. package/src/llama.cpp/src/llama-io.cpp +15 -0
  163. package/src/llama.cpp/src/llama-io.h +35 -0
  164. package/src/llama.cpp/src/llama-kv-cache.cpp +1006 -291
  165. package/src/llama.cpp/src/llama-kv-cache.h +178 -109
  166. package/src/llama.cpp/src/llama-memory.cpp +1 -0
  167. package/src/llama.cpp/src/llama-memory.h +21 -0
  168. package/src/llama.cpp/src/llama-mmap.cpp +11 -1
  169. package/src/llama.cpp/src/llama-model.cpp +8230 -122
  170. package/src/llama.cpp/src/llama-model.h +34 -1
  171. package/src/llama.cpp/src/llama-quant.cpp +10 -1
  172. package/src/llama.cpp/src/llama-sampling.cpp +43 -10
  173. package/src/llama.cpp/src/llama-vocab.cpp +12 -0
  174. package/src/llama.cpp/src/llama.cpp +51 -9837
  175. package/src/llama.cpp/tests/test-backend-ops.cpp +247 -112
  176. package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
  177. package/src/llama.cpp/tests/test-chat.cpp +593 -395
  178. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
  179. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
  180. package/src/llama.cpp/Sources/llama/llama.h +0 -4
  181. package/src/llama.cpp/common/chat.hpp +0 -55
  182. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +0 -143
  183. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +0 -9
  184. /package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +0 -0
@@ -464,7 +464,7 @@ static rpc_tensor serialize_tensor(const ggml_tensor * tensor) {
464
464
  return result;
465
465
  }
466
466
 
467
- static void ggml_backend_rpc_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
467
+ static enum ggml_status ggml_backend_rpc_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
468
468
  ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
469
469
 
470
470
  // CUDA backend on the server pads everything to 512 due to CUDA limitations.
@@ -478,6 +478,7 @@ static void ggml_backend_rpc_buffer_init_tensor(ggml_backend_buffer_t buffer, gg
478
478
  bool status = send_rpc_cmd(ctx->sock, RPC_CMD_INIT_TENSOR, &request, sizeof(request), nullptr, 0);
479
479
  GGML_ASSERT(status);
480
480
  }
481
+ return GGML_STATUS_SUCCESS;
481
482
  }
482
483
 
483
484
  static void ggml_backend_rpc_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
@@ -1,3 +1,5 @@
1
+ message(STATUS "GGML_SYCL_TARGET=${GGML_SYCL_TARGET}")
2
+
1
3
  if (NOT GGML_SYCL_TARGET MATCHES "^(INTEL|NVIDIA|AMD)$")
2
4
  message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL, NVIDIA, or AMD")
3
5
  endif()
@@ -64,6 +66,9 @@ if (WIN32)
64
66
  find_package(MKL REQUIRED)
65
67
  target_link_libraries(ggml-sycl PRIVATE IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
66
68
  else()
69
+ if (GGML_SYCL_GRAPH)
70
+ add_compile_definitions(GGML_SYCL_GRAPH)
71
+ endif()
67
72
  if (GGML_SYCL_TARGET STREQUAL "INTEL")
68
73
  target_link_libraries(ggml-sycl PRIVATE sycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
69
74
  elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA")
@@ -26,9 +26,10 @@
26
26
  #include "softmax.hpp"
27
27
  #include "tsembd.hpp"
28
28
  #include "im2col.hpp"
29
- #include "wkv6.hpp"
29
+ #include "wkv.hpp"
30
30
  #include "outprod.hpp"
31
31
  #include "element_wise.hpp"
32
+ #include "cpy.hpp"
32
33
  #include "gla.hpp"
33
34
 
34
35
  #endif // GGML_SYCL_BACKEND_HPP
@@ -99,3 +99,20 @@ catch (sycl::exception const &exc) {
99
99
  << ", line:" << __LINE__ << std::endl;
100
100
  std::exit(1);
101
101
  }
102
+
103
+
104
+ void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector<queue_ptr> streams) {
105
+ for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
106
+ for (int64_t is = 0; is < GGML_SYCL_MAX_STREAMS; ++is) {
107
+ if (extra->events[i][is] != nullptr) {
108
+ SYCL_CHECK(CHECK_TRY_ERROR(dpct::destroy_event(extra->events[i][is])));
109
+ }
110
+ }
111
+ if (extra->data_device[i] != nullptr && streams.size()>0) {
112
+ ggml_sycl_set_device(i);
113
+ SYCL_CHECK(
114
+ CHECK_TRY_ERROR(sycl::free(extra->data_device[i], *(streams[i]))));
115
+ }
116
+ }
117
+ delete extra;
118
+ }
@@ -19,6 +19,9 @@
19
19
  #include "dpct/helper.hpp"
20
20
  #include "ggml-sycl.h"
21
21
  #include "presets.hpp"
22
+ #include "sycl_hw.hpp"
23
+
24
+
22
25
  #if GGML_SYCL_DNNL
23
26
  #include "dnnl.hpp"
24
27
  #include "dnnl_sycl.hpp"
@@ -31,11 +34,15 @@
31
34
  #pragma clang diagnostic ignored "-Wnested-anon-types"
32
35
  #include "ggml-common.h"
33
36
  #pragma clang diagnostic pop
37
+ #include "ggml-impl.h"
34
38
 
35
39
  void* ggml_sycl_host_malloc(size_t size);
36
40
  void ggml_sycl_host_free(void* ptr);
37
41
 
38
- static int g_ggml_sycl_debug = 0;
42
+
43
+ extern int g_ggml_sycl_debug;
44
+ extern int g_ggml_sycl_disable_optimize;
45
+
39
46
  #define GGML_SYCL_DEBUG(...) \
40
47
  do { \
41
48
  if (g_ggml_sycl_debug) \
@@ -182,18 +189,24 @@ inline dpct::err0 ggml_sycl_set_device(const int device) try {
182
189
  }
183
190
 
184
191
  //////////////////////
192
+ struct optimize_feature {
193
+ bool reorder=false;
194
+ };
195
+
196
+ struct sycl_device_info {
197
+ int cc; // compute capability
198
+ // int nsm; // number of streaming multiprocessors
199
+ // size_t smpb; // max. shared memory per block
200
+ bool vmm; // virtual memory support
201
+ size_t total_vram;
202
+ sycl_hw_info hw_info;
203
+ optimize_feature opt_feature;
204
+ };
205
+
185
206
 
186
207
  struct ggml_sycl_device_info {
187
208
  int device_count;
188
209
 
189
- struct sycl_device_info {
190
- int cc; // compute capability
191
- // int nsm; // number of streaming multiprocessors
192
- // size_t smpb; // max. shared memory per block
193
- bool vmm; // virtual memory support
194
- size_t total_vram;
195
- };
196
-
197
210
  sycl_device_info devices[GGML_SYCL_MAX_DEVICES] = {};
198
211
 
199
212
  std::array<float, GGML_SYCL_MAX_DEVICES> default_tensor_split = {};
@@ -260,17 +273,47 @@ struct ggml_tensor_extra_gpu {
260
273
  // tensors
261
274
  dpct::event_ptr events[GGML_SYCL_MAX_DEVICES]
262
275
  [GGML_SYCL_MAX_STREAMS]; // events for synchronizing multiple GPUs
276
+ optimize_feature optimized_feature;
263
277
  };
264
278
 
279
+ void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector<queue_ptr> streams={});
280
+
281
+ inline optimize_feature check_gpu_optimize_feature(syclex::architecture &arch) {
282
+ optimize_feature opt;
283
+
284
+ opt.reorder =
285
+ (arch == syclex::architecture::intel_gpu_dg1 ||
286
+ arch == syclex::architecture::intel_gpu_acm_g10 ||
287
+ arch == syclex::architecture::intel_gpu_acm_g11 ||
288
+ arch == syclex::architecture::intel_gpu_acm_g12 ||
289
+ arch == syclex::architecture::intel_gpu_pvc ||
290
+ arch == syclex::architecture::intel_gpu_pvc_vg ||
291
+ arch == syclex::architecture::intel_gpu_mtl_u ||
292
+ arch == syclex::architecture::intel_gpu_mtl_s ||
293
+ arch == syclex::architecture::intel_gpu_mtl_h ||
294
+ arch == syclex::architecture::intel_gpu_arl_u ||
295
+ arch == syclex::architecture::intel_gpu_arl_s ||
296
+ arch == syclex::architecture::intel_gpu_arl_h ||
297
+ arch == syclex::architecture::intel_gpu_bmg_g21 ||
298
+ arch == syclex::architecture::intel_gpu_lnl_m
299
+ );
300
+
301
+ return opt;
302
+ }
303
+
304
+ namespace sycl_ex = sycl::ext::oneapi::experimental;
265
305
  struct ggml_backend_sycl_context {
266
306
  int device;
267
307
  std::string name;
308
+ optimize_feature opt_feature;
309
+ bool optimized_graph=false;
268
310
 
269
311
  queue_ptr qptrs[GGML_SYCL_MAX_DEVICES][GGML_SYCL_MAX_STREAMS] = { { nullptr } };
270
312
 
271
313
  explicit ggml_backend_sycl_context(int device) :
272
314
  device(device),
273
315
  name(GGML_SYCL_NAME + std::to_string(device)) {
316
+ opt_feature = ggml_sycl_info().devices[device].opt_feature;
274
317
  }
275
318
 
276
319
  queue_ptr stream(int device, int stream) {
@@ -350,6 +393,10 @@ struct ggml_backend_sycl_context {
350
393
  return pool(device);
351
394
  }
352
395
 
396
+ #ifdef GGML_SYCL_GRAPH
397
+ std::unique_ptr<sycl_ex::command_graph<sycl_ex::graph_state::executable>> exec_graph = nullptr;
398
+ #endif
399
+
353
400
  ggml_sycl_pool & host_pool(int device) {
354
401
  if (host_pools[device] == nullptr) {
355
402
  host_pools[device] = new_pool_for_host(stream(device, 0), device);
@@ -432,6 +479,7 @@ static void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
432
479
  int ne0, int ne1, int ne2, int ne3,
433
480
  int ne10, int ne11, int ne12, int ne13,
434
481
  /*int s0, */ int s1, int s2, int s3,
482
+ /*int s00,*/ int s01, int s02, int s03,
435
483
  /*int s10,*/ int s11, int s12, int s13,
436
484
  const sycl::nd_item<3> &item_ct1) {
437
485
  const int i0s = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
@@ -453,9 +501,9 @@ static void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
453
501
  const int i12 = i2 % ne12;
454
502
  const int i13 = i3 % ne13;
455
503
 
456
- const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
504
+ const size_t i_src0 = i3*s03 + i2*s02 + i1*s01;
457
505
  const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
458
- const size_t i_dst = i_src0;
506
+ const size_t i_dst = i3*s3 + i2*s2 + i1*s1;
459
507
 
460
508
  const src0_t * src0_row = src0 + i_src0;
461
509
  const src1_t * src1_row = src1 + i_src1;
@@ -473,6 +521,7 @@ static void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t
473
521
  int ne0, int ne1, int ne2, int ne3,
474
522
  int ne10, int ne11, int ne12, int ne13,
475
523
  /*int s0, */ int s1, int s2, int s3,
524
+ /*int s00,*/ int s01, int s02, int s03,
476
525
  /*int s10,*/ int s11, int s12, int s13,
477
526
  const sycl::nd_item<3> &item_ct1) {
478
527
 
@@ -492,9 +541,9 @@ static void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t
492
541
  const int i12 = i2 % ne12;
493
542
  const int i13 = i3 % ne13;
494
543
 
495
- const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
544
+ const size_t i_src0 = i3*s03 + i2*s02 + i1*s01;
496
545
  const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
497
- const size_t i_dst = i_src0;
546
+ const size_t i_dst = i3*s3 + i2*s2 + i1*s1;
498
547
 
499
548
  const src0_t * src0_row = src0 + i_src0;
500
549
  const src1_t * src1_row = src1 + i_src1;
@@ -524,9 +573,11 @@ struct bin_bcast_sycl {
524
573
  int nr[4] = { nr0, nr1, nr2, nr3 };
525
574
 
526
575
  // collapse dimensions until first broadcast dimension
527
- int64_t cne0[] = {ne0, ne1, ne2, ne3};
576
+ int64_t cne[] = {ne0, ne1, ne2, ne3};
577
+ int64_t cne0[] = {ne00, ne01, ne02, ne03};
528
578
  int64_t cne1[] = {ne10, ne11, ne12, ne13};
529
- size_t cnb0[] = {nb0, nb1, nb2, nb3};
579
+ size_t cnb[] = {nb0, nb1, nb2, nb3};
580
+ size_t cnb0[] = {nb00, nb01, nb02, nb03};
530
581
  size_t cnb1[] = {nb10, nb11, nb12, nb13};
531
582
  auto collapse = [](int64_t cne[]) {
532
583
  cne[0] *= cne[1];
@@ -541,32 +592,41 @@ struct bin_bcast_sycl {
541
592
  cnb[3] *= cne[3];
542
593
  };
543
594
 
544
- for (int i = 0; i < 4; i++) {
545
- if (nr[i] != 1) {
546
- break;
547
- }
548
- if (i > 0) {
549
- collapse_nb(cnb0, cne0);
550
- collapse_nb(cnb1, cne1);
551
- collapse(cne0);
552
- collapse(cne1);
595
+ if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) {
596
+ for (int i = 0; i < 4; i++) {
597
+ if (nr[i] != 1) {
598
+ break;
599
+ }
600
+ if (i > 0) {
601
+ collapse_nb(cnb, cne);
602
+ collapse_nb(cnb0, cne0);
603
+ collapse_nb(cnb1, cne1);
604
+ collapse(cne);
605
+ collapse(cne0);
606
+ collapse(cne1);
607
+ }
553
608
  }
554
609
  }
555
610
  {
556
- int64_t ne0 = cne0[0];
557
- int64_t ne1 = cne0[1];
558
- int64_t ne2 = cne0[2];
559
- int64_t ne3 = cne0[3];
611
+ int64_t ne0 = cne[0];
612
+ int64_t ne1 = cne[1];
613
+ int64_t ne2 = cne[2];
614
+ int64_t ne3 = cne[3];
560
615
 
561
616
  int64_t ne10 = cne1[0];
562
617
  int64_t ne11 = cne1[1];
563
618
  int64_t ne12 = cne1[2];
564
619
  int64_t ne13 = cne1[3];
565
620
 
566
- size_t nb0 = cnb0[0];
567
- size_t nb1 = cnb0[1];
568
- size_t nb2 = cnb0[2];
569
- size_t nb3 = cnb0[3];
621
+ size_t nb0 = cnb[0];
622
+ size_t nb1 = cnb[1];
623
+ size_t nb2 = cnb[2];
624
+ size_t nb3 = cnb[3];
625
+
626
+ size_t nb00 = cnb0[0];
627
+ size_t nb01 = cnb0[1];
628
+ size_t nb02 = cnb0[2];
629
+ size_t nb03 = cnb0[3];
570
630
 
571
631
  size_t nb10 = cnb1[0];
572
632
  size_t nb11 = cnb1[1];
@@ -583,6 +643,28 @@ struct bin_bcast_sycl {
583
643
  size_t s12 = nb12 / sizeof(src1_t);
584
644
  size_t s13 = nb13 / sizeof(src1_t);
585
645
 
646
+ size_t s00 = nb00 / sizeof(src0_t);
647
+ size_t s01 = nb01 / sizeof(src0_t);
648
+ size_t s02 = nb02 / sizeof(src0_t);
649
+ size_t s03 = nb03 / sizeof(src0_t);
650
+
651
+ GGML_UNUSED(s00);
652
+
653
+ GGML_ASSERT(nb0 % sizeof(dst_t) == 0);
654
+ GGML_ASSERT(nb1 % sizeof(dst_t) == 0);
655
+ GGML_ASSERT(nb2 % sizeof(dst_t) == 0);
656
+ GGML_ASSERT(nb3 % sizeof(dst_t) == 0);
657
+
658
+ GGML_ASSERT(nb00 % sizeof(src0_t) == 0);
659
+ GGML_ASSERT(nb01 % sizeof(src0_t) == 0);
660
+ GGML_ASSERT(nb02 % sizeof(src0_t) == 0);
661
+ GGML_ASSERT(nb03 % sizeof(src0_t) == 0);
662
+
663
+ GGML_ASSERT(nb10 % sizeof(src1_t) == 0);
664
+ GGML_ASSERT(nb11 % sizeof(src1_t) == 0);
665
+ GGML_ASSERT(nb12 % sizeof(src1_t) == 0);
666
+ GGML_ASSERT(nb13 % sizeof(src1_t) == 0);
667
+
586
668
  GGML_ASSERT(s0 == 1);
587
669
  GGML_ASSERT(s10 == 1);
588
670
 
@@ -619,8 +701,8 @@ struct bin_bcast_sycl {
619
701
  [=](sycl::nd_item<3> item_ct1) {
620
702
  k_bin_bcast_unravel<bin_op>(
621
703
  src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3,
622
- ne10, ne11, ne12, ne13, s1, s2, s3, s11, s12,
623
- s13, item_ct1);
704
+ ne10, ne11, ne12, ne13, s1, s2, s3, s01, s02,
705
+ s03, s11, s12, s13, item_ct1);
624
706
  });
625
707
  }
626
708
  } else {
@@ -638,7 +720,7 @@ struct bin_bcast_sycl {
638
720
  [=](sycl::nd_item<3> item_ct1) {
639
721
  k_bin_bcast<bin_op>(src0_dd, src1_dd, dst_dd, ne0, ne1,
640
722
  ne2, ne3, ne10, ne11, ne12, ne13,
641
- s1, s2, s3, s11, s12, s13,
723
+ s1, s2, s3, s01, s02, s03, s11, s12, s13,
642
724
  item_ct1);
643
725
  });
644
726
  }
@@ -680,5 +762,4 @@ bool gpu_has_xmx(sycl::device &dev);
680
762
  void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
681
763
  const ggml_tensor *src1, ggml_tensor *dst,
682
764
  const ggml_sycl_op_flatten_t op);
683
-
684
765
  #endif // GGML_SYCL_COMMON_HPP
@@ -125,6 +125,25 @@ static void dequantize_row_q4_0_sycl(const void *vx, dst_t *y, const int64_t k,
125
125
  }
126
126
  }
127
127
 
128
+ template <typename dst_t>
129
+ static void dequantize_row_q4_0_sycl_reorder(const void *vx, dst_t *y, const int64_t k,
130
+ dpct::queue_ptr stream) {
131
+
132
+ dpct::has_capability_or_fail(stream->get_device(),
133
+ {sycl::aspect::fp16});
134
+
135
+ int constexpr WARP_K = WARP_SIZE * QK4_0;
136
+ const int n_warp = (k + WARP_K - 1) / WARP_K;
137
+ GGML_ASSERT(k % 2 == 0);
138
+ stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, n_warp) *
139
+ sycl::range<3>(1, 1, WARP_SIZE),
140
+ sycl::range<3>(1, 1, WARP_SIZE)),
141
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]]{
142
+ dequantize_block_q4_0_reorder(vx, y, k, item_ct1);
143
+ });
144
+
145
+ }
146
+
128
147
  template <typename dst_t>
129
148
  static void dequantize_row_q4_1_sycl(const void *vx, dst_t *y, const int64_t k,
130
149
  dpct::queue_ptr stream) {
@@ -452,10 +471,15 @@ static void convert_unary_sycl(const void *__restrict__ vx,
452
471
  }
453
472
  }
454
473
 
455
- to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type) {
474
+ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor *dst) {
456
475
  switch (type) {
457
476
  case GGML_TYPE_Q4_0:
458
- return dequantize_block_sycl<QK4_0, QR4_0, dequantize_q4_0>;
477
+ if (dst->src[0]->extra &&
478
+ ((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {
479
+ return dequantize_row_q4_0_sycl_reorder;
480
+ } else {
481
+ return dequantize_block_sycl<QK4_0, QR4_0, dequantize_q4_0>;
482
+ }
459
483
  case GGML_TYPE_Q4_1:
460
484
  return dequantize_block_sycl<QK4_1, QR4_1, dequantize_q4_1>;
461
485
  case GGML_TYPE_Q5_0:
@@ -499,10 +523,15 @@ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type) {
499
523
  }
500
524
  }
501
525
 
502
- to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type) {
526
+ to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) {
503
527
  switch (type) {
504
528
  case GGML_TYPE_Q4_0:
505
- return dequantize_row_q4_0_sycl;
529
+ if (dst->src[0]->extra &&
530
+ ((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {
531
+ return dequantize_row_q4_0_sycl_reorder;
532
+ } else {
533
+ return dequantize_row_q4_0_sycl;
534
+ }
506
535
  case GGML_TYPE_Q4_1:
507
536
  return dequantize_row_q4_1_sycl;
508
537
  case GGML_TYPE_Q5_0:
@@ -21,7 +21,7 @@ using to_t_sycl_t = void (*)(const void *__restrict__ x, T *__restrict__ y,
21
21
  typedef to_t_sycl_t<float> to_fp32_sycl_t;
22
22
  typedef to_t_sycl_t<sycl::half> to_fp16_sycl_t;
23
23
 
24
- to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type);
25
- to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type);
24
+ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor *dst);
25
+ to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst);
26
26
 
27
27
  #endif // GGML_SYCL_CONVERT_HPP