@fugood/llama.node 0.3.12 → 0.3.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (159) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/lib/binding.ts +2 -1
  18. package/package.json +1 -1
  19. package/src/LlamaCompletionWorker.cpp +14 -0
  20. package/src/LlamaContext.cpp +110 -79
  21. package/src/LlamaContext.h +1 -1
  22. package/src/common.hpp +1 -2
  23. package/src/llama.cpp/.github/workflows/build.yml +95 -13
  24. package/src/llama.cpp/.github/workflows/docker.yml +2 -0
  25. package/src/llama.cpp/.github/workflows/labeler.yml +1 -1
  26. package/src/llama.cpp/.github/workflows/server.yml +2 -0
  27. package/src/llama.cpp/common/CMakeLists.txt +23 -6
  28. package/src/llama.cpp/common/arg.cpp +292 -14
  29. package/src/llama.cpp/common/chat.cpp +1128 -315
  30. package/src/llama.cpp/common/chat.h +135 -0
  31. package/src/llama.cpp/common/common.cpp +27 -171
  32. package/src/llama.cpp/common/common.h +41 -73
  33. package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
  34. package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
  35. package/src/llama.cpp/common/llguidance.cpp +3 -3
  36. package/src/llama.cpp/common/log.cpp +1 -0
  37. package/src/llama.cpp/common/log.h +2 -1
  38. package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +21 -7
  39. package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +61 -14
  40. package/src/llama.cpp/common/ngram-cache.cpp +1 -0
  41. package/src/llama.cpp/common/sampling.cpp +93 -49
  42. package/src/llama.cpp/common/speculative.cpp +6 -5
  43. package/src/llama.cpp/common/speculative.h +1 -1
  44. package/src/llama.cpp/docs/build.md +47 -9
  45. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +3 -1
  46. package/src/llama.cpp/examples/embedding/embedding.cpp +1 -0
  47. package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
  48. package/src/llama.cpp/examples/imatrix/imatrix.cpp +4 -4
  49. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +6 -5
  50. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +1 -1
  51. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +1 -1
  52. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  53. package/src/llama.cpp/examples/llava/clip.cpp +373 -107
  54. package/src/llama.cpp/examples/llava/clip.h +19 -3
  55. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
  56. package/src/llama.cpp/examples/llava/llava.cpp +4 -2
  57. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
  58. package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -0
  59. package/src/llama.cpp/examples/main/main.cpp +73 -28
  60. package/src/llama.cpp/examples/parallel/parallel.cpp +1 -0
  61. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -0
  62. package/src/llama.cpp/examples/perplexity/perplexity.cpp +1 -0
  63. package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
  64. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
  65. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
  66. package/src/llama.cpp/examples/run/run.cpp +115 -79
  67. package/src/llama.cpp/examples/server/CMakeLists.txt +1 -1
  68. package/src/llama.cpp/examples/server/httplib.h +381 -292
  69. package/src/llama.cpp/examples/server/server.cpp +134 -128
  70. package/src/llama.cpp/examples/server/utils.hpp +95 -106
  71. package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
  72. package/src/llama.cpp/examples/tts/tts.cpp +251 -142
  73. package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
  74. package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
  75. package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
  76. package/src/llama.cpp/ggml/include/ggml-cpu.h +4 -1
  77. package/src/llama.cpp/ggml/include/ggml-metal.h +1 -1
  78. package/src/llama.cpp/ggml/include/ggml-vulkan.h +0 -2
  79. package/src/llama.cpp/ggml/include/ggml.h +6 -2
  80. package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
  81. package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
  82. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
  83. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
  84. package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
  85. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -2
  86. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
  87. package/src/llama.cpp/ggml/src/ggml-common.h +0 -2
  88. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +132 -17
  89. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
  90. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
  91. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
  92. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +156 -11
  93. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +2235 -641
  94. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1572 -198
  95. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +24 -5
  96. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
  97. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
  98. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
  99. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +9 -8
  101. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +16 -3
  102. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
  103. package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
  104. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
  105. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +235 -0
  106. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -2
  107. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
  108. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +246 -120
  109. package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
  110. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
  111. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +2 -0
  112. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  113. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
  114. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +51 -10
  115. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
  116. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
  117. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
  118. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
  119. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
  120. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +136 -4
  121. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +308 -0
  122. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
  123. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +174 -728
  124. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -77
  125. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -0
  126. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
  127. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
  128. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +949 -602
  129. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +37 -3
  130. package/src/llama.cpp/ggml/src/ggml.c +9 -4
  131. package/src/llama.cpp/include/llama.h +32 -14
  132. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
  133. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
  134. package/src/llama.cpp/requirements/requirements-all.txt +1 -0
  135. package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
  136. package/src/llama.cpp/requirements.txt +1 -0
  137. package/src/llama.cpp/src/llama-arch.cpp +21 -0
  138. package/src/llama.cpp/src/llama-arch.h +1 -0
  139. package/src/llama.cpp/src/llama-chat.cpp +1 -0
  140. package/src/llama.cpp/src/llama-grammar.cpp +183 -183
  141. package/src/llama.cpp/src/llama-grammar.h +13 -4
  142. package/src/llama.cpp/src/llama-impl.h +6 -6
  143. package/src/llama.cpp/src/llama-kv-cache.h +2 -1
  144. package/src/llama.cpp/src/llama-mmap.cpp +11 -1
  145. package/src/llama.cpp/src/llama-mmap.h +1 -0
  146. package/src/llama.cpp/src/llama-model.cpp +70 -6
  147. package/src/llama.cpp/src/llama-sampling.cpp +174 -67
  148. package/src/llama.cpp/src/llama-vocab.cpp +12 -0
  149. package/src/llama.cpp/src/llama.cpp +154 -5
  150. package/src/llama.cpp/src/unicode.cpp +9 -2
  151. package/src/llama.cpp/tests/test-backend-ops.cpp +171 -115
  152. package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
  153. package/src/llama.cpp/tests/test-chat.cpp +691 -325
  154. package/src/llama.cpp/tests/test-gguf.cpp +4 -4
  155. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
  156. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
  157. package/src/llama.cpp/tests/test-sampling.cpp +15 -0
  158. package/src/llama.cpp/Sources/llama/llama.h +0 -4
  159. package/src/llama.cpp/common/chat.hpp +0 -52
@@ -464,7 +464,7 @@ static rpc_tensor serialize_tensor(const ggml_tensor * tensor) {
464
464
  return result;
465
465
  }
466
466
 
467
- static void ggml_backend_rpc_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
467
+ static enum ggml_status ggml_backend_rpc_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
468
468
  ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
469
469
 
470
470
  // CUDA backend on the server pads everything to 512 due to CUDA limitations.
@@ -478,6 +478,7 @@ static void ggml_backend_rpc_buffer_init_tensor(ggml_backend_buffer_t buffer, gg
478
478
  bool status = send_rpc_cmd(ctx->sock, RPC_CMD_INIT_TENSOR, &request, sizeof(request), nullptr, 0);
479
479
  GGML_ASSERT(status);
480
480
  }
481
+ return GGML_STATUS_SUCCESS;
481
482
  }
482
483
 
483
484
  static void ggml_backend_rpc_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
@@ -1,3 +1,5 @@
1
+ message(STATUS "GGML_SYCL_TARGET=${GGML_SYCL_TARGET}")
2
+
1
3
  if (NOT GGML_SYCL_TARGET MATCHES "^(INTEL|NVIDIA|AMD)$")
2
4
  message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL, NVIDIA, or AMD")
3
5
  endif()
@@ -29,6 +29,7 @@
29
29
  #include "wkv6.hpp"
30
30
  #include "outprod.hpp"
31
31
  #include "element_wise.hpp"
32
+ #include "cpy.hpp"
32
33
  #include "gla.hpp"
33
34
 
34
35
  #endif // GGML_SYCL_BACKEND_HPP
@@ -99,3 +99,20 @@ catch (sycl::exception const &exc) {
99
99
  << ", line:" << __LINE__ << std::endl;
100
100
  std::exit(1);
101
101
  }
102
+
103
+
104
+ void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector<queue_ptr> streams) {
105
+ for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
106
+ for (int64_t is = 0; is < GGML_SYCL_MAX_STREAMS; ++is) {
107
+ if (extra->events[i][is] != nullptr) {
108
+ SYCL_CHECK(CHECK_TRY_ERROR(dpct::destroy_event(extra->events[i][is])));
109
+ }
110
+ }
111
+ if (extra->data_device[i] != nullptr && streams.size()>0) {
112
+ ggml_sycl_set_device(i);
113
+ SYCL_CHECK(
114
+ CHECK_TRY_ERROR(sycl::free(extra->data_device[i], *(streams[i]))));
115
+ }
116
+ }
117
+ delete extra;
118
+ }
@@ -19,6 +19,9 @@
19
19
  #include "dpct/helper.hpp"
20
20
  #include "ggml-sycl.h"
21
21
  #include "presets.hpp"
22
+ #include "sycl_hw.hpp"
23
+
24
+
22
25
  #if GGML_SYCL_DNNL
23
26
  #include "dnnl.hpp"
24
27
  #include "dnnl_sycl.hpp"
@@ -31,11 +34,15 @@
31
34
  #pragma clang diagnostic ignored "-Wnested-anon-types"
32
35
  #include "ggml-common.h"
33
36
  #pragma clang diagnostic pop
37
+ #include "ggml-impl.h"
34
38
 
35
39
  void* ggml_sycl_host_malloc(size_t size);
36
40
  void ggml_sycl_host_free(void* ptr);
37
41
 
38
- static int g_ggml_sycl_debug = 0;
42
+
43
+ extern int g_ggml_sycl_debug;
44
+ extern int g_ggml_sycl_disable_optimize;
45
+
39
46
  #define GGML_SYCL_DEBUG(...) \
40
47
  do { \
41
48
  if (g_ggml_sycl_debug) \
@@ -182,18 +189,24 @@ inline dpct::err0 ggml_sycl_set_device(const int device) try {
182
189
  }
183
190
 
184
191
  //////////////////////
192
+ struct optimize_feature {
193
+ bool reorder=false;
194
+ };
195
+
196
+ struct sycl_device_info {
197
+ int cc; // compute capability
198
+ // int nsm; // number of streaming multiprocessors
199
+ // size_t smpb; // max. shared memory per block
200
+ bool vmm; // virtual memory support
201
+ size_t total_vram;
202
+ sycl_hw_info hw_info;
203
+ optimize_feature opt_feature;
204
+ };
205
+
185
206
 
186
207
  struct ggml_sycl_device_info {
187
208
  int device_count;
188
209
 
189
- struct sycl_device_info {
190
- int cc; // compute capability
191
- // int nsm; // number of streaming multiprocessors
192
- // size_t smpb; // max. shared memory per block
193
- bool vmm; // virtual memory support
194
- size_t total_vram;
195
- };
196
-
197
210
  sycl_device_info devices[GGML_SYCL_MAX_DEVICES] = {};
198
211
 
199
212
  std::array<float, GGML_SYCL_MAX_DEVICES> default_tensor_split = {};
@@ -260,17 +273,46 @@ struct ggml_tensor_extra_gpu {
260
273
  // tensors
261
274
  dpct::event_ptr events[GGML_SYCL_MAX_DEVICES]
262
275
  [GGML_SYCL_MAX_STREAMS]; // events for synchronizing multiple GPUs
276
+ optimize_feature optimized_feature;
263
277
  };
264
278
 
279
+ void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector<queue_ptr> streams={});
280
+
281
+ inline optimize_feature check_gpu_optimize_feature(syclex::architecture &arch) {
282
+ optimize_feature opt;
283
+
284
+ opt.reorder =
285
+ (arch == syclex::architecture::intel_gpu_dg1 ||
286
+ arch == syclex::architecture::intel_gpu_acm_g10 ||
287
+ arch == syclex::architecture::intel_gpu_acm_g11 ||
288
+ arch == syclex::architecture::intel_gpu_acm_g12 ||
289
+ arch == syclex::architecture::intel_gpu_pvc ||
290
+ arch == syclex::architecture::intel_gpu_pvc_vg ||
291
+ arch == syclex::architecture::intel_gpu_mtl_u ||
292
+ arch == syclex::architecture::intel_gpu_mtl_s ||
293
+ arch == syclex::architecture::intel_gpu_mtl_h ||
294
+ arch == syclex::architecture::intel_gpu_arl_u ||
295
+ arch == syclex::architecture::intel_gpu_arl_s ||
296
+ arch == syclex::architecture::intel_gpu_arl_h ||
297
+ arch == syclex::architecture::intel_gpu_bmg_g21 ||
298
+ arch == syclex::architecture::intel_gpu_lnl_m
299
+ );
300
+
301
+ return opt;
302
+ }
303
+
265
304
  struct ggml_backend_sycl_context {
266
305
  int device;
267
306
  std::string name;
307
+ optimize_feature opt_feature;
308
+ bool optimized_graph=false;
268
309
 
269
310
  queue_ptr qptrs[GGML_SYCL_MAX_DEVICES][GGML_SYCL_MAX_STREAMS] = { { nullptr } };
270
311
 
271
312
  explicit ggml_backend_sycl_context(int device) :
272
313
  device(device),
273
314
  name(GGML_SYCL_NAME + std::to_string(device)) {
315
+ opt_feature = ggml_sycl_info().devices[device].opt_feature;
274
316
  }
275
317
 
276
318
  queue_ptr stream(int device, int stream) {
@@ -680,5 +722,4 @@ bool gpu_has_xmx(sycl::device &dev);
680
722
  void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
681
723
  const ggml_tensor *src1, ggml_tensor *dst,
682
724
  const ggml_sycl_op_flatten_t op);
683
-
684
725
  #endif // GGML_SYCL_COMMON_HPP
@@ -125,6 +125,25 @@ static void dequantize_row_q4_0_sycl(const void *vx, dst_t *y, const int64_t k,
125
125
  }
126
126
  }
127
127
 
128
+ template <typename dst_t>
129
+ static void dequantize_row_q4_0_sycl_reorder(const void *vx, dst_t *y, const int64_t k,
130
+ dpct::queue_ptr stream) {
131
+
132
+ dpct::has_capability_or_fail(stream->get_device(),
133
+ {sycl::aspect::fp16});
134
+
135
+ int constexpr WARP_K = WARP_SIZE * QK4_0;
136
+ const int n_warp = (k + WARP_K - 1) / WARP_K;
137
+ GGML_ASSERT(k % 2 == 0);
138
+ stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, n_warp) *
139
+ sycl::range<3>(1, 1, WARP_SIZE),
140
+ sycl::range<3>(1, 1, WARP_SIZE)),
141
+ [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]]{
142
+ dequantize_block_q4_0_reorder(vx, y, k, item_ct1);
143
+ });
144
+
145
+ }
146
+
128
147
  template <typename dst_t>
129
148
  static void dequantize_row_q4_1_sycl(const void *vx, dst_t *y, const int64_t k,
130
149
  dpct::queue_ptr stream) {
@@ -452,10 +471,15 @@ static void convert_unary_sycl(const void *__restrict__ vx,
452
471
  }
453
472
  }
454
473
 
455
- to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type) {
474
+ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor *dst) {
456
475
  switch (type) {
457
476
  case GGML_TYPE_Q4_0:
458
- return dequantize_block_sycl<QK4_0, QR4_0, dequantize_q4_0>;
477
+ if (dst->src[0]->extra &&
478
+ ((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {
479
+ return dequantize_row_q4_0_sycl_reorder;
480
+ } else {
481
+ return dequantize_block_sycl<QK4_0, QR4_0, dequantize_q4_0>;
482
+ }
459
483
  case GGML_TYPE_Q4_1:
460
484
  return dequantize_block_sycl<QK4_1, QR4_1, dequantize_q4_1>;
461
485
  case GGML_TYPE_Q5_0:
@@ -499,10 +523,15 @@ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type) {
499
523
  }
500
524
  }
501
525
 
502
- to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type) {
526
+ to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) {
503
527
  switch (type) {
504
528
  case GGML_TYPE_Q4_0:
505
- return dequantize_row_q4_0_sycl;
529
+ if (dst->src[0]->extra &&
530
+ ((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {
531
+ return dequantize_row_q4_0_sycl_reorder;
532
+ } else {
533
+ return dequantize_row_q4_0_sycl;
534
+ }
506
535
  case GGML_TYPE_Q4_1:
507
536
  return dequantize_row_q4_1_sycl;
508
537
  case GGML_TYPE_Q5_0:
@@ -21,7 +21,7 @@ using to_t_sycl_t = void (*)(const void *__restrict__ x, T *__restrict__ y,
21
21
  typedef to_t_sycl_t<float> to_fp32_sycl_t;
22
22
  typedef to_t_sycl_t<sycl::half> to_fp16_sycl_t;
23
23
 
24
- to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type);
25
- to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type);
24
+ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor *dst);
25
+ to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst);
26
26
 
27
27
  #endif // GGML_SYCL_CONVERT_HPP