@fugood/llama.node 0.3.14 → 0.3.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/package.json +1 -1
  18. package/src/llama.cpp/.github/workflows/build.yml +30 -1
  19. package/src/llama.cpp/CMakeLists.txt +9 -1
  20. package/src/llama.cpp/cmake/common.cmake +2 -0
  21. package/src/llama.cpp/common/arg.cpp +20 -2
  22. package/src/llama.cpp/common/common.cpp +6 -3
  23. package/src/llama.cpp/common/speculative.cpp +4 -4
  24. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +2 -2
  25. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +1 -1
  26. package/src/llama.cpp/examples/embedding/embedding.cpp +1 -1
  27. package/src/llama.cpp/examples/gritlm/gritlm.cpp +2 -2
  28. package/src/llama.cpp/examples/imatrix/imatrix.cpp +1 -1
  29. package/src/llama.cpp/examples/infill/infill.cpp +2 -2
  30. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +2 -2
  31. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +4 -4
  32. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +1 -1
  33. package/src/llama.cpp/examples/lookahead/lookahead.cpp +6 -6
  34. package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
  35. package/src/llama.cpp/examples/main/main.cpp +6 -6
  36. package/src/llama.cpp/examples/parallel/parallel.cpp +5 -5
  37. package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
  38. package/src/llama.cpp/examples/perplexity/perplexity.cpp +6 -6
  39. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -2
  40. package/src/llama.cpp/examples/retrieval/retrieval.cpp +1 -1
  41. package/src/llama.cpp/examples/run/run.cpp +91 -46
  42. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +2 -2
  43. package/src/llama.cpp/examples/server/server.cpp +37 -15
  44. package/src/llama.cpp/examples/server/utils.hpp +3 -1
  45. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
  46. package/src/llama.cpp/examples/speculative/speculative.cpp +14 -14
  47. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  48. package/src/llama.cpp/examples/tts/tts.cpp +20 -9
  49. package/src/llama.cpp/ggml/CMakeLists.txt +1 -0
  50. package/src/llama.cpp/ggml/cmake/common.cmake +26 -0
  51. package/src/llama.cpp/ggml/include/ggml.h +24 -0
  52. package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -28
  53. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +6 -2
  54. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +0 -5
  55. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +15 -7
  56. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +1493 -12
  57. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +150 -1
  58. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +284 -29
  59. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +2 -1
  60. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -1
  61. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +7 -0
  62. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +0 -4
  63. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +95 -22
  64. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +35 -12
  65. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -1
  66. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +93 -27
  67. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
  68. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +12 -13
  69. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +40 -40
  70. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +12 -43
  71. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +1 -2
  72. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +109 -40
  73. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +0 -1
  74. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +19 -20
  75. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +114 -6
  76. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +6 -0
  77. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +1 -1
  78. package/src/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +305 -0
  79. package/src/llama.cpp/ggml/src/ggml-sycl/wkv.hpp +10 -0
  80. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +398 -158
  81. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -4
  82. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +7 -2
  83. package/src/llama.cpp/ggml/src/ggml.c +85 -2
  84. package/src/llama.cpp/include/llama.h +86 -22
  85. package/src/llama.cpp/src/CMakeLists.txt +5 -2
  86. package/src/llama.cpp/src/llama-adapter.cpp +19 -20
  87. package/src/llama.cpp/src/llama-adapter.h +11 -9
  88. package/src/llama.cpp/src/llama-arch.cpp +103 -16
  89. package/src/llama.cpp/src/llama-arch.h +18 -0
  90. package/src/llama.cpp/src/llama-batch.h +2 -2
  91. package/src/llama.cpp/src/llama-context.cpp +2253 -1222
  92. package/src/llama.cpp/src/llama-context.h +214 -77
  93. package/src/llama.cpp/src/llama-cparams.h +1 -0
  94. package/src/llama.cpp/src/llama-graph.cpp +1662 -0
  95. package/src/llama.cpp/src/llama-graph.h +574 -0
  96. package/src/llama.cpp/src/llama-hparams.cpp +8 -0
  97. package/src/llama.cpp/src/llama-hparams.h +9 -0
  98. package/src/llama.cpp/src/llama-io.cpp +15 -0
  99. package/src/llama.cpp/src/llama-io.h +35 -0
  100. package/src/llama.cpp/src/llama-kv-cache.cpp +1006 -291
  101. package/src/llama.cpp/src/llama-kv-cache.h +178 -110
  102. package/src/llama.cpp/src/llama-memory.cpp +1 -0
  103. package/src/llama.cpp/src/llama-memory.h +21 -0
  104. package/src/llama.cpp/src/llama-model.cpp +8244 -173
  105. package/src/llama.cpp/src/llama-model.h +34 -1
  106. package/src/llama.cpp/src/llama-quant.cpp +10 -1
  107. package/src/llama.cpp/src/llama.cpp +51 -9984
  108. package/src/llama.cpp/tests/test-backend-ops.cpp +145 -23
  109. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +0 -143
  110. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +0 -9
@@ -46,6 +46,7 @@
46
46
  static bool g_sycl_loaded = false;
47
47
  int g_ggml_sycl_debug = 0;
48
48
  int g_ggml_sycl_disable_optimize = 0;
49
+ int g_ggml_sycl_disable_graph = 0;
49
50
 
50
51
  static ggml_sycl_device_info ggml_sycl_init() {
51
52
  ggml_sycl_device_info info = {};
@@ -95,7 +96,7 @@ const ggml_sycl_device_info & ggml_sycl_info() {
95
96
  return info;
96
97
  }
97
98
 
98
- void print_device_detail(int id, sycl::device &device, std::string device_type) {
99
+ static void print_device_detail(int id, sycl::device &device, std::string device_type) {
99
100
 
100
101
  dpct::device_info prop;
101
102
  SYCL_CHECK(CHECK_TRY_ERROR(
@@ -118,7 +119,7 @@ void print_device_detail(int id, sycl::device &device, std::string device_type)
118
119
  global_mem_size, device.get_info<sycl::info::device::driver_version>().c_str());
119
120
  }
120
121
 
121
- void print_device_opt_feature(int device_count) {
122
+ static void print_device_opt_feature(int device_count) {
122
123
  GGML_LOG_INFO("SYCL Optimization Feature:\n");
123
124
  GGML_LOG_INFO(
124
125
  "|ID| Device Type|Reorder|\n");
@@ -191,10 +192,12 @@ static void ggml_check_sycl() try {
191
192
  if (!initialized) {
192
193
  g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
193
194
  g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 0);
195
+ g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1);
194
196
  GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n");
195
197
  GGML_LOG_INFO("Running with Environment Variables:\n");
196
198
  GGML_LOG_INFO(" GGML_SYCL_DEBUG: %d\n", g_ggml_sycl_debug);
197
199
  GGML_LOG_INFO(" GGML_SYCL_DISABLE_OPT: %d\n", g_ggml_sycl_disable_optimize);
200
+ GGML_LOG_INFO(" GGML_SYCL_DISABLE_GRAPH: %d\n", g_ggml_sycl_disable_graph);
198
201
  GGML_LOG_INFO("Build with Macros:\n");
199
202
  #if defined(GGML_SYCL_FORCE_MMQ)
200
203
  GGML_LOG_INFO(" GGML_SYCL_FORCE_MMQ: yes\n");
@@ -333,10 +336,11 @@ ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
333
336
  assert(tensor->view_src->buffer->buft == buffer->buft);
334
337
  return GGML_STATUS_SUCCESS;
335
338
  }
336
-
337
- ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
338
- tensor->extra = extra;
339
- ctx->tensor_extras.push_back(extra); //used to release it when destroy ctx.
339
+ if (tensor->type == GGML_TYPE_Q4_0) {
340
+ ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
341
+ tensor->extra = extra;
342
+ ctx->tensor_extras.push_back(extra); //used to release it when destroy ctx.
343
+ }
340
344
 
341
345
  if (ggml_is_quantized(tensor->type)) {
342
346
  // initialize padding to 0 to avoid possible NaN values
@@ -400,7 +404,7 @@ catch (sycl::exception const &exc) {
400
404
  std::exit(1);
401
405
  }
402
406
 
403
- void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst,
407
+ static void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst,
404
408
  const void *ptr_src, size_t size) {
405
409
  char *host_buf = (char *)malloc(size);
406
410
  q_src.memcpy(host_buf, (const char *)ptr_src, size).wait();
@@ -486,6 +490,22 @@ catch (sycl::exception const &exc) {
486
490
  std::exit(1);
487
491
  }
488
492
 
493
+ static void ggml_backend_sycl_buffer_reset(ggml_backend_buffer_t buffer) {
494
+ GGML_SYCL_DEBUG("[SYCL] call %s\n", __func__);
495
+ if (buffer == nullptr) {
496
+ return;
497
+ }
498
+
499
+ ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *) buffer->context;
500
+
501
+ if (ctx != nullptr) {
502
+ for (ggml_tensor_extra_gpu * extra : ctx->tensor_extras) {
503
+ release_extra_gpu(extra);
504
+ }
505
+ ctx->tensor_extras.clear(); // reset the tensor_extras vector
506
+ }
507
+ }
508
+
489
509
  static const ggml_backend_buffer_i ggml_backend_sycl_buffer_interface = {
490
510
  /* .free_buffer = */ ggml_backend_sycl_buffer_free_buffer,
491
511
  /* .get_base = */ ggml_backend_sycl_buffer_get_base,
@@ -495,7 +515,7 @@ static const ggml_backend_buffer_i ggml_backend_sycl_buffer_interface = {
495
515
  /* .get_tensor = */ ggml_backend_sycl_buffer_get_tensor,
496
516
  /* .cpy_tensor = */ ggml_backend_sycl_buffer_cpy_tensor,
497
517
  /* .clear = */ ggml_backend_sycl_buffer_clear,
498
- /* .reset = */ NULL,
518
+ /* .reset = */ ggml_backend_sycl_buffer_reset,
499
519
  };
500
520
 
501
521
  // sycl buffer type
@@ -576,7 +596,6 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
576
596
  static std::mutex mutex;
577
597
  std::lock_guard<std::mutex> lock(mutex);
578
598
 
579
- GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_buffer_type\n");
580
599
 
581
600
  auto dev_count = ggml_backend_sycl_get_device_count();
582
601
 
@@ -604,7 +623,7 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
604
623
  return &ggml_backend_sycl_buffer_types[device];
605
624
  }
606
625
 
607
- ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(ggml_backend_sycl_context * ctx) {
626
+ static ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(ggml_backend_sycl_context * ctx) {
608
627
  GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_buffer_type\n");
609
628
 
610
629
  int device = ctx->device;
@@ -1666,7 +1685,7 @@ static void quantize_row_q8_1_sycl(const float *x, void *vy, const int kx,
1666
1685
 
1667
1686
  stream->parallel_for(
1668
1687
  sycl::nd_range<3>(num_blocks * block_size, block_size),
1669
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
1688
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
1670
1689
  quantize_q8_1<QUANT_BLOCK_TILE>(x, vy, kx, kx_padded, item_ct1);
1671
1690
  });
1672
1691
  }
@@ -1687,7 +1706,7 @@ static void ggml_mul_mat_p021_f16_f32_sycl(const void *vx, const float *y,
1687
1706
 
1688
1707
  stream->parallel_for(
1689
1708
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
1690
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
1709
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
1691
1710
  mul_mat_p021_f16_f32(vx, y, dst, ncols_x, nrows_x, nchannels_x,
1692
1711
  nchannels_y, item_ct1);
1693
1712
  });
@@ -1707,7 +1726,7 @@ static void ggml_mul_mat_vec_nc_f16_f32_sycl(
1707
1726
 
1708
1727
  stream->parallel_for(
1709
1728
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
1710
- [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
1729
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
1711
1730
  mul_mat_vec_nc_f16_f32(vx, y, dst, ncols_x, nrows_x,
1712
1731
  row_stride_x, channel_stride_x,
1713
1732
  nchannels_y / nchannels_x, item_ct1);
@@ -1748,7 +1767,7 @@ static void sum_rows_f32_sycl(const float *x, float *dst, const int ncols,
1748
1767
  const sycl::range<3> block_nums(1, nrows, 1);
1749
1768
  stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
1750
1769
  [=](sycl::nd_item<3> item_ct1)
1751
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
1770
+ [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
1752
1771
  k_sum_rows_f32(x, dst, ncols, item_ct1);
1753
1772
  });
1754
1773
  }
@@ -2039,9 +2058,9 @@ inline void ggml_sycl_op_mul_mat_sycl(
2039
2058
  const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst);
2040
2059
  to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
2041
2060
  #else
2042
- auto dnnl_stream = ctx.stream_dnnl(stream);
2043
- DnnlGemmWrapper::row_gemm(dnnl_stream, false, true, src1_ncols, row_diff, ne10, src1_ptr, DnnlGemmWrapper::to_dt<sycl::half>(),
2044
- src0_ptr, DnnlGemmWrapper::to_dt<sycl::half>(), dst_f16.get(), DnnlGemmWrapper::to_dt<sycl::half>());
2061
+ DnnlGemmWrapper::row_gemm(ctx, false, true, src1_ncols, row_diff, ne10, src1_ptr,
2062
+ DnnlGemmWrapper::to_dt<sycl::half>(), src0_ptr, DnnlGemmWrapper::to_dt<sycl::half>(),
2063
+ dst_f16.get(), DnnlGemmWrapper::to_dt<sycl::half>(), stream);
2045
2064
  const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst);
2046
2065
  to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff* src1_ncols, stream);
2047
2066
  #endif
@@ -2080,9 +2099,9 @@ inline void ggml_sycl_op_mul_mat_sycl(
2080
2099
  dst_dd_i, ldc)));
2081
2100
  # endif
2082
2101
  #else
2083
- auto dnnl_stream = ctx.stream_dnnl(stream);
2084
- DnnlGemmWrapper::row_gemm(dnnl_stream, false, true, src1_ncols, row_diff, ne10, src1_ddf1_i, DnnlGemmWrapper::to_dt<float>(),
2085
- src0_ddf_i, DnnlGemmWrapper::to_dt<float>(), dst_dd_i, DnnlGemmWrapper::to_dt<float>());
2102
+ DnnlGemmWrapper::row_gemm(ctx, false, true, src1_ncols, row_diff, ne10, src1_ddf1_i,
2103
+ DnnlGemmWrapper::to_dt<float>(), src0_ddf_i, DnnlGemmWrapper::to_dt<float>(),
2104
+ dst_dd_i, DnnlGemmWrapper::to_dt<float>(), stream);
2086
2105
  #endif
2087
2106
  }
2088
2107
  GGML_UNUSED(dst);
@@ -2680,6 +2699,12 @@ static void ggml_sycl_rms_norm(ggml_backend_sycl_context & ctx, ggml_tensor * ds
2680
2699
  GGML_SYCL_DEBUG("call %s done\n", __func__);
2681
2700
  }
2682
2701
 
2702
+ static void ggml_sycl_l2_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
2703
+ GGML_SYCL_DEBUG("call %s\n", __func__);
2704
+ ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_l2_norm);
2705
+ GGML_SYCL_DEBUG("call %s done\n", __func__);
2706
+ }
2707
+
2683
2708
  static void ggml_sycl_group_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
2684
2709
  GGML_SYCL_DEBUG("call %s\n", __func__);
2685
2710
  ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_group_norm);
@@ -2898,7 +2923,7 @@ inline bool ggml_sycl_supports_mmq(enum ggml_type type) {
2898
2923
  return false;
2899
2924
  }
2900
2925
 
2901
- bool ggml_sycl_supports_dmmv(enum ggml_type type) {
2926
+ static bool ggml_sycl_supports_dmmv(enum ggml_type type) {
2902
2927
  switch (type) {
2903
2928
  case GGML_TYPE_Q4_0:
2904
2929
  case GGML_TYPE_Q4_1:
@@ -3113,8 +3138,8 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
3113
3138
  const int64_t i2 = i12;
3114
3139
 
3115
3140
  src0_row.data = src0_original + i02*nb02;
3116
- src1_row.data = src1_original + + i11*nb11 + i12*nb12;
3117
- dst_row.data = dst_original + i1*nb1 + i2*nb2;
3141
+ src1_row.data = src1_original + i11*nb11 + i12*nb12;
3142
+ dst_row.data = dst_original + i1*nb1 + i2*nb2;
3118
3143
 
3119
3144
  ggml_sycl_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
3120
3145
  }
@@ -3271,7 +3296,7 @@ static void ggml_sycl_argmax(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
3271
3296
  }
3272
3297
 
3273
3298
 
3274
- void ggml_sycl_set_main_device(const int main_device) try {
3299
+ static void ggml_sycl_set_main_device(const int main_device) try {
3275
3300
  if (dpct::get_current_device_id() == static_cast<unsigned int> (main_device)) {
3276
3301
  return;
3277
3302
  }
@@ -3292,7 +3317,7 @@ catch (sycl::exception const &exc) {
3292
3317
  std::exit(1);
3293
3318
  }
3294
3319
 
3295
- bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tensor * dst) {
3320
+ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tensor * dst) {
3296
3321
  if (!g_sycl_loaded) return false;
3297
3322
 
3298
3323
  if (dst->src[0] != nullptr && ggml_backend_buffer_is_sycl_split(dst->src[0]->buffer)) {
@@ -3394,6 +3419,9 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens
3394
3419
  case GGML_OP_RMS_NORM:
3395
3420
  ggml_sycl_rms_norm(ctx, dst);
3396
3421
  break;
3422
+ case GGML_OP_L2_NORM:
3423
+ ggml_sycl_l2_norm(ctx, dst);
3424
+ break;
3397
3425
  case GGML_OP_MUL_MAT:
3398
3426
  if (dst->src[0]->ne[3] != dst->src[1]->ne[3]) {
3399
3427
  return false;
@@ -3471,6 +3499,9 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens
3471
3499
  case GGML_OP_RWKV_WKV6:
3472
3500
  ggml_sycl_op_rwkv_wkv6(ctx, dst);
3473
3501
  break;
3502
+ case GGML_OP_RWKV_WKV7:
3503
+ ggml_sycl_op_rwkv_wkv7(ctx, dst);
3504
+ break;
3474
3505
  case GGML_OP_GATED_LINEAR_ATTN:
3475
3506
  ggml_sycl_op_gated_linear_attn(ctx, dst);
3476
3507
  break;
@@ -3610,7 +3641,7 @@ catch (sycl::exception const &exc) {
3610
3641
  std::exit(1);
3611
3642
  }
3612
3643
 
3613
- void reorder_qw(char *data_device, const int ncols, const int nrows,
3644
+ static void reorder_qw(char *data_device, const int ncols, const int nrows,
3614
3645
  size_t size, size_t offset, dpct::queue_ptr stream) {
3615
3646
  auto tmp_buf = sycl::malloc_shared<char>(size, *stream);
3616
3647
  SYCL_CHECK(
@@ -3624,7 +3655,7 @@ void reorder_qw(char *data_device, const int ncols, const int nrows,
3624
3655
 
3625
3656
  stream->parallel_for(
3626
3657
  size / sizeof(block_q4_0),
3627
- [=](auto i) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
3658
+ [=](auto i) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
3628
3659
  const block_q4_0* x = (const block_q4_0*)tmp_buf;
3629
3660
  const int ib = i;
3630
3661
 
@@ -3638,7 +3669,7 @@ void reorder_qw(char *data_device, const int ncols, const int nrows,
3638
3669
  sycl::free(tmp_buf, *stream);
3639
3670
  }
3640
3671
 
3641
- void reorder_qw(ggml_tensor * src0, dpct::queue_ptr stream) {
3672
+ static void reorder_qw(ggml_tensor * src0, dpct::queue_ptr stream) {
3642
3673
  char*data_device = (char*)src0->data;
3643
3674
  size_t ncols = src0->ne[0];
3644
3675
  size_t nrows = src0->ne[1];
@@ -3647,7 +3678,7 @@ void reorder_qw(ggml_tensor * src0, dpct::queue_ptr stream) {
3647
3678
  reorder_qw(data_device, ncols, nrows, size, 0, stream);
3648
3679
  }
3649
3680
 
3650
- void opt_for_reorder(ggml_tensor * dst, dpct::queue_ptr stream) {
3681
+ static void opt_for_reorder(ggml_tensor * dst, dpct::queue_ptr stream) {
3651
3682
  ggml_tensor *src0 = dst->src[0];
3652
3683
  ggml_tensor *src1 = dst->src[1];
3653
3684
 
@@ -3660,7 +3691,7 @@ void opt_for_reorder(ggml_tensor * dst, dpct::queue_ptr stream) {
3660
3691
  }
3661
3692
  }
3662
3693
 
3663
- void optimize_graph_once(ggml_cgraph * cgraph, ggml_backend_sycl_context * ctx) {
3694
+ static void optimize_graph_once(ggml_cgraph * cgraph, ggml_backend_sycl_context * ctx) {
3664
3695
  dpct::queue_ptr stream = ctx->stream();
3665
3696
  if (ctx->optimized_graph) {
3666
3697
  return;
@@ -3671,10 +3702,9 @@ void optimize_graph_once(ggml_cgraph * cgraph, ggml_backend_sycl_context * ctx)
3671
3702
  if (ctx->opt_feature.reorder) opt_for_reorder(cgraph->nodes[i], stream);
3672
3703
  }
3673
3704
  }
3674
- static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
3675
- ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
3676
- ggml_sycl_set_main_device(sycl_ctx->device);
3677
3705
 
3706
+ static void ggml_backend_sycl_graph_compute_impl(ggml_backend_sycl_context * sycl_ctx, ggml_cgraph * cgraph) {
3707
+ ggml_sycl_set_main_device(sycl_ctx->device);
3678
3708
  if (!g_ggml_sycl_disable_optimize) optimize_graph_once(cgraph, sycl_ctx);
3679
3709
 
3680
3710
  for (int i = 0; i < cgraph->n_nodes; i++) {
@@ -3696,7 +3726,46 @@ static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_
3696
3726
  }
3697
3727
  GGML_ASSERT(ok);
3698
3728
  }
3729
+ }
3730
+
3731
+ static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
3732
+ auto * sycl_ctx = static_cast<ggml_backend_sycl_context *>(backend->context);
3733
+
3734
+ #ifdef GGML_SYCL_GRAPH
3735
+ if (!g_ggml_sycl_disable_graph) {
3736
+ if (!sycl_ctx->exec_graph && !dpct::get_device(sycl_ctx->device).has(sycl::aspect::ext_oneapi_graph)) {
3737
+ GGML_SYCL_DEBUG("[SYCL-GRAPH] can not use graphs on device:%d\n", sycl_ctx->device);
3738
+ ggml_backend_sycl_graph_compute_impl(sycl_ctx, cgraph);
3739
+ return GGML_STATUS_SUCCESS;
3740
+ }
3741
+
3742
+ sycl_ex::command_graph model_sycl_graph(*(sycl_ctx->stream()));
3743
+ model_sycl_graph.begin_recording(*(sycl_ctx->stream()));
3744
+ ggml_backend_sycl_graph_compute_impl(sycl_ctx, cgraph);
3745
+ model_sycl_graph.end_recording();
3746
+
3747
+ if (!sycl_ctx->exec_graph) {
3748
+ auto exec_graph = model_sycl_graph.finalize({sycl_ex::property::graph::updatable{}});
3749
+ sycl_ctx->exec_graph = std::make_unique<
3750
+ sycl_ex::command_graph<sycl_ex::graph_state::executable>>(exec_graph);
3751
+ } else {
3752
+ try {
3753
+ sycl_ctx->exec_graph->update(model_sycl_graph);
3754
+ GGML_SYCL_DEBUG("[SYCL-GRAPH] update success\n");
3755
+ } catch (sycl::exception const & e) {
3756
+ GGML_SYCL_DEBUG("[SYCL-GRAPH] Exception when updating graph, %s\n", e.what());
3757
+ auto exec_graph = model_sycl_graph.finalize({sycl_ex::property::graph::updatable{}});
3758
+ sycl_ctx->exec_graph = std::make_unique<
3759
+ sycl_ex::command_graph<sycl_ex::graph_state::executable>>(exec_graph);
3760
+ }
3761
+ }
3699
3762
 
3763
+ sycl_ctx->stream()->ext_oneapi_graph(*(sycl_ctx->exec_graph));
3764
+ } else
3765
+ #endif
3766
+ {
3767
+ ggml_backend_sycl_graph_compute_impl(sycl_ctx, cgraph);
3768
+ }
3700
3769
  return GGML_STATUS_SUCCESS;
3701
3770
  }
3702
3771
 
@@ -3761,7 +3830,6 @@ bool ggml_backend_is_sycl(ggml_backend_t backend) {
3761
3830
  }
3762
3831
 
3763
3832
  int ggml_backend_sycl_get_device_count() {
3764
- GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_count\n");
3765
3833
  return ggml_sycl_info().device_count;
3766
3834
  }
3767
3835
 
@@ -3851,7 +3919,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
3851
3919
  return true;
3852
3920
  }
3853
3921
  return false;
3854
- } break;
3922
+ }
3855
3923
  case GGML_OP_UNARY:
3856
3924
  switch (ggml_get_unary_op(op)) {
3857
3925
  case GGML_UNARY_OP_NEG:
@@ -3869,7 +3937,6 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
3869
3937
  default:
3870
3938
  return false;
3871
3939
  }
3872
- break;
3873
3940
  case GGML_OP_MUL_MAT:
3874
3941
  case GGML_OP_MUL_MAT_ID:
3875
3942
  {
@@ -3900,7 +3967,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
3900
3967
  return false;
3901
3968
  }
3902
3969
  return true;
3903
- } break;
3970
+ }
3904
3971
  case GGML_OP_OUT_PROD:
3905
3972
  return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->ne[2] == 1 && op->ne[3] == 1;
3906
3973
  case GGML_OP_GET_ROWS:
@@ -3917,7 +3984,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
3917
3984
  default:
3918
3985
  return false;
3919
3986
  }
3920
- } break;
3987
+ }
3921
3988
  case GGML_OP_CPY:
3922
3989
  {
3923
3990
  ggml_type src0_type = op->src[0]->type;
@@ -3968,12 +4035,12 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
3968
4035
  return true;
3969
4036
  }
3970
4037
  return false;
3971
- } break;
4038
+ }
3972
4039
  case GGML_OP_CONCAT:
3973
4040
  {
3974
4041
  ggml_type src0_type = op->src[0]->type;
3975
4042
  return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
3976
- } break;
4043
+ }
3977
4044
  case GGML_OP_DUP:
3978
4045
  case GGML_OP_ARGMAX:
3979
4046
  case GGML_OP_NONE:
@@ -3997,6 +4064,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
3997
4064
  return (op->src[0]->type == GGML_TYPE_F32);
3998
4065
  case GGML_OP_NORM:
3999
4066
  case GGML_OP_RMS_NORM:
4067
+ case GGML_OP_L2_NORM:
4000
4068
  case GGML_OP_GROUP_NORM:
4001
4069
  return ggml_is_contiguous(op->src[0]);
4002
4070
  case GGML_OP_SCALE:
@@ -4030,6 +4098,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
4030
4098
  case GGML_OP_LEAKY_RELU:
4031
4099
  case GGML_OP_TIMESTEP_EMBEDDING:
4032
4100
  case GGML_OP_RWKV_WKV6:
4101
+ case GGML_OP_RWKV_WKV7:
4033
4102
  case GGML_OP_GATED_LINEAR_ATTN:
4034
4103
  return true;
4035
4104
  default:
@@ -3017,7 +3017,6 @@ void ggml_sycl_op_mul_mat_q(
3017
3017
  break;
3018
3018
  default:
3019
3019
  GGML_ABORT("fatal error");
3020
- break;
3021
3020
  }
3022
3021
 
3023
3022
  GGML_UNUSED(src1);
@@ -495,7 +495,7 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
495
495
  cgh.parallel_for(
496
496
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
497
497
  [=](sycl::nd_item<3> item_ct1)
498
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
498
+ [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
499
499
  mul_mat_vec_q<QK4_0, QI4_0, block_q4_0,
500
500
  VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
501
501
  vx, vy, dst, ncols, nrows, item_ct1);
@@ -519,7 +519,7 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
519
519
  cgh.parallel_for(
520
520
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
521
521
  [=](sycl::nd_item<3> item_ct1)
522
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
522
+ [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
523
523
  mul_mat_vec_q<QK4_0, QI4_1, block_q4_1,
524
524
  VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
525
525
  vx, vy, dst, ncols, nrows, item_ct1);
@@ -543,7 +543,7 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
543
543
  cgh.parallel_for(
544
544
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
545
545
  [=](sycl::nd_item<3> item_ct1)
546
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
546
+ [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
547
547
  mul_mat_vec_q<QK5_0, QI5_0, block_q5_0,
548
548
  VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
549
549
  vx, vy, dst, ncols, nrows, item_ct1);
@@ -567,7 +567,7 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
567
567
  cgh.parallel_for(
568
568
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
569
569
  [=](sycl::nd_item<3> item_ct1)
570
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
570
+ [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
571
571
  mul_mat_vec_q<QK5_1, QI5_1, block_q5_1,
572
572
  VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
573
573
  vx, vy, dst, ncols, nrows, item_ct1);
@@ -591,7 +591,7 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
591
591
  cgh.parallel_for(
592
592
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
593
593
  [=](sycl::nd_item<3> item_ct1)
594
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
594
+ [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
595
595
  mul_mat_vec_q<QK8_0, QI8_0, block_q8_0,
596
596
  VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
597
597
  vx, vy, dst, ncols, nrows, item_ct1);
@@ -615,7 +615,7 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
615
615
  cgh.parallel_for(
616
616
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
617
617
  [=](sycl::nd_item<3> item_ct1)
618
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
618
+ [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
619
619
  mul_mat_vec_q<QK_K, QI2_K, block_q2_K,
620
620
  VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
621
621
  vx, vy, dst, ncols, nrows, item_ct1);
@@ -639,7 +639,7 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
639
639
  cgh.parallel_for(
640
640
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
641
641
  [=](sycl::nd_item<3> item_ct1)
642
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
642
+ [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
643
643
  mul_mat_vec_q<QK_K, QI3_K, block_q3_K,
644
644
  VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
645
645
  vx, vy, dst, ncols, nrows, item_ct1);
@@ -663,7 +663,7 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
663
663
  cgh.parallel_for(
664
664
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
665
665
  [=](sycl::nd_item<3> item_ct1)
666
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
666
+ [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
667
667
  mul_mat_vec_q<QK_K, QI4_K, block_q4_K,
668
668
  VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
669
669
  vx, vy, dst, ncols, nrows, item_ct1);
@@ -687,7 +687,7 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
687
687
  cgh.parallel_for(
688
688
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
689
689
  [=](sycl::nd_item<3> item_ct1)
690
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
690
+ [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
691
691
  mul_mat_vec_q<QK_K, QI5_K, block_q5_K,
692
692
  VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
693
693
  vx, vy, dst, ncols, nrows, item_ct1);
@@ -711,7 +711,7 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
711
711
  cgh.parallel_for(
712
712
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
713
713
  [=](sycl::nd_item<3> item_ct1)
714
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
714
+ [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
715
715
  mul_mat_vec_q<QK_K, QI6_K, block_q6_K,
716
716
  VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
717
717
  vx, vy, dst, ncols, nrows, item_ct1);
@@ -734,7 +734,7 @@ static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy,
734
734
  cgh.parallel_for(
735
735
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
736
736
  [=](sycl::nd_item<3> item_ct1)
737
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
737
+ [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
738
738
  mul_mat_vec_q_iq2_xxs_q8_1<QK_K, QI2_XXS/2, block_iq2_xxs, 1>(
739
739
  vx, vy, dst, ncols, nrows, item_ct1);
740
740
  });
@@ -755,7 +755,7 @@ static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy,
755
755
  cgh.parallel_for(
756
756
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
757
757
  [=](sycl::nd_item<3> item_ct1)
758
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
758
+ [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
759
759
  mul_mat_vec_q_iq2_xs_q8_1<QK_K, QI2_XS/2, block_iq2_xs, 1>(
760
760
  vx, vy, dst, ncols, nrows, item_ct1);
761
761
  });
@@ -777,7 +777,7 @@ static void mul_mat_vec_iq2_s_q8_1_sycl(const void *vx, const void *vy,
777
777
  cgh.parallel_for(
778
778
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
779
779
  [=](sycl::nd_item<3> item_ct1)
780
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
780
+ [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
781
781
  mul_mat_vec_q_iq2_s_q8_1<QK_K, QI2_S/2, block_iq2_s, 1>(
782
782
  vx, vy, dst, ncols, nrows, item_ct1);
783
783
  });
@@ -799,7 +799,7 @@ static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy,
799
799
  cgh.parallel_for(
800
800
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
801
801
  [=](sycl::nd_item<3> item_ct1)
802
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
802
+ [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
803
803
  mul_mat_vec_q_iq3_xxs_q8_1<QK_K, QI3_XXS/2, block_iq3_xxs, 1>(
804
804
  vx, vy, dst, ncols, nrows, item_ct1);
805
805
  });
@@ -821,7 +821,7 @@ static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy,
821
821
  cgh.parallel_for(
822
822
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
823
823
  [=](sycl::nd_item<3> item_ct1)
824
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
824
+ [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
825
825
  mul_mat_vec_q_iq3_s_q8_1<QK_K, QI3_S/2, block_iq3_s, 1>(
826
826
  vx, vy, dst, ncols, nrows, item_ct1);
827
827
  });
@@ -843,7 +843,7 @@ static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy,
843
843
  cgh.parallel_for(
844
844
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
845
845
  [=](sycl::nd_item<3> item_ct1)
846
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
846
+ [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
847
847
  mul_mat_vec_q_iq1_s_q8_1<QK_K, QI1_S, block_iq1_s, 1>(
848
848
  vx, vy, dst, ncols, nrows, item_ct1);
849
849
  });
@@ -864,7 +864,7 @@ static void mul_mat_vec_iq1_m_q8_1_sycl(const void *vx, const void *vy,
864
864
  cgh.parallel_for(
865
865
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
866
866
  [=](sycl::nd_item<3> item_ct1)
867
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
867
+ [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
868
868
  mul_mat_vec_q_iq1_m_q8_1<QK_K, QI1_S, block_iq1_m, 1>(
869
869
  vx, vy, dst, ncols, nrows, item_ct1);
870
870
  });
@@ -886,7 +886,7 @@ static void mul_mat_vec_iq4_nl_q8_1_sycl(const void *vx, const void *vy,
886
886
  cgh.parallel_for(
887
887
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
888
888
  [=](sycl::nd_item<3> item_ct1)
889
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
889
+ [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
890
890
  mul_mat_vec_q_iq4_nl_q8_1<QK4_NL, QI4_NL, block_iq4_nl, 2>(
891
891
  vx, vy, dst, ncols, nrows, item_ct1);
892
892
  });
@@ -908,7 +908,7 @@ static void mul_mat_vec_iq4_xs_q8_1_sycl(const void *vx, const void *vy,
908
908
  cgh.parallel_for(
909
909
  sycl::nd_range<3>(block_nums * block_dims, block_dims),
910
910
  [=](sycl::nd_item<3> item_ct1)
911
- [[intel::reqd_sub_group_size(WARP_SIZE)]] {
911
+ [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
912
912
  mul_mat_vec_q_iq4_xs_q8_1<QK_K, QI4_XS/4, block_iq4_xs, 1>(
913
913
  vx, vy, dst, ncols, nrows, item_ct1);
914
914
  });
@@ -1003,7 +1003,6 @@ void ggml_sycl_op_mul_mat_vec_q(
1003
1003
  break;
1004
1004
  default:
1005
1005
  GGML_ABORT("fatal error");
1006
- break;
1007
1006
  }
1008
1007
  }
1009
1008
  GGML_UNUSED(src1);