@fugood/llama.node 0.3.0 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (187) hide show
  1. package/CMakeLists.txt +1 -10
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +6 -4
  17. package/src/LlamaCompletionWorker.cpp +6 -6
  18. package/src/LlamaContext.cpp +7 -9
  19. package/src/common.hpp +2 -1
  20. package/src/llama.cpp/.github/workflows/build.yml +98 -24
  21. package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
  22. package/src/llama.cpp/.github/workflows/docker.yml +43 -34
  23. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
  24. package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
  25. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
  26. package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
  27. package/src/llama.cpp/.github/workflows/server.yml +7 -0
  28. package/src/llama.cpp/CMakeLists.txt +20 -8
  29. package/src/llama.cpp/common/CMakeLists.txt +12 -10
  30. package/src/llama.cpp/common/arg.cpp +2006 -0
  31. package/src/llama.cpp/common/arg.h +77 -0
  32. package/src/llama.cpp/common/common.cpp +496 -1632
  33. package/src/llama.cpp/common/common.h +161 -63
  34. package/src/llama.cpp/common/console.cpp +3 -0
  35. package/src/llama.cpp/common/log.cpp +401 -0
  36. package/src/llama.cpp/common/log.h +66 -698
  37. package/src/llama.cpp/common/ngram-cache.cpp +3 -0
  38. package/src/llama.cpp/common/sampling.cpp +348 -350
  39. package/src/llama.cpp/common/sampling.h +62 -139
  40. package/src/llama.cpp/common/stb_image.h +5990 -6398
  41. package/src/llama.cpp/common/train.cpp +2 -0
  42. package/src/llama.cpp/docs/build.md +36 -1
  43. package/src/llama.cpp/examples/CMakeLists.txt +0 -1
  44. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +1 -2
  45. package/src/llama.cpp/examples/batched/batched.cpp +39 -55
  46. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +34 -44
  47. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
  48. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +15 -15
  49. package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
  50. package/src/llama.cpp/examples/embedding/embedding.cpp +143 -87
  51. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +33 -33
  52. package/src/llama.cpp/examples/export-lora/export-lora.cpp +36 -35
  53. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
  54. package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +5 -0
  55. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
  56. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
  57. package/src/llama.cpp/examples/gritlm/gritlm.cpp +34 -27
  58. package/src/llama.cpp/examples/imatrix/imatrix.cpp +59 -62
  59. package/src/llama.cpp/examples/infill/infill.cpp +117 -132
  60. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +265 -58
  61. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +29 -22
  62. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  63. package/src/llama.cpp/examples/llava/clip.cpp +685 -150
  64. package/src/llama.cpp/examples/llava/clip.h +11 -2
  65. package/src/llama.cpp/examples/llava/llava-cli.cpp +47 -58
  66. package/src/llama.cpp/examples/llava/llava.cpp +110 -24
  67. package/src/llama.cpp/examples/llava/llava.h +2 -3
  68. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
  69. package/src/llama.cpp/examples/llava/requirements.txt +1 -0
  70. package/src/llama.cpp/examples/lookahead/lookahead.cpp +42 -43
  71. package/src/llama.cpp/examples/lookup/lookup-create.cpp +10 -8
  72. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +23 -22
  73. package/src/llama.cpp/examples/lookup/lookup.cpp +40 -43
  74. package/src/llama.cpp/examples/main/main.cpp +210 -262
  75. package/src/llama.cpp/examples/parallel/parallel.cpp +49 -49
  76. package/src/llama.cpp/examples/passkey/passkey.cpp +42 -50
  77. package/src/llama.cpp/examples/perplexity/perplexity.cpp +187 -200
  78. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  79. package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
  80. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -3
  81. package/src/llama.cpp/examples/retrieval/retrieval.cpp +49 -44
  82. package/src/llama.cpp/examples/rpc/rpc-server.cpp +24 -1
  83. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +32 -35
  84. package/src/llama.cpp/examples/server/CMakeLists.txt +3 -5
  85. package/src/llama.cpp/examples/server/server.cpp +1027 -1073
  86. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
  87. package/src/llama.cpp/examples/server/utils.hpp +107 -105
  88. package/src/llama.cpp/examples/simple/simple.cpp +35 -41
  89. package/src/llama.cpp/examples/speculative/speculative.cpp +129 -103
  90. package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
  91. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  92. package/src/llama.cpp/examples/tokenize/tokenize.cpp +25 -27
  93. package/src/llama.cpp/ggml/CMakeLists.txt +14 -3
  94. package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
  95. package/src/llama.cpp/ggml/include/ggml-backend.h +145 -60
  96. package/src/llama.cpp/ggml/include/ggml-blas.h +3 -3
  97. package/src/llama.cpp/ggml/include/ggml-cann.h +15 -19
  98. package/src/llama.cpp/ggml/include/ggml-cuda.h +16 -16
  99. package/src/llama.cpp/ggml/include/ggml-metal.h +5 -8
  100. package/src/llama.cpp/ggml/include/ggml-rpc.h +5 -5
  101. package/src/llama.cpp/ggml/include/ggml-sycl.h +8 -8
  102. package/src/llama.cpp/ggml/include/ggml-vulkan.h +7 -7
  103. package/src/llama.cpp/ggml/include/ggml.h +293 -186
  104. package/src/llama.cpp/ggml/src/CMakeLists.txt +86 -44
  105. package/src/llama.cpp/ggml/src/ggml-aarch64.c +2135 -1119
  106. package/src/llama.cpp/ggml/src/ggml-alloc.c +6 -0
  107. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +152 -70
  108. package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +606 -286
  109. package/src/llama.cpp/ggml/src/ggml-blas.cpp +9 -10
  110. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
  111. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
  112. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
  113. package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
  114. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
  115. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
  116. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
  117. package/src/llama.cpp/ggml/src/ggml-cann.cpp +215 -216
  118. package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
  119. package/src/llama.cpp/ggml/src/ggml-cpu-impl.h +614 -0
  120. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  121. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
  122. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  123. package/src/llama.cpp/ggml/src/ggml-impl.h +49 -603
  124. package/src/llama.cpp/ggml/src/ggml-kompute.cpp +4 -24
  125. package/src/llama.cpp/ggml/src/ggml-quants.c +972 -92
  126. package/src/llama.cpp/ggml/src/ggml-quants.h +15 -0
  127. package/src/llama.cpp/ggml/src/ggml-rpc.cpp +116 -66
  128. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
  129. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +11 -0
  130. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +52 -0
  131. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
  132. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
  133. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
  134. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
  135. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
  136. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
  137. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +16 -3
  138. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
  140. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
  141. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1 -1
  142. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +6 -3
  143. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
  144. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
  145. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
  146. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
  147. package/src/llama.cpp/ggml/src/ggml-sycl.cpp +97 -169
  148. package/src/llama.cpp/ggml/src/ggml-vulkan.cpp +1508 -1124
  149. package/src/llama.cpp/ggml/src/ggml.c +3001 -1647
  150. package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +192 -0
  151. package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +2 -0
  152. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +88 -40
  153. package/src/llama.cpp/include/llama.h +241 -264
  154. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
  155. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
  156. package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
  157. package/src/llama.cpp/src/llama-grammar.cpp +721 -122
  158. package/src/llama.cpp/src/llama-grammar.h +120 -15
  159. package/src/llama.cpp/src/llama-impl.h +156 -1
  160. package/src/llama.cpp/src/llama-sampling.cpp +1375 -303
  161. package/src/llama.cpp/src/llama-sampling.h +20 -47
  162. package/src/llama.cpp/src/llama-vocab.cpp +343 -120
  163. package/src/llama.cpp/src/llama-vocab.h +33 -17
  164. package/src/llama.cpp/src/llama.cpp +4247 -1525
  165. package/src/llama.cpp/src/unicode-data.cpp +6 -4
  166. package/src/llama.cpp/src/unicode-data.h +4 -4
  167. package/src/llama.cpp/src/unicode.cpp +15 -7
  168. package/src/llama.cpp/tests/CMakeLists.txt +3 -0
  169. package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
  170. package/src/llama.cpp/tests/test-backend-ops.cpp +1592 -289
  171. package/src/llama.cpp/tests/test-barrier.cpp +93 -0
  172. package/src/llama.cpp/tests/test-grad0.cpp +187 -70
  173. package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
  174. package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
  175. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +6 -4
  176. package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
  177. package/src/llama.cpp/tests/test-log.cpp +39 -0
  178. package/src/llama.cpp/tests/test-quantize-fns.cpp +6 -0
  179. package/src/llama.cpp/tests/test-rope.cpp +1 -1
  180. package/src/llama.cpp/tests/test-sampling.cpp +157 -98
  181. package/src/llama.cpp/tests/test-tokenizer-0.cpp +55 -35
  182. package/patches/llama.patch +0 -22
  183. package/src/llama.cpp/.github/workflows/bench.yml +0 -310
  184. package/src/llama.cpp/common/grammar-parser.cpp +0 -536
  185. package/src/llama.cpp/common/grammar-parser.h +0 -29
  186. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
  187. package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
@@ -33,11 +33,12 @@
33
33
  #include <sycl/half_type.hpp>
34
34
 
35
35
  #include "ggml-sycl.h"
36
- #include "ggml.h"
36
+ #include "ggml-impl.h"
37
37
  #include "ggml-backend-impl.h"
38
38
 
39
39
  #include "ggml-sycl/backend.hpp"
40
40
  #include "ggml-sycl/presets.hpp"
41
+ #include "ggml-sycl/gemm.hpp"
41
42
 
42
43
  bool ggml_sycl_loaded(void);
43
44
  void ggml_sycl_free_data(struct ggml_tensor * tensor);
@@ -893,43 +894,6 @@ static void clamp_f32(const float * x, float * dst, const float min, const float
893
894
  dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
894
895
  }
895
896
 
896
- template <typename T>
897
- static void im2col_kernel(const float *x, T *dst, int offset_delta,
898
- int IW, int IH, int OW, int KW, int KH,
899
- int pelements, int CHW, int s0, int s1, int p0,
900
- int p1, int d0, int d1,
901
- const sycl::nd_item<3> &item_ct1) {
902
- const int i = item_ct1.get_local_id(2) +
903
- item_ct1.get_group(2) * item_ct1.get_local_range(2);
904
- if (i >= pelements) {
905
- return;
906
- }
907
-
908
- const int ksize = OW * (KH > 1 ? KW : 1);
909
- const int kx = i / ksize;
910
- const int kd = kx * ksize;
911
- const int ky = (i - kd) / OW;
912
- const int ix = i % OW;
913
-
914
- const int64_t iiw = ix * s0 + kx * d0 - p0;
915
- const int64_t iih = item_ct1.get_group(1) * s1 + ky * d1 - p1;
916
-
917
- const int64_t offset_dst =
918
- (item_ct1.get_group(1) * OW + ix) * CHW +
919
- (item_ct1.get_group(0) * (KW * KH) + ky * KW + kx);
920
-
921
- if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
922
- dst[offset_dst] =
923
- sycl::vec<float, 1>(0.0f)
924
- .convert<sycl::half, sycl::rounding_mode::automatic>()[0];
925
- } else {
926
- const int64_t offset_src = item_ct1.get_group(0) * offset_delta;
927
- dst[offset_dst] =
928
- sycl::vec<float, 1>(x[offset_src + iih * IW + iiw])
929
- .convert<sycl::half, sycl::rounding_mode::automatic>()[0];
930
- }
931
- }
932
-
933
897
  template <typename Ti, typename To>
934
898
  static void pool2d_nchw_kernel(
935
899
  const int ih, const int iw, const int oh, const int ow,
@@ -1742,32 +1706,6 @@ static void diag_mask_inf_f32_sycl(const float *x, float *dst,
1742
1706
  });
1743
1707
  }
1744
1708
 
1745
- template <typename T>
1746
- static void im2col_sycl(const float *x, T *dst, int IW, int IH,
1747
- int OW, int OH, int KW, int KH, int IC,
1748
- int offset_delta, int s0, int s1, int p0,
1749
- int p1, int d0, int d1,
1750
- queue_ptr stream) {
1751
- const int parallel_elements = OW * KW * KH;
1752
- const int num_blocks = (parallel_elements + SYCL_IM2COL_BLOCK_SIZE - 1) / SYCL_IM2COL_BLOCK_SIZE;
1753
- sycl::range<3> block_nums(IC, OH, num_blocks);
1754
- {
1755
- dpct::has_capability_or_fail(stream->get_device(),
1756
- {sycl::aspect::fp16});
1757
-
1758
- stream->parallel_for(
1759
- sycl::nd_range<3>(block_nums *
1760
- sycl::range<3>(1, 1, SYCL_IM2COL_BLOCK_SIZE),
1761
- sycl::range<3>(1, 1, SYCL_IM2COL_BLOCK_SIZE)),
1762
- [=](sycl::nd_item<3> item_ct1) {
1763
- im2col_kernel(x, dst, offset_delta, IW, IH, OW, KW, KH,
1764
- parallel_elements, (IC * KH * KW), s0, s1, p0,
1765
- p1, d0, d1, item_ct1);
1766
- });
1767
- }
1768
- }
1769
-
1770
-
1771
1709
  static bool g_sycl_loaded = false;
1772
1710
 
1773
1711
  bool ggml_sycl_loaded(void) {
@@ -2016,6 +1954,11 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
2016
1954
  SYCL_CHECK(
2017
1955
  CHECK_TRY_ERROR(ptr = (void *)sycl::malloc_device(
2018
1956
  look_ahead_size, *qptr)));
1957
+ if (!ptr) {
1958
+ fprintf(stderr, "%s: can't malloc %lu Bytes memory on device", __func__, look_ahead_size);
1959
+ return nullptr;
1960
+ }
1961
+
2019
1962
  *actual_size = look_ahead_size;
2020
1963
  pool_size += look_ahead_size;
2021
1964
 
@@ -2545,6 +2488,7 @@ inline void ggml_sycl_op_mul_mat_sycl(
2545
2488
 
2546
2489
  const sycl::half alpha_f16 = 1.0f;
2547
2490
  const sycl::half beta_f16 = 0.0f;
2491
+ #if !GGML_SYCL_DNNL
2548
2492
  SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm(
2549
2493
  *stream, oneapi::mkl::transpose::trans,
2550
2494
  oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
@@ -2554,6 +2498,13 @@ inline void ggml_sycl_op_mul_mat_sycl(
2554
2498
  dpct::library_data_t::real_half)));
2555
2499
  const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
2556
2500
  to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
2501
+ #else
2502
+ auto dnnl_stream = ctx.stream_dnnl(stream);
2503
+ DnnlGemmWrapper::row_gemm(dnnl_stream, false, true, src1_ncols, row_diff, ne10, src1_ptr, DnnlGemmWrapper::to_dt<sycl::half>(),
2504
+ src0_ptr, DnnlGemmWrapper::to_dt<sycl::half>(), dst_f16.get(), DnnlGemmWrapper::to_dt<sycl::half>());
2505
+ const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
2506
+ to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff* src1_ncols, stream);
2507
+ #endif
2557
2508
  }
2558
2509
  else {
2559
2510
  // GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat_sycl - fp32 path\n");
@@ -2576,13 +2527,18 @@ inline void ggml_sycl_op_mul_mat_sycl(
2576
2527
 
2577
2528
  const float alpha = 1.0f;
2578
2529
  const float beta = 0.0f;
2579
-
2530
+ #if !GGML_SYCL_DNNL
2580
2531
  SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm(
2581
2532
  *stream, oneapi::mkl::transpose::trans,
2582
2533
  oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
2583
2534
  dpct::get_value(&alpha, *stream), src0_ddf_i, ne00,
2584
2535
  src1_ddf1_i, ne10, dpct::get_value(&beta, *stream),
2585
2536
  dst_dd_i, ldc)));
2537
+ #else
2538
+ auto dnnl_stream = ctx.stream_dnnl(stream);
2539
+ DnnlGemmWrapper::row_gemm(dnnl_stream, false, true, src1_ncols, row_diff, ne10, src1_ddf1_i, DnnlGemmWrapper::to_dt<float>(),
2540
+ src0_ddf_i, DnnlGemmWrapper::to_dt<float>(), dst_dd_i, DnnlGemmWrapper::to_dt<float>());
2541
+ #endif
2586
2542
  }
2587
2543
  (void) dst;
2588
2544
  (void) src1_ddq_i;
@@ -2636,47 +2592,6 @@ static void ggml_sycl_op_pool2d(ggml_backend_sycl_context & ctx, const ggml_tens
2636
2592
  (void) src1_dd;
2637
2593
  }
2638
2594
 
2639
- inline void ggml_sycl_op_im2col(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
2640
- const ggml_tensor *src1, ggml_tensor *dst,
2641
- const float *src0_dd, const float *src1_dd,
2642
- float *dst_dd,
2643
- const queue_ptr &main_stream) {
2644
-
2645
- GGML_ASSERT(src0->type == GGML_TYPE_F16);
2646
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
2647
- GGML_ASSERT( dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
2648
-
2649
- const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
2650
- const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
2651
- const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
2652
- const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
2653
- const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
2654
- const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
2655
-
2656
- const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
2657
-
2658
- const int64_t IC = src1->ne[is_2D ? 2 : 1];
2659
- const int64_t IH = is_2D ? src1->ne[1] : 1;
2660
- const int64_t IW = src1->ne[0];
2661
-
2662
- const int64_t KH = is_2D ? src0->ne[1] : 1;
2663
- const int64_t KW = src0->ne[0];
2664
-
2665
- const int64_t OH = is_2D ? dst->ne[2] : 1;
2666
- const int64_t OW = dst->ne[1];
2667
-
2668
- const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
2669
-
2670
- if (dst->type == GGML_TYPE_F16) {
2671
- im2col_sycl(src1_dd, (sycl::half *)dst_dd, IW, IH, OW, OH, KW, KH, IC, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
2672
- } else {
2673
- im2col_sycl(src1_dd, (float *)dst_dd, IW, IH, OW, OH, KW, KH, IC, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
2674
- }
2675
-
2676
- (void) src0;
2677
- (void) src0_dd;
2678
- }
2679
-
2680
2595
  inline void ggml_sycl_op_sum_rows(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
2681
2596
  const ggml_tensor *src1, ggml_tensor *dst,
2682
2597
  const float *src0_dd, const float *src1_dd,
@@ -3981,6 +3896,9 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens
3981
3896
  ggml_sycl_func_t func;
3982
3897
 
3983
3898
  switch (tensor->op) {
3899
+ case GGML_OP_CONV_TRANSPOSE_1D:
3900
+ func = ggml_sycl_op_conv_transpose_1d;
3901
+ break;
3984
3902
  case GGML_OP_REPEAT:
3985
3903
  func = ggml_sycl_repeat;
3986
3904
  break;
@@ -4105,6 +4023,9 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens
4105
4023
  case GGML_OP_ARGSORT:
4106
4024
  func = ggml_sycl_argsort;
4107
4025
  break;
4026
+ case GGML_OP_TIMESTEP_EMBEDDING:
4027
+ func = ggml_sycl_op_timestep_embedding;
4028
+ break;
4108
4029
  default:
4109
4030
  return false;
4110
4031
  }
@@ -4117,7 +4038,7 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens
4117
4038
  return true;
4118
4039
  }
4119
4040
 
4120
- GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len) try {
4041
+ GGML_API void ggml_sycl_get_gpu_list(int *id_list, int max_len) try {
4121
4042
  GGML_SYCL_DEBUG("[SYCL] call ggml_sycl_get_gpu_list\n");
4122
4043
  for(int i=0;i<max_len;i++) id_list[i] = -1;
4123
4044
 
@@ -4147,7 +4068,7 @@ catch (sycl::exception const &exc) {
4147
4068
  std::exit(1);
4148
4069
  }
4149
4070
 
4150
- GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description,
4071
+ GGML_API void ggml_sycl_get_device_description(int device, char *description,
4151
4072
  size_t description_size) try {
4152
4073
  GGML_SYCL_DEBUG("[SYCL] call ggml_sycl_get_device_description\n");
4153
4074
  dpct::device_info prop;
@@ -4161,7 +4082,7 @@ catch (sycl::exception const &exc) {
4161
4082
  std::exit(1);
4162
4083
  }
4163
4084
 
4164
- GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free,
4085
+ void ggml_backend_sycl_get_device_memory(int device, size_t *free,
4165
4086
  size_t *total) try {
4166
4087
  GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_memory\n");
4167
4088
  ggml_sycl_set_device(device);
@@ -4214,12 +4135,12 @@ struct ggml_backend_sycl_buffer_context {
4214
4135
  }
4215
4136
  };
4216
4137
 
4217
- GGML_CALL static const char * ggml_backend_sycl_buffer_get_name(ggml_backend_buffer_t buffer) {
4138
+ static const char * ggml_backend_sycl_buffer_get_name(ggml_backend_buffer_t buffer) {
4218
4139
  ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context;
4219
4140
  return ctx->name.c_str();
4220
4141
  }
4221
4142
 
4222
- GGML_CALL static bool ggml_backend_buffer_is_sycl(ggml_backend_buffer_t buffer) {
4143
+ static bool ggml_backend_buffer_is_sycl(ggml_backend_buffer_t buffer) {
4223
4144
  return buffer->iface.get_name == ggml_backend_sycl_buffer_get_name;
4224
4145
  }
4225
4146
 
@@ -4241,7 +4162,7 @@ static void * ggml_backend_sycl_buffer_get_base(ggml_backend_buffer_t buffer) {
4241
4162
  return ctx->dev_ptr;
4242
4163
  }
4243
4164
 
4244
- GGML_CALL static void
4165
+ static void
4245
4166
  ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
4246
4167
  ggml_tensor *tensor) try {
4247
4168
  ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context;
@@ -4316,7 +4237,7 @@ catch (sycl::exception const &exc) {
4316
4237
  std::exit(1);
4317
4238
  }
4318
4239
 
4319
- GGML_CALL static bool
4240
+ static bool
4320
4241
  ggml_backend_sycl_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
4321
4242
  const ggml_tensor *src,
4322
4243
  ggml_tensor *dst) try {
@@ -4401,6 +4322,7 @@ static struct ggml_backend_buffer_i ggml_backend_sycl_buffer_interface = {
4401
4322
  /* .free_buffer = */ ggml_backend_sycl_buffer_free_buffer,
4402
4323
  /* .get_base = */ ggml_backend_sycl_buffer_get_base,
4403
4324
  /* .init_tensor = */ ggml_backend_sycl_buffer_init_tensor,
4325
+ /* .memset_tensor = */ NULL,
4404
4326
  /* .set_tensor = */ ggml_backend_sycl_buffer_set_tensor,
4405
4327
  /* .get_tensor = */ ggml_backend_sycl_buffer_get_tensor,
4406
4328
  /* .cpy_tensor = */ ggml_backend_sycl_buffer_cpy_tensor,
@@ -4417,12 +4339,12 @@ struct ggml_backend_sycl_buffer_type_context {
4417
4339
  queue_ptr stream = nullptr;
4418
4340
  };
4419
4341
 
4420
- GGML_CALL static const char * ggml_backend_sycl_buffer_type_name(ggml_backend_buffer_type_t buft) {
4342
+ static const char * ggml_backend_sycl_buffer_type_name(ggml_backend_buffer_type_t buft) {
4421
4343
  ggml_backend_sycl_buffer_type_context * ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
4422
4344
 
4423
4345
  return ctx->name.c_str();
4424
4346
  }
4425
- GGML_CALL static ggml_backend_buffer_t
4347
+ static ggml_backend_buffer_t
4426
4348
  ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
4427
4349
  size_t size) try {
4428
4350
  ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
@@ -4433,6 +4355,10 @@ ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
4433
4355
  void * dev_ptr;
4434
4356
  SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)sycl::malloc_device(
4435
4357
  size, *stream)));
4358
+ if (!dev_ptr) {
4359
+ fprintf(stderr, "%s: can't malloc %lu Bytes memory on device", __func__, size);
4360
+ return nullptr;
4361
+ }
4436
4362
  ggml_backend_sycl_buffer_context * ctx = new ggml_backend_sycl_buffer_context(buft_ctx->device, dev_ptr, buft_ctx->stream);
4437
4363
  return ggml_backend_buffer_init(buft, ggml_backend_sycl_buffer_interface, ctx, size);
4438
4364
  }
@@ -4442,7 +4368,7 @@ catch (sycl::exception const &exc) {
4442
4368
  std::exit(1);
4443
4369
  }
4444
4370
 
4445
- GGML_CALL static size_t ggml_backend_sycl_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
4371
+ static size_t ggml_backend_sycl_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
4446
4372
  return 128;
4447
4373
  UNUSED(buft);
4448
4374
  }
@@ -4453,7 +4379,7 @@ static size_t ggml_backend_sycl_buffer_type_get_max_size(ggml_backend_buffer_typ
4453
4379
  UNUSED(buft);
4454
4380
  }
4455
4381
 
4456
- GGML_CALL static size_t ggml_backend_sycl_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
4382
+ static size_t ggml_backend_sycl_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
4457
4383
  size_t size = ggml_nbytes(tensor);
4458
4384
  int64_t ne0 = tensor->ne[0];
4459
4385
 
@@ -4498,6 +4424,7 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
4498
4424
  queue_ptr stream = &(device_i.default_queue());
4499
4425
  ggml_backend_sycl_buffer_types[i] = {
4500
4426
  /* .iface = */ ggml_backend_sycl_buffer_type_interface,
4427
+ /* .device = */ nullptr,
4501
4428
  /* .context = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(i), stream},
4502
4429
  };
4503
4430
  }
@@ -4523,6 +4450,7 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(ggml_backend_sycl_conte
4523
4450
  for (int i = 0; i < ggml_sycl_info().device_count; i++) {
4524
4451
  ggml_backend_sycl_buffer_types[i] = {
4525
4452
  /* .iface = */ ggml_backend_sycl_buffer_type_interface,
4453
+ /* .device = */ nullptr,
4526
4454
  /* .context = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(i), ctx->stream(i, 0)},
4527
4455
  };
4528
4456
  }
@@ -4587,7 +4515,7 @@ struct ggml_backend_sycl_split_buffer_context {
4587
4515
  std::vector<queue_ptr> streams;
4588
4516
  };
4589
4517
 
4590
- GGML_CALL static const char * ggml_backend_sycl_split_buffer_get_name(ggml_backend_buffer_t buffer) {
4518
+ static const char * ggml_backend_sycl_split_buffer_get_name(ggml_backend_buffer_t buffer) {
4591
4519
  return GGML_SYCL_NAME "_Split";
4592
4520
 
4593
4521
  UNUSED(buffer);
@@ -4597,19 +4525,19 @@ static bool ggml_backend_buffer_is_sycl_split(ggml_backend_buffer_t buffer) {
4597
4525
  return buffer->iface.get_name == ggml_backend_sycl_split_buffer_get_name;
4598
4526
  }
4599
4527
 
4600
- GGML_CALL static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
4528
+ static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
4601
4529
  ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
4602
4530
  delete ctx;
4603
4531
  }
4604
4532
 
4605
- GGML_CALL static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) {
4533
+ static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) {
4606
4534
  // the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced
4607
4535
  return (void *)0x1000;
4608
4536
 
4609
4537
  UNUSED(buffer);
4610
4538
  }
4611
4539
 
4612
- GGML_CALL static void
4540
+ static void
4613
4541
  ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
4614
4542
  ggml_tensor *tensor) try {
4615
4543
  GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
@@ -4653,7 +4581,11 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
4653
4581
  */
4654
4582
  SYCL_CHECK(CHECK_TRY_ERROR(buf = (char *)sycl::malloc_device(
4655
4583
  size, *stream)));
4656
-
4584
+ if (!buf) {
4585
+ char err_buf[1024];
4586
+ snprintf(err_buf, 1023, "%s: can't malloc %lu Bytes memory on device", __func__, size);
4587
+ throw std::runtime_error(err_buf);
4588
+ }
4657
4589
  // set padding to 0 to avoid possible NaN values
4658
4590
  if (size > original_size) {
4659
4591
  /*
@@ -4688,7 +4620,7 @@ catch (sycl::exception const &exc) {
4688
4620
  std::exit(1);
4689
4621
  }
4690
4622
 
4691
- GGML_CALL static void
4623
+ static void
4692
4624
  ggml_backend_sycl_split_buffer_set_tensor(ggml_backend_buffer_t buffer,
4693
4625
  ggml_tensor *tensor, const void *data,
4694
4626
  size_t offset, size_t size) try {
@@ -4741,7 +4673,7 @@ catch (sycl::exception const &exc) {
4741
4673
  std::exit(1);
4742
4674
  }
4743
4675
 
4744
- GGML_CALL static void
4676
+ static void
4745
4677
  ggml_backend_sycl_split_buffer_get_tensor(ggml_backend_buffer_t buffer,
4746
4678
  const ggml_tensor *tensor, void *data,
4747
4679
  size_t offset, size_t size) try {
@@ -4794,7 +4726,7 @@ catch (sycl::exception const &exc) {
4794
4726
  std::exit(1);
4795
4727
  }
4796
4728
 
4797
- GGML_CALL static void ggml_backend_sycl_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
4729
+ static void ggml_backend_sycl_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
4798
4730
  UNUSED(buffer);
4799
4731
  UNUSED(value);
4800
4732
  }
@@ -4804,6 +4736,7 @@ static struct ggml_backend_buffer_i ggml_backend_sycl_split_buffer_interface = {
4804
4736
  /* .free_buffer = */ ggml_backend_sycl_split_buffer_free_buffer,
4805
4737
  /* .get_base = */ ggml_backend_sycl_split_buffer_get_base,
4806
4738
  /* .init_tensor = */ ggml_backend_sycl_split_buffer_init_tensor,
4739
+ /* .memset_tensor = */ NULL,
4807
4740
  /* .set_tensor = */ ggml_backend_sycl_split_buffer_set_tensor,
4808
4741
  /* .get_tensor = */ ggml_backend_sycl_split_buffer_get_tensor,
4809
4742
  /* .cpy_tensor = */ NULL,
@@ -4811,13 +4744,13 @@ static struct ggml_backend_buffer_i ggml_backend_sycl_split_buffer_interface = {
4811
4744
  /* .reset = */ NULL,
4812
4745
  };
4813
4746
 
4814
- GGML_CALL static const char * ggml_backend_sycl_split_buffer_type_name(ggml_backend_buffer_type_t buft) {
4747
+ static const char * ggml_backend_sycl_split_buffer_type_name(ggml_backend_buffer_type_t buft) {
4815
4748
  return GGML_SYCL_NAME "_Split";
4816
4749
 
4817
4750
  UNUSED(buft);
4818
4751
  }
4819
4752
 
4820
- GGML_CALL static ggml_backend_buffer_t ggml_backend_sycl_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
4753
+ static ggml_backend_buffer_t ggml_backend_sycl_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
4821
4754
  // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
4822
4755
  // instead, we allocate them for each tensor separately in init_tensor
4823
4756
  // however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated,
@@ -4827,12 +4760,12 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_sycl_split_buffer_type_alloc
4827
4760
  return ggml_backend_buffer_init(buft, ggml_backend_sycl_split_buffer_interface, ctx, size);
4828
4761
  }
4829
4762
 
4830
- GGML_CALL static size_t ggml_backend_sycl_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
4763
+ static size_t ggml_backend_sycl_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
4831
4764
  return 128;
4832
4765
  UNUSED(buft);
4833
4766
  }
4834
4767
 
4835
- GGML_CALL static size_t ggml_backend_sycl_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
4768
+ static size_t ggml_backend_sycl_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
4836
4769
  ggml_backend_sycl_split_buffer_type_context * ctx = (ggml_backend_sycl_split_buffer_type_context *)buft->context;
4837
4770
 
4838
4771
  size_t total_size = 0;
@@ -4859,7 +4792,7 @@ GGML_CALL static size_t ggml_backend_sycl_split_buffer_type_get_alloc_size(ggml_
4859
4792
  return total_size;
4860
4793
  }
4861
4794
 
4862
- GGML_CALL static bool ggml_backend_sycl_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
4795
+ static bool ggml_backend_sycl_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
4863
4796
  return false;
4864
4797
 
4865
4798
  UNUSED(buft);
@@ -4874,7 +4807,7 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_split_buffer_type_interface
4874
4807
  /* .is_host = */ ggml_backend_sycl_split_buffer_type_is_host,
4875
4808
  };
4876
4809
 
4877
- GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split) {
4810
+ ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split) {
4878
4811
  static std::mutex mutex;
4879
4812
  std::lock_guard<std::mutex> lock(mutex);
4880
4813
 
@@ -4906,6 +4839,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const f
4906
4839
 
4907
4840
  struct ggml_backend_buffer_type buft {
4908
4841
  /* .iface = */ ggml_backend_sycl_split_buffer_type_interface,
4842
+ /* .device = */ nullptr,
4909
4843
  /* .context = */ new ggml_backend_sycl_split_buffer_type_context{tensor_split_arr},
4910
4844
  };
4911
4845
 
@@ -4915,13 +4849,13 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const f
4915
4849
 
4916
4850
  // host buffer type
4917
4851
 
4918
- GGML_CALL static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
4852
+ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
4919
4853
  return GGML_SYCL_NAME "_Host";
4920
4854
 
4921
4855
  UNUSED(buft);
4922
4856
  }
4923
4857
 
4924
- GGML_CALL static const char * ggml_backend_sycl_host_buffer_name(ggml_backend_buffer_t buffer) {
4858
+ static const char * ggml_backend_sycl_host_buffer_name(ggml_backend_buffer_t buffer) {
4925
4859
  return GGML_SYCL_NAME "_Host";
4926
4860
 
4927
4861
  UNUSED(buffer);
@@ -4959,6 +4893,7 @@ ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() {
4959
4893
  /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
4960
4894
  /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
4961
4895
  },
4896
+ /* .device = */ nullptr,
4962
4897
  /* .context = */ nullptr,
4963
4898
  };
4964
4899
 
@@ -4967,14 +4902,14 @@ ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() {
4967
4902
 
4968
4903
  // backend
4969
4904
 
4970
- GGML_CALL static const char * ggml_backend_sycl_name(ggml_backend_t backend) {
4905
+ static const char * ggml_backend_sycl_name(ggml_backend_t backend) {
4971
4906
 
4972
4907
  ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
4973
4908
 
4974
4909
  return sycl_ctx->name.c_str();
4975
4910
  }
4976
4911
 
4977
- GGML_CALL static void ggml_backend_sycl_free(ggml_backend_t backend) {
4912
+ static void ggml_backend_sycl_free(ggml_backend_t backend) {
4978
4913
  ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
4979
4914
 
4980
4915
  delete sycl_ctx;
@@ -4982,12 +4917,12 @@ GGML_CALL static void ggml_backend_sycl_free(ggml_backend_t backend) {
4982
4917
  }
4983
4918
 
4984
4919
 
4985
- GGML_CALL static ggml_backend_buffer_type_t ggml_backend_sycl_get_default_buffer_type(ggml_backend_t backend) {
4920
+ static ggml_backend_buffer_type_t ggml_backend_sycl_get_default_buffer_type(ggml_backend_t backend) {
4986
4921
  ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
4987
4922
  return ggml_backend_sycl_buffer_type(sycl_ctx->device);
4988
4923
  }
4989
4924
 
4990
- GGML_CALL static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
4925
+ static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
4991
4926
  ggml_tensor *tensor,
4992
4927
  const void *data, size_t offset,
4993
4928
  size_t size) try {
@@ -5005,7 +4940,7 @@ catch (sycl::exception const &exc) {
5005
4940
  std::exit(1);
5006
4941
  }
5007
4942
 
5008
- GGML_CALL static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend,
4943
+ static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend,
5009
4944
  const ggml_tensor *tensor,
5010
4945
  void *data, size_t offset,
5011
4946
  size_t size) try {
@@ -5023,9 +4958,9 @@ catch (sycl::exception const &exc) {
5023
4958
  std::exit(1);
5024
4959
  }
5025
4960
 
5026
- GGML_CALL static bool ggml_backend_sycl_cpy_tensor_async(ggml_backend_t backend,
5027
- const ggml_tensor *src,
5028
- ggml_tensor *dst) try {
4961
+ static bool ggml_backend_sycl_cpy_tensor_async(ggml_backend_t backend,
4962
+ const ggml_tensor *src,
4963
+ ggml_tensor *dst) try {
5029
4964
  ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
5030
4965
  if (dst->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && ggml_backend_buffer_is_sycl(src->buffer)) {
5031
4966
  /*
@@ -5060,7 +4995,7 @@ catch (sycl::exception const &exc) {
5060
4995
  std::exit(1);
5061
4996
  }
5062
4997
 
5063
- GGML_CALL static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
4998
+ static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
5064
4999
  ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
5065
5000
  ggml_sycl_set_main_device(sycl_ctx->device);
5066
5001
 
@@ -5088,8 +5023,17 @@ GGML_CALL static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t back
5088
5023
  return GGML_STATUS_SUCCESS;
5089
5024
  }
5090
5025
 
5091
- GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
5026
+ static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
5092
5027
  switch (op->op) {
5028
+ case GGML_OP_CONV_TRANSPOSE_1D:
5029
+ {
5030
+ ggml_type src0_type = op->src[0]->type;
5031
+ ggml_type src1_type = op->src[1]->type;
5032
+ if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
5033
+ return true;
5034
+ }
5035
+ return false;
5036
+ } break;
5093
5037
  case GGML_OP_UNARY:
5094
5038
  switch (ggml_get_unary_op(op)) {
5095
5039
  case GGML_UNARY_OP_GELU:
@@ -5198,13 +5142,17 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
5198
5142
  case GGML_OP_SCALE:
5199
5143
  case GGML_OP_SQR:
5200
5144
  case GGML_OP_CLAMP:
5145
+ return true;
5201
5146
  case GGML_OP_CONT:
5147
+ return op->src[0]->type != GGML_TYPE_BF16;
5202
5148
  case GGML_OP_DIAG_MASK_INF:
5203
5149
  case GGML_OP_SOFT_MAX:
5204
5150
  return true;
5205
5151
  case GGML_OP_ROPE:
5206
5152
  return ggml_is_contiguous(op->src[0]);
5207
5153
  case GGML_OP_IM2COL:
5154
+ // TODO: add support for the new F32 operations
5155
+ return op->src[0]->type == GGML_TYPE_F16;
5208
5156
  case GGML_OP_POOL_2D:
5209
5157
  case GGML_OP_SUM_ROWS:
5210
5158
  case GGML_OP_ARGSORT:
@@ -5213,6 +5161,7 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
5213
5161
  case GGML_OP_UPSCALE:
5214
5162
  case GGML_OP_PAD:
5215
5163
  case GGML_OP_LEAKY_RELU:
5164
+ case GGML_OP_TIMESTEP_EMBEDDING:
5216
5165
  return true;
5217
5166
  default:
5218
5167
  return false;
@@ -5221,13 +5170,13 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
5221
5170
  UNUSED(backend);
5222
5171
  }
5223
5172
 
5224
- GGML_CALL static bool ggml_backend_sycl_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
5173
+ static bool ggml_backend_sycl_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
5225
5174
  const int min_batch_size = 32;
5226
5175
  return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS && op->op != GGML_OP_MUL_MAT_ID;
5227
5176
  GGML_UNUSED(backend);
5228
5177
  }
5229
5178
 
5230
- GGML_CALL static bool ggml_backend_sycl_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
5179
+ static bool ggml_backend_sycl_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
5231
5180
  if (buft->iface.get_name != ggml_backend_sycl_buffer_type_name) {
5232
5181
  return false;
5233
5182
  }
@@ -5252,11 +5201,8 @@ static ggml_backend_i ggml_backend_sycl_interface = {
5252
5201
  /* .supports_op = */ ggml_backend_sycl_supports_op,
5253
5202
  /* .supports_buft = */ ggml_backend_sycl_supports_buft,
5254
5203
  /* .offload_op = */ ggml_backend_sycl_offload_op,
5255
- /* .event_new = */ NULL,
5256
- /* .event_free = */ NULL,
5257
5204
  /* .event_record = */ NULL,
5258
5205
  /* .event_wait = */ NULL,
5259
- /* .event_synchronize = */ NULL,
5260
5206
  };
5261
5207
 
5262
5208
  static ggml_guid_t ggml_backend_sycl_guid() {
@@ -5264,7 +5210,7 @@ static ggml_guid_t ggml_backend_sycl_guid() {
5264
5210
  return &guid;
5265
5211
  }
5266
5212
 
5267
- GGML_CALL ggml_backend_t ggml_backend_sycl_init(int device) {
5213
+ ggml_backend_t ggml_backend_sycl_init(int device) {
5268
5214
  GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_init\n");
5269
5215
  ggml_check_sycl();
5270
5216
 
@@ -5279,6 +5225,7 @@ GGML_CALL ggml_backend_t ggml_backend_sycl_init(int device) {
5279
5225
  ggml_backend_t sycl_backend = new ggml_backend {
5280
5226
  /* .guid = */ ggml_backend_sycl_guid(),
5281
5227
  /* .interface = */ ggml_backend_sycl_interface,
5228
+ /* .device = */ nullptr,
5282
5229
  /* .context = */ ctx
5283
5230
  };
5284
5231
 
@@ -5289,26 +5236,7 @@ bool ggml_backend_is_sycl(ggml_backend_t backend) {
5289
5236
  return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_sycl_guid());
5290
5237
  }
5291
5238
 
5292
- GGML_CALL int ggml_backend_sycl_get_device_count() {
5239
+ int ggml_backend_sycl_get_device_count() {
5293
5240
  GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_count\n");
5294
5241
  return ggml_sycl_info().device_count;
5295
5242
  }
5296
-
5297
- GGML_CALL static ggml_backend_t ggml_backend_reg_sycl_init(const char * params, void * user_data) {
5298
- ggml_backend_t sycl_backend = ggml_backend_sycl_init((int) (intptr_t) user_data);
5299
- return sycl_backend;
5300
-
5301
- UNUSED(params);
5302
- }
5303
-
5304
- extern "C" int ggml_backend_sycl_reg_devices();
5305
-
5306
- int ggml_backend_sycl_reg_devices() {
5307
- assert(ggml_sycl_info().device_count>0);
5308
- for (int i = 0; i < ggml_sycl_info().device_count; i++) {
5309
- char name[128];
5310
- snprintf(name, sizeof(name), "%s%d", GGML_SYCL_NAME, i);
5311
- ggml_backend_register(name, ggml_backend_reg_sycl_init, ggml_backend_sycl_buffer_type(i), (void *) (intptr_t) i);
5312
- }
5313
- return ggml_sycl_info().device_count;
5314
- }