@fugood/llama.node 0.3.16 → 0.3.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (202) hide show
  1. package/CMakeLists.txt +3 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +5 -0
  19. package/package.json +1 -1
  20. package/src/LlamaCompletionWorker.cpp +8 -0
  21. package/src/LlamaCompletionWorker.h +1 -0
  22. package/src/LlamaContext.cpp +3 -2
  23. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +124 -0
  24. package/src/llama.cpp/.github/workflows/build.yml +70 -27
  25. package/src/llama.cpp/.github/workflows/docker.yml +6 -6
  26. package/src/llama.cpp/.github/workflows/server.yml +7 -11
  27. package/src/llama.cpp/CMakeLists.txt +23 -1
  28. package/src/llama.cpp/common/CMakeLists.txt +6 -3
  29. package/src/llama.cpp/common/arg.cpp +809 -105
  30. package/src/llama.cpp/common/arg.h +9 -0
  31. package/src/llama.cpp/common/chat.cpp +1 -1
  32. package/src/llama.cpp/common/common.cpp +31 -521
  33. package/src/llama.cpp/common/common.h +17 -36
  34. package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
  35. package/src/llama.cpp/common/llguidance.cpp +30 -47
  36. package/src/llama.cpp/common/minja/chat-template.hpp +15 -7
  37. package/src/llama.cpp/common/minja/minja.hpp +119 -93
  38. package/src/llama.cpp/common/sampling.cpp +3 -0
  39. package/src/llama.cpp/docs/build.md +122 -7
  40. package/src/llama.cpp/examples/CMakeLists.txt +0 -9
  41. package/src/llama.cpp/examples/batched/batched.cpp +1 -1
  42. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +1 -1
  43. package/src/llama.cpp/examples/embedding/embedding.cpp +7 -1
  44. package/src/llama.cpp/examples/export-lora/export-lora.cpp +1 -1
  45. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -16
  46. package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
  47. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +210 -8
  48. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  49. package/src/llama.cpp/examples/llava/CMakeLists.txt +39 -24
  50. package/src/llama.cpp/examples/llava/clip-impl.h +345 -0
  51. package/src/llama.cpp/examples/llava/clip.cpp +2152 -1803
  52. package/src/llama.cpp/examples/llava/clip.h +39 -22
  53. package/src/llama.cpp/examples/llava/deprecation-warning.cpp +22 -0
  54. package/src/llama.cpp/examples/llava/llava.cpp +64 -52
  55. package/src/llama.cpp/examples/llava/mtmd-cli.cpp +344 -0
  56. package/src/llama.cpp/examples/llava/mtmd.cpp +708 -0
  57. package/src/llama.cpp/examples/llava/mtmd.h +168 -0
  58. package/src/llama.cpp/examples/llava/{qwen2vl-cli.cpp → qwen2vl-test.cpp} +83 -31
  59. package/src/llama.cpp/examples/main/main.cpp +16 -5
  60. package/src/llama.cpp/examples/parallel/parallel.cpp +3 -1
  61. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
  62. package/src/llama.cpp/examples/perplexity/perplexity.cpp +17 -3
  63. package/src/llama.cpp/examples/quantize/quantize.cpp +115 -2
  64. package/src/llama.cpp/examples/rpc/CMakeLists.txt +4 -2
  65. package/src/llama.cpp/examples/rpc/rpc-server.cpp +163 -8
  66. package/src/llama.cpp/examples/run/CMakeLists.txt +12 -1
  67. package/src/llama.cpp/examples/run/run.cpp +14 -28
  68. package/src/llama.cpp/examples/server/httplib.h +313 -247
  69. package/src/llama.cpp/examples/server/server.cpp +238 -139
  70. package/src/llama.cpp/examples/server/utils.hpp +51 -2
  71. package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
  72. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  73. package/src/llama.cpp/examples/sycl/build.sh +2 -2
  74. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
  75. package/src/llama.cpp/examples/tts/tts.cpp +6 -9
  76. package/src/llama.cpp/ggml/CMakeLists.txt +8 -2
  77. package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
  78. package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
  79. package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
  80. package/src/llama.cpp/ggml/include/ggml.h +66 -99
  81. package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
  82. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
  83. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
  84. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
  85. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
  86. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
  87. package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
  88. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
  89. package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
  90. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +48 -22
  91. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
  92. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  93. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
  94. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
  95. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -192
  96. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
  97. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +754 -404
  98. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1003 -13519
  99. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +2 -7
  101. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +0 -1
  102. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +3 -4
  103. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +533 -88
  104. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8809 -0
  105. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
  106. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
  107. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
  108. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
  109. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +258 -0
  110. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
  111. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
  112. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
  113. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
  114. package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
  115. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +70 -3
  116. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
  117. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -260
  118. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +293 -40
  119. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +96 -22
  120. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  121. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +350 -0
  122. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  123. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
  124. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +2 -292
  125. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
  126. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +967 -438
  127. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
  128. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
  129. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
  130. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +204 -280
  131. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
  132. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
  133. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
  134. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
  135. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
  136. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
  137. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
  138. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +23 -0
  139. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +646 -114
  140. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +12 -0
  141. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +17 -8
  142. package/src/llama.cpp/ggml/src/ggml.c +141 -245
  143. package/src/llama.cpp/ggml/src/gguf.cpp +1 -0
  144. package/src/llama.cpp/include/llama.h +30 -11
  145. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
  146. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
  147. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
  148. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
  149. package/src/llama.cpp/requirements/requirements-all.txt +2 -0
  150. package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
  151. package/src/llama.cpp/src/CMakeLists.txt +3 -2
  152. package/src/llama.cpp/src/llama-adapter.cpp +37 -1
  153. package/src/llama.cpp/src/llama-arch.cpp +160 -17
  154. package/src/llama.cpp/src/llama-arch.h +16 -0
  155. package/src/llama.cpp/src/llama-chat.cpp +82 -17
  156. package/src/llama.cpp/src/llama-chat.h +6 -2
  157. package/src/llama.cpp/src/llama-context.cpp +108 -92
  158. package/src/llama.cpp/src/llama-context.h +1 -2
  159. package/src/llama.cpp/src/llama-graph.cpp +189 -119
  160. package/src/llama.cpp/src/llama-graph.h +26 -6
  161. package/src/llama.cpp/src/llama-hparams.h +13 -0
  162. package/src/llama.cpp/src/llama-kv-cache.cpp +70 -123
  163. package/src/llama.cpp/src/llama-kv-cache.h +41 -115
  164. package/src/llama.cpp/src/llama-memory.h +1 -1
  165. package/src/llama.cpp/src/llama-mmap.cpp +1 -1
  166. package/src/llama.cpp/src/llama-model-loader.cpp +10 -5
  167. package/src/llama.cpp/src/llama-model-loader.h +5 -3
  168. package/src/llama.cpp/src/llama-model.cpp +1760 -534
  169. package/src/llama.cpp/src/llama-model.h +13 -1
  170. package/src/llama.cpp/src/llama-quant.cpp +29 -8
  171. package/src/llama.cpp/src/llama-sampling.cpp +7 -1
  172. package/src/llama.cpp/src/llama-vocab.cpp +44 -6
  173. package/src/llama.cpp/src/llama.cpp +1 -1
  174. package/src/llama.cpp/tests/CMakeLists.txt +43 -30
  175. package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
  176. package/src/llama.cpp/tests/test-backend-ops.cpp +82 -43
  177. package/src/llama.cpp/tests/test-chat-template.cpp +34 -13
  178. package/src/llama.cpp/tests/test-chat.cpp +12 -2
  179. package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
  180. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
  181. package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
  182. package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
  183. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
  184. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
  185. package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
  186. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
  187. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
  188. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
  189. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
  190. package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
  191. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
  192. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
  193. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  194. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  195. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  196. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  197. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  198. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  199. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  200. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  201. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  202. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
@@ -1,4 +1,5 @@
1
1
  #include "common.hpp"
2
+ #include "ggml.h"
2
3
  #include "element_wise.hpp"
3
4
 
4
5
  static void acc_f32(const float * x, const float * y, float * dst, const int ne,
@@ -20,10 +21,32 @@ static void acc_f32(const float * x, const float * y, float * dst, const int ne,
20
21
  }
21
22
  }
22
23
 
23
- static void gelu_f32(const float * x, float * dst, const int k,
24
+ template<typename T>
25
+ static void sgn(const T * x, T * dst, const int k, const sycl::nd_item<3> &item_ct1) {
26
+ for(auto i = item_ct1.get_global_id(2); i < (const size_t)k; i += item_ct1.get_global_range(2)) {
27
+ dst[i] = x[i] > static_cast<T>(0.f) ? static_cast<T>(1.f) : ((x[i] < static_cast<T>(0.f) ? static_cast<T>(-1.f) : static_cast<T>(0.f)));
28
+ }
29
+ }
30
+
31
+ template<typename T>
32
+ static void abs_op(const T * x, T * dst, const int k, const sycl::nd_item<3> &item_ct1) {
33
+ for(auto i = item_ct1.get_global_id(2); i < (const size_t)k; i += item_ct1.get_global_range(2)) {
34
+ dst[i] = sycl::fabs(x[i]);
35
+ }
36
+ }
37
+
38
+ template<typename T>
39
+ static void elu_op(const T * x, T * dst, const int k, const sycl::nd_item<3> &item_ct1) {
40
+ for(auto i = item_ct1.get_global_id(2); i < (const size_t)k; i += item_ct1.get_global_range(2)) {
41
+ dst[i] = (x[i] > static_cast<T>(0.f)) ? x[i] : sycl::expm1(x[i]);
42
+ }
43
+ }
44
+
45
+ template<typename T>
46
+ static void gelu(const T * x, T * dst, const int k,
24
47
  const sycl::nd_item<3> &item_ct1) {
25
- const float GELU_COEF_A = 0.044715f;
26
- const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
48
+ const T GELU_COEF_A = static_cast<T>(0.044715f);
49
+ const T SQRT_2_OVER_PI = static_cast<T>(0.79788456080286535587989211986876f);
27
50
  const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
28
51
  item_ct1.get_local_id(2);
29
52
 
@@ -32,12 +55,13 @@ static void gelu_f32(const float * x, float * dst, const int k,
32
55
  }
33
56
 
34
57
  float xi = x[i];
35
- dst[i] = 0.5f * xi *
36
- (1.0f +
37
- sycl::tanh(SQRT_2_OVER_PI * xi * (1.0f + GELU_COEF_A * xi * xi)));
58
+ dst[i] = static_cast<T>(0.5f) * xi *
59
+ (static_cast<T>(1.0f) +
60
+ sycl::tanh(SQRT_2_OVER_PI * xi * (static_cast<T>(1.0f) + GELU_COEF_A * xi * xi)));
38
61
  }
39
62
 
40
- static void silu_f32(const float * x, float * dst, const int k,
63
+ template<typename T>
64
+ static void silu(const T * x, T * dst, const int k,
41
65
  const sycl::nd_item<3> &item_ct1) {
42
66
  const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
43
67
  item_ct1.get_local_id(2);
@@ -45,10 +69,11 @@ static void silu_f32(const float * x, float * dst, const int k,
45
69
  if (i >= k) {
46
70
  return;
47
71
  }
48
- dst[i] = x[i] / (1.0f + sycl::native::exp(-x[i]));
72
+ dst[i] = x[i] / (static_cast<T>(1.0f) + sycl::native::exp(-x[i]));
49
73
  }
50
74
 
51
- static void gelu_quick_f32(const float *x, float *dst, int k,
75
+ template<typename T>
76
+ static void gelu_quick(const T *x, T *dst, int k,
52
77
  const sycl::nd_item<3> &item_ct1) {
53
78
  const float GELU_QUICK_COEF = -1.702f;
54
79
  const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
@@ -56,20 +81,22 @@ static void gelu_quick_f32(const float *x, float *dst, int k,
56
81
  if (i >= k) {
57
82
  return;
58
83
  }
59
- dst[i] = x[i] * (1.0f / (1.0f + sycl::native::exp(GELU_QUICK_COEF * x[i])));
84
+ dst[i] = x[i] * (static_cast<T>(1.0f) / (static_cast<T>(1.0f) + sycl::native::exp(GELU_QUICK_COEF * x[i])));
60
85
  }
61
86
 
62
- static void tanh_f32(const float *x, float *dst, int k,
87
+ template<typename T>
88
+ static void tanh(const T *x, T *dst, int k,
63
89
  const sycl::nd_item<3> &item_ct1) {
64
90
  const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
65
91
  item_ct1.get_local_id(2);
66
92
  if (i >= k) {
67
93
  return;
68
94
  }
69
- dst[i] = sycl::tanh((float)(x[i]));
95
+ dst[i] = sycl::tanh((x[i]));
70
96
  }
71
97
 
72
- static void relu_f32(const float * x, float * dst, const int k,
98
+ template<typename T>
99
+ static void relu(const T * x, T * dst, const int k,
73
100
  const sycl::nd_item<3> &item_ct1) {
74
101
  const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
75
102
  item_ct1.get_local_id(2);
@@ -77,10 +104,11 @@ static void relu_f32(const float * x, float * dst, const int k,
77
104
  if (i >= k) {
78
105
  return;
79
106
  }
80
- dst[i] = sycl::fmax((float)(x[i]), (float)0);
107
+ dst[i] = sycl::fmax((x[i]), static_cast<T>(0));
81
108
  }
82
109
 
83
- static void sigmoid_f32(const float * x, float * dst, const int k,
110
+ template<typename T>
111
+ static void sigmoid(const T * x, T * dst, const int k,
84
112
  const sycl::nd_item<3> &item_ct1) {
85
113
  const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
86
114
  item_ct1.get_local_id(2);
@@ -88,10 +116,11 @@ static void sigmoid_f32(const float * x, float * dst, const int k,
88
116
  if (i >= k) {
89
117
  return;
90
118
  }
91
- dst[i] = 1.0f / (1.0f + sycl::native::exp(-x[i]));
119
+ dst[i] = 1.0f / (static_cast<T>(1.0f) + sycl::native::exp(-x[i]));
92
120
  }
93
121
 
94
- static void sqrt_f32(const float * x, float * dst, const int k,
122
+ template<typename T>
123
+ static void sqrt(const T * x, T * dst, const int k,
95
124
  const sycl::nd_item<3> &item_ct1) {
96
125
  const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
97
126
  item_ct1.get_local_id(2);
@@ -102,7 +131,8 @@ static void sqrt_f32(const float * x, float * dst, const int k,
102
131
  dst[i] = sycl::sqrt(x[i]);
103
132
  }
104
133
 
105
- static void sin_f32(const float * x, float * dst, const int k,
134
+ template<typename T>
135
+ static void sin(const T * x, T * dst, const int k,
106
136
  const sycl::nd_item<3> &item_ct1) {
107
137
  const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
108
138
  item_ct1.get_local_id(2);
@@ -113,7 +143,8 @@ static void sin_f32(const float * x, float * dst, const int k,
113
143
  dst[i] = sycl::sin(x[i]);
114
144
  }
115
145
 
116
- static void cos_f32(const float * x, float * dst, const int k,
146
+ template<typename T>
147
+ static void cos(const T * x, T * dst, const int k,
117
148
  const sycl::nd_item<3> &item_ct1) {
118
149
  const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
119
150
  item_ct1.get_local_id(2);
@@ -124,7 +155,8 @@ static void cos_f32(const float * x, float * dst, const int k,
124
155
  dst[i] = sycl::cos(x[i]);
125
156
  }
126
157
 
127
- static void hardsigmoid_f32(const float * x, float * dst, const int k,
158
+ template<typename T>
159
+ static void hardsigmoid(const T * x, T * dst, const int k,
128
160
  const sycl::nd_item<3> &item_ct1) {
129
161
  const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
130
162
  item_ct1.get_local_id(2);
@@ -132,10 +164,11 @@ static void hardsigmoid_f32(const float * x, float * dst, const int k,
132
164
  if (i >= k) {
133
165
  return;
134
166
  }
135
- dst[i] = sycl::fmin(1.0f, sycl::fmax(0.0f, (x[i] + 3.0f) / 6.0f));
167
+ dst[i] = sycl::fmin(static_cast<T>(1.0f), sycl::fmax(static_cast<T>(0.0f), (x[i] + static_cast<T>(3.0f)) / static_cast<T>(6.0f)));
136
168
  }
137
169
 
138
- static void hardswish_f32(const float * x, float * dst, const int k,
170
+ template<typename T>
171
+ static void hardswish(const T * x, T * dst, const int k,
139
172
  const sycl::nd_item<3> &item_ct1) {
140
173
  const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
141
174
  item_ct1.get_local_id(2);
@@ -143,10 +176,11 @@ static void hardswish_f32(const float * x, float * dst, const int k,
143
176
  if (i >= k) {
144
177
  return;
145
178
  }
146
- dst[i] = x[i] * sycl::fmin(1.0f, sycl::fmax(0.0f, (x[i] + 3.0f) / 6.0f));
179
+ dst[i] = x[i] * sycl::fmin(static_cast<T>(1.0f), sycl::fmax(static_cast<T>(0.0f), (x[i] + static_cast<T>(3.0f)) / static_cast<T>(6.0f)));
147
180
  }
148
181
 
149
- static void exp_f32(const float * x, float * dst, const int k,
182
+ template<typename T>
183
+ static void exp(const T * x, T * dst, const int k,
150
184
  const sycl::nd_item<3> &item_ct1) {
151
185
  const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
152
186
  item_ct1.get_local_id(2);
@@ -157,7 +191,8 @@ static void exp_f32(const float * x, float * dst, const int k,
157
191
  dst[i] = sycl::exp(x[i]);
158
192
  }
159
193
 
160
- static void log_f32(const float * x, float * dst, const int k,
194
+ template<typename T>
195
+ static void log(const T * x, T * dst, const int k,
161
196
  const sycl::nd_item<3> &item_ct1) {
162
197
  const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
163
198
  item_ct1.get_local_id(2);
@@ -165,15 +200,16 @@ static void log_f32(const float * x, float * dst, const int k,
165
200
  if (i >= k) {
166
201
  return;
167
202
  }
168
- float xi = x[i];
203
+ T xi = x[i];
169
204
  if (xi <= 0) {
170
- dst[i] = -INFINITY;
205
+ dst[i] = neg_infinity<T>();
171
206
  } else {
172
207
  dst[i] = sycl::log(xi);
173
208
  }
174
209
  }
175
210
 
176
- static void neg_f32(const float * x, float * dst, const int k,
211
+ template<typename T>
212
+ static void neg(const T * x, T * dst, const int k,
177
213
  const sycl::nd_item<3> &item_ct1) {
178
214
  const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
179
215
  item_ct1.get_local_id(2);
@@ -184,7 +220,8 @@ static void neg_f32(const float * x, float * dst, const int k,
184
220
  dst[i] = -x[i];
185
221
  }
186
222
 
187
- static void step_f32(const float * x, float * dst, const int k,
223
+ template<typename T>
224
+ static void step(const T * x, T * dst, const int k,
188
225
  const sycl::nd_item<3> &item_ct1) {
189
226
  const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
190
227
  item_ct1.get_local_id(2);
@@ -192,21 +229,23 @@ static void step_f32(const float * x, float * dst, const int k,
192
229
  if (i >= k) {
193
230
  return;
194
231
  }
195
- dst[i] = x[i] > 0.0f;
232
+ dst[i] = x[i] > static_cast<T>(0.0f);
196
233
  }
197
234
 
198
- static void leaky_relu_f32(const float *x, float *dst, const int k, const float negative_slope,
235
+ template<typename T>
236
+ static void leaky_relu(const T *x, T *dst, const int k, const float negative_slope,
199
237
  const sycl::nd_item<3> &item_ct1) {
200
238
  const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
201
239
  item_ct1.get_local_id(2);
202
240
  if (i >= k) {
203
241
  return;
204
242
  }
205
- dst[i] = sycl::fmax((float)(x[i]), (float)0) +
206
- sycl::fmin((float)(x[i]), 0.0f) * negative_slope;
243
+ dst[i] = sycl::fmax((x[i]), static_cast<T>(0)) +
244
+ sycl::fmin((x[i]), static_cast<T>(0.0f)) * negative_slope;
207
245
  }
208
246
 
209
- static void sqr_f32(const float * x, float * dst, const int k,
247
+ template<typename T>
248
+ static void sqr(const T * x, T * dst, const int k,
210
249
  const sycl::nd_item<3> &item_ct1) {
211
250
  const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
212
251
  item_ct1.get_local_id(2);
@@ -217,7 +256,8 @@ static void sqr_f32(const float * x, float * dst, const int k,
217
256
  dst[i] = x[i] * x[i];
218
257
  }
219
258
 
220
- static void upscale_f32(const float *x, float *dst, const int nb00, const int nb01,
259
+ template<typename T>
260
+ static void upscale(const T *x, T *dst, const int nb00, const int nb01,
221
261
  const int nb02, const int nb03, const int ne10, const int ne11,
222
262
  const int ne12, const int ne13, const float sf0, const float sf1,
223
263
  const float sf2, const float sf3, const sycl::nd_item<1> &item_ct1) {
@@ -237,10 +277,11 @@ static void upscale_f32(const float *x, float *dst, const int nb00, const int n
237
277
  int i02 = i12 / sf2;
238
278
  int i03 = i13 / sf3;
239
279
 
240
- dst[index] = *(const float *)((const char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00);
280
+ dst[index] = *(const T *)((const char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00);
241
281
  }
242
282
 
243
- static void pad_f32(const float *x, float *dst, const int ne0, const int ne00, const int ne01, const int ne02,
283
+ template <typename T>
284
+ static void pad(const T *x, T *dst, const int ne0, const int ne00, const int ne01, const int ne02,
244
285
  const sycl::nd_item<3> &item_ct1) {
245
286
  int nidx = item_ct1.get_local_id(2) +
246
287
  item_ct1.get_group(2) * item_ct1.get_local_range(2);
@@ -256,11 +297,23 @@ static void pad_f32(const float *x, float *dst, const int ne0, const int ne00,
256
297
  item_ct1.get_group(0) * ne00 * ne01;
257
298
  dst[offset_dst] = x[offset_src];
258
299
  } else {
259
- dst[offset_dst] = 0.0f;
300
+ dst[offset_dst] = static_cast<T>(0.0f);
260
301
  }
261
302
  }
262
303
 
263
304
 
305
+ template<typename T>
306
+ static void clamp(const T * x, T * dst, const float min, const float max, const int k,
307
+ const sycl::nd_item<3> &item_ct1) {
308
+ const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
309
+ item_ct1.get_local_id(2);
310
+
311
+ if (i >= k) {
312
+ return;
313
+ }
314
+
315
+ dst[i] = x[i] < static_cast<T>(min) ? static_cast<T>(min) : (x[i] > static_cast<T>(max) ? static_cast<T>(max) : x[i]);
316
+ }
264
317
 
265
318
  static void acc_f32_sycl(const float *x, const float *y, float *dst,
266
319
  const int n_elements, const int ne10, const int ne11,
@@ -277,7 +330,8 @@ static void acc_f32_sycl(const float *x, const float *y, float *dst,
277
330
  });
278
331
  }
279
332
 
280
- static void gelu_f32_sycl(const float *x, float *dst, const int k,
333
+ template<typename T>
334
+ static void gelu_sycl(const T *x, T *dst, const int k,
281
335
  queue_ptr stream) {
282
336
  const int num_blocks = (k + SYCL_GELU_BLOCK_SIZE - 1) / SYCL_GELU_BLOCK_SIZE;
283
337
  stream->parallel_for(
@@ -285,11 +339,12 @@ static void gelu_f32_sycl(const float *x, float *dst, const int k,
285
339
  sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE),
286
340
  sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)),
287
341
  [=](sycl::nd_item<3> item_ct1) {
288
- gelu_f32(x, dst, k, item_ct1);
342
+ gelu(x, dst, k, item_ct1);
289
343
  });
290
344
  }
291
345
 
292
- static void silu_f32_sycl(const float *x, float *dst, const int k,
346
+ template<typename T>
347
+ static void silu_sycl(const T *x, T *dst, const int k,
293
348
  queue_ptr stream) {
294
349
  const int num_blocks = (k + SYCL_SILU_BLOCK_SIZE - 1) / SYCL_SILU_BLOCK_SIZE;
295
350
  stream->parallel_for(
@@ -297,11 +352,43 @@ static void silu_f32_sycl(const float *x, float *dst, const int k,
297
352
  sycl::range<3>(1, 1, SYCL_SILU_BLOCK_SIZE),
298
353
  sycl::range<3>(1, 1, SYCL_SILU_BLOCK_SIZE)),
299
354
  [=](sycl::nd_item<3> item_ct1) {
300
- silu_f32(x, dst, k, item_ct1);
355
+ silu(x, dst, k, item_ct1);
301
356
  });
302
357
  }
303
358
 
304
- static void gelu_quick_f32_sycl(const float *x, float *dst, const int k,
359
+ template<typename T>
360
+ static void sgn_sycl(const T * x, T * dst, const int k, queue_ptr stream) {
361
+ // hard code for now
362
+ const int num_blocks = ceil_div(k, 256);
363
+ stream->parallel_for(
364
+ sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range(1, 1, 256)), sycl::range(1, 1, 256)), [=](sycl::nd_item<3> item_ct1) {
365
+ sgn(x, dst, k, item_ct1);
366
+ });
367
+ }
368
+
369
+ template<typename T>
370
+ static void abs_sycl(const T * x, T * dst, const int k, queue_ptr stream) {
371
+ // hard code for now
372
+ const int num_blocks = ceil_div(k, 256);
373
+ stream->parallel_for(
374
+ sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, 256)), sycl::range<3>(1, 1, 256)), [=](sycl::nd_item<3> item_ct1) {
375
+ abs_op(x, dst, k, item_ct1);
376
+ });
377
+ }
378
+
379
+
380
+ template<typename T>
381
+ static void elu_sycl(const T * x, T * dst, const int k, queue_ptr stream) {
382
+ // hard code for now
383
+ const int num_blocks = ceil_div(k, 256);
384
+ stream->parallel_for(
385
+ sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, 256)), sycl::range<3>(1, 1, 256)), [=](sycl::nd_item<3> item_ct1) {
386
+ elu_op(x, dst, k, item_ct1);
387
+ });
388
+ }
389
+
390
+ template<typename T>
391
+ static void gelu_quick_sycl(const T *x, T *dst, const int k,
305
392
  queue_ptr stream) {
306
393
  const int num_blocks = (k + SYCL_GELU_BLOCK_SIZE - 1) / SYCL_GELU_BLOCK_SIZE;
307
394
  stream->parallel_for(
@@ -309,11 +396,12 @@ static void gelu_quick_f32_sycl(const float *x, float *dst, const int k,
309
396
  sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE),
310
397
  sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)),
311
398
  [=](sycl::nd_item<3> item_ct1) {
312
- gelu_quick_f32(x, dst, k, item_ct1);
399
+ gelu_quick(x, dst, k, item_ct1);
313
400
  });
314
401
  }
315
402
 
316
- static void tanh_f32_sycl(const float *x, float *dst, const int k,
403
+ template<typename T>
404
+ static void tanh_sycl(const T *x, T *dst, const int k,
317
405
  queue_ptr stream) {
318
406
  const int num_blocks = (k + SYCL_TANH_BLOCK_SIZE - 1) / SYCL_TANH_BLOCK_SIZE;
319
407
  stream->parallel_for(
@@ -321,11 +409,12 @@ static void tanh_f32_sycl(const float *x, float *dst, const int k,
321
409
  sycl::range<3>(1, 1, SYCL_TANH_BLOCK_SIZE),
322
410
  sycl::range<3>(1, 1, SYCL_TANH_BLOCK_SIZE)),
323
411
  [=](sycl::nd_item<3> item_ct1) {
324
- tanh_f32(x, dst, k, item_ct1);
412
+ tanh(x, dst, k, item_ct1);
325
413
  });
326
414
  }
327
415
 
328
- static void relu_f32_sycl(const float *x, float *dst, const int k,
416
+ template<typename T>
417
+ static void relu_sycl(const T *x, T *dst, const int k,
329
418
  queue_ptr stream) {
330
419
  const int num_blocks = (k + SYCL_RELU_BLOCK_SIZE - 1) / SYCL_RELU_BLOCK_SIZE;
331
420
  stream->parallel_for(
@@ -333,11 +422,12 @@ static void relu_f32_sycl(const float *x, float *dst, const int k,
333
422
  sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE),
334
423
  sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE)),
335
424
  [=](sycl::nd_item<3> item_ct1) {
336
- relu_f32(x, dst, k, item_ct1);
425
+ relu(x, dst, k, item_ct1);
337
426
  });
338
427
  }
339
428
 
340
- static void hardsigmoid_f32_sycl(const float *x, float *dst, const int k,
429
+ template<typename T>
430
+ static void hardsigmoid_sycl(const T *x, T *dst, const int k,
341
431
  queue_ptr stream) {
342
432
  const int num_blocks = (k + SYCL_HARDSIGMOID_BLOCK_SIZE - 1) / SYCL_HARDSIGMOID_BLOCK_SIZE;
343
433
  stream->parallel_for(
@@ -345,11 +435,12 @@ static void hardsigmoid_f32_sycl(const float *x, float *dst, const int k,
345
435
  sycl::range<3>(1, 1, SYCL_HARDSIGMOID_BLOCK_SIZE),
346
436
  sycl::range<3>(1, 1, SYCL_HARDSIGMOID_BLOCK_SIZE)),
347
437
  [=](sycl::nd_item<3> item_ct1) {
348
- hardsigmoid_f32(x, dst, k, item_ct1);
438
+ hardsigmoid(x, dst, k, item_ct1);
349
439
  });
350
440
  }
351
441
 
352
- static void hardswish_f32_sycl(const float *x, float *dst, const int k,
442
+ template<typename T>
443
+ static void hardswish_sycl(const T *x, T *dst, const int k,
353
444
  queue_ptr stream) {
354
445
  const int num_blocks = (k + SYCL_HARDSWISH_BLOCK_SIZE - 1) / SYCL_HARDSWISH_BLOCK_SIZE;
355
446
  stream->parallel_for(
@@ -357,11 +448,12 @@ static void hardswish_f32_sycl(const float *x, float *dst, const int k,
357
448
  sycl::range<3>(1, 1, SYCL_HARDSWISH_BLOCK_SIZE),
358
449
  sycl::range<3>(1, 1, SYCL_HARDSWISH_BLOCK_SIZE)),
359
450
  [=](sycl::nd_item<3> item_ct1) {
360
- hardswish_f32(x, dst, k, item_ct1);
451
+ hardswish(x, dst, k, item_ct1);
361
452
  });
362
453
  }
363
454
 
364
- static void exp_f32_sycl(const float *x, float *dst, const int k,
455
+ template<typename T>
456
+ static void exp_sycl(const T *x, T *dst, const int k,
365
457
  queue_ptr stream) {
366
458
  const int num_blocks = (k + SYCL_EXP_BLOCK_SIZE - 1) / SYCL_EXP_BLOCK_SIZE;
367
459
  stream->parallel_for(
@@ -369,11 +461,12 @@ static void exp_f32_sycl(const float *x, float *dst, const int k,
369
461
  sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE),
370
462
  sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE)),
371
463
  [=](sycl::nd_item<3> item_ct1) {
372
- exp_f32(x, dst, k, item_ct1);
464
+ exp(x, dst, k, item_ct1);
373
465
  });
374
466
  }
375
467
 
376
- static void log_f32_sycl(const float *x, float *dst, const int k,
468
+ template<typename T>
469
+ static void log_sycl(const T *x, T *dst, const int k,
377
470
  queue_ptr stream) {
378
471
  const int num_blocks = (k + SYCL_EXP_BLOCK_SIZE - 1) / SYCL_EXP_BLOCK_SIZE;
379
472
  stream->parallel_for(
@@ -381,11 +474,12 @@ static void log_f32_sycl(const float *x, float *dst, const int k,
381
474
  sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE),
382
475
  sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE)),
383
476
  [=](sycl::nd_item<3> item_ct1) {
384
- log_f32(x, dst, k, item_ct1);
477
+ log(x, dst, k, item_ct1);
385
478
  });
386
479
  }
387
480
 
388
- static void neg_f32_sycl(const float *x, float *dst, const int k,
481
+ template<typename T>
482
+ static void neg_sycl(const T *x, T *dst, const int k,
389
483
  queue_ptr stream) {
390
484
  const int num_blocks = (k + SYCL_NEG_BLOCK_SIZE - 1) / SYCL_NEG_BLOCK_SIZE;
391
485
  stream->parallel_for(
@@ -393,11 +487,12 @@ static void neg_f32_sycl(const float *x, float *dst, const int k,
393
487
  sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE),
394
488
  sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE)),
395
489
  [=](sycl::nd_item<3> item_ct1) {
396
- neg_f32(x, dst, k, item_ct1);
490
+ neg(x, dst, k, item_ct1);
397
491
  });
398
492
  }
399
493
 
400
- static void step_f32_sycl(const float *x, float *dst, const int k,
494
+ template<typename T>
495
+ static void step_sycl(const T *x, T *dst, const int k,
401
496
  queue_ptr stream) {
402
497
  const int num_blocks = (k + SYCL_NEG_BLOCK_SIZE - 1) / SYCL_NEG_BLOCK_SIZE;
403
498
  stream->parallel_for(
@@ -405,11 +500,12 @@ static void step_f32_sycl(const float *x, float *dst, const int k,
405
500
  sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE),
406
501
  sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE)),
407
502
  [=](sycl::nd_item<3> item_ct1) {
408
- step_f32(x, dst, k, item_ct1);
503
+ step(x, dst, k, item_ct1);
409
504
  });
410
505
  }
411
506
 
412
- static void sigmoid_f32_sycl(const float *x, float *dst, const int k,
507
+ template<typename T>
508
+ static void sigmoid_sycl(const T *x, T *dst, const int k,
413
509
  queue_ptr stream) {
414
510
  const int num_blocks = (k + SYCL_SIGMOID_BLOCK_SIZE - 1) / SYCL_SIGMOID_BLOCK_SIZE;
415
511
  stream->parallel_for(
@@ -417,11 +513,12 @@ static void sigmoid_f32_sycl(const float *x, float *dst, const int k,
417
513
  sycl::range<3>(1, 1, SYCL_SIGMOID_BLOCK_SIZE),
418
514
  sycl::range<3>(1, 1, SYCL_SIGMOID_BLOCK_SIZE)),
419
515
  [=](sycl::nd_item<3> item_ct1) {
420
- sigmoid_f32(x, dst, k, item_ct1);
516
+ sigmoid(x, dst, k, item_ct1);
421
517
  });
422
518
  }
423
519
 
424
- static void sqrt_f32_sycl(const float *x, float *dst, const int k,
520
+ template<typename T>
521
+ static void sqrt_sycl(const T *x, T *dst, const int k,
425
522
  queue_ptr stream) {
426
523
  const int num_blocks = (k + SYCL_SQRT_BLOCK_SIZE - 1) / SYCL_SQRT_BLOCK_SIZE;
427
524
  stream->parallel_for(
@@ -429,11 +526,12 @@ static void sqrt_f32_sycl(const float *x, float *dst, const int k,
429
526
  sycl::range<3>(1, 1, SYCL_SQRT_BLOCK_SIZE),
430
527
  sycl::range<3>(1, 1, SYCL_SQRT_BLOCK_SIZE)),
431
528
  [=](sycl::nd_item<3> item_ct1) {
432
- sqrt_f32(x, dst, k, item_ct1);
529
+ sqrt(x, dst, k, item_ct1);
433
530
  });
434
531
  }
435
532
 
436
- static void sin_f32_sycl(const float *x, float *dst, const int k,
533
+ template<typename T>
534
+ static void sin_sycl(const T *x, T *dst, const int k,
437
535
  queue_ptr stream) {
438
536
  const int num_blocks = (k + SYCL_SIN_BLOCK_SIZE - 1) / SYCL_SIN_BLOCK_SIZE;
439
537
  stream->parallel_for(
@@ -441,11 +539,12 @@ static void sin_f32_sycl(const float *x, float *dst, const int k,
441
539
  sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE),
442
540
  sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE)),
443
541
  [=](sycl::nd_item<3> item_ct1) {
444
- sin_f32(x, dst, k, item_ct1);
542
+ sin(x, dst, k, item_ct1);
445
543
  });
446
544
  }
447
545
 
448
- static void cos_f32_sycl(const float *x, float *dst, const int k,
546
+ template<typename T>
547
+ static void cos_sycl(const T *x, T *dst, const int k,
449
548
  queue_ptr stream) {
450
549
  const int num_blocks = (k + SYCL_SIN_BLOCK_SIZE - 1) / SYCL_SIN_BLOCK_SIZE;
451
550
  stream->parallel_for(
@@ -453,11 +552,12 @@ static void cos_f32_sycl(const float *x, float *dst, const int k,
453
552
  sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE),
454
553
  sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE)),
455
554
  [=](sycl::nd_item<3> item_ct1) {
456
- cos_f32(x, dst, k, item_ct1);
555
+ cos(x, dst, k, item_ct1);
457
556
  });
458
557
  }
459
558
 
460
- static void leaky_relu_f32_sycl(const float *x, float *dst, const int k,
559
+ template<typename T>
560
+ static void leaky_relu_sycl(const T *x, T *dst, const int k,
461
561
  const float negative_slope,
462
562
  queue_ptr stream) {
463
563
  const int num_blocks = (k + SYCL_RELU_BLOCK_SIZE - 1) / SYCL_RELU_BLOCK_SIZE;
@@ -466,11 +566,12 @@ static void leaky_relu_f32_sycl(const float *x, float *dst, const int k,
466
566
  sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE),
467
567
  sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE)),
468
568
  [=](sycl::nd_item<3> item_ct1) {
469
- leaky_relu_f32(x, dst, k, negative_slope, item_ct1);
569
+ leaky_relu(x, dst, k, negative_slope, item_ct1);
470
570
  });
471
571
  }
472
572
 
473
- static void sqr_f32_sycl(const float *x, float *dst, const int k,
573
+ template<typename T>
574
+ static void sqr_sycl(const T *x, T *dst, const int k,
474
575
  queue_ptr stream) {
475
576
  const int num_blocks = (k + SYCL_SQR_BLOCK_SIZE - 1) / SYCL_SQR_BLOCK_SIZE;
476
577
  stream->parallel_for(
@@ -478,11 +579,12 @@ static void sqr_f32_sycl(const float *x, float *dst, const int k,
478
579
  sycl::range<3>(1, 1, SYCL_SQR_BLOCK_SIZE),
479
580
  sycl::range<3>(1, 1, SYCL_SQR_BLOCK_SIZE)),
480
581
  [=](sycl::nd_item<3> item_ct1) {
481
- sqr_f32(x, dst, k, item_ct1);
582
+ sqr(x, dst, k, item_ct1);
482
583
  });
483
584
  }
484
585
 
485
- static void upscale_f32_sycl(const float *x, float *dst, const int nb00, const int nb01,
586
+ template<typename T>
587
+ static void upscale_sycl(const T *x, T *dst, const int nb00, const int nb01,
486
588
  const int nb02, const int nb03, const int ne10, const int ne11,
487
589
  const int ne12, const int ne13, const float sf0, const float sf1,
488
590
  const float sf2, const float sf3, queue_ptr stream) {
@@ -492,11 +594,12 @@ static void upscale_f32_sycl(const float *x, float *dst, const int nb00, const i
492
594
  stream->parallel_for(
493
595
  sycl::nd_range<1>(gridDim, sycl::range<1>(SYCL_UPSCALE_BLOCK_SIZE)),
494
596
  [=](sycl::nd_item<1> item_ct1) {
495
- upscale_f32(x, dst, nb00, nb01, nb02, nb03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3, item_ct1);
597
+ upscale(x, dst, nb00, nb01, nb02, nb03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3, item_ct1);
496
598
  });
497
599
  }
498
600
 
499
- static void pad_f32_sycl(const float *x, float *dst, const int ne00,
601
+ template<typename T>
602
+ static void pad_sycl(const T *x, T *dst, const int ne00,
500
603
  const int ne01, const int ne02, const int ne0,
501
604
  const int ne1, const int ne2, queue_ptr stream) {
502
605
  int num_blocks = (ne0 + SYCL_PAD_BLOCK_SIZE - 1) / SYCL_PAD_BLOCK_SIZE;
@@ -505,526 +608,952 @@ static void pad_f32_sycl(const float *x, float *dst, const int ne00,
505
608
  sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE),
506
609
  sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE)),
507
610
  [=](sycl::nd_item<3> item_ct1) {
508
- pad_f32(x, dst, ne0, ne00, ne01, ne02, item_ct1);
611
+ pad(x, dst, ne0, ne00, ne01, ne02, item_ct1);
509
612
  });
510
613
  }
511
614
 
512
- inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
513
- ggml_tensor *dst, const float *src0_dd,
514
- const float *src1_dd, float *dst_dd,
515
- const queue_ptr &main_stream) {
516
-
517
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
518
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
519
-
520
- silu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
521
-
522
- GGML_UNUSED(src1);
523
- GGML_UNUSED(dst);
524
- GGML_UNUSED(src1_dd);
525
- GGML_UNUSED(ctx);
615
+ template<typename T>
616
+ static void clamp_sycl(const T *x, T *dst, const float min,
617
+ const float max, const int k,
618
+ queue_ptr stream) {
619
+ const int num_blocks = (k + SYCL_CLAMP_BLOCK_SIZE - 1) / SYCL_CLAMP_BLOCK_SIZE;
620
+ stream->parallel_for(
621
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
622
+ sycl::range<3>(1, 1, SYCL_CLAMP_BLOCK_SIZE),
623
+ sycl::range<3>(1, 1, SYCL_CLAMP_BLOCK_SIZE)),
624
+ [=](sycl::nd_item<3> item_ct1) {
625
+ clamp(x, dst, min, max, k, item_ct1);
626
+ });
526
627
  }
527
628
 
528
- inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
529
- ggml_tensor *dst, const float *src0_dd,
530
- const float *src1_dd, float *dst_dd,
531
- const queue_ptr &main_stream) {
532
-
533
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
534
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
629
+ inline void ggml_sycl_op_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
630
+ #if defined (GGML_SYCL_F16)
631
+ GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
632
+ GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
535
633
 
536
- gelu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
537
-
538
- GGML_UNUSED(src1);
539
- GGML_UNUSED(dst);
540
- GGML_UNUSED(src1_dd);
541
- GGML_UNUSED(ctx);
634
+ #else
635
+ GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
636
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
637
+ #endif
638
+ GGML_ASSERT(dst->src[0]->type == dst->type);
639
+ dpct::queue_ptr main_stream = ctx.stream();
640
+ SYCL_CHECK(ggml_sycl_set_device(ctx.device));
641
+ switch (dst->type) {
642
+ #if defined (GGML_SYCL_F16)
643
+ case GGML_TYPE_F16:
644
+ {
645
+ auto data_pts = cast_data<sycl::half>(dst);
646
+ sgn_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
647
+ break;
648
+ }
649
+ #endif
650
+ case GGML_TYPE_F32:
651
+ {
652
+ auto data_pts = cast_data<float>(dst);
653
+ sgn_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
654
+ break;
655
+ }
656
+ default:
657
+ GGML_ABORT("GGML tensor type not supported!\n");
658
+ break;
659
+ }
542
660
  }
543
- inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
544
- const ggml_tensor *src1, ggml_tensor *dst,
545
- const float *src0_dd, const float *src1_dd,
546
- float *dst_dd,
547
- const queue_ptr &main_stream) {
548
-
549
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
550
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
551
661
 
552
- gelu_quick_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
662
+ inline void ggml_sycl_op_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
663
+ #if defined (GGML_SYCL_F16)
664
+ GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
665
+ GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
553
666
 
554
- GGML_UNUSED(src1);
555
- GGML_UNUSED(dst);
556
- GGML_UNUSED(src1_dd);
557
- GGML_UNUSED(ctx);
667
+ #else
668
+ GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
669
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
670
+ #endif
671
+ GGML_ASSERT(dst->src[0]->type == dst->type);
672
+ dpct::queue_ptr main_stream = ctx.stream();
673
+ SYCL_CHECK(ggml_sycl_set_device(ctx.device));
674
+ switch (dst->type) {
675
+ #if defined (GGML_SYCL_F16)
676
+ case GGML_TYPE_F16:
677
+ {
678
+ auto data_pts = cast_data<sycl::half>(dst);
679
+ abs_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
680
+ break;
681
+ }
682
+ #endif
683
+ case GGML_TYPE_F32:
684
+ {
685
+ auto data_pts = cast_data<float>(dst);
686
+ abs_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
687
+ break;
688
+ }
689
+ default:
690
+ GGML_ABORT("GGML tensor type not supported!\n");
691
+ break;
692
+ }
558
693
  }
559
694
 
560
- inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
561
- ggml_tensor *dst, const float *src0_dd,
562
- const float *src1_dd, float *dst_dd,
563
- const queue_ptr &main_stream) {
564
695
 
565
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
566
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
567
- tanh_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
696
+ inline void ggml_sycl_op_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
697
+ #if defined (GGML_SYCL_F16)
698
+ GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
699
+ GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
568
700
 
569
- GGML_UNUSED(src1);
570
- GGML_UNUSED(dst);
571
- GGML_UNUSED(src1_dd);
572
- GGML_UNUSED(ctx);
701
+ #else
702
+ GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
703
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
704
+ #endif
705
+ GGML_ASSERT(dst->src[0]->type == dst->type);
706
+ dpct::queue_ptr main_stream = ctx.stream();
707
+ SYCL_CHECK(ggml_sycl_set_device(ctx.device));
708
+ switch (dst->type) {
709
+ #if defined (GGML_SYCL_F16)
710
+ case GGML_TYPE_F16:
711
+ {
712
+ auto data_pts = cast_data<sycl::half>(dst);
713
+ elu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
714
+ break;
715
+ }
716
+ #endif
717
+ case GGML_TYPE_F32:
718
+ {
719
+ auto data_pts = cast_data<float>(dst);
720
+ elu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
721
+ break;
722
+ }
723
+ default:
724
+ GGML_ABORT("GGML tensor type not supported!\n");
725
+ break;
726
+ }
573
727
  }
574
728
 
575
- inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
576
- ggml_tensor *dst, const float *src0_dd,
577
- const float *src1_dd, float *dst_dd,
578
- const queue_ptr &main_stream) {
579
-
580
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
581
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
582
-
583
- relu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
584
-
585
- GGML_UNUSED(src1);
586
- GGML_UNUSED(dst);
587
- GGML_UNUSED(src1_dd);
588
- GGML_UNUSED(ctx);
729
+ inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
730
+ #if defined (GGML_SYCL_F16)
731
+ GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
732
+ GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
733
+ #else
734
+ GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
735
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
736
+ #endif
737
+ GGML_ASSERT(dst->src[0]->type == dst->type);
738
+ dpct::queue_ptr main_stream = ctx.stream();
739
+ SYCL_CHECK(ggml_sycl_set_device(ctx.device));
740
+ switch (dst->type) {
741
+ #if defined (GGML_SYCL_F16)
742
+ case GGML_TYPE_F16:
743
+ {
744
+ auto data_pts = cast_data<sycl::half>(dst);
745
+ silu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
746
+ break;
747
+ }
748
+ #endif
749
+ case GGML_TYPE_F32:
750
+ {
751
+ auto data_pts = cast_data<float>(dst);
752
+ silu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
753
+ break;
754
+ }
755
+ default:
756
+ GGML_ABORT("GGML tensor type not supported!\n");
757
+ break;
758
+ }
589
759
  }
590
760
 
591
- inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
592
- const ggml_tensor *src1, ggml_tensor *dst,
593
- const float *src0_dd, const float *src1_dd,
594
- float *dst_dd,
595
- const queue_ptr &main_stream) {
596
-
597
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
598
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
599
-
600
- hardsigmoid_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
601
-
602
- GGML_UNUSED(src1);
603
- GGML_UNUSED(dst);
604
- GGML_UNUSED(src1_dd);
605
- GGML_UNUSED(ctx);
761
+ inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
762
+ #if defined (GGML_SYCL_F16)
763
+ GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
764
+ GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
765
+ #else
766
+ GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
767
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
768
+ #endif
769
+ GGML_ASSERT(dst->src[0]->type == dst->type);
770
+ dpct::queue_ptr main_stream = ctx.stream();
771
+ SYCL_CHECK(ggml_sycl_set_device(ctx.device));
772
+ switch (dst->type) {
773
+ #if defined (GGML_SYCL_F16)
774
+ case GGML_TYPE_F16:
775
+ {
776
+ auto data_pts = cast_data<sycl::half>(dst);
777
+ gelu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
778
+ break;
779
+ }
780
+ #endif
781
+ case GGML_TYPE_F32:
782
+ {
783
+ auto data_pts = cast_data<float>(dst);
784
+ gelu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
785
+ break;
786
+ }
787
+ default:
788
+ GGML_ABORT("GGML tensor type not supported!\n");
789
+ break;
790
+ }
606
791
  }
607
792
 
608
- inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
609
- const ggml_tensor *src1, ggml_tensor *dst,
610
- const float *src0_dd, const float *src1_dd,
611
- float *dst_dd, const queue_ptr &main_stream) {
612
-
613
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
614
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
615
-
616
- hardswish_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
617
-
618
- GGML_UNUSED(src1);
619
- GGML_UNUSED(dst);
620
- GGML_UNUSED(src1_dd);
621
- GGML_UNUSED(ctx);
793
+ inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
794
+ #if defined (GGML_SYCL_F16)
795
+ GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
796
+ GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
797
+ #else
798
+ GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
799
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
800
+ #endif
801
+ GGML_ASSERT(dst->src[0]->type == dst->type);
802
+ dpct::queue_ptr main_stream = ctx.stream();
803
+ SYCL_CHECK(ggml_sycl_set_device(ctx.device));
804
+ switch (dst->type) {
805
+ #if defined (GGML_SYCL_F16)
806
+ case GGML_TYPE_F16:
807
+ {
808
+ auto data_pts = cast_data<sycl::half>(dst);
809
+ gelu_quick_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
810
+ break;
811
+ }
812
+ #endif
813
+ case GGML_TYPE_F32:
814
+ {
815
+ auto data_pts = cast_data<float>(dst);
816
+ gelu_quick_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
817
+ break;
818
+ }
819
+ default:
820
+ GGML_ABORT("GGML tensor type not supported!\n");
821
+ break;
822
+ }
622
823
  }
623
824
 
624
- inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
625
- const ggml_tensor *src1, ggml_tensor *dst,
626
- const float *src0_dd, const float *src1_dd,
627
- float *dst_dd, const queue_ptr &main_stream) {
628
-
629
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
630
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
631
-
632
- exp_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
633
-
634
- GGML_UNUSED(src1);
635
- GGML_UNUSED(dst);
636
- GGML_UNUSED(src1_dd);
637
- GGML_UNUSED(ctx);
825
+ inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
826
+ #if defined (GGML_SYCL_F16)
827
+ GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
828
+ GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
829
+ #else
830
+ GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
831
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
832
+ #endif
833
+ GGML_ASSERT(dst->src[0]->type == dst->type);
834
+ dpct::queue_ptr main_stream = ctx.stream();
835
+ SYCL_CHECK(ggml_sycl_set_device(ctx.device));
836
+ switch (dst->type) {
837
+ #if defined (GGML_SYCL_F16)
838
+ case GGML_TYPE_F16:
839
+ {
840
+ auto data_pts = cast_data<sycl::half>(dst);
841
+ tanh_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
842
+ break;
843
+ }
844
+ #endif
845
+ case GGML_TYPE_F32:
846
+ {
847
+ auto data_pts = cast_data<float>(dst);
848
+ tanh_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
849
+ break;
850
+ }
851
+ default:
852
+ GGML_ABORT("GGML tensor type not supported!\n");
853
+ break;
854
+ }
638
855
  }
639
856
 
640
- inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
641
- const ggml_tensor *src1, ggml_tensor *dst,
642
- const float *src0_dd, const float *src1_dd,
643
- float *dst_dd, const queue_ptr &main_stream) {
644
-
645
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
646
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
647
-
648
- log_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
649
-
650
- GGML_UNUSED(src1);
651
- GGML_UNUSED(dst);
652
- GGML_UNUSED(src1_dd);
653
- GGML_UNUSED(ctx);
857
+ inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
858
+ #if defined (GGML_SYCL_F16)
859
+ GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
860
+ GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
861
+ #else
862
+ GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
863
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
864
+ #endif
865
+ GGML_ASSERT(dst->src[0]->type == dst->type);
866
+ dpct::queue_ptr main_stream = ctx.stream();
867
+ SYCL_CHECK(ggml_sycl_set_device(ctx.device));
868
+
869
+ switch (dst->type) {
870
+ #if defined (GGML_SYCL_F16)
871
+ case GGML_TYPE_F16:
872
+ {
873
+ auto data_pts = cast_data<sycl::half>(dst);
874
+ relu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
875
+ break;
876
+ }
877
+ #endif
878
+ case GGML_TYPE_F32:
879
+ {
880
+ auto data_pts = cast_data<float>(dst);
881
+ relu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
882
+ break;
883
+ }
884
+ default:
885
+ GGML_ABORT("GGML tensor type not supported!\n");
886
+ break;
887
+ }
654
888
  }
655
889
 
656
- inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
657
- const ggml_tensor *src1, ggml_tensor *dst,
658
- const float *src0_dd, const float *src1_dd,
659
- float *dst_dd, const queue_ptr &main_stream) {
660
-
661
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
662
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
663
-
664
- sigmoid_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
665
-
666
- GGML_UNUSED(src1);
667
- GGML_UNUSED(dst);
668
- GGML_UNUSED(src1_dd);
669
- GGML_UNUSED(ctx);
890
+ inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
891
+ #if defined (GGML_SYCL_F16)
892
+ GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
893
+ GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
894
+ #else
895
+ GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
896
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
897
+ #endif
898
+ GGML_ASSERT(dst->src[0]->type == dst->type);
899
+
900
+ dpct::queue_ptr main_stream = ctx.stream();
901
+ SYCL_CHECK(ggml_sycl_set_device(ctx.device));
902
+
903
+ switch (dst->type) {
904
+ #if defined (GGML_SYCL_F16)
905
+ case GGML_TYPE_F16:
906
+ {
907
+ auto data_pts = cast_data<sycl::half>(dst);
908
+ hardsigmoid_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
909
+ break;
910
+ }
911
+ #endif
912
+ case GGML_TYPE_F32:
913
+ {
914
+ auto data_pts = cast_data<float>(dst);
915
+ hardsigmoid_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
916
+ break;
917
+ }
918
+ default:
919
+ GGML_ABORT("GGML tensor type not supported!\n");
920
+ break;
921
+ }
670
922
  }
671
923
 
672
- inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
673
- const ggml_tensor *src1, ggml_tensor *dst,
674
- const float *src0_dd, const float *src1_dd,
675
- float *dst_dd, const queue_ptr &main_stream) {
676
-
677
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
678
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
679
-
680
- sqrt_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
681
-
682
- GGML_UNUSED(src1);
683
- GGML_UNUSED(dst);
684
- GGML_UNUSED(src1_dd);
685
- GGML_UNUSED(ctx);
924
+ inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
925
+ #if defined (GGML_SYCL_F16)
926
+ GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
927
+ GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
928
+ #else
929
+ GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
930
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
931
+ #endif
932
+ GGML_ASSERT(dst->src[0]->type == dst->type);
933
+ dpct::queue_ptr main_stream = ctx.stream();
934
+ SYCL_CHECK(ggml_sycl_set_device(ctx.device));
935
+ switch (dst->type) {
936
+ #if defined (GGML_SYCL_F16)
937
+ case GGML_TYPE_F16:
938
+ {
939
+ auto data_pts = cast_data<sycl::half>(dst);
940
+ hardswish_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
941
+ break;
942
+ }
943
+ #endif
944
+ case GGML_TYPE_F32:
945
+ {
946
+ auto data_pts = cast_data<float>(dst);
947
+ hardswish_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
948
+ break;
949
+ }
950
+ default:
951
+ GGML_ABORT("GGML tensor type not supported!\n");
952
+ break;
953
+ }
686
954
  }
687
955
 
688
- inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
689
- const ggml_tensor *src1, ggml_tensor *dst,
690
- const float *src0_dd, const float *src1_dd,
691
- float *dst_dd, const queue_ptr &main_stream) {
692
-
693
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
694
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
695
-
696
- sin_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
697
-
698
- GGML_UNUSED(src1);
699
- GGML_UNUSED(dst);
700
- GGML_UNUSED(src1_dd);
701
- GGML_UNUSED(ctx);
956
+ inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
957
+ #if defined (GGML_SYCL_F16)
958
+ GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
959
+ GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
960
+ #else
961
+ GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
962
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
963
+ #endif
964
+ GGML_ASSERT(dst->src[0]->type == dst->type);
965
+ dpct::queue_ptr main_stream = ctx.stream();
966
+ SYCL_CHECK(ggml_sycl_set_device(ctx.device));
967
+ switch (dst->type) {
968
+ #if defined (GGML_SYCL_F16)
969
+ case GGML_TYPE_F16:
970
+ {
971
+ auto data_pts = cast_data<sycl::half>(dst);
972
+ exp_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
973
+ break;
974
+ }
975
+ #endif
976
+ case GGML_TYPE_F32:
977
+ {
978
+ auto data_pts = cast_data<float>(dst);
979
+ exp_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
980
+ break;
981
+ }
982
+ default:
983
+ GGML_ABORT("GGML tensor type not supported!\n");
984
+ break;
985
+ }
702
986
  }
703
987
 
704
- inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
705
- const ggml_tensor *src1, ggml_tensor *dst,
706
- const float *src0_dd, const float *src1_dd,
707
- float *dst_dd, const queue_ptr &main_stream) {
708
-
709
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
710
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
711
-
712
- cos_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
713
-
714
- GGML_UNUSED(src1);
715
- GGML_UNUSED(dst);
716
- GGML_UNUSED(src1_dd);
717
- GGML_UNUSED(ctx);
988
+ inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
989
+ #if defined (GGML_SYCL_F16)
990
+ GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
991
+ GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
992
+ #else
993
+ GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
994
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
995
+ #endif
996
+ GGML_ASSERT(dst->src[0]->type == dst->type);
997
+ dpct::queue_ptr main_stream = ctx.stream();
998
+ SYCL_CHECK(ggml_sycl_set_device(ctx.device));
999
+ switch (dst->type) {
1000
+ #if defined (GGML_SYCL_F16)
1001
+ case GGML_TYPE_F16:
1002
+ {
1003
+ auto data_pts = cast_data<sycl::half>(dst);
1004
+ log_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
1005
+ break;
1006
+ }
1007
+ #endif
1008
+ case GGML_TYPE_F32:
1009
+ {
1010
+ auto data_pts = cast_data<float>(dst);
1011
+ log_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
1012
+ break;
1013
+ }
1014
+ default:
1015
+ GGML_ABORT("GGML tensor type not supported!\n");
1016
+ break;
1017
+ }
718
1018
  }
719
1019
 
720
- inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
721
- const ggml_tensor *src1, ggml_tensor *dst,
722
- const float *src0_dd, const float *src1_dd,
723
- float *dst_dd, const queue_ptr &main_stream) {
724
-
725
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
726
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
727
-
728
- step_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
729
-
730
- GGML_UNUSED(src1);
731
- GGML_UNUSED(dst);
732
- GGML_UNUSED(src1_dd);
733
- GGML_UNUSED(ctx);
1020
+ inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1021
+ #if defined (GGML_SYCL_F16)
1022
+ GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
1023
+ GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
1024
+ #else
1025
+ GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
1026
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
1027
+ #endif
1028
+ GGML_ASSERT(dst->src[0]->type == dst->type);
1029
+ dpct::queue_ptr main_stream = ctx.stream();
1030
+ SYCL_CHECK(ggml_sycl_set_device(ctx.device));
1031
+ switch (dst->type) {
1032
+ #if defined (GGML_SYCL_F16)
1033
+ case GGML_TYPE_F16:
1034
+ {
1035
+ auto data_pts = cast_data<sycl::half>(dst);
1036
+ sigmoid_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
1037
+ break;
1038
+ }
1039
+ #endif
1040
+ case GGML_TYPE_F32:
1041
+ {
1042
+ auto data_pts = cast_data<float>(dst);
1043
+ sigmoid_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
1044
+ break;
1045
+ }
1046
+ default:
1047
+ GGML_ABORT("GGML tensor type not supported!\n");
1048
+ break;
1049
+ }
734
1050
  }
735
1051
 
736
- inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
737
- const ggml_tensor *src1, ggml_tensor *dst,
738
- const float *src0_dd, const float *src1_dd,
739
- float *dst_dd, const queue_ptr &main_stream) {
1052
+ inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1053
+ #if defined (GGML_SYCL_F16)
1054
+ GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
1055
+ GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
1056
+ #else
1057
+ GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
1058
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
1059
+ #endif
1060
+ GGML_ASSERT(dst->src[0]->type == dst->type);
1061
+
1062
+ dpct::queue_ptr main_stream = ctx.stream();
1063
+ SYCL_CHECK(ggml_sycl_set_device(ctx.device));
1064
+ switch (dst->type) {
1065
+ #if defined (GGML_SYCL_F16)
1066
+ case GGML_TYPE_F16:
1067
+ {
1068
+ auto data_pts = cast_data<sycl::half>(dst);
1069
+ sqrt_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
1070
+ break;
1071
+ }
1072
+ #endif
1073
+ case GGML_TYPE_F32:
1074
+ {
1075
+ auto data_pts = cast_data<float>(dst);
1076
+ sqrt_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
1077
+ break;
1078
+ }
1079
+ default:
1080
+ GGML_ABORT("GGML tensor type not supported!\n");
1081
+ break;
1082
+ }
1083
+ }
740
1084
 
741
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
742
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
1085
+ inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1086
+ #if defined (GGML_SYCL_F16)
1087
+ GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
1088
+ GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
1089
+ #else
1090
+ GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
1091
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
1092
+ #endif
1093
+ GGML_ASSERT(dst->src[0]->type == dst->type);
1094
+ dpct::queue_ptr main_stream = ctx.stream();
1095
+ SYCL_CHECK(ggml_sycl_set_device(ctx.device));
1096
+ switch (dst->type) {
1097
+ #if defined (GGML_SYCL_F16)
1098
+ case GGML_TYPE_F16:
1099
+ {
1100
+ auto data_pts = cast_data<sycl::half>(dst);
1101
+ sin_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
1102
+ break;
1103
+ }
1104
+ #endif
1105
+ case GGML_TYPE_F32:
1106
+ {
1107
+ auto data_pts = cast_data<float>(dst);
1108
+ sin_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
1109
+ break;
1110
+ }
1111
+ default:
1112
+ GGML_ABORT("GGML tensor type not supported!\n");
1113
+ break;
1114
+ }
1115
+ }
743
1116
 
744
- neg_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
1117
+ inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1118
+ #if defined (GGML_SYCL_F16)
1119
+ GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
1120
+ GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
1121
+ #else
1122
+ GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
1123
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
1124
+ #endif
1125
+ GGML_ASSERT(dst->src[0]->type == dst->type);
1126
+ dpct::queue_ptr main_stream = ctx.stream();
1127
+ SYCL_CHECK(ggml_sycl_set_device(ctx.device));
1128
+ switch (dst->type) {
1129
+ #if defined (GGML_SYCL_F16)
1130
+ case GGML_TYPE_F16:
1131
+ {
1132
+ auto data_pts = cast_data<sycl::half>(dst);
1133
+ cos_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
1134
+ break;
1135
+ }
1136
+ #endif
1137
+ case GGML_TYPE_F32:
1138
+ {
1139
+ auto data_pts = cast_data<float>(dst);
1140
+ cos_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
1141
+ break;
1142
+ }
1143
+ default:
1144
+ GGML_ABORT("GGML tensor type not supported!\n");
1145
+ break;
1146
+ }
1147
+ }
745
1148
 
746
- GGML_UNUSED(src1);
747
- GGML_UNUSED(dst);
748
- GGML_UNUSED(src1_dd);
749
- GGML_UNUSED(ctx);
1149
+ inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1150
+ #if defined (GGML_SYCL_F16)
1151
+ GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
1152
+ GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
1153
+ #else
1154
+ GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
1155
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
1156
+ #endif
1157
+ GGML_ASSERT(dst->src[0]->type == dst->type);
1158
+ dpct::queue_ptr main_stream = ctx.stream();
1159
+ SYCL_CHECK(ggml_sycl_set_device(ctx.device));
1160
+ switch (dst->type) {
1161
+ #if defined (GGML_SYCL_F16)
1162
+ case GGML_TYPE_F16:
1163
+ {
1164
+ auto data_pts = cast_data<sycl::half>(dst);
1165
+ step_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
1166
+ break;
1167
+ }
1168
+ #endif
1169
+ case GGML_TYPE_F32:
1170
+ {
1171
+ auto data_pts = cast_data<float>(dst);
1172
+ step_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
1173
+ break;
1174
+ }
1175
+ default:
1176
+ GGML_ABORT("GGML tensor type not supported!\n");
1177
+ break;
1178
+ }
750
1179
  }
751
1180
 
752
- inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
753
- const ggml_tensor *src1, ggml_tensor *dst,
754
- const float *src0_dd, const float *src1_dd,
755
- float *dst_dd,
756
- const queue_ptr &main_stream) {
1181
+ inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1182
+ #if defined (GGML_SYCL_F16)
1183
+ GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
1184
+ GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
1185
+ #else
1186
+ GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
1187
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
1188
+ #endif
1189
+ GGML_ASSERT(dst->src[0]->type == dst->type);
1190
+ dpct::queue_ptr main_stream = ctx.stream();
1191
+ SYCL_CHECK(ggml_sycl_set_device(ctx.device));
1192
+ switch (dst->type) {
1193
+ #if defined (GGML_SYCL_F16)
1194
+ case GGML_TYPE_F16:
1195
+ {
1196
+ auto data_pts = cast_data<sycl::half>(dst);
1197
+ neg_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
1198
+ break;
1199
+ }
1200
+ #endif
1201
+ case GGML_TYPE_F32:
1202
+ {
1203
+ auto data_pts = cast_data<float>(dst);
1204
+ neg_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
1205
+ break;
1206
+ }
1207
+ default:
1208
+ GGML_ABORT("GGML tensor type not supported!\n");
1209
+ break;
1210
+ }
1211
+ }
757
1212
 
758
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
759
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
1213
+ inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1214
+ #if defined (GGML_SYCL_F16)
1215
+ GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
1216
+ GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
1217
+ #else
1218
+ GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
1219
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
1220
+ #endif
760
1221
 
1222
+ GGML_ASSERT(dst->src[0]->type == dst->type);
761
1223
  float negative_slope;
762
1224
  memcpy(&negative_slope, dst->op_params, sizeof(float));
763
-
764
- leaky_relu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), negative_slope, main_stream);
765
-
766
- GGML_UNUSED(src1);
767
- GGML_UNUSED(dst);
768
- GGML_UNUSED(src1_dd);
769
- GGML_UNUSED(ctx);
1225
+ dpct::queue_ptr main_stream = ctx.stream();
1226
+ SYCL_CHECK(ggml_sycl_set_device(ctx.device));
1227
+ switch (dst->type) {
1228
+ #if defined (GGML_SYCL_F16)
1229
+ case GGML_TYPE_F16:
1230
+ {
1231
+ auto data_pts = cast_data<sycl::half>(dst);
1232
+ leaky_relu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), negative_slope, main_stream);
1233
+ break;
1234
+ }
1235
+ #endif
1236
+ case GGML_TYPE_F32:
1237
+ {
1238
+ auto data_pts = cast_data<float>(dst);
1239
+ leaky_relu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), negative_slope, main_stream);
1240
+ break;
1241
+ }
1242
+ default:
1243
+ GGML_ABORT("GGML tensor type not supported!\n");
1244
+ break;
1245
+ }
770
1246
  }
771
1247
 
772
- inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
773
- ggml_tensor *dst, const float *src0_dd,
774
- const float *src1_dd, float *dst_dd,
775
- const queue_ptr &main_stream) {
776
-
777
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
778
- GGML_ASSERT( dst->type == GGML_TYPE_F32);
779
-
780
- sqr_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
781
-
782
- GGML_UNUSED(src1);
783
- GGML_UNUSED(dst);
784
- GGML_UNUSED(src1_dd);
785
- GGML_UNUSED(ctx);
1248
+ inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1249
+ #if defined (GGML_SYCL_F16)
1250
+ GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
1251
+ GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
1252
+ #else
1253
+ GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
1254
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
1255
+ #endif
1256
+ GGML_ASSERT(dst->src[0]->type == dst->type);
1257
+ dpct::queue_ptr main_stream = ctx.stream();
1258
+ SYCL_CHECK(ggml_sycl_set_device(ctx.device));
1259
+ switch (dst->type) {
1260
+ #if defined (GGML_SYCL_F16)
1261
+ case GGML_TYPE_F16:
1262
+ {
1263
+ auto data_pts = cast_data<sycl::half>(dst);
1264
+ sqr_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
1265
+ break;
1266
+ }
1267
+ #endif
1268
+ case GGML_TYPE_F32:
1269
+ {
1270
+ auto data_pts = cast_data<float>(dst);
1271
+ sqr_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
1272
+ break;
1273
+ }
1274
+ default:
1275
+ GGML_ABORT("GGML tensor type not supported!\n");
1276
+ break;
1277
+ }
786
1278
  }
787
1279
 
788
- inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
789
- const ggml_tensor *src1, ggml_tensor *dst,
790
- const float *src0_dd, const float *src1_dd,
791
- float *dst_dd,
792
- const queue_ptr &main_stream) {
793
-
794
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
1280
+ inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1281
+ #if defined (GGML_SYCL_F16)
1282
+ GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
1283
+ GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
1284
+ #else
1285
+ GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
795
1286
  GGML_ASSERT(dst->type == GGML_TYPE_F32);
796
-
797
- const float sf0 = (float)dst->ne[0]/src0->ne[0];
798
- const float sf1 = (float)dst->ne[1]/src0->ne[1];
799
- const float sf2 = (float)dst->ne[2]/src0->ne[2];
800
- const float sf3 = (float)dst->ne[3]/src0->ne[3];
801
-
802
- upscale_f32_sycl(src0_dd, dst_dd, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
803
- dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3,
804
- main_stream);
805
-
806
- GGML_UNUSED(src1);
807
- GGML_UNUSED(dst);
808
- GGML_UNUSED(src1_dd);
809
- GGML_UNUSED(ctx);
1287
+ #endif
1288
+ GGML_ASSERT(dst->src[0]->type == dst->type);
1289
+
1290
+ dpct::queue_ptr main_stream = ctx.stream();
1291
+ SYCL_CHECK(ggml_sycl_set_device(ctx.device));
1292
+
1293
+ const float sf0 = (float) dst->ne[0] / dst->src[0]->ne[0];
1294
+ const float sf1 = (float) dst->ne[1] / dst->src[0]->ne[1];
1295
+ const float sf2 = (float) dst->ne[2] / dst->src[0]->ne[2];
1296
+ const float sf3 = (float) dst->ne[3] / dst->src[0]->ne[3];
1297
+ switch (dst->type) {
1298
+ #if defined (GGML_SYCL_F16)
1299
+ case GGML_TYPE_F16:
1300
+ {
1301
+ auto data_pts = cast_data<sycl::half>(dst);
1302
+ upscale_sycl(data_pts.src, data_pts.dst, dst->src[0]->nb[0], dst->src[0]->nb[1], dst->src[0]->nb[2],
1303
+ dst->src[0]->nb[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3,
1304
+ main_stream);
1305
+ break;
1306
+ }
1307
+ #endif
1308
+ case GGML_TYPE_F32:
1309
+ {
1310
+ auto data_pts = cast_data<float>(dst);
1311
+ upscale_sycl(data_pts.src, data_pts.dst, dst->src[0]->nb[0], dst->src[0]->nb[1], dst->src[0]->nb[2],
1312
+ dst->src[0]->nb[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3,
1313
+ main_stream);
1314
+ break;
1315
+ }
1316
+ default:
1317
+ GGML_ABORT("GGML tensor type not supported!\n");
1318
+ break;
1319
+ }
810
1320
  }
811
1321
 
812
- inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
813
- ggml_tensor *dst, const float *src0_dd,
814
- const float *src1_dd, float *dst_dd,
815
- const queue_ptr &main_stream) {
816
-
817
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
1322
+ inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1323
+ #if defined (GGML_SYCL_F16)
1324
+ GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
1325
+ GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
1326
+ #else
1327
+ GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
818
1328
  GGML_ASSERT(dst->type == GGML_TYPE_F32);
819
- GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
1329
+ #endif
1330
+ GGML_ASSERT(dst->src[0]->type == dst->type);
1331
+ GGML_ASSERT(dst->src[0]->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
1332
+ dpct::queue_ptr main_stream = ctx.stream();
1333
+ SYCL_CHECK(ggml_sycl_set_device(ctx.device));
1334
+ switch (dst->type) {
1335
+ #if defined (GGML_SYCL_F16)
1336
+ case GGML_TYPE_F16:
1337
+ {
1338
+ auto data_pts = cast_data<sycl::half>(dst);
1339
+ pad_sycl(data_pts.src, data_pts.dst, dst->src[0]->ne[0], dst->src[0]->ne[1], dst->src[0]->ne[2], dst->ne[0],
1340
+ dst->ne[1], dst->ne[2], main_stream);
1341
+ break;
1342
+ }
1343
+ #endif
1344
+ case GGML_TYPE_F32:
1345
+ {
1346
+ auto data_pts = cast_data<float>(dst);
1347
+ pad_sycl(data_pts.src, data_pts.dst, dst->src[0]->ne[0], dst->src[0]->ne[1], dst->src[0]->ne[2], dst->ne[0],
1348
+ dst->ne[1], dst->ne[2], main_stream);
1349
+ break;
1350
+ }
1351
+ default:
1352
+ GGML_ABORT("GGML tensor type not supported!\n");
1353
+ break;
1354
+ }
1355
+ }
820
1356
 
821
- pad_f32_sycl(src0_dd, dst_dd,
822
- src0->ne[0], src0->ne[1], src0->ne[2],
823
- dst->ne[0], dst->ne[1], dst->ne[2], main_stream);
1357
+ inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1358
+ #if defined(GGML_SYCL_F16)
1359
+ GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
1360
+ GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
1361
+ #else
824
1362
 
825
- GGML_UNUSED(src1);
826
- GGML_UNUSED(dst);
827
- GGML_UNUSED(src1_dd);
828
- GGML_UNUSED(ctx);
1363
+ GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
1364
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
1365
+ #endif
1366
+ GGML_ASSERT(dst->src[0]->type == dst->type);
1367
+ dpct::queue_ptr main_stream = ctx.stream();
1368
+ SYCL_CHECK(ggml_sycl_set_device(ctx.device));
1369
+ float min;
1370
+ float max;
1371
+ memcpy(&min, dst->op_params, sizeof(float));
1372
+ memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
1373
+
1374
+ switch (dst->type) {
1375
+ #if defined(GGML_SYCL_F16)
1376
+ case GGML_TYPE_F16:
1377
+ {
1378
+ auto data_pts = cast_data<sycl::half>(dst);
1379
+ clamp_sycl(data_pts.src, data_pts.dst, min, max, ggml_nelements(dst->src[0]), main_stream);
1380
+ break;
1381
+ }
1382
+ #endif
1383
+ case GGML_TYPE_F32:
1384
+ {
1385
+ auto data_pts = cast_data<float>(dst);
1386
+ clamp_sycl(data_pts.src, data_pts.dst, min, max, ggml_nelements(dst->src[0]), main_stream);
1387
+ break;
1388
+ }
1389
+ default:
1390
+ GGML_ABORT("GGML tensor type not supported!\n");
1391
+ break;
1392
+ }
829
1393
  }
830
1394
 
831
- inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
832
- ggml_tensor *dst, const float *src0_dd,
833
- const float *src1_dd, float *dst_dd,
834
- const queue_ptr &main_stream) {
1395
+ inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
835
1396
 
836
- GGML_ASSERT(src0->type == GGML_TYPE_F32);
837
- GGML_ASSERT(src1->type == GGML_TYPE_F32);
1397
+ GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
1398
+ GGML_ASSERT(dst->src[1]->type == GGML_TYPE_F32);
838
1399
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
839
1400
  GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported
1401
+ dpct::queue_ptr main_stream = ctx.stream();
1402
+ SYCL_CHECK(ggml_sycl_set_device(ctx.device));
1403
+ const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
1404
+ const float * src1_dd = static_cast<const float*>(dst->src[1]->data);
1405
+ float * dst_dd = static_cast<float *>(dst->data);
840
1406
 
841
1407
  int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
842
1408
  int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
843
1409
  // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
844
1410
  int offset = dst->op_params[3] / 4; // offset in bytes
845
1411
 
846
- acc_f32_sycl(src0_dd, src1_dd, dst_dd, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, main_stream);
847
-
848
- GGML_UNUSED(dst);
849
- GGML_UNUSED(ctx);
850
- }
851
-
852
- inline void ggml_sycl_op_add(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
853
- ggml_tensor *dst, const float *src0_dd,
854
- const float *src1_dd, float *dst_dd,
855
- const queue_ptr &main_stream) {
856
-
857
- ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_add>>(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
858
- }
859
-
860
- inline void ggml_sycl_op_sub(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
861
- ggml_tensor *dst, const float *src0_dd,
862
- const float *src1_dd, float *dst_dd,
863
- const queue_ptr &main_stream) {
864
-
865
- ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_sub>>(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
866
- }
867
-
868
- inline void ggml_sycl_op_mul(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
869
- ggml_tensor *dst, const float *src0_dd,
870
- const float *src1_dd, float *dst_dd,
871
- const queue_ptr &main_stream) {
872
-
873
- ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_mul>>(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
874
- }
875
-
876
- inline void ggml_sycl_op_div(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
877
- ggml_tensor *dst, const float *src0_dd,
878
- const float *src1_dd, float *dst_dd,
879
- const queue_ptr &main_stream) {
880
-
881
- ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_div>>(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
1412
+ acc_f32_sycl(src0_dd, src1_dd, dst_dd, ggml_nelements(dst), dst->src[1]->ne[0], dst->src[1]->ne[1], dst->src[1]->ne[2], nb1, nb2, offset, main_stream);
882
1413
  }
883
1414
 
884
1415
 
885
1416
  void ggml_sycl_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
886
- GGML_SYCL_DEBUG("call %s\n", __func__);
887
- ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_sqrt);
1417
+ GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
1418
+ ggml_sycl_op_sqrt(ctx, dst);
888
1419
  GGML_SYCL_DEBUG("call %s done\n", __func__);
889
1420
  }
890
1421
 
891
1422
  void ggml_sycl_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
892
- GGML_SYCL_DEBUG("call %s\n", __func__);
893
- ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_sin);
1423
+ GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
1424
+ ggml_sycl_op_sin(ctx, dst);
894
1425
  GGML_SYCL_DEBUG("call %s done\n", __func__);
895
1426
  }
896
1427
 
897
1428
  void ggml_sycl_cos(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
898
- GGML_SYCL_DEBUG("call %s\n", __func__);
899
- ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_cos);
1429
+ GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
1430
+ ggml_sycl_op_cos(ctx, dst);
900
1431
  GGML_SYCL_DEBUG("call %s done\n", __func__);
901
1432
  }
902
1433
 
903
1434
  void ggml_sycl_acc(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
904
- GGML_SYCL_DEBUG("call %s\n", __func__);
905
- ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_acc);
1435
+ GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
1436
+ ggml_sycl_op_acc(ctx, dst);
906
1437
  GGML_SYCL_DEBUG("call %s done\n", __func__);
907
1438
  }
908
1439
 
909
1440
  void ggml_sycl_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
910
- GGML_SYCL_DEBUG("call %s\n", __func__);
911
- ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_gelu);
1441
+ GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
1442
+ ggml_sycl_op_gelu(ctx, dst);
912
1443
  GGML_SYCL_DEBUG("call %s done\n", __func__);
913
1444
  }
914
1445
 
915
1446
  void ggml_sycl_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
916
- GGML_SYCL_DEBUG("call %s\n", __func__);
917
- ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_silu);
1447
+ GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
1448
+ ggml_sycl_op_silu(ctx, dst);
918
1449
  GGML_SYCL_DEBUG("call %s done\n", __func__);
919
1450
  }
920
1451
 
921
1452
  void ggml_sycl_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
922
- GGML_SYCL_DEBUG("call %s\n", __func__);
923
- ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_gelu_quick);
1453
+ GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
1454
+ ggml_sycl_op_gelu_quick(ctx, dst);
924
1455
  GGML_SYCL_DEBUG("call %s done\n", __func__);
925
1456
  }
926
1457
 
927
1458
  void ggml_sycl_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
928
- GGML_SYCL_DEBUG("call %s\n", __func__);
929
- ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_tanh);
1459
+ GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
1460
+ ggml_sycl_op_tanh(ctx, dst);
930
1461
  GGML_SYCL_DEBUG("call %s done\n", __func__);
931
1462
  }
932
1463
 
933
1464
  void ggml_sycl_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
934
- GGML_SYCL_DEBUG("call %s\n", __func__);
935
- ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_relu);
1465
+ GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
1466
+ ggml_sycl_op_relu(ctx, dst);
936
1467
  GGML_SYCL_DEBUG("call %s done\n", __func__);
937
1468
  }
938
1469
 
939
1470
  void ggml_sycl_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
940
- GGML_SYCL_DEBUG("call %s\n", __func__);
941
- ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_sigmoid);
1471
+ GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
1472
+ ggml_sycl_op_sigmoid(ctx, dst);
942
1473
  GGML_SYCL_DEBUG("call %s done\n", __func__);
943
1474
  }
944
1475
 
945
1476
  void ggml_sycl_hardsigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
946
- GGML_SYCL_DEBUG("call %s\n", __func__);
947
- ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_hardsigmoid);
1477
+ GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
1478
+ ggml_sycl_op_hardsigmoid(ctx, dst);
948
1479
  GGML_SYCL_DEBUG("call %s done\n", __func__);
949
1480
  }
950
1481
 
951
1482
  void ggml_sycl_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
952
- GGML_SYCL_DEBUG("call %s\n", __func__);
953
- ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_hardswish);
1483
+ GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
1484
+ ggml_sycl_op_hardswish(ctx, dst);
954
1485
  GGML_SYCL_DEBUG("call %s done\n", __func__);
955
1486
  }
956
1487
 
957
1488
 
958
1489
  void ggml_sycl_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
959
- GGML_SYCL_DEBUG("call %s\n", __func__);
960
- ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_exp);
1490
+ GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
1491
+ ggml_sycl_op_exp(ctx, dst);
961
1492
  GGML_SYCL_DEBUG("call %s done\n", __func__);
962
1493
  }
963
1494
 
964
1495
  void ggml_sycl_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
965
- GGML_SYCL_DEBUG("call %s\n", __func__);
966
- ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_log);
1496
+ GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
1497
+ ggml_sycl_op_log(ctx, dst);
967
1498
  GGML_SYCL_DEBUG("call %s done\n", __func__);
968
1499
  }
969
1500
 
970
1501
  void ggml_sycl_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
971
- GGML_SYCL_DEBUG("call %s\n", __func__);
972
- ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_neg);
1502
+ GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
1503
+ ggml_sycl_op_neg(ctx, dst);
973
1504
  GGML_SYCL_DEBUG("call %s done\n", __func__);
974
1505
  }
975
1506
 
976
1507
  void ggml_sycl_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
977
- GGML_SYCL_DEBUG("call %s\n", __func__);
978
- ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_step);
1508
+ GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
1509
+ ggml_sycl_op_step(ctx, dst);
979
1510
  GGML_SYCL_DEBUG("call %s done\n", __func__);
980
1511
  }
981
1512
 
982
1513
  void ggml_sycl_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
983
- GGML_SYCL_DEBUG("call %s\n", __func__);
984
- ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_leaky_relu);
1514
+ GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
1515
+ ggml_sycl_op_leaky_relu(ctx, dst);
985
1516
  GGML_SYCL_DEBUG("call %s done\n", __func__);
986
1517
  }
987
1518
 
988
1519
  void ggml_sycl_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
989
- GGML_SYCL_DEBUG("call %s\n", __func__);
990
- ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_sqr);
1520
+ GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
1521
+ ggml_sycl_op_sqr(ctx, dst);
991
1522
  GGML_SYCL_DEBUG("call %s done\n", __func__);
992
1523
  }
993
1524
 
994
1525
  void ggml_sycl_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
995
- GGML_SYCL_DEBUG("call %s\n", __func__);
996
- ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_upscale);
1526
+ GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
1527
+ ggml_sycl_op_upscale(ctx, dst);
997
1528
  GGML_SYCL_DEBUG("call %s done\n", __func__);
998
1529
  }
999
1530
 
1000
1531
  void ggml_sycl_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1001
- GGML_SYCL_DEBUG("call %s\n", __func__);
1002
- ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_pad);
1532
+ GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
1533
+ ggml_sycl_op_pad(ctx, dst);
1003
1534
  GGML_SYCL_DEBUG("call %s done\n", __func__);
1004
1535
  }
1005
1536
 
1006
-
1007
-
1008
- void ggml_sycl_add(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1009
- GGML_SYCL_DEBUG("call %s\n", __func__);
1010
- ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_add);
1537
+ void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1538
+ GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
1539
+ ggml_sycl_op_clamp(ctx, dst);
1011
1540
  GGML_SYCL_DEBUG("call %s done\n", __func__);
1012
1541
  }
1013
1542
 
1014
- void ggml_sycl_sub(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1015
- GGML_SYCL_DEBUG("call %s\n", __func__);
1016
- ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_sub);
1543
+ void ggml_sycl_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1544
+ GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
1545
+ ggml_sycl_op_sgn(ctx, dst);
1017
1546
  GGML_SYCL_DEBUG("call %s done\n", __func__);
1018
1547
  }
1019
1548
 
1020
- void ggml_sycl_mul(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1021
- GGML_SYCL_DEBUG("call %s\n", __func__);
1022
- ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_mul);
1549
+ void ggml_sycl_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1550
+ GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
1551
+ ggml_sycl_op_abs(ctx, dst);
1023
1552
  GGML_SYCL_DEBUG("call %s done\n", __func__);
1024
1553
  }
1025
1554
 
1026
- void ggml_sycl_div(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1027
- GGML_SYCL_DEBUG("call %s\n", __func__);
1028
- ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_div);
1555
+ void ggml_sycl_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
1556
+ GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
1557
+ ggml_sycl_op_elu(ctx, dst);
1029
1558
  GGML_SYCL_DEBUG("call %s done\n", __func__);
1030
1559
  }