whispercpp 1.3.0 → 1.3.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (132) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +5 -0
  3. data/LICENSE +1 -1
  4. data/README.md +165 -434
  5. data/Rakefile +60 -11
  6. data/ext/.gitignore +13 -0
  7. data/ext/cpu.mk +9 -0
  8. data/ext/{dr_wav.h → examples/dr_wav.h} +3560 -1179
  9. data/ext/extconf.rb +185 -16
  10. data/ext/ggml/include/ggml-alloc.h +76 -0
  11. data/ext/ggml/include/ggml-backend.h +352 -0
  12. data/ext/ggml/include/ggml-blas.h +25 -0
  13. data/ext/ggml/include/ggml-cann.h +123 -0
  14. data/ext/ggml/include/ggml-cpp.h +38 -0
  15. data/ext/ggml/include/ggml-cpu.h +135 -0
  16. data/ext/ggml/include/ggml-cuda.h +47 -0
  17. data/ext/ggml/include/ggml-kompute.h +50 -0
  18. data/ext/ggml/include/ggml-metal.h +66 -0
  19. data/ext/ggml/include/ggml-opencl.h +26 -0
  20. data/ext/ggml/include/ggml-opt.h +216 -0
  21. data/ext/ggml/include/ggml-rpc.h +28 -0
  22. data/ext/ggml/include/ggml-sycl.h +49 -0
  23. data/ext/ggml/include/ggml-vulkan.h +31 -0
  24. data/ext/{ggml.h → ggml/include/ggml.h} +479 -596
  25. data/ext/ggml/src/ggml-alloc.c +1037 -0
  26. data/ext/ggml/src/ggml-amx/common.h +94 -0
  27. data/ext/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
  28. data/ext/ggml/src/ggml-amx/mmq.cpp +2510 -0
  29. data/ext/ggml/src/ggml-amx/mmq.h +17 -0
  30. data/ext/ggml/src/ggml-backend-impl.h +256 -0
  31. data/ext/ggml/src/ggml-backend-reg.cpp +552 -0
  32. data/ext/ggml/src/ggml-backend.cpp +1999 -0
  33. data/ext/ggml/src/ggml-blas/ggml-blas.cpp +517 -0
  34. data/ext/ggml/src/ggml-cann/acl_tensor.cpp +175 -0
  35. data/ext/ggml/src/ggml-cann/acl_tensor.h +258 -0
  36. data/ext/ggml/src/ggml-cann/aclnn_ops.cpp +3427 -0
  37. data/ext/ggml/src/ggml-cann/aclnn_ops.h +592 -0
  38. data/ext/ggml/src/ggml-cann/common.h +286 -0
  39. data/ext/ggml/src/ggml-cann/ggml-cann.cpp +2188 -0
  40. data/ext/ggml/src/ggml-cann/kernels/ascendc_kernels.h +19 -0
  41. data/ext/ggml/src/ggml-cann/kernels/dup.cpp +236 -0
  42. data/ext/ggml/src/ggml-cann/kernels/get_row_f16.cpp +197 -0
  43. data/ext/ggml/src/ggml-cann/kernels/get_row_f32.cpp +190 -0
  44. data/ext/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +204 -0
  45. data/ext/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
  46. data/ext/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +218 -0
  47. data/ext/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +216 -0
  48. data/ext/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +295 -0
  49. data/ext/ggml/src/ggml-common.h +1853 -0
  50. data/ext/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
  51. data/ext/ggml/src/ggml-cpu/amx/amx.h +8 -0
  52. data/ext/ggml/src/ggml-cpu/amx/common.h +91 -0
  53. data/ext/ggml/src/ggml-cpu/amx/mmq.cpp +2511 -0
  54. data/ext/ggml/src/ggml-cpu/amx/mmq.h +10 -0
  55. data/ext/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
  56. data/ext/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +4262 -0
  57. data/ext/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +8 -0
  58. data/ext/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
  59. data/ext/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
  60. data/ext/ggml/src/ggml-cpu/ggml-cpu-impl.h +386 -0
  61. data/ext/ggml/src/ggml-cpu/ggml-cpu-quants.c +10835 -0
  62. data/ext/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  63. data/ext/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
  64. data/ext/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
  65. data/ext/ggml/src/ggml-cpu/ggml-cpu.c +14123 -0
  66. data/ext/ggml/src/ggml-cpu/ggml-cpu.cpp +622 -0
  67. data/ext/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1884 -0
  68. data/ext/ggml/src/ggml-cpu/llamafile/sgemm.h +14 -0
  69. data/ext/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  70. data/ext/ggml/src/ggml-cuda/vendors/hip.h +186 -0
  71. data/ext/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  72. data/ext/ggml/src/ggml-impl.h +556 -0
  73. data/ext/ggml/src/ggml-kompute/ggml-kompute.cpp +2251 -0
  74. data/ext/ggml/src/ggml-metal/ggml-metal-impl.h +288 -0
  75. data/ext/ggml/src/ggml-metal/ggml-metal.m +4884 -0
  76. data/ext/ggml/src/ggml-metal/ggml-metal.metal +6732 -0
  77. data/ext/ggml/src/ggml-opt.cpp +854 -0
  78. data/ext/ggml/src/ggml-quants.c +5238 -0
  79. data/ext/ggml/src/ggml-quants.h +100 -0
  80. data/ext/ggml/src/ggml-rpc/ggml-rpc.cpp +1406 -0
  81. data/ext/ggml/src/ggml-sycl/common.cpp +95 -0
  82. data/ext/ggml/src/ggml-sycl/concat.cpp +196 -0
  83. data/ext/ggml/src/ggml-sycl/conv.cpp +99 -0
  84. data/ext/ggml/src/ggml-sycl/convert.cpp +547 -0
  85. data/ext/ggml/src/ggml-sycl/dmmv.cpp +1023 -0
  86. data/ext/ggml/src/ggml-sycl/element_wise.cpp +1030 -0
  87. data/ext/ggml/src/ggml-sycl/ggml-sycl.cpp +4729 -0
  88. data/ext/ggml/src/ggml-sycl/im2col.cpp +126 -0
  89. data/ext/ggml/src/ggml-sycl/mmq.cpp +3031 -0
  90. data/ext/ggml/src/ggml-sycl/mmvq.cpp +1015 -0
  91. data/ext/ggml/src/ggml-sycl/norm.cpp +378 -0
  92. data/ext/ggml/src/ggml-sycl/outprod.cpp +56 -0
  93. data/ext/ggml/src/ggml-sycl/rope.cpp +276 -0
  94. data/ext/ggml/src/ggml-sycl/softmax.cpp +251 -0
  95. data/ext/ggml/src/ggml-sycl/tsembd.cpp +72 -0
  96. data/ext/ggml/src/ggml-sycl/wkv6.cpp +141 -0
  97. data/ext/ggml/src/ggml-threading.cpp +12 -0
  98. data/ext/ggml/src/ggml-threading.h +14 -0
  99. data/ext/ggml/src/ggml-vulkan/ggml-vulkan.cpp +8657 -0
  100. data/ext/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +593 -0
  101. data/ext/ggml/src/ggml.c +7694 -0
  102. data/ext/{whisper.h → include/whisper.h} +23 -22
  103. data/ext/metal-embed.mk +17 -0
  104. data/ext/metal.mk +6 -0
  105. data/ext/ruby_whisper.cpp +1492 -9
  106. data/ext/ruby_whisper.h +10 -0
  107. data/ext/scripts/get-flags.mk +38 -0
  108. data/ext/src/coreml/whisper-decoder-impl.h +146 -0
  109. data/ext/src/coreml/whisper-decoder-impl.m +201 -0
  110. data/ext/src/coreml/whisper-encoder-impl.h +142 -0
  111. data/ext/src/coreml/whisper-encoder-impl.m +197 -0
  112. data/ext/src/coreml/whisper-encoder.h +26 -0
  113. data/ext/src/openvino/whisper-openvino-encoder.cpp +108 -0
  114. data/ext/src/openvino/whisper-openvino-encoder.h +31 -0
  115. data/ext/{whisper.cpp → src/whisper.cpp} +661 -492
  116. data/extsources.rb +6 -0
  117. data/lib/whisper/model/uri.rb +157 -0
  118. data/lib/whisper.rb +2 -0
  119. data/tests/helper.rb +7 -0
  120. data/tests/jfk_reader/.gitignore +5 -0
  121. data/tests/jfk_reader/extconf.rb +3 -0
  122. data/tests/jfk_reader/jfk_reader.c +68 -0
  123. data/tests/test_callback.rb +160 -0
  124. data/tests/test_error.rb +20 -0
  125. data/tests/test_model.rb +71 -0
  126. data/tests/test_package.rb +31 -0
  127. data/tests/test_params.rb +160 -0
  128. data/tests/test_segment.rb +83 -0
  129. data/tests/test_whisper.rb +211 -123
  130. data/whispercpp.gemspec +36 -0
  131. metadata +137 -11
  132. data/ext/ggml.c +0 -21755
@@ -0,0 +1,1030 @@
1
+ #include "common.hpp"
2
+ #include "element_wise.hpp"
3
+
4
+ void acc_f32(const float * x, const float * y, float * dst, const int ne,
5
+ const int ne10, const int ne11, const int ne12,
6
+ const int nb1, const int nb2, int offset, const sycl::nd_item<3> &item_ct1) {
7
+ const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
8
+ item_ct1.get_local_id(2);
9
+ if (i >= ne) {
10
+ return;
11
+ }
12
+ int src1_idx = i - offset;
13
+ int oz = src1_idx / nb2;
14
+ int oy = (src1_idx - (oz * nb2)) / nb1;
15
+ int ox = src1_idx % nb1;
16
+ if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) {
17
+ dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11];
18
+ } else {
19
+ dst[i] = x[i];
20
+ }
21
+ }
22
+
23
+ void gelu_f32(const float * x, float * dst, const int k,
24
+ const sycl::nd_item<3> &item_ct1) {
25
+ const float GELU_COEF_A = 0.044715f;
26
+ const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
27
+ const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
28
+ item_ct1.get_local_id(2);
29
+
30
+ if (i >= k) {
31
+ return;
32
+ }
33
+
34
+ float xi = x[i];
35
+ dst[i] = 0.5f * xi *
36
+ (1.0f +
37
+ sycl::tanh(SQRT_2_OVER_PI * xi * (1.0f + GELU_COEF_A * xi * xi)));
38
+ }
39
+
40
+ void silu_f32(const float * x, float * dst, const int k,
41
+ const sycl::nd_item<3> &item_ct1) {
42
+ const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
43
+ item_ct1.get_local_id(2);
44
+
45
+ if (i >= k) {
46
+ return;
47
+ }
48
+ dst[i] = x[i] / (1.0f + sycl::native::exp(-x[i]));
49
+ }
50
+
51
+ void gelu_quick_f32(const float *x, float *dst, int k,
52
+ const sycl::nd_item<3> &item_ct1) {
53
+ const float GELU_QUICK_COEF = -1.702f;
54
+ const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
55
+ item_ct1.get_local_id(2);
56
+ if (i >= k) {
57
+ return;
58
+ }
59
+ dst[i] = x[i] * (1.0f / (1.0f + sycl::native::exp(GELU_QUICK_COEF * x[i])));
60
+ }
61
+
62
+ void tanh_f32(const float *x, float *dst, int k,
63
+ const sycl::nd_item<3> &item_ct1) {
64
+ const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
65
+ item_ct1.get_local_id(2);
66
+ if (i >= k) {
67
+ return;
68
+ }
69
+ dst[i] = sycl::tanh((float)(x[i]));
70
+ }
71
+
72
+ void relu_f32(const float * x, float * dst, const int k,
73
+ const sycl::nd_item<3> &item_ct1) {
74
+ const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
75
+ item_ct1.get_local_id(2);
76
+
77
+ if (i >= k) {
78
+ return;
79
+ }
80
+ dst[i] = sycl::fmax((float)(x[i]), (float)0);
81
+ }
82
+
83
+ void sigmoid_f32(const float * x, float * dst, const int k,
84
+ const sycl::nd_item<3> &item_ct1) {
85
+ const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
86
+ item_ct1.get_local_id(2);
87
+
88
+ if (i >= k) {
89
+ return;
90
+ }
91
+ dst[i] = 1.0f / (1.0f + sycl::native::exp(-x[i]));
92
+ }
93
+
94
+ void sqrt_f32(const float * x, float * dst, const int k,
95
+ const sycl::nd_item<3> &item_ct1) {
96
+ const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
97
+ item_ct1.get_local_id(2);
98
+
99
+ if (i >= k) {
100
+ return;
101
+ }
102
+ dst[i] = sycl::sqrt(x[i]);
103
+ }
104
+
105
+ void sin_f32(const float * x, float * dst, const int k,
106
+ const sycl::nd_item<3> &item_ct1) {
107
+ const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
108
+ item_ct1.get_local_id(2);
109
+
110
+ if (i >= k) {
111
+ return;
112
+ }
113
+ dst[i] = sycl::sin(x[i]);
114
+ }
115
+
116
+ void cos_f32(const float * x, float * dst, const int k,
117
+ const sycl::nd_item<3> &item_ct1) {
118
+ const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
119
+ item_ct1.get_local_id(2);
120
+
121
+ if (i >= k) {
122
+ return;
123
+ }
124
+ dst[i] = sycl::cos(x[i]);
125
+ }
126
+
127
+ void hardsigmoid_f32(const float * x, float * dst, const int k,
128
+ const sycl::nd_item<3> &item_ct1) {
129
+ const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
130
+ item_ct1.get_local_id(2);
131
+
132
+ if (i >= k) {
133
+ return;
134
+ }
135
+ dst[i] = sycl::fmin(1.0f, sycl::fmax(0.0f, (x[i] + 3.0f) / 6.0f));
136
+ }
137
+
138
+ void hardswish_f32(const float * x, float * dst, const int k,
139
+ const sycl::nd_item<3> &item_ct1) {
140
+ const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
141
+ item_ct1.get_local_id(2);
142
+
143
+ if (i >= k) {
144
+ return;
145
+ }
146
+ dst[i] = x[i] * sycl::fmin(1.0f, sycl::fmax(0.0f, (x[i] + 3.0f) / 6.0f));
147
+ }
148
+
149
+ void exp_f32(const float * x, float * dst, const int k,
150
+ const sycl::nd_item<3> &item_ct1) {
151
+ const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
152
+ item_ct1.get_local_id(2);
153
+
154
+ if (i >= k) {
155
+ return;
156
+ }
157
+ dst[i] = sycl::exp(x[i]);
158
+ }
159
+
160
+ void log_f32(const float * x, float * dst, const int k,
161
+ const sycl::nd_item<3> &item_ct1) {
162
+ const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
163
+ item_ct1.get_local_id(2);
164
+
165
+ if (i >= k) {
166
+ return;
167
+ }
168
+ float xi = x[i];
169
+ if (xi <= 0) {
170
+ dst[i] = -INFINITY;
171
+ } else {
172
+ dst[i] = sycl::log(xi);
173
+ }
174
+ }
175
+
176
+ void neg_f32(const float * x, float * dst, const int k,
177
+ const sycl::nd_item<3> &item_ct1) {
178
+ const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
179
+ item_ct1.get_local_id(2);
180
+
181
+ if (i >= k) {
182
+ return;
183
+ }
184
+ dst[i] = -x[i];
185
+ }
186
+
187
+ void step_f32(const float * x, float * dst, const int k,
188
+ const sycl::nd_item<3> &item_ct1) {
189
+ const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
190
+ item_ct1.get_local_id(2);
191
+
192
+ if (i >= k) {
193
+ return;
194
+ }
195
+ dst[i] = x[i] > 0.0f;
196
+ }
197
+
198
+ void leaky_relu_f32(const float *x, float *dst, const int k, const float negative_slope,
199
+ const sycl::nd_item<3> &item_ct1) {
200
+ const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
201
+ item_ct1.get_local_id(2);
202
+ if (i >= k) {
203
+ return;
204
+ }
205
+ dst[i] = sycl::fmax((float)(x[i]), (float)0) +
206
+ sycl::fmin((float)(x[i]), 0.0f) * negative_slope;
207
+ }
208
+
209
+ void sqr_f32(const float * x, float * dst, const int k,
210
+ const sycl::nd_item<3> &item_ct1) {
211
+ const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
212
+ item_ct1.get_local_id(2);
213
+
214
+ if (i >= k) {
215
+ return;
216
+ }
217
+ dst[i] = x[i] * x[i];
218
+ }
219
+
220
+ void upscale_f32(const float *x, float *dst, const int nb00, const int nb01,
221
+ const int nb02, const int nb03, const int ne10, const int ne11,
222
+ const int ne12, const int ne13, const float sf0, const float sf1,
223
+ const float sf2, const float sf3, const sycl::nd_item<1> &item_ct1) {
224
+ int index = item_ct1.get_local_id(0) +
225
+ item_ct1.get_group(0) * item_ct1.get_local_range(0);
226
+ if (index >= ne10 * ne11 * ne12 * ne13) {
227
+ return;
228
+ }
229
+ // operation
230
+ int i10 = index % ne10;
231
+ int i11 = (index / ne10) % ne11;
232
+ int i12 = (index / (ne10 * ne11)) % ne12;
233
+ int i13 = (index / (ne10 * ne11 * ne12)) % ne13;
234
+
235
+ int i00 = i10 / sf0;
236
+ int i01 = i11 / sf1;
237
+ int i02 = i12 / sf2;
238
+ int i03 = i13 / sf3;
239
+
240
+ dst[index] = *(const float *)((const char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00);
241
+ }
242
+
243
+ void pad_f32(const float *x, float *dst, const int ne0, const int ne00, const int ne01, const int ne02,
244
+ const sycl::nd_item<3> &item_ct1) {
245
+ int nidx = item_ct1.get_local_id(2) +
246
+ item_ct1.get_group(2) * item_ct1.get_local_range(2);
247
+ if (nidx >= ne0) {
248
+ return;
249
+ }
250
+
251
+ // operation
252
+ int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
253
+ item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
254
+ if (nidx < ne00 && item_ct1.get_group(1) < (size_t) ne01 && item_ct1.get_group(0) < (size_t) ne02) {
255
+ int offset_src = nidx + item_ct1.get_group(1) * ne00 +
256
+ item_ct1.get_group(0) * ne00 * ne01;
257
+ dst[offset_dst] = x[offset_src];
258
+ } else {
259
+ dst[offset_dst] = 0.0f;
260
+ }
261
+ }
262
+
263
+
264
+
265
+ void acc_f32_sycl(const float *x, const float *y, float *dst,
266
+ const int n_elements, const int ne10, const int ne11,
267
+ const int ne12, const int nb1, const int nb2,
268
+ const int offset, queue_ptr stream) {
269
+ int num_blocks = (n_elements + SYCL_ACC_BLOCK_SIZE - 1) / SYCL_ACC_BLOCK_SIZE;
270
+ stream->parallel_for(
271
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
272
+ sycl::range<3>(1, 1, SYCL_ACC_BLOCK_SIZE),
273
+ sycl::range<3>(1, 1, SYCL_ACC_BLOCK_SIZE)),
274
+ [=](sycl::nd_item<3> item_ct1) {
275
+ acc_f32(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset,
276
+ item_ct1);
277
+ });
278
+ }
279
+
280
+ void gelu_f32_sycl(const float *x, float *dst, const int k,
281
+ queue_ptr stream) {
282
+ const int num_blocks = (k + SYCL_GELU_BLOCK_SIZE - 1) / SYCL_GELU_BLOCK_SIZE;
283
+ stream->parallel_for(
284
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
285
+ sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE),
286
+ sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)),
287
+ [=](sycl::nd_item<3> item_ct1) {
288
+ gelu_f32(x, dst, k, item_ct1);
289
+ });
290
+ }
291
+
292
+ void silu_f32_sycl(const float *x, float *dst, const int k,
293
+ queue_ptr stream) {
294
+ const int num_blocks = (k + SYCL_SILU_BLOCK_SIZE - 1) / SYCL_SILU_BLOCK_SIZE;
295
+ stream->parallel_for(
296
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
297
+ sycl::range<3>(1, 1, SYCL_SILU_BLOCK_SIZE),
298
+ sycl::range<3>(1, 1, SYCL_SILU_BLOCK_SIZE)),
299
+ [=](sycl::nd_item<3> item_ct1) {
300
+ silu_f32(x, dst, k, item_ct1);
301
+ });
302
+ }
303
+
304
+ void gelu_quick_f32_sycl(const float *x, float *dst, const int k,
305
+ queue_ptr stream) {
306
+ const int num_blocks = (k + SYCL_GELU_BLOCK_SIZE - 1) / SYCL_GELU_BLOCK_SIZE;
307
+ stream->parallel_for(
308
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
309
+ sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE),
310
+ sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)),
311
+ [=](sycl::nd_item<3> item_ct1) {
312
+ gelu_quick_f32(x, dst, k, item_ct1);
313
+ });
314
+ }
315
+
316
+ void tanh_f32_sycl(const float *x, float *dst, const int k,
317
+ queue_ptr stream) {
318
+ const int num_blocks = (k + SYCL_TANH_BLOCK_SIZE - 1) / SYCL_TANH_BLOCK_SIZE;
319
+ stream->parallel_for(
320
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
321
+ sycl::range<3>(1, 1, SYCL_TANH_BLOCK_SIZE),
322
+ sycl::range<3>(1, 1, SYCL_TANH_BLOCK_SIZE)),
323
+ [=](sycl::nd_item<3> item_ct1) {
324
+ tanh_f32(x, dst, k, item_ct1);
325
+ });
326
+ }
327
+
328
+ void relu_f32_sycl(const float *x, float *dst, const int k,
329
+ queue_ptr stream) {
330
+ const int num_blocks = (k + SYCL_RELU_BLOCK_SIZE - 1) / SYCL_RELU_BLOCK_SIZE;
331
+ stream->parallel_for(
332
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
333
+ sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE),
334
+ sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE)),
335
+ [=](sycl::nd_item<3> item_ct1) {
336
+ relu_f32(x, dst, k, item_ct1);
337
+ });
338
+ }
339
+
340
+ void hardsigmoid_f32_sycl(const float *x, float *dst, const int k,
341
+ queue_ptr stream) {
342
+ const int num_blocks = (k + SYCL_HARDSIGMOID_BLOCK_SIZE - 1) / SYCL_HARDSIGMOID_BLOCK_SIZE;
343
+ stream->parallel_for(
344
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
345
+ sycl::range<3>(1, 1, SYCL_HARDSIGMOID_BLOCK_SIZE),
346
+ sycl::range<3>(1, 1, SYCL_HARDSIGMOID_BLOCK_SIZE)),
347
+ [=](sycl::nd_item<3> item_ct1) {
348
+ hardsigmoid_f32(x, dst, k, item_ct1);
349
+ });
350
+ }
351
+
352
+ void hardswish_f32_sycl(const float *x, float *dst, const int k,
353
+ queue_ptr stream) {
354
+ const int num_blocks = (k + SYCL_HARDSWISH_BLOCK_SIZE - 1) / SYCL_HARDSWISH_BLOCK_SIZE;
355
+ stream->parallel_for(
356
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
357
+ sycl::range<3>(1, 1, SYCL_HARDSWISH_BLOCK_SIZE),
358
+ sycl::range<3>(1, 1, SYCL_HARDSWISH_BLOCK_SIZE)),
359
+ [=](sycl::nd_item<3> item_ct1) {
360
+ hardswish_f32(x, dst, k, item_ct1);
361
+ });
362
+ }
363
+
364
+ void exp_f32_sycl(const float *x, float *dst, const int k,
365
+ queue_ptr stream) {
366
+ const int num_blocks = (k + SYCL_EXP_BLOCK_SIZE - 1) / SYCL_EXP_BLOCK_SIZE;
367
+ stream->parallel_for(
368
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
369
+ sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE),
370
+ sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE)),
371
+ [=](sycl::nd_item<3> item_ct1) {
372
+ exp_f32(x, dst, k, item_ct1);
373
+ });
374
+ }
375
+
376
+ void log_f32_sycl(const float *x, float *dst, const int k,
377
+ queue_ptr stream) {
378
+ const int num_blocks = (k + SYCL_EXP_BLOCK_SIZE - 1) / SYCL_EXP_BLOCK_SIZE;
379
+ stream->parallel_for(
380
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
381
+ sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE),
382
+ sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE)),
383
+ [=](sycl::nd_item<3> item_ct1) {
384
+ log_f32(x, dst, k, item_ct1);
385
+ });
386
+ }
387
+
388
+ void neg_f32_sycl(const float *x, float *dst, const int k,
389
+ queue_ptr stream) {
390
+ const int num_blocks = (k + SYCL_NEG_BLOCK_SIZE - 1) / SYCL_NEG_BLOCK_SIZE;
391
+ stream->parallel_for(
392
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
393
+ sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE),
394
+ sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE)),
395
+ [=](sycl::nd_item<3> item_ct1) {
396
+ neg_f32(x, dst, k, item_ct1);
397
+ });
398
+ }
399
+
400
+ void step_f32_sycl(const float *x, float *dst, const int k,
401
+ queue_ptr stream) {
402
+ const int num_blocks = (k + SYCL_NEG_BLOCK_SIZE - 1) / SYCL_NEG_BLOCK_SIZE;
403
+ stream->parallel_for(
404
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
405
+ sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE),
406
+ sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE)),
407
+ [=](sycl::nd_item<3> item_ct1) {
408
+ step_f32(x, dst, k, item_ct1);
409
+ });
410
+ }
411
+
412
+ void sigmoid_f32_sycl(const float *x, float *dst, const int k,
413
+ queue_ptr stream) {
414
+ const int num_blocks = (k + SYCL_SIGMOID_BLOCK_SIZE - 1) / SYCL_SIGMOID_BLOCK_SIZE;
415
+ stream->parallel_for(
416
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
417
+ sycl::range<3>(1, 1, SYCL_SIGMOID_BLOCK_SIZE),
418
+ sycl::range<3>(1, 1, SYCL_SIGMOID_BLOCK_SIZE)),
419
+ [=](sycl::nd_item<3> item_ct1) {
420
+ sigmoid_f32(x, dst, k, item_ct1);
421
+ });
422
+ }
423
+
424
+ void sqrt_f32_sycl(const float *x, float *dst, const int k,
425
+ queue_ptr stream) {
426
+ const int num_blocks = (k + SYCL_SQRT_BLOCK_SIZE - 1) / SYCL_SQRT_BLOCK_SIZE;
427
+ stream->parallel_for(
428
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
429
+ sycl::range<3>(1, 1, SYCL_SQRT_BLOCK_SIZE),
430
+ sycl::range<3>(1, 1, SYCL_SQRT_BLOCK_SIZE)),
431
+ [=](sycl::nd_item<3> item_ct1) {
432
+ sqrt_f32(x, dst, k, item_ct1);
433
+ });
434
+ }
435
+
436
+ void sin_f32_sycl(const float *x, float *dst, const int k,
437
+ queue_ptr stream) {
438
+ const int num_blocks = (k + SYCL_SIN_BLOCK_SIZE - 1) / SYCL_SIN_BLOCK_SIZE;
439
+ stream->parallel_for(
440
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
441
+ sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE),
442
+ sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE)),
443
+ [=](sycl::nd_item<3> item_ct1) {
444
+ sin_f32(x, dst, k, item_ct1);
445
+ });
446
+ }
447
+
448
+ void cos_f32_sycl(const float *x, float *dst, const int k,
449
+ queue_ptr stream) {
450
+ const int num_blocks = (k + SYCL_SIN_BLOCK_SIZE - 1) / SYCL_SIN_BLOCK_SIZE;
451
+ stream->parallel_for(
452
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
453
+ sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE),
454
+ sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE)),
455
+ [=](sycl::nd_item<3> item_ct1) {
456
+ cos_f32(x, dst, k, item_ct1);
457
+ });
458
+ }
459
+
460
+ void leaky_relu_f32_sycl(const float *x, float *dst, const int k,
461
+ const float negative_slope,
462
+ queue_ptr stream) {
463
+ const int num_blocks = (k + SYCL_RELU_BLOCK_SIZE - 1) / SYCL_RELU_BLOCK_SIZE;
464
+ stream->parallel_for(
465
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
466
+ sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE),
467
+ sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE)),
468
+ [=](sycl::nd_item<3> item_ct1) {
469
+ leaky_relu_f32(x, dst, k, negative_slope, item_ct1);
470
+ });
471
+ }
472
+
473
+ void sqr_f32_sycl(const float *x, float *dst, const int k,
474
+ queue_ptr stream) {
475
+ const int num_blocks = (k + SYCL_SQR_BLOCK_SIZE - 1) / SYCL_SQR_BLOCK_SIZE;
476
+ stream->parallel_for(
477
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
478
+ sycl::range<3>(1, 1, SYCL_SQR_BLOCK_SIZE),
479
+ sycl::range<3>(1, 1, SYCL_SQR_BLOCK_SIZE)),
480
+ [=](sycl::nd_item<3> item_ct1) {
481
+ sqr_f32(x, dst, k, item_ct1);
482
+ });
483
+ }
484
+
485
+ void upscale_f32_sycl(const float *x, float *dst, const int nb00, const int nb01,
486
+ const int nb02, const int nb03, const int ne10, const int ne11,
487
+ const int ne12, const int ne13, const float sf0, const float sf1,
488
+ const float sf2, const float sf3, queue_ptr stream) {
489
+ int dst_size = ne10 * ne11 * ne12 * ne13;
490
+ int num_blocks = (dst_size + SYCL_UPSCALE_BLOCK_SIZE - 1) / SYCL_UPSCALE_BLOCK_SIZE;
491
+ sycl::range<1> gridDim(num_blocks * SYCL_UPSCALE_BLOCK_SIZE);
492
+ stream->parallel_for(
493
+ sycl::nd_range<1>(gridDim, sycl::range<1>(SYCL_UPSCALE_BLOCK_SIZE)),
494
+ [=](sycl::nd_item<1> item_ct1) {
495
+ upscale_f32(x, dst, nb00, nb01, nb02, nb03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3, item_ct1);
496
+ });
497
+ }
498
+
499
+ void pad_f32_sycl(const float *x, float *dst, const int ne00,
500
+ const int ne01, const int ne02, const int ne0,
501
+ const int ne1, const int ne2, queue_ptr stream) {
502
+ int num_blocks = (ne0 + SYCL_PAD_BLOCK_SIZE - 1) / SYCL_PAD_BLOCK_SIZE;
503
+ sycl::range<3> gridDim(ne2, ne1, num_blocks);
504
+ stream->parallel_for(
505
+ sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE),
506
+ sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE)),
507
+ [=](sycl::nd_item<3> item_ct1) {
508
+ pad_f32(x, dst, ne0, ne00, ne01, ne02, item_ct1);
509
+ });
510
+ }
511
+
512
+ inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
513
+ ggml_tensor *dst, const float *src0_dd,
514
+ const float *src1_dd, float *dst_dd,
515
+ const queue_ptr &main_stream) {
516
+
517
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
518
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
519
+
520
+ silu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
521
+
522
+ GGML_UNUSED(src1);
523
+ GGML_UNUSED(dst);
524
+ GGML_UNUSED(src1_dd);
525
+ GGML_UNUSED(ctx);
526
+ }
527
+
528
+ inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
529
+ ggml_tensor *dst, const float *src0_dd,
530
+ const float *src1_dd, float *dst_dd,
531
+ const queue_ptr &main_stream) {
532
+
533
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
534
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
535
+
536
+ gelu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
537
+
538
+ GGML_UNUSED(src1);
539
+ GGML_UNUSED(dst);
540
+ GGML_UNUSED(src1_dd);
541
+ GGML_UNUSED(ctx);
542
+ }
543
+ inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
544
+ const ggml_tensor *src1, ggml_tensor *dst,
545
+ const float *src0_dd, const float *src1_dd,
546
+ float *dst_dd,
547
+ const queue_ptr &main_stream) {
548
+
549
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
550
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
551
+
552
+ gelu_quick_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
553
+
554
+ GGML_UNUSED(src1);
555
+ GGML_UNUSED(dst);
556
+ GGML_UNUSED(src1_dd);
557
+ GGML_UNUSED(ctx);
558
+ }
559
+
560
+ inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
561
+ ggml_tensor *dst, const float *src0_dd,
562
+ const float *src1_dd, float *dst_dd,
563
+ const queue_ptr &main_stream) {
564
+
565
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
566
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
567
+ tanh_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
568
+
569
+ GGML_UNUSED(src1);
570
+ GGML_UNUSED(dst);
571
+ GGML_UNUSED(src1_dd);
572
+ GGML_UNUSED(ctx);
573
+ }
574
+
575
+ inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
576
+ ggml_tensor *dst, const float *src0_dd,
577
+ const float *src1_dd, float *dst_dd,
578
+ const queue_ptr &main_stream) {
579
+
580
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
581
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
582
+
583
+ relu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
584
+
585
+ GGML_UNUSED(src1);
586
+ GGML_UNUSED(dst);
587
+ GGML_UNUSED(src1_dd);
588
+ GGML_UNUSED(ctx);
589
+ }
590
+
591
+ inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
592
+ const ggml_tensor *src1, ggml_tensor *dst,
593
+ const float *src0_dd, const float *src1_dd,
594
+ float *dst_dd,
595
+ const queue_ptr &main_stream) {
596
+
597
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
598
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
599
+
600
+ hardsigmoid_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
601
+
602
+ GGML_UNUSED(src1);
603
+ GGML_UNUSED(dst);
604
+ GGML_UNUSED(src1_dd);
605
+ GGML_UNUSED(ctx);
606
+ }
607
+
608
+ inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
609
+ const ggml_tensor *src1, ggml_tensor *dst,
610
+ const float *src0_dd, const float *src1_dd,
611
+ float *dst_dd, const queue_ptr &main_stream) {
612
+
613
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
614
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
615
+
616
+ hardswish_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
617
+
618
+ GGML_UNUSED(src1);
619
+ GGML_UNUSED(dst);
620
+ GGML_UNUSED(src1_dd);
621
+ GGML_UNUSED(ctx);
622
+ }
623
+
624
+ inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
625
+ const ggml_tensor *src1, ggml_tensor *dst,
626
+ const float *src0_dd, const float *src1_dd,
627
+ float *dst_dd, const queue_ptr &main_stream) {
628
+
629
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
630
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
631
+
632
+ exp_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
633
+
634
+ GGML_UNUSED(src1);
635
+ GGML_UNUSED(dst);
636
+ GGML_UNUSED(src1_dd);
637
+ GGML_UNUSED(ctx);
638
+ }
639
+
640
+ inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
641
+ const ggml_tensor *src1, ggml_tensor *dst,
642
+ const float *src0_dd, const float *src1_dd,
643
+ float *dst_dd, const queue_ptr &main_stream) {
644
+
645
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
646
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
647
+
648
+ log_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
649
+
650
+ GGML_UNUSED(src1);
651
+ GGML_UNUSED(dst);
652
+ GGML_UNUSED(src1_dd);
653
+ GGML_UNUSED(ctx);
654
+ }
655
+
656
+ inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
657
+ const ggml_tensor *src1, ggml_tensor *dst,
658
+ const float *src0_dd, const float *src1_dd,
659
+ float *dst_dd, const queue_ptr &main_stream) {
660
+
661
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
662
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
663
+
664
+ sigmoid_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
665
+
666
+ GGML_UNUSED(src1);
667
+ GGML_UNUSED(dst);
668
+ GGML_UNUSED(src1_dd);
669
+ GGML_UNUSED(ctx);
670
+ }
671
+
672
+ inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
673
+ const ggml_tensor *src1, ggml_tensor *dst,
674
+ const float *src0_dd, const float *src1_dd,
675
+ float *dst_dd, const queue_ptr &main_stream) {
676
+
677
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
678
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
679
+
680
+ sqrt_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
681
+
682
+ GGML_UNUSED(src1);
683
+ GGML_UNUSED(dst);
684
+ GGML_UNUSED(src1_dd);
685
+ GGML_UNUSED(ctx);
686
+ }
687
+
688
+ inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
689
+ const ggml_tensor *src1, ggml_tensor *dst,
690
+ const float *src0_dd, const float *src1_dd,
691
+ float *dst_dd, const queue_ptr &main_stream) {
692
+
693
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
694
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
695
+
696
+ sin_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
697
+
698
+ GGML_UNUSED(src1);
699
+ GGML_UNUSED(dst);
700
+ GGML_UNUSED(src1_dd);
701
+ GGML_UNUSED(ctx);
702
+ }
703
+
704
+ inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
705
+ const ggml_tensor *src1, ggml_tensor *dst,
706
+ const float *src0_dd, const float *src1_dd,
707
+ float *dst_dd, const queue_ptr &main_stream) {
708
+
709
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
710
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
711
+
712
+ cos_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
713
+
714
+ GGML_UNUSED(src1);
715
+ GGML_UNUSED(dst);
716
+ GGML_UNUSED(src1_dd);
717
+ GGML_UNUSED(ctx);
718
+ }
719
+
720
+ inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
721
+ const ggml_tensor *src1, ggml_tensor *dst,
722
+ const float *src0_dd, const float *src1_dd,
723
+ float *dst_dd, const queue_ptr &main_stream) {
724
+
725
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
726
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
727
+
728
+ step_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
729
+
730
+ GGML_UNUSED(src1);
731
+ GGML_UNUSED(dst);
732
+ GGML_UNUSED(src1_dd);
733
+ GGML_UNUSED(ctx);
734
+ }
735
+
736
+ inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
737
+ const ggml_tensor *src1, ggml_tensor *dst,
738
+ const float *src0_dd, const float *src1_dd,
739
+ float *dst_dd, const queue_ptr &main_stream) {
740
+
741
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
742
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
743
+
744
+ neg_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
745
+
746
+ GGML_UNUSED(src1);
747
+ GGML_UNUSED(dst);
748
+ GGML_UNUSED(src1_dd);
749
+ GGML_UNUSED(ctx);
750
+ }
751
+
752
+ inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
753
+ const ggml_tensor *src1, ggml_tensor *dst,
754
+ const float *src0_dd, const float *src1_dd,
755
+ float *dst_dd,
756
+ const queue_ptr &main_stream) {
757
+
758
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
759
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
760
+
761
+ float negative_slope;
762
+ memcpy(&negative_slope, dst->op_params, sizeof(float));
763
+
764
+ leaky_relu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), negative_slope, main_stream);
765
+
766
+ GGML_UNUSED(src1);
767
+ GGML_UNUSED(dst);
768
+ GGML_UNUSED(src1_dd);
769
+ GGML_UNUSED(ctx);
770
+ }
771
+
772
+ inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
773
+ ggml_tensor *dst, const float *src0_dd,
774
+ const float *src1_dd, float *dst_dd,
775
+ const queue_ptr &main_stream) {
776
+
777
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
778
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
779
+
780
+ sqr_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
781
+
782
+ GGML_UNUSED(src1);
783
+ GGML_UNUSED(dst);
784
+ GGML_UNUSED(src1_dd);
785
+ GGML_UNUSED(ctx);
786
+ }
787
+
788
+ inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
789
+ const ggml_tensor *src1, ggml_tensor *dst,
790
+ const float *src0_dd, const float *src1_dd,
791
+ float *dst_dd,
792
+ const queue_ptr &main_stream) {
793
+
794
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
795
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
796
+
797
+ const float sf0 = (float)dst->ne[0]/src0->ne[0];
798
+ const float sf1 = (float)dst->ne[1]/src0->ne[1];
799
+ const float sf2 = (float)dst->ne[2]/src0->ne[2];
800
+ const float sf3 = (float)dst->ne[3]/src0->ne[3];
801
+
802
+ upscale_f32_sycl(src0_dd, dst_dd, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
803
+ dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3,
804
+ main_stream);
805
+
806
+ GGML_UNUSED(src1);
807
+ GGML_UNUSED(dst);
808
+ GGML_UNUSED(src1_dd);
809
+ GGML_UNUSED(ctx);
810
+ }
811
+
812
+ inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
813
+ ggml_tensor *dst, const float *src0_dd,
814
+ const float *src1_dd, float *dst_dd,
815
+ const queue_ptr &main_stream) {
816
+
817
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
818
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
819
+ GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
820
+
821
+ pad_f32_sycl(src0_dd, dst_dd,
822
+ src0->ne[0], src0->ne[1], src0->ne[2],
823
+ dst->ne[0], dst->ne[1], dst->ne[2], main_stream);
824
+
825
+ GGML_UNUSED(src1);
826
+ GGML_UNUSED(dst);
827
+ GGML_UNUSED(src1_dd);
828
+ GGML_UNUSED(ctx);
829
+ }
830
+
831
+ inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
832
+ ggml_tensor *dst, const float *src0_dd,
833
+ const float *src1_dd, float *dst_dd,
834
+ const queue_ptr &main_stream) {
835
+
836
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
837
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
838
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
839
+ GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported
840
+
841
+ int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
842
+ int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
843
+ // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
844
+ int offset = dst->op_params[3] / 4; // offset in bytes
845
+
846
+ acc_f32_sycl(src0_dd, src1_dd, dst_dd, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, main_stream);
847
+
848
+ GGML_UNUSED(dst);
849
+ GGML_UNUSED(ctx);
850
+ }
851
+
852
+ inline void ggml_sycl_op_add(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
853
+ ggml_tensor *dst, const float *src0_dd,
854
+ const float *src1_dd, float *dst_dd,
855
+ const queue_ptr &main_stream) {
856
+
857
+ ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_add>>(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
858
+ }
859
+
860
+ inline void ggml_sycl_op_sub(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
861
+ ggml_tensor *dst, const float *src0_dd,
862
+ const float *src1_dd, float *dst_dd,
863
+ const queue_ptr &main_stream) {
864
+
865
+ ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_sub>>(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
866
+ }
867
+
868
+ inline void ggml_sycl_op_mul(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
869
+ ggml_tensor *dst, const float *src0_dd,
870
+ const float *src1_dd, float *dst_dd,
871
+ const queue_ptr &main_stream) {
872
+
873
+ ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_mul>>(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
874
+ }
875
+
876
+ inline void ggml_sycl_op_div(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
877
+ ggml_tensor *dst, const float *src0_dd,
878
+ const float *src1_dd, float *dst_dd,
879
+ const queue_ptr &main_stream) {
880
+
881
+ ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_div>>(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
882
+ }
883
+
884
+
885
+ void ggml_sycl_sqrt(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
886
+ GGML_SYCL_DEBUG("call %s\n", __func__);
887
+ ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_sqrt);
888
+ GGML_SYCL_DEBUG("call %s done\n", __func__);
889
+ }
890
+
891
+ void ggml_sycl_sin(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
892
+ GGML_SYCL_DEBUG("call %s\n", __func__);
893
+ ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_sin);
894
+ GGML_SYCL_DEBUG("call %s done\n", __func__);
895
+ }
896
+
897
+ void ggml_sycl_cos(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
898
+ GGML_SYCL_DEBUG("call %s\n", __func__);
899
+ ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_cos);
900
+ GGML_SYCL_DEBUG("call %s done\n", __func__);
901
+ }
902
+
903
+ void ggml_sycl_acc(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
904
+ GGML_SYCL_DEBUG("call %s\n", __func__);
905
+ ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_acc);
906
+ GGML_SYCL_DEBUG("call %s done\n", __func__);
907
+ }
908
+
909
+ void ggml_sycl_gelu(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
910
+ GGML_SYCL_DEBUG("call %s\n", __func__);
911
+ ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_gelu);
912
+ GGML_SYCL_DEBUG("call %s done\n", __func__);
913
+ }
914
+
915
+ void ggml_sycl_silu(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
916
+ GGML_SYCL_DEBUG("call %s\n", __func__);
917
+ ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_silu);
918
+ GGML_SYCL_DEBUG("call %s done\n", __func__);
919
+ }
920
+
921
+ void ggml_sycl_gelu_quick(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
922
+ GGML_SYCL_DEBUG("call %s\n", __func__);
923
+ ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_gelu_quick);
924
+ GGML_SYCL_DEBUG("call %s done\n", __func__);
925
+ }
926
+
927
+ void ggml_sycl_tanh(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
928
+ GGML_SYCL_DEBUG("call %s\n", __func__);
929
+ ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_tanh);
930
+ GGML_SYCL_DEBUG("call %s done\n", __func__);
931
+ }
932
+
933
+ void ggml_sycl_relu(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
934
+ GGML_SYCL_DEBUG("call %s\n", __func__);
935
+ ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_relu);
936
+ GGML_SYCL_DEBUG("call %s done\n", __func__);
937
+ }
938
+
939
+ void ggml_sycl_sigmoid(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
940
+ GGML_SYCL_DEBUG("call %s\n", __func__);
941
+ ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_sigmoid);
942
+ GGML_SYCL_DEBUG("call %s done\n", __func__);
943
+ }
944
+
945
+ void ggml_sycl_hardsigmoid(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
946
+ GGML_SYCL_DEBUG("call %s\n", __func__);
947
+ ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_hardsigmoid);
948
+ GGML_SYCL_DEBUG("call %s done\n", __func__);
949
+ }
950
+
951
+ void ggml_sycl_hardswish(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
952
+ GGML_SYCL_DEBUG("call %s\n", __func__);
953
+ ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_hardswish);
954
+ GGML_SYCL_DEBUG("call %s done\n", __func__);
955
+ }
956
+
957
+
958
+ void ggml_sycl_exp(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
959
+ GGML_SYCL_DEBUG("call %s\n", __func__);
960
+ ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_exp);
961
+ GGML_SYCL_DEBUG("call %s done\n", __func__);
962
+ }
963
+
964
+ void ggml_sycl_log(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
965
+ GGML_SYCL_DEBUG("call %s\n", __func__);
966
+ ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_log);
967
+ GGML_SYCL_DEBUG("call %s done\n", __func__);
968
+ }
969
+
970
+ void ggml_sycl_neg(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
971
+ GGML_SYCL_DEBUG("call %s\n", __func__);
972
+ ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_neg);
973
+ GGML_SYCL_DEBUG("call %s done\n", __func__);
974
+ }
975
+
976
+ void ggml_sycl_step(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
977
+ GGML_SYCL_DEBUG("call %s\n", __func__);
978
+ ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_step);
979
+ GGML_SYCL_DEBUG("call %s done\n", __func__);
980
+ }
981
+
982
+ void ggml_sycl_leaky_relu(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
983
+ GGML_SYCL_DEBUG("call %s\n", __func__);
984
+ ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_leaky_relu);
985
+ GGML_SYCL_DEBUG("call %s done\n", __func__);
986
+ }
987
+
988
+ void ggml_sycl_sqr(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
989
+ GGML_SYCL_DEBUG("call %s\n", __func__);
990
+ ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_sqr);
991
+ GGML_SYCL_DEBUG("call %s done\n", __func__);
992
+ }
993
+
994
+ void ggml_sycl_upscale(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
995
+ GGML_SYCL_DEBUG("call %s\n", __func__);
996
+ ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_upscale);
997
+ GGML_SYCL_DEBUG("call %s done\n", __func__);
998
+ }
999
+
1000
+ void ggml_sycl_pad(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1001
+ GGML_SYCL_DEBUG("call %s\n", __func__);
1002
+ ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_pad);
1003
+ GGML_SYCL_DEBUG("call %s done\n", __func__);
1004
+ }
1005
+
1006
+
1007
+
1008
+ void ggml_sycl_add(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1009
+ GGML_SYCL_DEBUG("call %s\n", __func__);
1010
+ ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_add);
1011
+ GGML_SYCL_DEBUG("call %s done\n", __func__);
1012
+ }
1013
+
1014
+ void ggml_sycl_sub(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1015
+ GGML_SYCL_DEBUG("call %s\n", __func__);
1016
+ ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_sub);
1017
+ GGML_SYCL_DEBUG("call %s done\n", __func__);
1018
+ }
1019
+
1020
+ void ggml_sycl_mul(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1021
+ GGML_SYCL_DEBUG("call %s\n", __func__);
1022
+ ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_mul);
1023
+ GGML_SYCL_DEBUG("call %s done\n", __func__);
1024
+ }
1025
+
1026
+ void ggml_sycl_div(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
1027
+ GGML_SYCL_DEBUG("call %s\n", __func__);
1028
+ ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_div);
1029
+ GGML_SYCL_DEBUG("call %s done\n", __func__);
1030
+ }