@fugood/llama.node 0.3.15 → 0.3.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +3 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +5 -0
- package/package.json +1 -1
- package/src/LlamaCompletionWorker.cpp +8 -0
- package/src/LlamaCompletionWorker.h +1 -0
- package/src/LlamaContext.cpp +3 -2
- package/src/llama.cpp/.github/workflows/build-linux-cross.yml +124 -0
- package/src/llama.cpp/.github/workflows/build.yml +70 -27
- package/src/llama.cpp/.github/workflows/docker.yml +6 -6
- package/src/llama.cpp/.github/workflows/server.yml +7 -11
- package/src/llama.cpp/CMakeLists.txt +23 -1
- package/src/llama.cpp/common/CMakeLists.txt +6 -3
- package/src/llama.cpp/common/arg.cpp +809 -105
- package/src/llama.cpp/common/arg.h +9 -0
- package/src/llama.cpp/common/chat.cpp +1 -1
- package/src/llama.cpp/common/common.cpp +31 -521
- package/src/llama.cpp/common/common.h +17 -36
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
- package/src/llama.cpp/common/llguidance.cpp +30 -47
- package/src/llama.cpp/common/minja/chat-template.hpp +15 -7
- package/src/llama.cpp/common/minja/minja.hpp +119 -93
- package/src/llama.cpp/common/sampling.cpp +3 -0
- package/src/llama.cpp/docs/build.md +122 -7
- package/src/llama.cpp/examples/CMakeLists.txt +0 -9
- package/src/llama.cpp/examples/batched/batched.cpp +1 -1
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +7 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -16
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +210 -8
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llava/CMakeLists.txt +39 -24
- package/src/llama.cpp/examples/llava/clip-impl.h +345 -0
- package/src/llama.cpp/examples/llava/clip.cpp +2152 -1803
- package/src/llama.cpp/examples/llava/clip.h +39 -22
- package/src/llama.cpp/examples/llava/deprecation-warning.cpp +22 -0
- package/src/llama.cpp/examples/llava/llava.cpp +64 -52
- package/src/llama.cpp/examples/llava/mtmd-cli.cpp +344 -0
- package/src/llama.cpp/examples/llava/mtmd.cpp +708 -0
- package/src/llama.cpp/examples/llava/mtmd.h +168 -0
- package/src/llama.cpp/examples/llava/{qwen2vl-cli.cpp → qwen2vl-test.cpp} +83 -31
- package/src/llama.cpp/examples/main/main.cpp +16 -5
- package/src/llama.cpp/examples/parallel/parallel.cpp +3 -1
- package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +17 -3
- package/src/llama.cpp/examples/quantize/quantize.cpp +115 -2
- package/src/llama.cpp/examples/rpc/CMakeLists.txt +4 -2
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +163 -8
- package/src/llama.cpp/examples/run/CMakeLists.txt +12 -1
- package/src/llama.cpp/examples/run/run.cpp +14 -28
- package/src/llama.cpp/examples/server/httplib.h +313 -247
- package/src/llama.cpp/examples/server/server.cpp +243 -139
- package/src/llama.cpp/examples/server/utils.hpp +51 -2
- package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
- package/src/llama.cpp/examples/sycl/build.sh +2 -2
- package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
- package/src/llama.cpp/examples/tts/tts.cpp +14 -9
- package/src/llama.cpp/ggml/CMakeLists.txt +8 -2
- package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
- package/src/llama.cpp/ggml/include/ggml.h +66 -99
- package/src/llama.cpp/ggml/src/CMakeLists.txt +15 -8
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
- package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +48 -22
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2413 -228
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +754 -404
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1004 -13516
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +2 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +0 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +3 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +533 -88
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8809 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +258 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
- package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +70 -3
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -260
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +293 -40
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +127 -33
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +350 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +29 -293
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +967 -438
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +12 -43
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +210 -286
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +23 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +692 -126
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +12 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +21 -10
- package/src/llama.cpp/ggml/src/ggml.c +141 -245
- package/src/llama.cpp/ggml/src/gguf.cpp +1 -0
- package/src/llama.cpp/include/llama.h +30 -11
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +2 -0
- package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
- package/src/llama.cpp/src/CMakeLists.txt +3 -2
- package/src/llama.cpp/src/llama-adapter.cpp +37 -1
- package/src/llama.cpp/src/llama-arch.cpp +161 -17
- package/src/llama.cpp/src/llama-arch.h +16 -0
- package/src/llama.cpp/src/llama-chat.cpp +82 -17
- package/src/llama.cpp/src/llama-chat.h +6 -2
- package/src/llama.cpp/src/llama-context.cpp +108 -92
- package/src/llama.cpp/src/llama-context.h +1 -2
- package/src/llama.cpp/src/llama-graph.cpp +189 -119
- package/src/llama.cpp/src/llama-graph.h +26 -6
- package/src/llama.cpp/src/llama-hparams.h +13 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +70 -123
- package/src/llama.cpp/src/llama-kv-cache.h +41 -115
- package/src/llama.cpp/src/llama-memory.h +1 -1
- package/src/llama.cpp/src/llama-mmap.cpp +1 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +10 -5
- package/src/llama.cpp/src/llama-model-loader.h +5 -3
- package/src/llama.cpp/src/llama-model.cpp +1544 -291
- package/src/llama.cpp/src/llama-model.h +13 -1
- package/src/llama.cpp/src/llama-quant.cpp +29 -8
- package/src/llama.cpp/src/llama-sampling.cpp +7 -1
- package/src/llama.cpp/src/llama-vocab.cpp +44 -6
- package/src/llama.cpp/src/llama.cpp +1 -1
- package/src/llama.cpp/tests/CMakeLists.txt +43 -30
- package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
- package/src/llama.cpp/tests/test-backend-ops.cpp +139 -57
- package/src/llama.cpp/tests/test-chat-template.cpp +34 -13
- package/src/llama.cpp/tests/test-chat.cpp +12 -2
- package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
- package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
- package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
- package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
- package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
#include "common.hpp"
|
|
2
|
+
#include "ggml.h"
|
|
2
3
|
#include "element_wise.hpp"
|
|
3
4
|
|
|
4
5
|
static void acc_f32(const float * x, const float * y, float * dst, const int ne,
|
|
@@ -20,10 +21,32 @@ static void acc_f32(const float * x, const float * y, float * dst, const int ne,
|
|
|
20
21
|
}
|
|
21
22
|
}
|
|
22
23
|
|
|
23
|
-
|
|
24
|
+
template<typename T>
|
|
25
|
+
static void sgn(const T * x, T * dst, const int k, const sycl::nd_item<3> &item_ct1) {
|
|
26
|
+
for(auto i = item_ct1.get_global_id(2); i < (const size_t)k; i += item_ct1.get_global_range(2)) {
|
|
27
|
+
dst[i] = x[i] > static_cast<T>(0.f) ? static_cast<T>(1.f) : ((x[i] < static_cast<T>(0.f) ? static_cast<T>(-1.f) : static_cast<T>(0.f)));
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
template<typename T>
|
|
32
|
+
static void abs_op(const T * x, T * dst, const int k, const sycl::nd_item<3> &item_ct1) {
|
|
33
|
+
for(auto i = item_ct1.get_global_id(2); i < (const size_t)k; i += item_ct1.get_global_range(2)) {
|
|
34
|
+
dst[i] = sycl::fabs(x[i]);
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
template<typename T>
|
|
39
|
+
static void elu_op(const T * x, T * dst, const int k, const sycl::nd_item<3> &item_ct1) {
|
|
40
|
+
for(auto i = item_ct1.get_global_id(2); i < (const size_t)k; i += item_ct1.get_global_range(2)) {
|
|
41
|
+
dst[i] = (x[i] > static_cast<T>(0.f)) ? x[i] : sycl::expm1(x[i]);
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
template<typename T>
|
|
46
|
+
static void gelu(const T * x, T * dst, const int k,
|
|
24
47
|
const sycl::nd_item<3> &item_ct1) {
|
|
25
|
-
const
|
|
26
|
-
const
|
|
48
|
+
const T GELU_COEF_A = static_cast<T>(0.044715f);
|
|
49
|
+
const T SQRT_2_OVER_PI = static_cast<T>(0.79788456080286535587989211986876f);
|
|
27
50
|
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
28
51
|
item_ct1.get_local_id(2);
|
|
29
52
|
|
|
@@ -32,12 +55,13 @@ static void gelu_f32(const float * x, float * dst, const int k,
|
|
|
32
55
|
}
|
|
33
56
|
|
|
34
57
|
float xi = x[i];
|
|
35
|
-
dst[i] = 0.5f * xi *
|
|
36
|
-
(1.0f +
|
|
37
|
-
sycl::tanh(SQRT_2_OVER_PI * xi * (1.0f + GELU_COEF_A * xi * xi)));
|
|
58
|
+
dst[i] = static_cast<T>(0.5f) * xi *
|
|
59
|
+
(static_cast<T>(1.0f) +
|
|
60
|
+
sycl::tanh(SQRT_2_OVER_PI * xi * (static_cast<T>(1.0f) + GELU_COEF_A * xi * xi)));
|
|
38
61
|
}
|
|
39
62
|
|
|
40
|
-
|
|
63
|
+
template<typename T>
|
|
64
|
+
static void silu(const T * x, T * dst, const int k,
|
|
41
65
|
const sycl::nd_item<3> &item_ct1) {
|
|
42
66
|
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
43
67
|
item_ct1.get_local_id(2);
|
|
@@ -45,10 +69,11 @@ static void silu_f32(const float * x, float * dst, const int k,
|
|
|
45
69
|
if (i >= k) {
|
|
46
70
|
return;
|
|
47
71
|
}
|
|
48
|
-
dst[i] = x[i] / (1.0f + sycl::native::exp(-x[i]));
|
|
72
|
+
dst[i] = x[i] / (static_cast<T>(1.0f) + sycl::native::exp(-x[i]));
|
|
49
73
|
}
|
|
50
74
|
|
|
51
|
-
|
|
75
|
+
template<typename T>
|
|
76
|
+
static void gelu_quick(const T *x, T *dst, int k,
|
|
52
77
|
const sycl::nd_item<3> &item_ct1) {
|
|
53
78
|
const float GELU_QUICK_COEF = -1.702f;
|
|
54
79
|
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
@@ -56,20 +81,22 @@ static void gelu_quick_f32(const float *x, float *dst, int k,
|
|
|
56
81
|
if (i >= k) {
|
|
57
82
|
return;
|
|
58
83
|
}
|
|
59
|
-
dst[i] = x[i] * (1.0f / (1.0f + sycl::native::exp(GELU_QUICK_COEF * x[i])));
|
|
84
|
+
dst[i] = x[i] * (static_cast<T>(1.0f) / (static_cast<T>(1.0f) + sycl::native::exp(GELU_QUICK_COEF * x[i])));
|
|
60
85
|
}
|
|
61
86
|
|
|
62
|
-
|
|
87
|
+
template<typename T>
|
|
88
|
+
static void tanh(const T *x, T *dst, int k,
|
|
63
89
|
const sycl::nd_item<3> &item_ct1) {
|
|
64
90
|
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
65
91
|
item_ct1.get_local_id(2);
|
|
66
92
|
if (i >= k) {
|
|
67
93
|
return;
|
|
68
94
|
}
|
|
69
|
-
dst[i] = sycl::tanh((
|
|
95
|
+
dst[i] = sycl::tanh((x[i]));
|
|
70
96
|
}
|
|
71
97
|
|
|
72
|
-
|
|
98
|
+
template<typename T>
|
|
99
|
+
static void relu(const T * x, T * dst, const int k,
|
|
73
100
|
const sycl::nd_item<3> &item_ct1) {
|
|
74
101
|
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
75
102
|
item_ct1.get_local_id(2);
|
|
@@ -77,10 +104,11 @@ static void relu_f32(const float * x, float * dst, const int k,
|
|
|
77
104
|
if (i >= k) {
|
|
78
105
|
return;
|
|
79
106
|
}
|
|
80
|
-
dst[i] = sycl::fmax((
|
|
107
|
+
dst[i] = sycl::fmax((x[i]), static_cast<T>(0));
|
|
81
108
|
}
|
|
82
109
|
|
|
83
|
-
|
|
110
|
+
template<typename T>
|
|
111
|
+
static void sigmoid(const T * x, T * dst, const int k,
|
|
84
112
|
const sycl::nd_item<3> &item_ct1) {
|
|
85
113
|
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
86
114
|
item_ct1.get_local_id(2);
|
|
@@ -88,10 +116,11 @@ static void sigmoid_f32(const float * x, float * dst, const int k,
|
|
|
88
116
|
if (i >= k) {
|
|
89
117
|
return;
|
|
90
118
|
}
|
|
91
|
-
dst[i] = 1.0f / (1.0f + sycl::native::exp(-x[i]));
|
|
119
|
+
dst[i] = 1.0f / (static_cast<T>(1.0f) + sycl::native::exp(-x[i]));
|
|
92
120
|
}
|
|
93
121
|
|
|
94
|
-
|
|
122
|
+
template<typename T>
|
|
123
|
+
static void sqrt(const T * x, T * dst, const int k,
|
|
95
124
|
const sycl::nd_item<3> &item_ct1) {
|
|
96
125
|
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
97
126
|
item_ct1.get_local_id(2);
|
|
@@ -102,7 +131,8 @@ static void sqrt_f32(const float * x, float * dst, const int k,
|
|
|
102
131
|
dst[i] = sycl::sqrt(x[i]);
|
|
103
132
|
}
|
|
104
133
|
|
|
105
|
-
|
|
134
|
+
template<typename T>
|
|
135
|
+
static void sin(const T * x, T * dst, const int k,
|
|
106
136
|
const sycl::nd_item<3> &item_ct1) {
|
|
107
137
|
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
108
138
|
item_ct1.get_local_id(2);
|
|
@@ -113,7 +143,8 @@ static void sin_f32(const float * x, float * dst, const int k,
|
|
|
113
143
|
dst[i] = sycl::sin(x[i]);
|
|
114
144
|
}
|
|
115
145
|
|
|
116
|
-
|
|
146
|
+
template<typename T>
|
|
147
|
+
static void cos(const T * x, T * dst, const int k,
|
|
117
148
|
const sycl::nd_item<3> &item_ct1) {
|
|
118
149
|
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
119
150
|
item_ct1.get_local_id(2);
|
|
@@ -124,7 +155,8 @@ static void cos_f32(const float * x, float * dst, const int k,
|
|
|
124
155
|
dst[i] = sycl::cos(x[i]);
|
|
125
156
|
}
|
|
126
157
|
|
|
127
|
-
|
|
158
|
+
template<typename T>
|
|
159
|
+
static void hardsigmoid(const T * x, T * dst, const int k,
|
|
128
160
|
const sycl::nd_item<3> &item_ct1) {
|
|
129
161
|
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
130
162
|
item_ct1.get_local_id(2);
|
|
@@ -132,10 +164,11 @@ static void hardsigmoid_f32(const float * x, float * dst, const int k,
|
|
|
132
164
|
if (i >= k) {
|
|
133
165
|
return;
|
|
134
166
|
}
|
|
135
|
-
dst[i] = sycl::fmin(1.0f, sycl::fmax(0.0f, (x[i] + 3.0f) / 6.0f));
|
|
167
|
+
dst[i] = sycl::fmin(static_cast<T>(1.0f), sycl::fmax(static_cast<T>(0.0f), (x[i] + static_cast<T>(3.0f)) / static_cast<T>(6.0f)));
|
|
136
168
|
}
|
|
137
169
|
|
|
138
|
-
|
|
170
|
+
template<typename T>
|
|
171
|
+
static void hardswish(const T * x, T * dst, const int k,
|
|
139
172
|
const sycl::nd_item<3> &item_ct1) {
|
|
140
173
|
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
141
174
|
item_ct1.get_local_id(2);
|
|
@@ -143,10 +176,11 @@ static void hardswish_f32(const float * x, float * dst, const int k,
|
|
|
143
176
|
if (i >= k) {
|
|
144
177
|
return;
|
|
145
178
|
}
|
|
146
|
-
dst[i] = x[i] * sycl::fmin(1.0f, sycl::fmax(0.0f, (x[i] + 3.0f) / 6.0f));
|
|
179
|
+
dst[i] = x[i] * sycl::fmin(static_cast<T>(1.0f), sycl::fmax(static_cast<T>(0.0f), (x[i] + static_cast<T>(3.0f)) / static_cast<T>(6.0f)));
|
|
147
180
|
}
|
|
148
181
|
|
|
149
|
-
|
|
182
|
+
template<typename T>
|
|
183
|
+
static void exp(const T * x, T * dst, const int k,
|
|
150
184
|
const sycl::nd_item<3> &item_ct1) {
|
|
151
185
|
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
152
186
|
item_ct1.get_local_id(2);
|
|
@@ -157,7 +191,8 @@ static void exp_f32(const float * x, float * dst, const int k,
|
|
|
157
191
|
dst[i] = sycl::exp(x[i]);
|
|
158
192
|
}
|
|
159
193
|
|
|
160
|
-
|
|
194
|
+
template<typename T>
|
|
195
|
+
static void log(const T * x, T * dst, const int k,
|
|
161
196
|
const sycl::nd_item<3> &item_ct1) {
|
|
162
197
|
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
163
198
|
item_ct1.get_local_id(2);
|
|
@@ -165,15 +200,16 @@ static void log_f32(const float * x, float * dst, const int k,
|
|
|
165
200
|
if (i >= k) {
|
|
166
201
|
return;
|
|
167
202
|
}
|
|
168
|
-
|
|
203
|
+
T xi = x[i];
|
|
169
204
|
if (xi <= 0) {
|
|
170
|
-
dst[i] =
|
|
205
|
+
dst[i] = neg_infinity<T>();
|
|
171
206
|
} else {
|
|
172
207
|
dst[i] = sycl::log(xi);
|
|
173
208
|
}
|
|
174
209
|
}
|
|
175
210
|
|
|
176
|
-
|
|
211
|
+
template<typename T>
|
|
212
|
+
static void neg(const T * x, T * dst, const int k,
|
|
177
213
|
const sycl::nd_item<3> &item_ct1) {
|
|
178
214
|
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
179
215
|
item_ct1.get_local_id(2);
|
|
@@ -184,7 +220,8 @@ static void neg_f32(const float * x, float * dst, const int k,
|
|
|
184
220
|
dst[i] = -x[i];
|
|
185
221
|
}
|
|
186
222
|
|
|
187
|
-
|
|
223
|
+
template<typename T>
|
|
224
|
+
static void step(const T * x, T * dst, const int k,
|
|
188
225
|
const sycl::nd_item<3> &item_ct1) {
|
|
189
226
|
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
190
227
|
item_ct1.get_local_id(2);
|
|
@@ -192,21 +229,23 @@ static void step_f32(const float * x, float * dst, const int k,
|
|
|
192
229
|
if (i >= k) {
|
|
193
230
|
return;
|
|
194
231
|
}
|
|
195
|
-
dst[i] = x[i] > 0.0f;
|
|
232
|
+
dst[i] = x[i] > static_cast<T>(0.0f);
|
|
196
233
|
}
|
|
197
234
|
|
|
198
|
-
|
|
235
|
+
template<typename T>
|
|
236
|
+
static void leaky_relu(const T *x, T *dst, const int k, const float negative_slope,
|
|
199
237
|
const sycl::nd_item<3> &item_ct1) {
|
|
200
238
|
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
201
239
|
item_ct1.get_local_id(2);
|
|
202
240
|
if (i >= k) {
|
|
203
241
|
return;
|
|
204
242
|
}
|
|
205
|
-
dst[i] = sycl::fmax((
|
|
206
|
-
sycl::fmin((
|
|
243
|
+
dst[i] = sycl::fmax((x[i]), static_cast<T>(0)) +
|
|
244
|
+
sycl::fmin((x[i]), static_cast<T>(0.0f)) * negative_slope;
|
|
207
245
|
}
|
|
208
246
|
|
|
209
|
-
|
|
247
|
+
template<typename T>
|
|
248
|
+
static void sqr(const T * x, T * dst, const int k,
|
|
210
249
|
const sycl::nd_item<3> &item_ct1) {
|
|
211
250
|
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
212
251
|
item_ct1.get_local_id(2);
|
|
@@ -217,7 +256,8 @@ static void sqr_f32(const float * x, float * dst, const int k,
|
|
|
217
256
|
dst[i] = x[i] * x[i];
|
|
218
257
|
}
|
|
219
258
|
|
|
220
|
-
|
|
259
|
+
template<typename T>
|
|
260
|
+
static void upscale(const T *x, T *dst, const int nb00, const int nb01,
|
|
221
261
|
const int nb02, const int nb03, const int ne10, const int ne11,
|
|
222
262
|
const int ne12, const int ne13, const float sf0, const float sf1,
|
|
223
263
|
const float sf2, const float sf3, const sycl::nd_item<1> &item_ct1) {
|
|
@@ -237,10 +277,11 @@ static void upscale_f32(const float *x, float *dst, const int nb00, const int n
|
|
|
237
277
|
int i02 = i12 / sf2;
|
|
238
278
|
int i03 = i13 / sf3;
|
|
239
279
|
|
|
240
|
-
dst[index] = *(const
|
|
280
|
+
dst[index] = *(const T *)((const char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00);
|
|
241
281
|
}
|
|
242
282
|
|
|
243
|
-
|
|
283
|
+
template <typename T>
|
|
284
|
+
static void pad(const T *x, T *dst, const int ne0, const int ne00, const int ne01, const int ne02,
|
|
244
285
|
const sycl::nd_item<3> &item_ct1) {
|
|
245
286
|
int nidx = item_ct1.get_local_id(2) +
|
|
246
287
|
item_ct1.get_group(2) * item_ct1.get_local_range(2);
|
|
@@ -256,11 +297,23 @@ static void pad_f32(const float *x, float *dst, const int ne0, const int ne00,
|
|
|
256
297
|
item_ct1.get_group(0) * ne00 * ne01;
|
|
257
298
|
dst[offset_dst] = x[offset_src];
|
|
258
299
|
} else {
|
|
259
|
-
dst[offset_dst] = 0.0f;
|
|
300
|
+
dst[offset_dst] = static_cast<T>(0.0f);
|
|
260
301
|
}
|
|
261
302
|
}
|
|
262
303
|
|
|
263
304
|
|
|
305
|
+
template<typename T>
|
|
306
|
+
static void clamp(const T * x, T * dst, const float min, const float max, const int k,
|
|
307
|
+
const sycl::nd_item<3> &item_ct1) {
|
|
308
|
+
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
309
|
+
item_ct1.get_local_id(2);
|
|
310
|
+
|
|
311
|
+
if (i >= k) {
|
|
312
|
+
return;
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
dst[i] = x[i] < static_cast<T>(min) ? static_cast<T>(min) : (x[i] > static_cast<T>(max) ? static_cast<T>(max) : x[i]);
|
|
316
|
+
}
|
|
264
317
|
|
|
265
318
|
static void acc_f32_sycl(const float *x, const float *y, float *dst,
|
|
266
319
|
const int n_elements, const int ne10, const int ne11,
|
|
@@ -277,7 +330,8 @@ static void acc_f32_sycl(const float *x, const float *y, float *dst,
|
|
|
277
330
|
});
|
|
278
331
|
}
|
|
279
332
|
|
|
280
|
-
|
|
333
|
+
template<typename T>
|
|
334
|
+
static void gelu_sycl(const T *x, T *dst, const int k,
|
|
281
335
|
queue_ptr stream) {
|
|
282
336
|
const int num_blocks = (k + SYCL_GELU_BLOCK_SIZE - 1) / SYCL_GELU_BLOCK_SIZE;
|
|
283
337
|
stream->parallel_for(
|
|
@@ -285,11 +339,12 @@ static void gelu_f32_sycl(const float *x, float *dst, const int k,
|
|
|
285
339
|
sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE),
|
|
286
340
|
sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)),
|
|
287
341
|
[=](sycl::nd_item<3> item_ct1) {
|
|
288
|
-
|
|
342
|
+
gelu(x, dst, k, item_ct1);
|
|
289
343
|
});
|
|
290
344
|
}
|
|
291
345
|
|
|
292
|
-
|
|
346
|
+
template<typename T>
|
|
347
|
+
static void silu_sycl(const T *x, T *dst, const int k,
|
|
293
348
|
queue_ptr stream) {
|
|
294
349
|
const int num_blocks = (k + SYCL_SILU_BLOCK_SIZE - 1) / SYCL_SILU_BLOCK_SIZE;
|
|
295
350
|
stream->parallel_for(
|
|
@@ -297,11 +352,43 @@ static void silu_f32_sycl(const float *x, float *dst, const int k,
|
|
|
297
352
|
sycl::range<3>(1, 1, SYCL_SILU_BLOCK_SIZE),
|
|
298
353
|
sycl::range<3>(1, 1, SYCL_SILU_BLOCK_SIZE)),
|
|
299
354
|
[=](sycl::nd_item<3> item_ct1) {
|
|
300
|
-
|
|
355
|
+
silu(x, dst, k, item_ct1);
|
|
301
356
|
});
|
|
302
357
|
}
|
|
303
358
|
|
|
304
|
-
|
|
359
|
+
template<typename T>
|
|
360
|
+
static void sgn_sycl(const T * x, T * dst, const int k, queue_ptr stream) {
|
|
361
|
+
// hard code for now
|
|
362
|
+
const int num_blocks = ceil_div(k, 256);
|
|
363
|
+
stream->parallel_for(
|
|
364
|
+
sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range(1, 1, 256)), sycl::range(1, 1, 256)), [=](sycl::nd_item<3> item_ct1) {
|
|
365
|
+
sgn(x, dst, k, item_ct1);
|
|
366
|
+
});
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
template<typename T>
|
|
370
|
+
static void abs_sycl(const T * x, T * dst, const int k, queue_ptr stream) {
|
|
371
|
+
// hard code for now
|
|
372
|
+
const int num_blocks = ceil_div(k, 256);
|
|
373
|
+
stream->parallel_for(
|
|
374
|
+
sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, 256)), sycl::range<3>(1, 1, 256)), [=](sycl::nd_item<3> item_ct1) {
|
|
375
|
+
abs_op(x, dst, k, item_ct1);
|
|
376
|
+
});
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
template<typename T>
|
|
381
|
+
static void elu_sycl(const T * x, T * dst, const int k, queue_ptr stream) {
|
|
382
|
+
// hard code for now
|
|
383
|
+
const int num_blocks = ceil_div(k, 256);
|
|
384
|
+
stream->parallel_for(
|
|
385
|
+
sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, 256)), sycl::range<3>(1, 1, 256)), [=](sycl::nd_item<3> item_ct1) {
|
|
386
|
+
elu_op(x, dst, k, item_ct1);
|
|
387
|
+
});
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
template<typename T>
|
|
391
|
+
static void gelu_quick_sycl(const T *x, T *dst, const int k,
|
|
305
392
|
queue_ptr stream) {
|
|
306
393
|
const int num_blocks = (k + SYCL_GELU_BLOCK_SIZE - 1) / SYCL_GELU_BLOCK_SIZE;
|
|
307
394
|
stream->parallel_for(
|
|
@@ -309,11 +396,12 @@ static void gelu_quick_f32_sycl(const float *x, float *dst, const int k,
|
|
|
309
396
|
sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE),
|
|
310
397
|
sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)),
|
|
311
398
|
[=](sycl::nd_item<3> item_ct1) {
|
|
312
|
-
|
|
399
|
+
gelu_quick(x, dst, k, item_ct1);
|
|
313
400
|
});
|
|
314
401
|
}
|
|
315
402
|
|
|
316
|
-
|
|
403
|
+
template<typename T>
|
|
404
|
+
static void tanh_sycl(const T *x, T *dst, const int k,
|
|
317
405
|
queue_ptr stream) {
|
|
318
406
|
const int num_blocks = (k + SYCL_TANH_BLOCK_SIZE - 1) / SYCL_TANH_BLOCK_SIZE;
|
|
319
407
|
stream->parallel_for(
|
|
@@ -321,11 +409,12 @@ static void tanh_f32_sycl(const float *x, float *dst, const int k,
|
|
|
321
409
|
sycl::range<3>(1, 1, SYCL_TANH_BLOCK_SIZE),
|
|
322
410
|
sycl::range<3>(1, 1, SYCL_TANH_BLOCK_SIZE)),
|
|
323
411
|
[=](sycl::nd_item<3> item_ct1) {
|
|
324
|
-
|
|
412
|
+
tanh(x, dst, k, item_ct1);
|
|
325
413
|
});
|
|
326
414
|
}
|
|
327
415
|
|
|
328
|
-
|
|
416
|
+
template<typename T>
|
|
417
|
+
static void relu_sycl(const T *x, T *dst, const int k,
|
|
329
418
|
queue_ptr stream) {
|
|
330
419
|
const int num_blocks = (k + SYCL_RELU_BLOCK_SIZE - 1) / SYCL_RELU_BLOCK_SIZE;
|
|
331
420
|
stream->parallel_for(
|
|
@@ -333,11 +422,12 @@ static void relu_f32_sycl(const float *x, float *dst, const int k,
|
|
|
333
422
|
sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE),
|
|
334
423
|
sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE)),
|
|
335
424
|
[=](sycl::nd_item<3> item_ct1) {
|
|
336
|
-
|
|
425
|
+
relu(x, dst, k, item_ct1);
|
|
337
426
|
});
|
|
338
427
|
}
|
|
339
428
|
|
|
340
|
-
|
|
429
|
+
template<typename T>
|
|
430
|
+
static void hardsigmoid_sycl(const T *x, T *dst, const int k,
|
|
341
431
|
queue_ptr stream) {
|
|
342
432
|
const int num_blocks = (k + SYCL_HARDSIGMOID_BLOCK_SIZE - 1) / SYCL_HARDSIGMOID_BLOCK_SIZE;
|
|
343
433
|
stream->parallel_for(
|
|
@@ -345,11 +435,12 @@ static void hardsigmoid_f32_sycl(const float *x, float *dst, const int k,
|
|
|
345
435
|
sycl::range<3>(1, 1, SYCL_HARDSIGMOID_BLOCK_SIZE),
|
|
346
436
|
sycl::range<3>(1, 1, SYCL_HARDSIGMOID_BLOCK_SIZE)),
|
|
347
437
|
[=](sycl::nd_item<3> item_ct1) {
|
|
348
|
-
|
|
438
|
+
hardsigmoid(x, dst, k, item_ct1);
|
|
349
439
|
});
|
|
350
440
|
}
|
|
351
441
|
|
|
352
|
-
|
|
442
|
+
template<typename T>
|
|
443
|
+
static void hardswish_sycl(const T *x, T *dst, const int k,
|
|
353
444
|
queue_ptr stream) {
|
|
354
445
|
const int num_blocks = (k + SYCL_HARDSWISH_BLOCK_SIZE - 1) / SYCL_HARDSWISH_BLOCK_SIZE;
|
|
355
446
|
stream->parallel_for(
|
|
@@ -357,11 +448,12 @@ static void hardswish_f32_sycl(const float *x, float *dst, const int k,
|
|
|
357
448
|
sycl::range<3>(1, 1, SYCL_HARDSWISH_BLOCK_SIZE),
|
|
358
449
|
sycl::range<3>(1, 1, SYCL_HARDSWISH_BLOCK_SIZE)),
|
|
359
450
|
[=](sycl::nd_item<3> item_ct1) {
|
|
360
|
-
|
|
451
|
+
hardswish(x, dst, k, item_ct1);
|
|
361
452
|
});
|
|
362
453
|
}
|
|
363
454
|
|
|
364
|
-
|
|
455
|
+
template<typename T>
|
|
456
|
+
static void exp_sycl(const T *x, T *dst, const int k,
|
|
365
457
|
queue_ptr stream) {
|
|
366
458
|
const int num_blocks = (k + SYCL_EXP_BLOCK_SIZE - 1) / SYCL_EXP_BLOCK_SIZE;
|
|
367
459
|
stream->parallel_for(
|
|
@@ -369,11 +461,12 @@ static void exp_f32_sycl(const float *x, float *dst, const int k,
|
|
|
369
461
|
sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE),
|
|
370
462
|
sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE)),
|
|
371
463
|
[=](sycl::nd_item<3> item_ct1) {
|
|
372
|
-
|
|
464
|
+
exp(x, dst, k, item_ct1);
|
|
373
465
|
});
|
|
374
466
|
}
|
|
375
467
|
|
|
376
|
-
|
|
468
|
+
template<typename T>
|
|
469
|
+
static void log_sycl(const T *x, T *dst, const int k,
|
|
377
470
|
queue_ptr stream) {
|
|
378
471
|
const int num_blocks = (k + SYCL_EXP_BLOCK_SIZE - 1) / SYCL_EXP_BLOCK_SIZE;
|
|
379
472
|
stream->parallel_for(
|
|
@@ -381,11 +474,12 @@ static void log_f32_sycl(const float *x, float *dst, const int k,
|
|
|
381
474
|
sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE),
|
|
382
475
|
sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE)),
|
|
383
476
|
[=](sycl::nd_item<3> item_ct1) {
|
|
384
|
-
|
|
477
|
+
log(x, dst, k, item_ct1);
|
|
385
478
|
});
|
|
386
479
|
}
|
|
387
480
|
|
|
388
|
-
|
|
481
|
+
template<typename T>
|
|
482
|
+
static void neg_sycl(const T *x, T *dst, const int k,
|
|
389
483
|
queue_ptr stream) {
|
|
390
484
|
const int num_blocks = (k + SYCL_NEG_BLOCK_SIZE - 1) / SYCL_NEG_BLOCK_SIZE;
|
|
391
485
|
stream->parallel_for(
|
|
@@ -393,11 +487,12 @@ static void neg_f32_sycl(const float *x, float *dst, const int k,
|
|
|
393
487
|
sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE),
|
|
394
488
|
sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE)),
|
|
395
489
|
[=](sycl::nd_item<3> item_ct1) {
|
|
396
|
-
|
|
490
|
+
neg(x, dst, k, item_ct1);
|
|
397
491
|
});
|
|
398
492
|
}
|
|
399
493
|
|
|
400
|
-
|
|
494
|
+
template<typename T>
|
|
495
|
+
static void step_sycl(const T *x, T *dst, const int k,
|
|
401
496
|
queue_ptr stream) {
|
|
402
497
|
const int num_blocks = (k + SYCL_NEG_BLOCK_SIZE - 1) / SYCL_NEG_BLOCK_SIZE;
|
|
403
498
|
stream->parallel_for(
|
|
@@ -405,11 +500,12 @@ static void step_f32_sycl(const float *x, float *dst, const int k,
|
|
|
405
500
|
sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE),
|
|
406
501
|
sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE)),
|
|
407
502
|
[=](sycl::nd_item<3> item_ct1) {
|
|
408
|
-
|
|
503
|
+
step(x, dst, k, item_ct1);
|
|
409
504
|
});
|
|
410
505
|
}
|
|
411
506
|
|
|
412
|
-
|
|
507
|
+
template<typename T>
|
|
508
|
+
static void sigmoid_sycl(const T *x, T *dst, const int k,
|
|
413
509
|
queue_ptr stream) {
|
|
414
510
|
const int num_blocks = (k + SYCL_SIGMOID_BLOCK_SIZE - 1) / SYCL_SIGMOID_BLOCK_SIZE;
|
|
415
511
|
stream->parallel_for(
|
|
@@ -417,11 +513,12 @@ static void sigmoid_f32_sycl(const float *x, float *dst, const int k,
|
|
|
417
513
|
sycl::range<3>(1, 1, SYCL_SIGMOID_BLOCK_SIZE),
|
|
418
514
|
sycl::range<3>(1, 1, SYCL_SIGMOID_BLOCK_SIZE)),
|
|
419
515
|
[=](sycl::nd_item<3> item_ct1) {
|
|
420
|
-
|
|
516
|
+
sigmoid(x, dst, k, item_ct1);
|
|
421
517
|
});
|
|
422
518
|
}
|
|
423
519
|
|
|
424
|
-
|
|
520
|
+
template<typename T>
|
|
521
|
+
static void sqrt_sycl(const T *x, T *dst, const int k,
|
|
425
522
|
queue_ptr stream) {
|
|
426
523
|
const int num_blocks = (k + SYCL_SQRT_BLOCK_SIZE - 1) / SYCL_SQRT_BLOCK_SIZE;
|
|
427
524
|
stream->parallel_for(
|
|
@@ -429,11 +526,12 @@ static void sqrt_f32_sycl(const float *x, float *dst, const int k,
|
|
|
429
526
|
sycl::range<3>(1, 1, SYCL_SQRT_BLOCK_SIZE),
|
|
430
527
|
sycl::range<3>(1, 1, SYCL_SQRT_BLOCK_SIZE)),
|
|
431
528
|
[=](sycl::nd_item<3> item_ct1) {
|
|
432
|
-
|
|
529
|
+
sqrt(x, dst, k, item_ct1);
|
|
433
530
|
});
|
|
434
531
|
}
|
|
435
532
|
|
|
436
|
-
|
|
533
|
+
template<typename T>
|
|
534
|
+
static void sin_sycl(const T *x, T *dst, const int k,
|
|
437
535
|
queue_ptr stream) {
|
|
438
536
|
const int num_blocks = (k + SYCL_SIN_BLOCK_SIZE - 1) / SYCL_SIN_BLOCK_SIZE;
|
|
439
537
|
stream->parallel_for(
|
|
@@ -441,11 +539,12 @@ static void sin_f32_sycl(const float *x, float *dst, const int k,
|
|
|
441
539
|
sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE),
|
|
442
540
|
sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE)),
|
|
443
541
|
[=](sycl::nd_item<3> item_ct1) {
|
|
444
|
-
|
|
542
|
+
sin(x, dst, k, item_ct1);
|
|
445
543
|
});
|
|
446
544
|
}
|
|
447
545
|
|
|
448
|
-
|
|
546
|
+
template<typename T>
|
|
547
|
+
static void cos_sycl(const T *x, T *dst, const int k,
|
|
449
548
|
queue_ptr stream) {
|
|
450
549
|
const int num_blocks = (k + SYCL_SIN_BLOCK_SIZE - 1) / SYCL_SIN_BLOCK_SIZE;
|
|
451
550
|
stream->parallel_for(
|
|
@@ -453,11 +552,12 @@ static void cos_f32_sycl(const float *x, float *dst, const int k,
|
|
|
453
552
|
sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE),
|
|
454
553
|
sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE)),
|
|
455
554
|
[=](sycl::nd_item<3> item_ct1) {
|
|
456
|
-
|
|
555
|
+
cos(x, dst, k, item_ct1);
|
|
457
556
|
});
|
|
458
557
|
}
|
|
459
558
|
|
|
460
|
-
|
|
559
|
+
template<typename T>
|
|
560
|
+
static void leaky_relu_sycl(const T *x, T *dst, const int k,
|
|
461
561
|
const float negative_slope,
|
|
462
562
|
queue_ptr stream) {
|
|
463
563
|
const int num_blocks = (k + SYCL_RELU_BLOCK_SIZE - 1) / SYCL_RELU_BLOCK_SIZE;
|
|
@@ -466,11 +566,12 @@ static void leaky_relu_f32_sycl(const float *x, float *dst, const int k,
|
|
|
466
566
|
sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE),
|
|
467
567
|
sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE)),
|
|
468
568
|
[=](sycl::nd_item<3> item_ct1) {
|
|
469
|
-
|
|
569
|
+
leaky_relu(x, dst, k, negative_slope, item_ct1);
|
|
470
570
|
});
|
|
471
571
|
}
|
|
472
572
|
|
|
473
|
-
|
|
573
|
+
template<typename T>
|
|
574
|
+
static void sqr_sycl(const T *x, T *dst, const int k,
|
|
474
575
|
queue_ptr stream) {
|
|
475
576
|
const int num_blocks = (k + SYCL_SQR_BLOCK_SIZE - 1) / SYCL_SQR_BLOCK_SIZE;
|
|
476
577
|
stream->parallel_for(
|
|
@@ -478,11 +579,12 @@ static void sqr_f32_sycl(const float *x, float *dst, const int k,
|
|
|
478
579
|
sycl::range<3>(1, 1, SYCL_SQR_BLOCK_SIZE),
|
|
479
580
|
sycl::range<3>(1, 1, SYCL_SQR_BLOCK_SIZE)),
|
|
480
581
|
[=](sycl::nd_item<3> item_ct1) {
|
|
481
|
-
|
|
582
|
+
sqr(x, dst, k, item_ct1);
|
|
482
583
|
});
|
|
483
584
|
}
|
|
484
585
|
|
|
485
|
-
|
|
586
|
+
template<typename T>
|
|
587
|
+
static void upscale_sycl(const T *x, T *dst, const int nb00, const int nb01,
|
|
486
588
|
const int nb02, const int nb03, const int ne10, const int ne11,
|
|
487
589
|
const int ne12, const int ne13, const float sf0, const float sf1,
|
|
488
590
|
const float sf2, const float sf3, queue_ptr stream) {
|
|
@@ -492,11 +594,12 @@ static void upscale_f32_sycl(const float *x, float *dst, const int nb00, const i
|
|
|
492
594
|
stream->parallel_for(
|
|
493
595
|
sycl::nd_range<1>(gridDim, sycl::range<1>(SYCL_UPSCALE_BLOCK_SIZE)),
|
|
494
596
|
[=](sycl::nd_item<1> item_ct1) {
|
|
495
|
-
|
|
597
|
+
upscale(x, dst, nb00, nb01, nb02, nb03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3, item_ct1);
|
|
496
598
|
});
|
|
497
599
|
}
|
|
498
600
|
|
|
499
|
-
|
|
601
|
+
template<typename T>
|
|
602
|
+
static void pad_sycl(const T *x, T *dst, const int ne00,
|
|
500
603
|
const int ne01, const int ne02, const int ne0,
|
|
501
604
|
const int ne1, const int ne2, queue_ptr stream) {
|
|
502
605
|
int num_blocks = (ne0 + SYCL_PAD_BLOCK_SIZE - 1) / SYCL_PAD_BLOCK_SIZE;
|
|
@@ -505,526 +608,952 @@ static void pad_f32_sycl(const float *x, float *dst, const int ne00,
|
|
|
505
608
|
sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE),
|
|
506
609
|
sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE)),
|
|
507
610
|
[=](sycl::nd_item<3> item_ct1) {
|
|
508
|
-
|
|
611
|
+
pad(x, dst, ne0, ne00, ne01, ne02, item_ct1);
|
|
509
612
|
});
|
|
510
613
|
}
|
|
511
614
|
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
GGML_UNUSED(src1_dd);
|
|
525
|
-
GGML_UNUSED(ctx);
|
|
615
|
+
template<typename T>
|
|
616
|
+
static void clamp_sycl(const T *x, T *dst, const float min,
|
|
617
|
+
const float max, const int k,
|
|
618
|
+
queue_ptr stream) {
|
|
619
|
+
const int num_blocks = (k + SYCL_CLAMP_BLOCK_SIZE - 1) / SYCL_CLAMP_BLOCK_SIZE;
|
|
620
|
+
stream->parallel_for(
|
|
621
|
+
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
|
|
622
|
+
sycl::range<3>(1, 1, SYCL_CLAMP_BLOCK_SIZE),
|
|
623
|
+
sycl::range<3>(1, 1, SYCL_CLAMP_BLOCK_SIZE)),
|
|
624
|
+
[=](sycl::nd_item<3> item_ct1) {
|
|
625
|
+
clamp(x, dst, min, max, k, item_ct1);
|
|
626
|
+
});
|
|
526
627
|
}
|
|
527
628
|
|
|
528
|
-
inline void
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
534
|
-
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
629
|
+
inline void ggml_sycl_op_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
630
|
+
#if defined (GGML_SYCL_F16)
|
|
631
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
|
|
632
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
|
535
633
|
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
634
|
+
#else
|
|
635
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
|
636
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
637
|
+
#endif
|
|
638
|
+
GGML_ASSERT(dst->src[0]->type == dst->type);
|
|
639
|
+
dpct::queue_ptr main_stream = ctx.stream();
|
|
640
|
+
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
|
641
|
+
switch (dst->type) {
|
|
642
|
+
#if defined (GGML_SYCL_F16)
|
|
643
|
+
case GGML_TYPE_F16:
|
|
644
|
+
{
|
|
645
|
+
auto data_pts = cast_data<sycl::half>(dst);
|
|
646
|
+
sgn_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
647
|
+
break;
|
|
648
|
+
}
|
|
649
|
+
#endif
|
|
650
|
+
case GGML_TYPE_F32:
|
|
651
|
+
{
|
|
652
|
+
auto data_pts = cast_data<float>(dst);
|
|
653
|
+
sgn_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
654
|
+
break;
|
|
655
|
+
}
|
|
656
|
+
default:
|
|
657
|
+
GGML_ABORT("GGML tensor type not supported!\n");
|
|
658
|
+
break;
|
|
659
|
+
}
|
|
542
660
|
}
|
|
543
|
-
inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
|
544
|
-
const ggml_tensor *src1, ggml_tensor *dst,
|
|
545
|
-
const float *src0_dd, const float *src1_dd,
|
|
546
|
-
float *dst_dd,
|
|
547
|
-
const queue_ptr &main_stream) {
|
|
548
|
-
|
|
549
|
-
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
550
|
-
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
551
661
|
|
|
552
|
-
|
|
662
|
+
inline void ggml_sycl_op_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
663
|
+
#if defined (GGML_SYCL_F16)
|
|
664
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
|
|
665
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
|
553
666
|
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
667
|
+
#else
|
|
668
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
|
669
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
670
|
+
#endif
|
|
671
|
+
GGML_ASSERT(dst->src[0]->type == dst->type);
|
|
672
|
+
dpct::queue_ptr main_stream = ctx.stream();
|
|
673
|
+
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
|
674
|
+
switch (dst->type) {
|
|
675
|
+
#if defined (GGML_SYCL_F16)
|
|
676
|
+
case GGML_TYPE_F16:
|
|
677
|
+
{
|
|
678
|
+
auto data_pts = cast_data<sycl::half>(dst);
|
|
679
|
+
abs_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
680
|
+
break;
|
|
681
|
+
}
|
|
682
|
+
#endif
|
|
683
|
+
case GGML_TYPE_F32:
|
|
684
|
+
{
|
|
685
|
+
auto data_pts = cast_data<float>(dst);
|
|
686
|
+
abs_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
687
|
+
break;
|
|
688
|
+
}
|
|
689
|
+
default:
|
|
690
|
+
GGML_ABORT("GGML tensor type not supported!\n");
|
|
691
|
+
break;
|
|
692
|
+
}
|
|
558
693
|
}
|
|
559
694
|
|
|
560
|
-
inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
|
561
|
-
ggml_tensor *dst, const float *src0_dd,
|
|
562
|
-
const float *src1_dd, float *dst_dd,
|
|
563
|
-
const queue_ptr &main_stream) {
|
|
564
695
|
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
696
|
+
inline void ggml_sycl_op_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
697
|
+
#if defined (GGML_SYCL_F16)
|
|
698
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
|
|
699
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
|
568
700
|
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
701
|
+
#else
|
|
702
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
|
703
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
704
|
+
#endif
|
|
705
|
+
GGML_ASSERT(dst->src[0]->type == dst->type);
|
|
706
|
+
dpct::queue_ptr main_stream = ctx.stream();
|
|
707
|
+
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
|
708
|
+
switch (dst->type) {
|
|
709
|
+
#if defined (GGML_SYCL_F16)
|
|
710
|
+
case GGML_TYPE_F16:
|
|
711
|
+
{
|
|
712
|
+
auto data_pts = cast_data<sycl::half>(dst);
|
|
713
|
+
elu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
714
|
+
break;
|
|
715
|
+
}
|
|
716
|
+
#endif
|
|
717
|
+
case GGML_TYPE_F32:
|
|
718
|
+
{
|
|
719
|
+
auto data_pts = cast_data<float>(dst);
|
|
720
|
+
elu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
721
|
+
break;
|
|
722
|
+
}
|
|
723
|
+
default:
|
|
724
|
+
GGML_ABORT("GGML tensor type not supported!\n");
|
|
725
|
+
break;
|
|
726
|
+
}
|
|
573
727
|
}
|
|
574
728
|
|
|
575
|
-
inline void
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
GGML_ASSERT(
|
|
581
|
-
GGML_ASSERT(
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
729
|
+
inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
730
|
+
#if defined (GGML_SYCL_F16)
|
|
731
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
|
|
732
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
|
733
|
+
#else
|
|
734
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
|
735
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
736
|
+
#endif
|
|
737
|
+
GGML_ASSERT(dst->src[0]->type == dst->type);
|
|
738
|
+
dpct::queue_ptr main_stream = ctx.stream();
|
|
739
|
+
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
|
740
|
+
switch (dst->type) {
|
|
741
|
+
#if defined (GGML_SYCL_F16)
|
|
742
|
+
case GGML_TYPE_F16:
|
|
743
|
+
{
|
|
744
|
+
auto data_pts = cast_data<sycl::half>(dst);
|
|
745
|
+
silu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
746
|
+
break;
|
|
747
|
+
}
|
|
748
|
+
#endif
|
|
749
|
+
case GGML_TYPE_F32:
|
|
750
|
+
{
|
|
751
|
+
auto data_pts = cast_data<float>(dst);
|
|
752
|
+
silu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
753
|
+
break;
|
|
754
|
+
}
|
|
755
|
+
default:
|
|
756
|
+
GGML_ABORT("GGML tensor type not supported!\n");
|
|
757
|
+
break;
|
|
758
|
+
}
|
|
589
759
|
}
|
|
590
760
|
|
|
591
|
-
inline void
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
GGML_ASSERT(
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
761
|
+
inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
762
|
+
#if defined (GGML_SYCL_F16)
|
|
763
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
|
|
764
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
|
765
|
+
#else
|
|
766
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
|
767
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
768
|
+
#endif
|
|
769
|
+
GGML_ASSERT(dst->src[0]->type == dst->type);
|
|
770
|
+
dpct::queue_ptr main_stream = ctx.stream();
|
|
771
|
+
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
|
772
|
+
switch (dst->type) {
|
|
773
|
+
#if defined (GGML_SYCL_F16)
|
|
774
|
+
case GGML_TYPE_F16:
|
|
775
|
+
{
|
|
776
|
+
auto data_pts = cast_data<sycl::half>(dst);
|
|
777
|
+
gelu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
778
|
+
break;
|
|
779
|
+
}
|
|
780
|
+
#endif
|
|
781
|
+
case GGML_TYPE_F32:
|
|
782
|
+
{
|
|
783
|
+
auto data_pts = cast_data<float>(dst);
|
|
784
|
+
gelu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
785
|
+
break;
|
|
786
|
+
}
|
|
787
|
+
default:
|
|
788
|
+
GGML_ABORT("GGML tensor type not supported!\n");
|
|
789
|
+
break;
|
|
790
|
+
}
|
|
606
791
|
}
|
|
607
792
|
|
|
608
|
-
inline void
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
GGML_ASSERT(
|
|
614
|
-
GGML_ASSERT(
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
793
|
+
inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
|
794
|
+
#if defined (GGML_SYCL_F16)
|
|
795
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
|
|
796
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
|
797
|
+
#else
|
|
798
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
|
799
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
800
|
+
#endif
|
|
801
|
+
GGML_ASSERT(dst->src[0]->type == dst->type);
|
|
802
|
+
dpct::queue_ptr main_stream = ctx.stream();
|
|
803
|
+
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
|
804
|
+
switch (dst->type) {
|
|
805
|
+
#if defined (GGML_SYCL_F16)
|
|
806
|
+
case GGML_TYPE_F16:
|
|
807
|
+
{
|
|
808
|
+
auto data_pts = cast_data<sycl::half>(dst);
|
|
809
|
+
gelu_quick_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
810
|
+
break;
|
|
811
|
+
}
|
|
812
|
+
#endif
|
|
813
|
+
case GGML_TYPE_F32:
|
|
814
|
+
{
|
|
815
|
+
auto data_pts = cast_data<float>(dst);
|
|
816
|
+
gelu_quick_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
817
|
+
break;
|
|
818
|
+
}
|
|
819
|
+
default:
|
|
820
|
+
GGML_ABORT("GGML tensor type not supported!\n");
|
|
821
|
+
break;
|
|
822
|
+
}
|
|
622
823
|
}
|
|
623
824
|
|
|
624
|
-
inline void
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
GGML_ASSERT(
|
|
630
|
-
GGML_ASSERT(
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
825
|
+
inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
826
|
+
#if defined (GGML_SYCL_F16)
|
|
827
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
|
|
828
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
|
829
|
+
#else
|
|
830
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
|
831
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
832
|
+
#endif
|
|
833
|
+
GGML_ASSERT(dst->src[0]->type == dst->type);
|
|
834
|
+
dpct::queue_ptr main_stream = ctx.stream();
|
|
835
|
+
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
|
836
|
+
switch (dst->type) {
|
|
837
|
+
#if defined (GGML_SYCL_F16)
|
|
838
|
+
case GGML_TYPE_F16:
|
|
839
|
+
{
|
|
840
|
+
auto data_pts = cast_data<sycl::half>(dst);
|
|
841
|
+
tanh_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
842
|
+
break;
|
|
843
|
+
}
|
|
844
|
+
#endif
|
|
845
|
+
case GGML_TYPE_F32:
|
|
846
|
+
{
|
|
847
|
+
auto data_pts = cast_data<float>(dst);
|
|
848
|
+
tanh_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
849
|
+
break;
|
|
850
|
+
}
|
|
851
|
+
default:
|
|
852
|
+
GGML_ABORT("GGML tensor type not supported!\n");
|
|
853
|
+
break;
|
|
854
|
+
}
|
|
638
855
|
}
|
|
639
856
|
|
|
640
|
-
inline void
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
GGML_ASSERT(
|
|
646
|
-
GGML_ASSERT(
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
857
|
+
inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
858
|
+
#if defined (GGML_SYCL_F16)
|
|
859
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
|
|
860
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
|
861
|
+
#else
|
|
862
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
|
863
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
864
|
+
#endif
|
|
865
|
+
GGML_ASSERT(dst->src[0]->type == dst->type);
|
|
866
|
+
dpct::queue_ptr main_stream = ctx.stream();
|
|
867
|
+
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
|
868
|
+
|
|
869
|
+
switch (dst->type) {
|
|
870
|
+
#if defined (GGML_SYCL_F16)
|
|
871
|
+
case GGML_TYPE_F16:
|
|
872
|
+
{
|
|
873
|
+
auto data_pts = cast_data<sycl::half>(dst);
|
|
874
|
+
relu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
875
|
+
break;
|
|
876
|
+
}
|
|
877
|
+
#endif
|
|
878
|
+
case GGML_TYPE_F32:
|
|
879
|
+
{
|
|
880
|
+
auto data_pts = cast_data<float>(dst);
|
|
881
|
+
relu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
882
|
+
break;
|
|
883
|
+
}
|
|
884
|
+
default:
|
|
885
|
+
GGML_ABORT("GGML tensor type not supported!\n");
|
|
886
|
+
break;
|
|
887
|
+
}
|
|
654
888
|
}
|
|
655
889
|
|
|
656
|
-
inline void
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
GGML_ASSERT(
|
|
662
|
-
GGML_ASSERT(
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
890
|
+
inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
891
|
+
#if defined (GGML_SYCL_F16)
|
|
892
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
|
|
893
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
|
894
|
+
#else
|
|
895
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
|
896
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
897
|
+
#endif
|
|
898
|
+
GGML_ASSERT(dst->src[0]->type == dst->type);
|
|
899
|
+
|
|
900
|
+
dpct::queue_ptr main_stream = ctx.stream();
|
|
901
|
+
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
|
902
|
+
|
|
903
|
+
switch (dst->type) {
|
|
904
|
+
#if defined (GGML_SYCL_F16)
|
|
905
|
+
case GGML_TYPE_F16:
|
|
906
|
+
{
|
|
907
|
+
auto data_pts = cast_data<sycl::half>(dst);
|
|
908
|
+
hardsigmoid_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
909
|
+
break;
|
|
910
|
+
}
|
|
911
|
+
#endif
|
|
912
|
+
case GGML_TYPE_F32:
|
|
913
|
+
{
|
|
914
|
+
auto data_pts = cast_data<float>(dst);
|
|
915
|
+
hardsigmoid_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
916
|
+
break;
|
|
917
|
+
}
|
|
918
|
+
default:
|
|
919
|
+
GGML_ABORT("GGML tensor type not supported!\n");
|
|
920
|
+
break;
|
|
921
|
+
}
|
|
670
922
|
}
|
|
671
923
|
|
|
672
|
-
inline void
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
GGML_ASSERT(
|
|
678
|
-
GGML_ASSERT(
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
924
|
+
inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
925
|
+
#if defined (GGML_SYCL_F16)
|
|
926
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
|
|
927
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
|
928
|
+
#else
|
|
929
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
|
930
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
931
|
+
#endif
|
|
932
|
+
GGML_ASSERT(dst->src[0]->type == dst->type);
|
|
933
|
+
dpct::queue_ptr main_stream = ctx.stream();
|
|
934
|
+
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
|
935
|
+
switch (dst->type) {
|
|
936
|
+
#if defined (GGML_SYCL_F16)
|
|
937
|
+
case GGML_TYPE_F16:
|
|
938
|
+
{
|
|
939
|
+
auto data_pts = cast_data<sycl::half>(dst);
|
|
940
|
+
hardswish_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
941
|
+
break;
|
|
942
|
+
}
|
|
943
|
+
#endif
|
|
944
|
+
case GGML_TYPE_F32:
|
|
945
|
+
{
|
|
946
|
+
auto data_pts = cast_data<float>(dst);
|
|
947
|
+
hardswish_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
948
|
+
break;
|
|
949
|
+
}
|
|
950
|
+
default:
|
|
951
|
+
GGML_ABORT("GGML tensor type not supported!\n");
|
|
952
|
+
break;
|
|
953
|
+
}
|
|
686
954
|
}
|
|
687
955
|
|
|
688
|
-
inline void
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
GGML_ASSERT(
|
|
694
|
-
GGML_ASSERT(
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
956
|
+
inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
957
|
+
#if defined (GGML_SYCL_F16)
|
|
958
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
|
|
959
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
|
960
|
+
#else
|
|
961
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
|
962
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
963
|
+
#endif
|
|
964
|
+
GGML_ASSERT(dst->src[0]->type == dst->type);
|
|
965
|
+
dpct::queue_ptr main_stream = ctx.stream();
|
|
966
|
+
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
|
967
|
+
switch (dst->type) {
|
|
968
|
+
#if defined (GGML_SYCL_F16)
|
|
969
|
+
case GGML_TYPE_F16:
|
|
970
|
+
{
|
|
971
|
+
auto data_pts = cast_data<sycl::half>(dst);
|
|
972
|
+
exp_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
973
|
+
break;
|
|
974
|
+
}
|
|
975
|
+
#endif
|
|
976
|
+
case GGML_TYPE_F32:
|
|
977
|
+
{
|
|
978
|
+
auto data_pts = cast_data<float>(dst);
|
|
979
|
+
exp_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
980
|
+
break;
|
|
981
|
+
}
|
|
982
|
+
default:
|
|
983
|
+
GGML_ABORT("GGML tensor type not supported!\n");
|
|
984
|
+
break;
|
|
985
|
+
}
|
|
702
986
|
}
|
|
703
987
|
|
|
704
|
-
inline void
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
GGML_ASSERT(
|
|
710
|
-
GGML_ASSERT(
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
988
|
+
inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
989
|
+
#if defined (GGML_SYCL_F16)
|
|
990
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
|
|
991
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
|
992
|
+
#else
|
|
993
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
|
994
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
995
|
+
#endif
|
|
996
|
+
GGML_ASSERT(dst->src[0]->type == dst->type);
|
|
997
|
+
dpct::queue_ptr main_stream = ctx.stream();
|
|
998
|
+
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
|
999
|
+
switch (dst->type) {
|
|
1000
|
+
#if defined (GGML_SYCL_F16)
|
|
1001
|
+
case GGML_TYPE_F16:
|
|
1002
|
+
{
|
|
1003
|
+
auto data_pts = cast_data<sycl::half>(dst);
|
|
1004
|
+
log_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
1005
|
+
break;
|
|
1006
|
+
}
|
|
1007
|
+
#endif
|
|
1008
|
+
case GGML_TYPE_F32:
|
|
1009
|
+
{
|
|
1010
|
+
auto data_pts = cast_data<float>(dst);
|
|
1011
|
+
log_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
1012
|
+
break;
|
|
1013
|
+
}
|
|
1014
|
+
default:
|
|
1015
|
+
GGML_ABORT("GGML tensor type not supported!\n");
|
|
1016
|
+
break;
|
|
1017
|
+
}
|
|
718
1018
|
}
|
|
719
1019
|
|
|
720
|
-
inline void
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
GGML_ASSERT(
|
|
726
|
-
GGML_ASSERT(
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
1020
|
+
inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
1021
|
+
#if defined (GGML_SYCL_F16)
|
|
1022
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
|
|
1023
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
|
1024
|
+
#else
|
|
1025
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
|
1026
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
1027
|
+
#endif
|
|
1028
|
+
GGML_ASSERT(dst->src[0]->type == dst->type);
|
|
1029
|
+
dpct::queue_ptr main_stream = ctx.stream();
|
|
1030
|
+
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
|
1031
|
+
switch (dst->type) {
|
|
1032
|
+
#if defined (GGML_SYCL_F16)
|
|
1033
|
+
case GGML_TYPE_F16:
|
|
1034
|
+
{
|
|
1035
|
+
auto data_pts = cast_data<sycl::half>(dst);
|
|
1036
|
+
sigmoid_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
1037
|
+
break;
|
|
1038
|
+
}
|
|
1039
|
+
#endif
|
|
1040
|
+
case GGML_TYPE_F32:
|
|
1041
|
+
{
|
|
1042
|
+
auto data_pts = cast_data<float>(dst);
|
|
1043
|
+
sigmoid_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
1044
|
+
break;
|
|
1045
|
+
}
|
|
1046
|
+
default:
|
|
1047
|
+
GGML_ABORT("GGML tensor type not supported!\n");
|
|
1048
|
+
break;
|
|
1049
|
+
}
|
|
734
1050
|
}
|
|
735
1051
|
|
|
736
|
-
inline void
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
1052
|
+
inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
1053
|
+
#if defined (GGML_SYCL_F16)
|
|
1054
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
|
|
1055
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
|
1056
|
+
#else
|
|
1057
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
|
1058
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
1059
|
+
#endif
|
|
1060
|
+
GGML_ASSERT(dst->src[0]->type == dst->type);
|
|
1061
|
+
|
|
1062
|
+
dpct::queue_ptr main_stream = ctx.stream();
|
|
1063
|
+
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
|
1064
|
+
switch (dst->type) {
|
|
1065
|
+
#if defined (GGML_SYCL_F16)
|
|
1066
|
+
case GGML_TYPE_F16:
|
|
1067
|
+
{
|
|
1068
|
+
auto data_pts = cast_data<sycl::half>(dst);
|
|
1069
|
+
sqrt_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
1070
|
+
break;
|
|
1071
|
+
}
|
|
1072
|
+
#endif
|
|
1073
|
+
case GGML_TYPE_F32:
|
|
1074
|
+
{
|
|
1075
|
+
auto data_pts = cast_data<float>(dst);
|
|
1076
|
+
sqrt_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
1077
|
+
break;
|
|
1078
|
+
}
|
|
1079
|
+
default:
|
|
1080
|
+
GGML_ABORT("GGML tensor type not supported!\n");
|
|
1081
|
+
break;
|
|
1082
|
+
}
|
|
1083
|
+
}
|
|
740
1084
|
|
|
741
|
-
|
|
742
|
-
|
|
1085
|
+
inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
1086
|
+
#if defined (GGML_SYCL_F16)
|
|
1087
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
|
|
1088
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
|
1089
|
+
#else
|
|
1090
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
|
1091
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
1092
|
+
#endif
|
|
1093
|
+
GGML_ASSERT(dst->src[0]->type == dst->type);
|
|
1094
|
+
dpct::queue_ptr main_stream = ctx.stream();
|
|
1095
|
+
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
|
1096
|
+
switch (dst->type) {
|
|
1097
|
+
#if defined (GGML_SYCL_F16)
|
|
1098
|
+
case GGML_TYPE_F16:
|
|
1099
|
+
{
|
|
1100
|
+
auto data_pts = cast_data<sycl::half>(dst);
|
|
1101
|
+
sin_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
1102
|
+
break;
|
|
1103
|
+
}
|
|
1104
|
+
#endif
|
|
1105
|
+
case GGML_TYPE_F32:
|
|
1106
|
+
{
|
|
1107
|
+
auto data_pts = cast_data<float>(dst);
|
|
1108
|
+
sin_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
1109
|
+
break;
|
|
1110
|
+
}
|
|
1111
|
+
default:
|
|
1112
|
+
GGML_ABORT("GGML tensor type not supported!\n");
|
|
1113
|
+
break;
|
|
1114
|
+
}
|
|
1115
|
+
}
|
|
743
1116
|
|
|
744
|
-
|
|
1117
|
+
inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
1118
|
+
#if defined (GGML_SYCL_F16)
|
|
1119
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
|
|
1120
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
|
1121
|
+
#else
|
|
1122
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
|
1123
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
1124
|
+
#endif
|
|
1125
|
+
GGML_ASSERT(dst->src[0]->type == dst->type);
|
|
1126
|
+
dpct::queue_ptr main_stream = ctx.stream();
|
|
1127
|
+
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
|
1128
|
+
switch (dst->type) {
|
|
1129
|
+
#if defined (GGML_SYCL_F16)
|
|
1130
|
+
case GGML_TYPE_F16:
|
|
1131
|
+
{
|
|
1132
|
+
auto data_pts = cast_data<sycl::half>(dst);
|
|
1133
|
+
cos_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
1134
|
+
break;
|
|
1135
|
+
}
|
|
1136
|
+
#endif
|
|
1137
|
+
case GGML_TYPE_F32:
|
|
1138
|
+
{
|
|
1139
|
+
auto data_pts = cast_data<float>(dst);
|
|
1140
|
+
cos_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
1141
|
+
break;
|
|
1142
|
+
}
|
|
1143
|
+
default:
|
|
1144
|
+
GGML_ABORT("GGML tensor type not supported!\n");
|
|
1145
|
+
break;
|
|
1146
|
+
}
|
|
1147
|
+
}
|
|
745
1148
|
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
1149
|
+
inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
1150
|
+
#if defined (GGML_SYCL_F16)
|
|
1151
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
|
|
1152
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
|
1153
|
+
#else
|
|
1154
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
|
1155
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
1156
|
+
#endif
|
|
1157
|
+
GGML_ASSERT(dst->src[0]->type == dst->type);
|
|
1158
|
+
dpct::queue_ptr main_stream = ctx.stream();
|
|
1159
|
+
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
|
1160
|
+
switch (dst->type) {
|
|
1161
|
+
#if defined (GGML_SYCL_F16)
|
|
1162
|
+
case GGML_TYPE_F16:
|
|
1163
|
+
{
|
|
1164
|
+
auto data_pts = cast_data<sycl::half>(dst);
|
|
1165
|
+
step_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
1166
|
+
break;
|
|
1167
|
+
}
|
|
1168
|
+
#endif
|
|
1169
|
+
case GGML_TYPE_F32:
|
|
1170
|
+
{
|
|
1171
|
+
auto data_pts = cast_data<float>(dst);
|
|
1172
|
+
step_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
1173
|
+
break;
|
|
1174
|
+
}
|
|
1175
|
+
default:
|
|
1176
|
+
GGML_ABORT("GGML tensor type not supported!\n");
|
|
1177
|
+
break;
|
|
1178
|
+
}
|
|
750
1179
|
}
|
|
751
1180
|
|
|
752
|
-
inline void
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
1181
|
+
inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
1182
|
+
#if defined (GGML_SYCL_F16)
|
|
1183
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
|
|
1184
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
|
1185
|
+
#else
|
|
1186
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
|
1187
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
1188
|
+
#endif
|
|
1189
|
+
GGML_ASSERT(dst->src[0]->type == dst->type);
|
|
1190
|
+
dpct::queue_ptr main_stream = ctx.stream();
|
|
1191
|
+
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
|
1192
|
+
switch (dst->type) {
|
|
1193
|
+
#if defined (GGML_SYCL_F16)
|
|
1194
|
+
case GGML_TYPE_F16:
|
|
1195
|
+
{
|
|
1196
|
+
auto data_pts = cast_data<sycl::half>(dst);
|
|
1197
|
+
neg_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
1198
|
+
break;
|
|
1199
|
+
}
|
|
1200
|
+
#endif
|
|
1201
|
+
case GGML_TYPE_F32:
|
|
1202
|
+
{
|
|
1203
|
+
auto data_pts = cast_data<float>(dst);
|
|
1204
|
+
neg_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
1205
|
+
break;
|
|
1206
|
+
}
|
|
1207
|
+
default:
|
|
1208
|
+
GGML_ABORT("GGML tensor type not supported!\n");
|
|
1209
|
+
break;
|
|
1210
|
+
}
|
|
1211
|
+
}
|
|
757
1212
|
|
|
758
|
-
|
|
759
|
-
|
|
1213
|
+
inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
1214
|
+
#if defined (GGML_SYCL_F16)
|
|
1215
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
|
|
1216
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
|
1217
|
+
#else
|
|
1218
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
|
1219
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
1220
|
+
#endif
|
|
760
1221
|
|
|
1222
|
+
GGML_ASSERT(dst->src[0]->type == dst->type);
|
|
761
1223
|
float negative_slope;
|
|
762
1224
|
memcpy(&negative_slope, dst->op_params, sizeof(float));
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
1225
|
+
dpct::queue_ptr main_stream = ctx.stream();
|
|
1226
|
+
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
|
1227
|
+
switch (dst->type) {
|
|
1228
|
+
#if defined (GGML_SYCL_F16)
|
|
1229
|
+
case GGML_TYPE_F16:
|
|
1230
|
+
{
|
|
1231
|
+
auto data_pts = cast_data<sycl::half>(dst);
|
|
1232
|
+
leaky_relu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), negative_slope, main_stream);
|
|
1233
|
+
break;
|
|
1234
|
+
}
|
|
1235
|
+
#endif
|
|
1236
|
+
case GGML_TYPE_F32:
|
|
1237
|
+
{
|
|
1238
|
+
auto data_pts = cast_data<float>(dst);
|
|
1239
|
+
leaky_relu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), negative_slope, main_stream);
|
|
1240
|
+
break;
|
|
1241
|
+
}
|
|
1242
|
+
default:
|
|
1243
|
+
GGML_ABORT("GGML tensor type not supported!\n");
|
|
1244
|
+
break;
|
|
1245
|
+
}
|
|
770
1246
|
}
|
|
771
1247
|
|
|
772
|
-
inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx,
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
GGML_ASSERT(
|
|
778
|
-
GGML_ASSERT(
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
1248
|
+
inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
1249
|
+
#if defined (GGML_SYCL_F16)
|
|
1250
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
|
|
1251
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
|
1252
|
+
#else
|
|
1253
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
|
1254
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
1255
|
+
#endif
|
|
1256
|
+
GGML_ASSERT(dst->src[0]->type == dst->type);
|
|
1257
|
+
dpct::queue_ptr main_stream = ctx.stream();
|
|
1258
|
+
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
|
1259
|
+
switch (dst->type) {
|
|
1260
|
+
#if defined (GGML_SYCL_F16)
|
|
1261
|
+
case GGML_TYPE_F16:
|
|
1262
|
+
{
|
|
1263
|
+
auto data_pts = cast_data<sycl::half>(dst);
|
|
1264
|
+
sqr_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
1265
|
+
break;
|
|
1266
|
+
}
|
|
1267
|
+
#endif
|
|
1268
|
+
case GGML_TYPE_F32:
|
|
1269
|
+
{
|
|
1270
|
+
auto data_pts = cast_data<float>(dst);
|
|
1271
|
+
sqr_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
1272
|
+
break;
|
|
1273
|
+
}
|
|
1274
|
+
default:
|
|
1275
|
+
GGML_ABORT("GGML tensor type not supported!\n");
|
|
1276
|
+
break;
|
|
1277
|
+
}
|
|
786
1278
|
}
|
|
787
1279
|
|
|
788
|
-
inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx,
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
1280
|
+
inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
1281
|
+
#if defined (GGML_SYCL_F16)
|
|
1282
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
|
|
1283
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
|
1284
|
+
#else
|
|
1285
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
|
795
1286
|
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
1287
|
+
#endif
|
|
1288
|
+
GGML_ASSERT(dst->src[0]->type == dst->type);
|
|
1289
|
+
|
|
1290
|
+
dpct::queue_ptr main_stream = ctx.stream();
|
|
1291
|
+
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
|
1292
|
+
|
|
1293
|
+
const float sf0 = (float) dst->ne[0] / dst->src[0]->ne[0];
|
|
1294
|
+
const float sf1 = (float) dst->ne[1] / dst->src[0]->ne[1];
|
|
1295
|
+
const float sf2 = (float) dst->ne[2] / dst->src[0]->ne[2];
|
|
1296
|
+
const float sf3 = (float) dst->ne[3] / dst->src[0]->ne[3];
|
|
1297
|
+
switch (dst->type) {
|
|
1298
|
+
#if defined (GGML_SYCL_F16)
|
|
1299
|
+
case GGML_TYPE_F16:
|
|
1300
|
+
{
|
|
1301
|
+
auto data_pts = cast_data<sycl::half>(dst);
|
|
1302
|
+
upscale_sycl(data_pts.src, data_pts.dst, dst->src[0]->nb[0], dst->src[0]->nb[1], dst->src[0]->nb[2],
|
|
1303
|
+
dst->src[0]->nb[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3,
|
|
1304
|
+
main_stream);
|
|
1305
|
+
break;
|
|
1306
|
+
}
|
|
1307
|
+
#endif
|
|
1308
|
+
case GGML_TYPE_F32:
|
|
1309
|
+
{
|
|
1310
|
+
auto data_pts = cast_data<float>(dst);
|
|
1311
|
+
upscale_sycl(data_pts.src, data_pts.dst, dst->src[0]->nb[0], dst->src[0]->nb[1], dst->src[0]->nb[2],
|
|
1312
|
+
dst->src[0]->nb[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3,
|
|
1313
|
+
main_stream);
|
|
1314
|
+
break;
|
|
1315
|
+
}
|
|
1316
|
+
default:
|
|
1317
|
+
GGML_ABORT("GGML tensor type not supported!\n");
|
|
1318
|
+
break;
|
|
1319
|
+
}
|
|
810
1320
|
}
|
|
811
1321
|
|
|
812
|
-
inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx,
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
GGML_ASSERT(
|
|
1322
|
+
inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
1323
|
+
#if defined (GGML_SYCL_F16)
|
|
1324
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
|
|
1325
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
|
1326
|
+
#else
|
|
1327
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
|
818
1328
|
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
819
|
-
|
|
1329
|
+
#endif
|
|
1330
|
+
GGML_ASSERT(dst->src[0]->type == dst->type);
|
|
1331
|
+
GGML_ASSERT(dst->src[0]->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
|
|
1332
|
+
dpct::queue_ptr main_stream = ctx.stream();
|
|
1333
|
+
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
|
1334
|
+
switch (dst->type) {
|
|
1335
|
+
#if defined (GGML_SYCL_F16)
|
|
1336
|
+
case GGML_TYPE_F16:
|
|
1337
|
+
{
|
|
1338
|
+
auto data_pts = cast_data<sycl::half>(dst);
|
|
1339
|
+
pad_sycl(data_pts.src, data_pts.dst, dst->src[0]->ne[0], dst->src[0]->ne[1], dst->src[0]->ne[2], dst->ne[0],
|
|
1340
|
+
dst->ne[1], dst->ne[2], main_stream);
|
|
1341
|
+
break;
|
|
1342
|
+
}
|
|
1343
|
+
#endif
|
|
1344
|
+
case GGML_TYPE_F32:
|
|
1345
|
+
{
|
|
1346
|
+
auto data_pts = cast_data<float>(dst);
|
|
1347
|
+
pad_sycl(data_pts.src, data_pts.dst, dst->src[0]->ne[0], dst->src[0]->ne[1], dst->src[0]->ne[2], dst->ne[0],
|
|
1348
|
+
dst->ne[1], dst->ne[2], main_stream);
|
|
1349
|
+
break;
|
|
1350
|
+
}
|
|
1351
|
+
default:
|
|
1352
|
+
GGML_ABORT("GGML tensor type not supported!\n");
|
|
1353
|
+
break;
|
|
1354
|
+
}
|
|
1355
|
+
}
|
|
820
1356
|
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
1357
|
+
inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
1358
|
+
#if defined(GGML_SYCL_F16)
|
|
1359
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
|
|
1360
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
|
1361
|
+
#else
|
|
824
1362
|
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
|
|
1363
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
|
1364
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
1365
|
+
#endif
|
|
1366
|
+
GGML_ASSERT(dst->src[0]->type == dst->type);
|
|
1367
|
+
dpct::queue_ptr main_stream = ctx.stream();
|
|
1368
|
+
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
|
1369
|
+
float min;
|
|
1370
|
+
float max;
|
|
1371
|
+
memcpy(&min, dst->op_params, sizeof(float));
|
|
1372
|
+
memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
|
|
1373
|
+
|
|
1374
|
+
switch (dst->type) {
|
|
1375
|
+
#if defined(GGML_SYCL_F16)
|
|
1376
|
+
case GGML_TYPE_F16:
|
|
1377
|
+
{
|
|
1378
|
+
auto data_pts = cast_data<sycl::half>(dst);
|
|
1379
|
+
clamp_sycl(data_pts.src, data_pts.dst, min, max, ggml_nelements(dst->src[0]), main_stream);
|
|
1380
|
+
break;
|
|
1381
|
+
}
|
|
1382
|
+
#endif
|
|
1383
|
+
case GGML_TYPE_F32:
|
|
1384
|
+
{
|
|
1385
|
+
auto data_pts = cast_data<float>(dst);
|
|
1386
|
+
clamp_sycl(data_pts.src, data_pts.dst, min, max, ggml_nelements(dst->src[0]), main_stream);
|
|
1387
|
+
break;
|
|
1388
|
+
}
|
|
1389
|
+
default:
|
|
1390
|
+
GGML_ABORT("GGML tensor type not supported!\n");
|
|
1391
|
+
break;
|
|
1392
|
+
}
|
|
829
1393
|
}
|
|
830
1394
|
|
|
831
|
-
inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx,
|
|
832
|
-
ggml_tensor *dst, const float *src0_dd,
|
|
833
|
-
const float *src1_dd, float *dst_dd,
|
|
834
|
-
const queue_ptr &main_stream) {
|
|
1395
|
+
inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
|
835
1396
|
|
|
836
|
-
GGML_ASSERT(
|
|
837
|
-
GGML_ASSERT(
|
|
1397
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
|
1398
|
+
GGML_ASSERT(dst->src[1]->type == GGML_TYPE_F32);
|
|
838
1399
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
839
1400
|
GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported
|
|
1401
|
+
dpct::queue_ptr main_stream = ctx.stream();
|
|
1402
|
+
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
|
1403
|
+
const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
|
|
1404
|
+
const float * src1_dd = static_cast<const float*>(dst->src[1]->data);
|
|
1405
|
+
float * dst_dd = static_cast<float *>(dst->data);
|
|
840
1406
|
|
|
841
1407
|
int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
|
|
842
1408
|
int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
|
|
843
1409
|
// int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
|
|
844
1410
|
int offset = dst->op_params[3] / 4; // offset in bytes
|
|
845
1411
|
|
|
846
|
-
acc_f32_sycl(src0_dd, src1_dd, dst_dd, ggml_nelements(dst),
|
|
847
|
-
|
|
848
|
-
GGML_UNUSED(dst);
|
|
849
|
-
GGML_UNUSED(ctx);
|
|
850
|
-
}
|
|
851
|
-
|
|
852
|
-
inline void ggml_sycl_op_add(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
|
853
|
-
ggml_tensor *dst, const float *src0_dd,
|
|
854
|
-
const float *src1_dd, float *dst_dd,
|
|
855
|
-
const queue_ptr &main_stream) {
|
|
856
|
-
|
|
857
|
-
ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_add>>(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
|
|
858
|
-
}
|
|
859
|
-
|
|
860
|
-
inline void ggml_sycl_op_sub(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
|
861
|
-
ggml_tensor *dst, const float *src0_dd,
|
|
862
|
-
const float *src1_dd, float *dst_dd,
|
|
863
|
-
const queue_ptr &main_stream) {
|
|
864
|
-
|
|
865
|
-
ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_sub>>(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
|
|
866
|
-
}
|
|
867
|
-
|
|
868
|
-
inline void ggml_sycl_op_mul(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
|
869
|
-
ggml_tensor *dst, const float *src0_dd,
|
|
870
|
-
const float *src1_dd, float *dst_dd,
|
|
871
|
-
const queue_ptr &main_stream) {
|
|
872
|
-
|
|
873
|
-
ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_mul>>(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
|
|
874
|
-
}
|
|
875
|
-
|
|
876
|
-
inline void ggml_sycl_op_div(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
|
877
|
-
ggml_tensor *dst, const float *src0_dd,
|
|
878
|
-
const float *src1_dd, float *dst_dd,
|
|
879
|
-
const queue_ptr &main_stream) {
|
|
880
|
-
|
|
881
|
-
ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_div>>(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
|
|
1412
|
+
acc_f32_sycl(src0_dd, src1_dd, dst_dd, ggml_nelements(dst), dst->src[1]->ne[0], dst->src[1]->ne[1], dst->src[1]->ne[2], nb1, nb2, offset, main_stream);
|
|
882
1413
|
}
|
|
883
1414
|
|
|
884
1415
|
|
|
885
1416
|
void ggml_sycl_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
886
|
-
GGML_SYCL_DEBUG("call %s\n", __func__);
|
|
887
|
-
|
|
1417
|
+
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
|
1418
|
+
ggml_sycl_op_sqrt(ctx, dst);
|
|
888
1419
|
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
889
1420
|
}
|
|
890
1421
|
|
|
891
1422
|
void ggml_sycl_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
892
|
-
GGML_SYCL_DEBUG("call %s\n", __func__);
|
|
893
|
-
|
|
1423
|
+
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
|
1424
|
+
ggml_sycl_op_sin(ctx, dst);
|
|
894
1425
|
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
895
1426
|
}
|
|
896
1427
|
|
|
897
1428
|
void ggml_sycl_cos(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
898
|
-
GGML_SYCL_DEBUG("call %s\n", __func__);
|
|
899
|
-
|
|
1429
|
+
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
|
1430
|
+
ggml_sycl_op_cos(ctx, dst);
|
|
900
1431
|
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
901
1432
|
}
|
|
902
1433
|
|
|
903
1434
|
void ggml_sycl_acc(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
904
|
-
GGML_SYCL_DEBUG("call %s\n", __func__);
|
|
905
|
-
|
|
1435
|
+
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
|
1436
|
+
ggml_sycl_op_acc(ctx, dst);
|
|
906
1437
|
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
907
1438
|
}
|
|
908
1439
|
|
|
909
1440
|
void ggml_sycl_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
910
|
-
GGML_SYCL_DEBUG("call %s\n", __func__);
|
|
911
|
-
|
|
1441
|
+
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
|
1442
|
+
ggml_sycl_op_gelu(ctx, dst);
|
|
912
1443
|
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
913
1444
|
}
|
|
914
1445
|
|
|
915
1446
|
void ggml_sycl_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
916
|
-
GGML_SYCL_DEBUG("call %s\n", __func__);
|
|
917
|
-
|
|
1447
|
+
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
|
1448
|
+
ggml_sycl_op_silu(ctx, dst);
|
|
918
1449
|
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
919
1450
|
}
|
|
920
1451
|
|
|
921
1452
|
void ggml_sycl_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
922
|
-
GGML_SYCL_DEBUG("call %s\n", __func__);
|
|
923
|
-
|
|
1453
|
+
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
|
1454
|
+
ggml_sycl_op_gelu_quick(ctx, dst);
|
|
924
1455
|
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
925
1456
|
}
|
|
926
1457
|
|
|
927
1458
|
void ggml_sycl_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
928
|
-
GGML_SYCL_DEBUG("call %s\n", __func__);
|
|
929
|
-
|
|
1459
|
+
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
|
1460
|
+
ggml_sycl_op_tanh(ctx, dst);
|
|
930
1461
|
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
931
1462
|
}
|
|
932
1463
|
|
|
933
1464
|
void ggml_sycl_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
934
|
-
GGML_SYCL_DEBUG("call %s\n", __func__);
|
|
935
|
-
|
|
1465
|
+
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
|
1466
|
+
ggml_sycl_op_relu(ctx, dst);
|
|
936
1467
|
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
937
1468
|
}
|
|
938
1469
|
|
|
939
1470
|
void ggml_sycl_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
940
|
-
GGML_SYCL_DEBUG("call %s\n", __func__);
|
|
941
|
-
|
|
1471
|
+
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
|
1472
|
+
ggml_sycl_op_sigmoid(ctx, dst);
|
|
942
1473
|
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
943
1474
|
}
|
|
944
1475
|
|
|
945
1476
|
void ggml_sycl_hardsigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
946
|
-
GGML_SYCL_DEBUG("call %s\n", __func__);
|
|
947
|
-
|
|
1477
|
+
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
|
1478
|
+
ggml_sycl_op_hardsigmoid(ctx, dst);
|
|
948
1479
|
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
949
1480
|
}
|
|
950
1481
|
|
|
951
1482
|
void ggml_sycl_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
952
|
-
GGML_SYCL_DEBUG("call %s\n", __func__);
|
|
953
|
-
|
|
1483
|
+
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
|
1484
|
+
ggml_sycl_op_hardswish(ctx, dst);
|
|
954
1485
|
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
955
1486
|
}
|
|
956
1487
|
|
|
957
1488
|
|
|
958
1489
|
void ggml_sycl_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
959
|
-
GGML_SYCL_DEBUG("call %s\n", __func__);
|
|
960
|
-
|
|
1490
|
+
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
|
1491
|
+
ggml_sycl_op_exp(ctx, dst);
|
|
961
1492
|
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
962
1493
|
}
|
|
963
1494
|
|
|
964
1495
|
void ggml_sycl_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
965
|
-
GGML_SYCL_DEBUG("call %s\n", __func__);
|
|
966
|
-
|
|
1496
|
+
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
|
1497
|
+
ggml_sycl_op_log(ctx, dst);
|
|
967
1498
|
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
968
1499
|
}
|
|
969
1500
|
|
|
970
1501
|
void ggml_sycl_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
971
|
-
GGML_SYCL_DEBUG("call %s\n", __func__);
|
|
972
|
-
|
|
1502
|
+
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
|
1503
|
+
ggml_sycl_op_neg(ctx, dst);
|
|
973
1504
|
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
974
1505
|
}
|
|
975
1506
|
|
|
976
1507
|
void ggml_sycl_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
977
|
-
GGML_SYCL_DEBUG("call %s\n", __func__);
|
|
978
|
-
|
|
1508
|
+
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
|
1509
|
+
ggml_sycl_op_step(ctx, dst);
|
|
979
1510
|
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
980
1511
|
}
|
|
981
1512
|
|
|
982
1513
|
void ggml_sycl_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
983
|
-
GGML_SYCL_DEBUG("call %s\n", __func__);
|
|
984
|
-
|
|
1514
|
+
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
|
1515
|
+
ggml_sycl_op_leaky_relu(ctx, dst);
|
|
985
1516
|
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
986
1517
|
}
|
|
987
1518
|
|
|
988
1519
|
void ggml_sycl_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
989
|
-
GGML_SYCL_DEBUG("call %s\n", __func__);
|
|
990
|
-
|
|
1520
|
+
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
|
1521
|
+
ggml_sycl_op_sqr(ctx, dst);
|
|
991
1522
|
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
992
1523
|
}
|
|
993
1524
|
|
|
994
1525
|
void ggml_sycl_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
995
|
-
GGML_SYCL_DEBUG("call %s\n", __func__);
|
|
996
|
-
|
|
1526
|
+
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
|
1527
|
+
ggml_sycl_op_upscale(ctx, dst);
|
|
997
1528
|
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
998
1529
|
}
|
|
999
1530
|
|
|
1000
1531
|
void ggml_sycl_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
1001
|
-
GGML_SYCL_DEBUG("call %s\n", __func__);
|
|
1002
|
-
|
|
1532
|
+
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
|
1533
|
+
ggml_sycl_op_pad(ctx, dst);
|
|
1003
1534
|
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
1004
1535
|
}
|
|
1005
1536
|
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
|
|
1009
|
-
GGML_SYCL_DEBUG("call %s\n", __func__);
|
|
1010
|
-
ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_add);
|
|
1537
|
+
void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
1538
|
+
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
|
1539
|
+
ggml_sycl_op_clamp(ctx, dst);
|
|
1011
1540
|
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
1012
1541
|
}
|
|
1013
1542
|
|
|
1014
|
-
void
|
|
1015
|
-
GGML_SYCL_DEBUG("call %s\n", __func__);
|
|
1016
|
-
|
|
1543
|
+
void ggml_sycl_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
1544
|
+
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
|
1545
|
+
ggml_sycl_op_sgn(ctx, dst);
|
|
1017
1546
|
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
1018
1547
|
}
|
|
1019
1548
|
|
|
1020
|
-
void
|
|
1021
|
-
GGML_SYCL_DEBUG("call %s\n", __func__);
|
|
1022
|
-
|
|
1549
|
+
void ggml_sycl_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
1550
|
+
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
|
1551
|
+
ggml_sycl_op_abs(ctx, dst);
|
|
1023
1552
|
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
1024
1553
|
}
|
|
1025
1554
|
|
|
1026
|
-
void
|
|
1027
|
-
GGML_SYCL_DEBUG("call %s\n", __func__);
|
|
1028
|
-
|
|
1555
|
+
void ggml_sycl_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
1556
|
+
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
|
1557
|
+
ggml_sycl_op_elu(ctx, dst);
|
|
1029
1558
|
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
1030
1559
|
}
|