@fugood/llama.node 0.3.16 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +6 -1
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +44 -2
- package/lib/index.js +132 -1
- package/lib/index.ts +203 -3
- package/package.json +2 -1
- package/src/EmbeddingWorker.cpp +1 -1
- package/src/LlamaCompletionWorker.cpp +374 -19
- package/src/LlamaCompletionWorker.h +31 -10
- package/src/LlamaContext.cpp +216 -7
- package/src/LlamaContext.h +12 -0
- package/src/common.hpp +15 -0
- package/src/llama.cpp/.github/workflows/build-linux-cross.yml +233 -0
- package/src/llama.cpp/.github/workflows/build.yml +89 -767
- package/src/llama.cpp/.github/workflows/docker.yml +9 -6
- package/src/llama.cpp/.github/workflows/release.yml +716 -0
- package/src/llama.cpp/.github/workflows/server.yml +19 -23
- package/src/llama.cpp/CMakeLists.txt +11 -1
- package/src/llama.cpp/cmake/build-info.cmake +8 -2
- package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
- package/src/llama.cpp/common/CMakeLists.txt +35 -4
- package/src/llama.cpp/common/arg.cpp +844 -121
- package/src/llama.cpp/common/arg.h +9 -0
- package/src/llama.cpp/common/chat.cpp +129 -107
- package/src/llama.cpp/common/chat.h +2 -0
- package/src/llama.cpp/common/common.cpp +64 -518
- package/src/llama.cpp/common/common.h +35 -45
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
- package/src/llama.cpp/common/llguidance.cpp +31 -47
- package/src/llama.cpp/common/minja/chat-template.hpp +23 -11
- package/src/llama.cpp/common/minja/minja.hpp +186 -127
- package/src/llama.cpp/common/regex-partial.cpp +204 -0
- package/src/llama.cpp/common/regex-partial.h +56 -0
- package/src/llama.cpp/common/sampling.cpp +60 -50
- package/src/llama.cpp/docs/build.md +122 -7
- package/src/llama.cpp/examples/CMakeLists.txt +2 -32
- package/src/llama.cpp/examples/batched/batched.cpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +9 -12
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/parallel/parallel.cpp +89 -15
- package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
- package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
- package/src/llama.cpp/examples/sycl/build.sh +2 -2
- package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
- package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/training/finetune.cpp +96 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +35 -2
- package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
- package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
- package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
- package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
- package/src/llama.cpp/ggml/include/ggml.h +76 -106
- package/src/llama.cpp/ggml/src/CMakeLists.txt +11 -8
- package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
- package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +66 -33
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -194
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1060 -410
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1008 -13533
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +31 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +90 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +266 -72
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1034 -88
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8796 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +252 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
- package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +106 -14
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -262
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
- package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +307 -40
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +125 -45
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +10 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +239 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +9 -307
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +944 -438
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +507 -411
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
- package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +83 -49
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1278 -282
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +32 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +133 -30
- package/src/llama.cpp/ggml/src/ggml.c +170 -265
- package/src/llama.cpp/ggml/src/gguf.cpp +34 -33
- package/src/llama.cpp/include/llama.h +82 -22
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +5 -3
- package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
- package/src/llama.cpp/scripts/xxd.cmake +1 -1
- package/src/llama.cpp/src/CMakeLists.txt +4 -2
- package/src/llama.cpp/src/llama-adapter.cpp +43 -1
- package/src/llama.cpp/src/llama-arch.cpp +163 -17
- package/src/llama.cpp/src/llama-arch.h +16 -0
- package/src/llama.cpp/src/llama-batch.cpp +5 -1
- package/src/llama.cpp/src/llama-batch.h +2 -1
- package/src/llama.cpp/src/llama-chat.cpp +91 -16
- package/src/llama.cpp/src/llama-chat.h +7 -2
- package/src/llama.cpp/src/llama-context.cpp +479 -575
- package/src/llama.cpp/src/llama-context.h +44 -33
- package/src/llama.cpp/src/llama-cparams.h +1 -0
- package/src/llama.cpp/src/llama-graph.cpp +209 -157
- package/src/llama.cpp/src/llama-graph.h +38 -14
- package/src/llama.cpp/src/llama-hparams.h +13 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +1604 -543
- package/src/llama.cpp/src/llama-kv-cache.h +283 -171
- package/src/llama.cpp/src/llama-memory.h +12 -2
- package/src/llama.cpp/src/llama-mmap.cpp +1 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +34 -20
- package/src/llama.cpp/src/llama-model-loader.h +5 -3
- package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
- package/src/llama.cpp/src/llama-model-saver.h +37 -0
- package/src/llama.cpp/src/llama-model.cpp +1803 -330
- package/src/llama.cpp/src/llama-model.h +21 -2
- package/src/llama.cpp/src/llama-quant.cpp +33 -10
- package/src/llama.cpp/src/llama-sampling.cpp +25 -7
- package/src/llama.cpp/src/llama-vocab.cpp +86 -10
- package/src/llama.cpp/src/llama-vocab.h +6 -0
- package/src/llama.cpp/src/llama.cpp +15 -1
- package/src/llama.cpp/tests/CMakeLists.txt +52 -31
- package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
- package/src/llama.cpp/tests/test-backend-ops.cpp +189 -90
- package/src/llama.cpp/tests/test-chat-template.cpp +26 -6
- package/src/llama.cpp/tests/test-chat.cpp +15 -3
- package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
- package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
- package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
- package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
- package/src/llama.cpp/tests/test-opt.cpp +33 -21
- package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
- package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
- package/src/llama.cpp/tests/test-sampling.cpp +1 -1
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
- package/src/llama.cpp/tools/CMakeLists.txt +39 -0
- package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +3 -3
- package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +1 -1
- package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +15 -16
- package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
- package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +623 -274
- package/src/llama.cpp/{examples → tools}/main/main.cpp +22 -14
- package/src/llama.cpp/tools/mtmd/CMakeLists.txt +47 -0
- package/src/llama.cpp/tools/mtmd/clip-impl.h +365 -0
- package/src/llama.cpp/tools/mtmd/clip.cpp +3646 -0
- package/src/llama.cpp/tools/mtmd/clip.h +99 -0
- package/src/llama.cpp/tools/mtmd/deprecation-warning.cpp +22 -0
- package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +370 -0
- package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
- package/src/llama.cpp/tools/mtmd/mtmd.cpp +678 -0
- package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
- package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +21 -5
- package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +53 -3
- package/src/llama.cpp/tools/rpc/CMakeLists.txt +4 -0
- package/src/llama.cpp/tools/rpc/rpc-server.cpp +322 -0
- package/src/llama.cpp/tools/run/CMakeLists.txt +16 -0
- package/src/llama.cpp/{examples → tools}/run/run.cpp +30 -30
- package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
- package/src/llama.cpp/{examples → tools}/server/httplib.h +313 -247
- package/src/llama.cpp/{examples → tools}/server/server.cpp +529 -215
- package/src/llama.cpp/{examples → tools}/server/utils.hpp +427 -6
- package/src/llama.cpp/{examples → tools}/tts/tts.cpp +6 -9
- package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/infill/infill.cpp +0 -590
- package/src/llama.cpp/examples/llava/CMakeLists.txt +0 -66
- package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
- package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
- package/src/llama.cpp/examples/llava/clip.cpp +0 -3206
- package/src/llama.cpp/examples/llava/clip.h +0 -118
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
- package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
- package/src/llama.cpp/examples/llava/llava.cpp +0 -574
- package/src/llama.cpp/examples/llava/llava.h +0 -49
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
- package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +0 -584
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
- package/src/llama.cpp/examples/rpc/CMakeLists.txt +0 -2
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +0 -171
- package/src/llama.cpp/examples/run/CMakeLists.txt +0 -5
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
- /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
- /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
#include "common.hpp"
|
|
2
|
+
#include "ggml.h"
|
|
2
3
|
#include "element_wise.hpp"
|
|
3
4
|
|
|
4
5
|
static void acc_f32(const float * x, const float * y, float * dst, const int ne,
|
|
@@ -20,10 +21,32 @@ static void acc_f32(const float * x, const float * y, float * dst, const int ne,
|
|
|
20
21
|
}
|
|
21
22
|
}
|
|
22
23
|
|
|
23
|
-
|
|
24
|
+
template<typename T>
|
|
25
|
+
static void sgn(const T * x, T * dst, const int k, const sycl::nd_item<3> &item_ct1) {
|
|
26
|
+
for(auto i = item_ct1.get_global_id(2); i < (const size_t)k; i += item_ct1.get_global_range(2)) {
|
|
27
|
+
dst[i] = x[i] > static_cast<T>(0.f) ? static_cast<T>(1.f) : ((x[i] < static_cast<T>(0.f) ? static_cast<T>(-1.f) : static_cast<T>(0.f)));
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
template<typename T>
|
|
32
|
+
static void abs_op(const T * x, T * dst, const int k, const sycl::nd_item<3> &item_ct1) {
|
|
33
|
+
for(auto i = item_ct1.get_global_id(2); i < (const size_t)k; i += item_ct1.get_global_range(2)) {
|
|
34
|
+
dst[i] = sycl::fabs(x[i]);
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
template<typename T>
|
|
39
|
+
static void elu_op(const T * x, T * dst, const int k, const sycl::nd_item<3> &item_ct1) {
|
|
40
|
+
for(auto i = item_ct1.get_global_id(2); i < (const size_t)k; i += item_ct1.get_global_range(2)) {
|
|
41
|
+
dst[i] = (x[i] > static_cast<T>(0.f)) ? x[i] : sycl::expm1(x[i]);
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
template<typename T>
|
|
46
|
+
static void gelu(const T * x, T * dst, const int k,
|
|
24
47
|
const sycl::nd_item<3> &item_ct1) {
|
|
25
|
-
const
|
|
26
|
-
const
|
|
48
|
+
const T GELU_COEF_A = static_cast<T>(0.044715f);
|
|
49
|
+
const T SQRT_2_OVER_PI = static_cast<T>(0.79788456080286535587989211986876f);
|
|
27
50
|
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
28
51
|
item_ct1.get_local_id(2);
|
|
29
52
|
|
|
@@ -32,12 +55,13 @@ static void gelu_f32(const float * x, float * dst, const int k,
|
|
|
32
55
|
}
|
|
33
56
|
|
|
34
57
|
float xi = x[i];
|
|
35
|
-
dst[i] = 0.5f * xi *
|
|
36
|
-
(1.0f +
|
|
37
|
-
sycl::tanh(SQRT_2_OVER_PI * xi * (1.0f + GELU_COEF_A * xi * xi)));
|
|
58
|
+
dst[i] = static_cast<T>(0.5f) * xi *
|
|
59
|
+
(static_cast<T>(1.0f) +
|
|
60
|
+
sycl::tanh(SQRT_2_OVER_PI * xi * (static_cast<T>(1.0f) + GELU_COEF_A * xi * xi)));
|
|
38
61
|
}
|
|
39
62
|
|
|
40
|
-
|
|
63
|
+
template<typename T>
|
|
64
|
+
static void silu(const T * x, T * dst, const int k,
|
|
41
65
|
const sycl::nd_item<3> &item_ct1) {
|
|
42
66
|
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
43
67
|
item_ct1.get_local_id(2);
|
|
@@ -45,10 +69,11 @@ static void silu_f32(const float * x, float * dst, const int k,
|
|
|
45
69
|
if (i >= k) {
|
|
46
70
|
return;
|
|
47
71
|
}
|
|
48
|
-
dst[i] = x[i] / (1.0f + sycl::native::exp(-x[i]));
|
|
72
|
+
dst[i] = x[i] / (static_cast<T>(1.0f) + sycl::native::exp(-x[i]));
|
|
49
73
|
}
|
|
50
74
|
|
|
51
|
-
|
|
75
|
+
template<typename T>
|
|
76
|
+
static void gelu_quick(const T *x, T *dst, int k,
|
|
52
77
|
const sycl::nd_item<3> &item_ct1) {
|
|
53
78
|
const float GELU_QUICK_COEF = -1.702f;
|
|
54
79
|
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
@@ -56,20 +81,22 @@ static void gelu_quick_f32(const float *x, float *dst, int k,
|
|
|
56
81
|
if (i >= k) {
|
|
57
82
|
return;
|
|
58
83
|
}
|
|
59
|
-
dst[i] = x[i] * (1.0f / (1.0f + sycl::native::exp(GELU_QUICK_COEF * x[i])));
|
|
84
|
+
dst[i] = x[i] * (static_cast<T>(1.0f) / (static_cast<T>(1.0f) + sycl::native::exp(GELU_QUICK_COEF * x[i])));
|
|
60
85
|
}
|
|
61
86
|
|
|
62
|
-
|
|
87
|
+
template<typename T>
|
|
88
|
+
static void tanh(const T *x, T *dst, int k,
|
|
63
89
|
const sycl::nd_item<3> &item_ct1) {
|
|
64
90
|
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
65
91
|
item_ct1.get_local_id(2);
|
|
66
92
|
if (i >= k) {
|
|
67
93
|
return;
|
|
68
94
|
}
|
|
69
|
-
dst[i] = sycl::tanh((
|
|
95
|
+
dst[i] = sycl::tanh((x[i]));
|
|
70
96
|
}
|
|
71
97
|
|
|
72
|
-
|
|
98
|
+
template<typename T>
|
|
99
|
+
static void relu(const T * x, T * dst, const int k,
|
|
73
100
|
const sycl::nd_item<3> &item_ct1) {
|
|
74
101
|
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
75
102
|
item_ct1.get_local_id(2);
|
|
@@ -77,10 +104,11 @@ static void relu_f32(const float * x, float * dst, const int k,
|
|
|
77
104
|
if (i >= k) {
|
|
78
105
|
return;
|
|
79
106
|
}
|
|
80
|
-
dst[i] = sycl::fmax((
|
|
107
|
+
dst[i] = sycl::fmax((x[i]), static_cast<T>(0));
|
|
81
108
|
}
|
|
82
109
|
|
|
83
|
-
|
|
110
|
+
template<typename T>
|
|
111
|
+
static void sigmoid(const T * x, T * dst, const int k,
|
|
84
112
|
const sycl::nd_item<3> &item_ct1) {
|
|
85
113
|
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
86
114
|
item_ct1.get_local_id(2);
|
|
@@ -88,10 +116,11 @@ static void sigmoid_f32(const float * x, float * dst, const int k,
|
|
|
88
116
|
if (i >= k) {
|
|
89
117
|
return;
|
|
90
118
|
}
|
|
91
|
-
dst[i] = 1.0f / (1.0f + sycl::native::exp(-x[i]));
|
|
119
|
+
dst[i] = 1.0f / (static_cast<T>(1.0f) + sycl::native::exp(-x[i]));
|
|
92
120
|
}
|
|
93
121
|
|
|
94
|
-
|
|
122
|
+
template<typename T>
|
|
123
|
+
static void sqrt(const T * x, T * dst, const int k,
|
|
95
124
|
const sycl::nd_item<3> &item_ct1) {
|
|
96
125
|
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
97
126
|
item_ct1.get_local_id(2);
|
|
@@ -102,7 +131,8 @@ static void sqrt_f32(const float * x, float * dst, const int k,
|
|
|
102
131
|
dst[i] = sycl::sqrt(x[i]);
|
|
103
132
|
}
|
|
104
133
|
|
|
105
|
-
|
|
134
|
+
template<typename T>
|
|
135
|
+
static void sin(const T * x, T * dst, const int k,
|
|
106
136
|
const sycl::nd_item<3> &item_ct1) {
|
|
107
137
|
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
108
138
|
item_ct1.get_local_id(2);
|
|
@@ -113,7 +143,8 @@ static void sin_f32(const float * x, float * dst, const int k,
|
|
|
113
143
|
dst[i] = sycl::sin(x[i]);
|
|
114
144
|
}
|
|
115
145
|
|
|
116
|
-
|
|
146
|
+
template<typename T>
|
|
147
|
+
static void cos(const T * x, T * dst, const int k,
|
|
117
148
|
const sycl::nd_item<3> &item_ct1) {
|
|
118
149
|
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
119
150
|
item_ct1.get_local_id(2);
|
|
@@ -124,7 +155,8 @@ static void cos_f32(const float * x, float * dst, const int k,
|
|
|
124
155
|
dst[i] = sycl::cos(x[i]);
|
|
125
156
|
}
|
|
126
157
|
|
|
127
|
-
|
|
158
|
+
template<typename T>
|
|
159
|
+
static void hardsigmoid(const T * x, T * dst, const int k,
|
|
128
160
|
const sycl::nd_item<3> &item_ct1) {
|
|
129
161
|
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
130
162
|
item_ct1.get_local_id(2);
|
|
@@ -132,10 +164,11 @@ static void hardsigmoid_f32(const float * x, float * dst, const int k,
|
|
|
132
164
|
if (i >= k) {
|
|
133
165
|
return;
|
|
134
166
|
}
|
|
135
|
-
dst[i] = sycl::fmin(1.0f, sycl::fmax(0.0f, (x[i] + 3.0f) / 6.0f));
|
|
167
|
+
dst[i] = sycl::fmin(static_cast<T>(1.0f), sycl::fmax(static_cast<T>(0.0f), (x[i] + static_cast<T>(3.0f)) / static_cast<T>(6.0f)));
|
|
136
168
|
}
|
|
137
169
|
|
|
138
|
-
|
|
170
|
+
template<typename T>
|
|
171
|
+
static void hardswish(const T * x, T * dst, const int k,
|
|
139
172
|
const sycl::nd_item<3> &item_ct1) {
|
|
140
173
|
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
141
174
|
item_ct1.get_local_id(2);
|
|
@@ -143,10 +176,11 @@ static void hardswish_f32(const float * x, float * dst, const int k,
|
|
|
143
176
|
if (i >= k) {
|
|
144
177
|
return;
|
|
145
178
|
}
|
|
146
|
-
dst[i] = x[i] * sycl::fmin(1.0f, sycl::fmax(0.0f, (x[i] + 3.0f) / 6.0f));
|
|
179
|
+
dst[i] = x[i] * sycl::fmin(static_cast<T>(1.0f), sycl::fmax(static_cast<T>(0.0f), (x[i] + static_cast<T>(3.0f)) / static_cast<T>(6.0f)));
|
|
147
180
|
}
|
|
148
181
|
|
|
149
|
-
|
|
182
|
+
template<typename T>
|
|
183
|
+
static void exp(const T * x, T * dst, const int k,
|
|
150
184
|
const sycl::nd_item<3> &item_ct1) {
|
|
151
185
|
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
152
186
|
item_ct1.get_local_id(2);
|
|
@@ -157,7 +191,8 @@ static void exp_f32(const float * x, float * dst, const int k,
|
|
|
157
191
|
dst[i] = sycl::exp(x[i]);
|
|
158
192
|
}
|
|
159
193
|
|
|
160
|
-
|
|
194
|
+
template<typename T>
|
|
195
|
+
static void log(const T * x, T * dst, const int k,
|
|
161
196
|
const sycl::nd_item<3> &item_ct1) {
|
|
162
197
|
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
163
198
|
item_ct1.get_local_id(2);
|
|
@@ -165,15 +200,16 @@ static void log_f32(const float * x, float * dst, const int k,
|
|
|
165
200
|
if (i >= k) {
|
|
166
201
|
return;
|
|
167
202
|
}
|
|
168
|
-
|
|
203
|
+
T xi = x[i];
|
|
169
204
|
if (xi <= 0) {
|
|
170
|
-
dst[i] =
|
|
205
|
+
dst[i] = neg_infinity<T>();
|
|
171
206
|
} else {
|
|
172
207
|
dst[i] = sycl::log(xi);
|
|
173
208
|
}
|
|
174
209
|
}
|
|
175
210
|
|
|
176
|
-
|
|
211
|
+
template<typename T>
|
|
212
|
+
static void neg(const T * x, T * dst, const int k,
|
|
177
213
|
const sycl::nd_item<3> &item_ct1) {
|
|
178
214
|
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
179
215
|
item_ct1.get_local_id(2);
|
|
@@ -184,7 +220,8 @@ static void neg_f32(const float * x, float * dst, const int k,
|
|
|
184
220
|
dst[i] = -x[i];
|
|
185
221
|
}
|
|
186
222
|
|
|
187
|
-
|
|
223
|
+
template<typename T>
|
|
224
|
+
static void step(const T * x, T * dst, const int k,
|
|
188
225
|
const sycl::nd_item<3> &item_ct1) {
|
|
189
226
|
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
190
227
|
item_ct1.get_local_id(2);
|
|
@@ -192,21 +229,23 @@ static void step_f32(const float * x, float * dst, const int k,
|
|
|
192
229
|
if (i >= k) {
|
|
193
230
|
return;
|
|
194
231
|
}
|
|
195
|
-
dst[i] = x[i] > 0.0f;
|
|
232
|
+
dst[i] = x[i] > static_cast<T>(0.0f);
|
|
196
233
|
}
|
|
197
234
|
|
|
198
|
-
|
|
235
|
+
template<typename T>
|
|
236
|
+
static void leaky_relu(const T *x, T *dst, const int k, const float negative_slope,
|
|
199
237
|
const sycl::nd_item<3> &item_ct1) {
|
|
200
238
|
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
201
239
|
item_ct1.get_local_id(2);
|
|
202
240
|
if (i >= k) {
|
|
203
241
|
return;
|
|
204
242
|
}
|
|
205
|
-
dst[i] = sycl::fmax((
|
|
206
|
-
sycl::fmin((
|
|
243
|
+
dst[i] = sycl::fmax((x[i]), static_cast<T>(0)) +
|
|
244
|
+
sycl::fmin((x[i]), static_cast<T>(0.0f)) * negative_slope;
|
|
207
245
|
}
|
|
208
246
|
|
|
209
|
-
|
|
247
|
+
template<typename T>
|
|
248
|
+
static void sqr(const T * x, T * dst, const int k,
|
|
210
249
|
const sycl::nd_item<3> &item_ct1) {
|
|
211
250
|
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
212
251
|
item_ct1.get_local_id(2);
|
|
@@ -217,7 +256,8 @@ static void sqr_f32(const float * x, float * dst, const int k,
|
|
|
217
256
|
dst[i] = x[i] * x[i];
|
|
218
257
|
}
|
|
219
258
|
|
|
220
|
-
|
|
259
|
+
template<typename T>
|
|
260
|
+
static void upscale(const T *x, T *dst, const int nb00, const int nb01,
|
|
221
261
|
const int nb02, const int nb03, const int ne10, const int ne11,
|
|
222
262
|
const int ne12, const int ne13, const float sf0, const float sf1,
|
|
223
263
|
const float sf2, const float sf3, const sycl::nd_item<1> &item_ct1) {
|
|
@@ -237,10 +277,11 @@ static void upscale_f32(const float *x, float *dst, const int nb00, const int n
|
|
|
237
277
|
int i02 = i12 / sf2;
|
|
238
278
|
int i03 = i13 / sf3;
|
|
239
279
|
|
|
240
|
-
dst[index] = *(const
|
|
280
|
+
dst[index] = *(const T *)((const char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00);
|
|
241
281
|
}
|
|
242
282
|
|
|
243
|
-
|
|
283
|
+
template <typename T>
|
|
284
|
+
static void pad(const T *x, T *dst, const int ne0, const int ne00, const int ne01, const int ne02,
|
|
244
285
|
const sycl::nd_item<3> &item_ct1) {
|
|
245
286
|
int nidx = item_ct1.get_local_id(2) +
|
|
246
287
|
item_ct1.get_group(2) * item_ct1.get_local_range(2);
|
|
@@ -256,11 +297,23 @@ static void pad_f32(const float *x, float *dst, const int ne0, const int ne00,
|
|
|
256
297
|
item_ct1.get_group(0) * ne00 * ne01;
|
|
257
298
|
dst[offset_dst] = x[offset_src];
|
|
258
299
|
} else {
|
|
259
|
-
dst[offset_dst] = 0.0f;
|
|
300
|
+
dst[offset_dst] = static_cast<T>(0.0f);
|
|
260
301
|
}
|
|
261
302
|
}
|
|
262
303
|
|
|
263
304
|
|
|
305
|
+
template<typename T>
|
|
306
|
+
static void clamp(const T * x, T * dst, const float min, const float max, const int k,
|
|
307
|
+
const sycl::nd_item<3> &item_ct1) {
|
|
308
|
+
const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
|
|
309
|
+
item_ct1.get_local_id(2);
|
|
310
|
+
|
|
311
|
+
if (i >= k) {
|
|
312
|
+
return;
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
dst[i] = x[i] < static_cast<T>(min) ? static_cast<T>(min) : (x[i] > static_cast<T>(max) ? static_cast<T>(max) : x[i]);
|
|
316
|
+
}
|
|
264
317
|
|
|
265
318
|
static void acc_f32_sycl(const float *x, const float *y, float *dst,
|
|
266
319
|
const int n_elements, const int ne10, const int ne11,
|
|
@@ -277,7 +330,8 @@ static void acc_f32_sycl(const float *x, const float *y, float *dst,
|
|
|
277
330
|
});
|
|
278
331
|
}
|
|
279
332
|
|
|
280
|
-
|
|
333
|
+
template<typename T>
|
|
334
|
+
static void gelu_sycl(const T *x, T *dst, const int k,
|
|
281
335
|
queue_ptr stream) {
|
|
282
336
|
const int num_blocks = (k + SYCL_GELU_BLOCK_SIZE - 1) / SYCL_GELU_BLOCK_SIZE;
|
|
283
337
|
stream->parallel_for(
|
|
@@ -285,11 +339,12 @@ static void gelu_f32_sycl(const float *x, float *dst, const int k,
|
|
|
285
339
|
sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE),
|
|
286
340
|
sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)),
|
|
287
341
|
[=](sycl::nd_item<3> item_ct1) {
|
|
288
|
-
|
|
342
|
+
gelu(x, dst, k, item_ct1);
|
|
289
343
|
});
|
|
290
344
|
}
|
|
291
345
|
|
|
292
|
-
|
|
346
|
+
template<typename T>
|
|
347
|
+
static void silu_sycl(const T *x, T *dst, const int k,
|
|
293
348
|
queue_ptr stream) {
|
|
294
349
|
const int num_blocks = (k + SYCL_SILU_BLOCK_SIZE - 1) / SYCL_SILU_BLOCK_SIZE;
|
|
295
350
|
stream->parallel_for(
|
|
@@ -297,11 +352,43 @@ static void silu_f32_sycl(const float *x, float *dst, const int k,
|
|
|
297
352
|
sycl::range<3>(1, 1, SYCL_SILU_BLOCK_SIZE),
|
|
298
353
|
sycl::range<3>(1, 1, SYCL_SILU_BLOCK_SIZE)),
|
|
299
354
|
[=](sycl::nd_item<3> item_ct1) {
|
|
300
|
-
|
|
355
|
+
silu(x, dst, k, item_ct1);
|
|
301
356
|
});
|
|
302
357
|
}
|
|
303
358
|
|
|
304
|
-
|
|
359
|
+
template<typename T>
|
|
360
|
+
static void sgn_sycl(const T * x, T * dst, const int k, queue_ptr stream) {
|
|
361
|
+
// hard code for now
|
|
362
|
+
const int num_blocks = ceil_div(k, 256);
|
|
363
|
+
stream->parallel_for(
|
|
364
|
+
sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range(1, 1, 256)), sycl::range(1, 1, 256)), [=](sycl::nd_item<3> item_ct1) {
|
|
365
|
+
sgn(x, dst, k, item_ct1);
|
|
366
|
+
});
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
template<typename T>
|
|
370
|
+
static void abs_sycl(const T * x, T * dst, const int k, queue_ptr stream) {
|
|
371
|
+
// hard code for now
|
|
372
|
+
const int num_blocks = ceil_div(k, 256);
|
|
373
|
+
stream->parallel_for(
|
|
374
|
+
sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, 256)), sycl::range<3>(1, 1, 256)), [=](sycl::nd_item<3> item_ct1) {
|
|
375
|
+
abs_op(x, dst, k, item_ct1);
|
|
376
|
+
});
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
template<typename T>
|
|
381
|
+
static void elu_sycl(const T * x, T * dst, const int k, queue_ptr stream) {
|
|
382
|
+
// hard code for now
|
|
383
|
+
const int num_blocks = ceil_div(k, 256);
|
|
384
|
+
stream->parallel_for(
|
|
385
|
+
sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, 256)), sycl::range<3>(1, 1, 256)), [=](sycl::nd_item<3> item_ct1) {
|
|
386
|
+
elu_op(x, dst, k, item_ct1);
|
|
387
|
+
});
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
template<typename T>
|
|
391
|
+
static void gelu_quick_sycl(const T *x, T *dst, const int k,
|
|
305
392
|
queue_ptr stream) {
|
|
306
393
|
const int num_blocks = (k + SYCL_GELU_BLOCK_SIZE - 1) / SYCL_GELU_BLOCK_SIZE;
|
|
307
394
|
stream->parallel_for(
|
|
@@ -309,11 +396,12 @@ static void gelu_quick_f32_sycl(const float *x, float *dst, const int k,
|
|
|
309
396
|
sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE),
|
|
310
397
|
sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)),
|
|
311
398
|
[=](sycl::nd_item<3> item_ct1) {
|
|
312
|
-
|
|
399
|
+
gelu_quick(x, dst, k, item_ct1);
|
|
313
400
|
});
|
|
314
401
|
}
|
|
315
402
|
|
|
316
|
-
|
|
403
|
+
template<typename T>
|
|
404
|
+
static void tanh_sycl(const T *x, T *dst, const int k,
|
|
317
405
|
queue_ptr stream) {
|
|
318
406
|
const int num_blocks = (k + SYCL_TANH_BLOCK_SIZE - 1) / SYCL_TANH_BLOCK_SIZE;
|
|
319
407
|
stream->parallel_for(
|
|
@@ -321,11 +409,12 @@ static void tanh_f32_sycl(const float *x, float *dst, const int k,
|
|
|
321
409
|
sycl::range<3>(1, 1, SYCL_TANH_BLOCK_SIZE),
|
|
322
410
|
sycl::range<3>(1, 1, SYCL_TANH_BLOCK_SIZE)),
|
|
323
411
|
[=](sycl::nd_item<3> item_ct1) {
|
|
324
|
-
|
|
412
|
+
tanh(x, dst, k, item_ct1);
|
|
325
413
|
});
|
|
326
414
|
}
|
|
327
415
|
|
|
328
|
-
|
|
416
|
+
template<typename T>
|
|
417
|
+
static void relu_sycl(const T *x, T *dst, const int k,
|
|
329
418
|
queue_ptr stream) {
|
|
330
419
|
const int num_blocks = (k + SYCL_RELU_BLOCK_SIZE - 1) / SYCL_RELU_BLOCK_SIZE;
|
|
331
420
|
stream->parallel_for(
|
|
@@ -333,11 +422,12 @@ static void relu_f32_sycl(const float *x, float *dst, const int k,
|
|
|
333
422
|
sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE),
|
|
334
423
|
sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE)),
|
|
335
424
|
[=](sycl::nd_item<3> item_ct1) {
|
|
336
|
-
|
|
425
|
+
relu(x, dst, k, item_ct1);
|
|
337
426
|
});
|
|
338
427
|
}
|
|
339
428
|
|
|
340
|
-
|
|
429
|
+
template<typename T>
|
|
430
|
+
static void hardsigmoid_sycl(const T *x, T *dst, const int k,
|
|
341
431
|
queue_ptr stream) {
|
|
342
432
|
const int num_blocks = (k + SYCL_HARDSIGMOID_BLOCK_SIZE - 1) / SYCL_HARDSIGMOID_BLOCK_SIZE;
|
|
343
433
|
stream->parallel_for(
|
|
@@ -345,11 +435,12 @@ static void hardsigmoid_f32_sycl(const float *x, float *dst, const int k,
|
|
|
345
435
|
sycl::range<3>(1, 1, SYCL_HARDSIGMOID_BLOCK_SIZE),
|
|
346
436
|
sycl::range<3>(1, 1, SYCL_HARDSIGMOID_BLOCK_SIZE)),
|
|
347
437
|
[=](sycl::nd_item<3> item_ct1) {
|
|
348
|
-
|
|
438
|
+
hardsigmoid(x, dst, k, item_ct1);
|
|
349
439
|
});
|
|
350
440
|
}
|
|
351
441
|
|
|
352
|
-
|
|
442
|
+
template<typename T>
|
|
443
|
+
static void hardswish_sycl(const T *x, T *dst, const int k,
|
|
353
444
|
queue_ptr stream) {
|
|
354
445
|
const int num_blocks = (k + SYCL_HARDSWISH_BLOCK_SIZE - 1) / SYCL_HARDSWISH_BLOCK_SIZE;
|
|
355
446
|
stream->parallel_for(
|
|
@@ -357,11 +448,12 @@ static void hardswish_f32_sycl(const float *x, float *dst, const int k,
|
|
|
357
448
|
sycl::range<3>(1, 1, SYCL_HARDSWISH_BLOCK_SIZE),
|
|
358
449
|
sycl::range<3>(1, 1, SYCL_HARDSWISH_BLOCK_SIZE)),
|
|
359
450
|
[=](sycl::nd_item<3> item_ct1) {
|
|
360
|
-
|
|
451
|
+
hardswish(x, dst, k, item_ct1);
|
|
361
452
|
});
|
|
362
453
|
}
|
|
363
454
|
|
|
364
|
-
|
|
455
|
+
template<typename T>
|
|
456
|
+
static void exp_sycl(const T *x, T *dst, const int k,
|
|
365
457
|
queue_ptr stream) {
|
|
366
458
|
const int num_blocks = (k + SYCL_EXP_BLOCK_SIZE - 1) / SYCL_EXP_BLOCK_SIZE;
|
|
367
459
|
stream->parallel_for(
|
|
@@ -369,11 +461,12 @@ static void exp_f32_sycl(const float *x, float *dst, const int k,
|
|
|
369
461
|
sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE),
|
|
370
462
|
sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE)),
|
|
371
463
|
[=](sycl::nd_item<3> item_ct1) {
|
|
372
|
-
|
|
464
|
+
exp(x, dst, k, item_ct1);
|
|
373
465
|
});
|
|
374
466
|
}
|
|
375
467
|
|
|
376
|
-
|
|
468
|
+
template<typename T>
|
|
469
|
+
static void log_sycl(const T *x, T *dst, const int k,
|
|
377
470
|
queue_ptr stream) {
|
|
378
471
|
const int num_blocks = (k + SYCL_EXP_BLOCK_SIZE - 1) / SYCL_EXP_BLOCK_SIZE;
|
|
379
472
|
stream->parallel_for(
|
|
@@ -381,11 +474,12 @@ static void log_f32_sycl(const float *x, float *dst, const int k,
|
|
|
381
474
|
sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE),
|
|
382
475
|
sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE)),
|
|
383
476
|
[=](sycl::nd_item<3> item_ct1) {
|
|
384
|
-
|
|
477
|
+
log(x, dst, k, item_ct1);
|
|
385
478
|
});
|
|
386
479
|
}
|
|
387
480
|
|
|
388
|
-
|
|
481
|
+
template<typename T>
|
|
482
|
+
static void neg_sycl(const T *x, T *dst, const int k,
|
|
389
483
|
queue_ptr stream) {
|
|
390
484
|
const int num_blocks = (k + SYCL_NEG_BLOCK_SIZE - 1) / SYCL_NEG_BLOCK_SIZE;
|
|
391
485
|
stream->parallel_for(
|
|
@@ -393,11 +487,12 @@ static void neg_f32_sycl(const float *x, float *dst, const int k,
|
|
|
393
487
|
sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE),
|
|
394
488
|
sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE)),
|
|
395
489
|
[=](sycl::nd_item<3> item_ct1) {
|
|
396
|
-
|
|
490
|
+
neg(x, dst, k, item_ct1);
|
|
397
491
|
});
|
|
398
492
|
}
|
|
399
493
|
|
|
400
|
-
|
|
494
|
+
template<typename T>
|
|
495
|
+
static void step_sycl(const T *x, T *dst, const int k,
|
|
401
496
|
queue_ptr stream) {
|
|
402
497
|
const int num_blocks = (k + SYCL_NEG_BLOCK_SIZE - 1) / SYCL_NEG_BLOCK_SIZE;
|
|
403
498
|
stream->parallel_for(
|
|
@@ -405,11 +500,12 @@ static void step_f32_sycl(const float *x, float *dst, const int k,
|
|
|
405
500
|
sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE),
|
|
406
501
|
sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE)),
|
|
407
502
|
[=](sycl::nd_item<3> item_ct1) {
|
|
408
|
-
|
|
503
|
+
step(x, dst, k, item_ct1);
|
|
409
504
|
});
|
|
410
505
|
}
|
|
411
506
|
|
|
412
|
-
|
|
507
|
+
template<typename T>
|
|
508
|
+
static void sigmoid_sycl(const T *x, T *dst, const int k,
|
|
413
509
|
queue_ptr stream) {
|
|
414
510
|
const int num_blocks = (k + SYCL_SIGMOID_BLOCK_SIZE - 1) / SYCL_SIGMOID_BLOCK_SIZE;
|
|
415
511
|
stream->parallel_for(
|
|
@@ -417,11 +513,12 @@ static void sigmoid_f32_sycl(const float *x, float *dst, const int k,
|
|
|
417
513
|
sycl::range<3>(1, 1, SYCL_SIGMOID_BLOCK_SIZE),
|
|
418
514
|
sycl::range<3>(1, 1, SYCL_SIGMOID_BLOCK_SIZE)),
|
|
419
515
|
[=](sycl::nd_item<3> item_ct1) {
|
|
420
|
-
|
|
516
|
+
sigmoid(x, dst, k, item_ct1);
|
|
421
517
|
});
|
|
422
518
|
}
|
|
423
519
|
|
|
424
|
-
|
|
520
|
+
template<typename T>
|
|
521
|
+
static void sqrt_sycl(const T *x, T *dst, const int k,
|
|
425
522
|
queue_ptr stream) {
|
|
426
523
|
const int num_blocks = (k + SYCL_SQRT_BLOCK_SIZE - 1) / SYCL_SQRT_BLOCK_SIZE;
|
|
427
524
|
stream->parallel_for(
|
|
@@ -429,11 +526,12 @@ static void sqrt_f32_sycl(const float *x, float *dst, const int k,
|
|
|
429
526
|
sycl::range<3>(1, 1, SYCL_SQRT_BLOCK_SIZE),
|
|
430
527
|
sycl::range<3>(1, 1, SYCL_SQRT_BLOCK_SIZE)),
|
|
431
528
|
[=](sycl::nd_item<3> item_ct1) {
|
|
432
|
-
|
|
529
|
+
sqrt(x, dst, k, item_ct1);
|
|
433
530
|
});
|
|
434
531
|
}
|
|
435
532
|
|
|
436
|
-
|
|
533
|
+
template<typename T>
|
|
534
|
+
static void sin_sycl(const T *x, T *dst, const int k,
|
|
437
535
|
queue_ptr stream) {
|
|
438
536
|
const int num_blocks = (k + SYCL_SIN_BLOCK_SIZE - 1) / SYCL_SIN_BLOCK_SIZE;
|
|
439
537
|
stream->parallel_for(
|
|
@@ -441,11 +539,12 @@ static void sin_f32_sycl(const float *x, float *dst, const int k,
|
|
|
441
539
|
sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE),
|
|
442
540
|
sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE)),
|
|
443
541
|
[=](sycl::nd_item<3> item_ct1) {
|
|
444
|
-
|
|
542
|
+
sin(x, dst, k, item_ct1);
|
|
445
543
|
});
|
|
446
544
|
}
|
|
447
545
|
|
|
448
|
-
|
|
546
|
+
template<typename T>
|
|
547
|
+
static void cos_sycl(const T *x, T *dst, const int k,
|
|
449
548
|
queue_ptr stream) {
|
|
450
549
|
const int num_blocks = (k + SYCL_SIN_BLOCK_SIZE - 1) / SYCL_SIN_BLOCK_SIZE;
|
|
451
550
|
stream->parallel_for(
|
|
@@ -453,11 +552,12 @@ static void cos_f32_sycl(const float *x, float *dst, const int k,
|
|
|
453
552
|
sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE),
|
|
454
553
|
sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE)),
|
|
455
554
|
[=](sycl::nd_item<3> item_ct1) {
|
|
456
|
-
|
|
555
|
+
cos(x, dst, k, item_ct1);
|
|
457
556
|
});
|
|
458
557
|
}
|
|
459
558
|
|
|
460
|
-
|
|
559
|
+
template<typename T>
|
|
560
|
+
static void leaky_relu_sycl(const T *x, T *dst, const int k,
|
|
461
561
|
const float negative_slope,
|
|
462
562
|
queue_ptr stream) {
|
|
463
563
|
const int num_blocks = (k + SYCL_RELU_BLOCK_SIZE - 1) / SYCL_RELU_BLOCK_SIZE;
|
|
@@ -466,11 +566,12 @@ static void leaky_relu_f32_sycl(const float *x, float *dst, const int k,
|
|
|
466
566
|
sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE),
|
|
467
567
|
sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE)),
|
|
468
568
|
[=](sycl::nd_item<3> item_ct1) {
|
|
469
|
-
|
|
569
|
+
leaky_relu(x, dst, k, negative_slope, item_ct1);
|
|
470
570
|
});
|
|
471
571
|
}
|
|
472
572
|
|
|
473
|
-
|
|
573
|
+
template<typename T>
|
|
574
|
+
static void sqr_sycl(const T *x, T *dst, const int k,
|
|
474
575
|
queue_ptr stream) {
|
|
475
576
|
const int num_blocks = (k + SYCL_SQR_BLOCK_SIZE - 1) / SYCL_SQR_BLOCK_SIZE;
|
|
476
577
|
stream->parallel_for(
|
|
@@ -478,11 +579,12 @@ static void sqr_f32_sycl(const float *x, float *dst, const int k,
|
|
|
478
579
|
sycl::range<3>(1, 1, SYCL_SQR_BLOCK_SIZE),
|
|
479
580
|
sycl::range<3>(1, 1, SYCL_SQR_BLOCK_SIZE)),
|
|
480
581
|
[=](sycl::nd_item<3> item_ct1) {
|
|
481
|
-
|
|
582
|
+
sqr(x, dst, k, item_ct1);
|
|
482
583
|
});
|
|
483
584
|
}
|
|
484
585
|
|
|
485
|
-
|
|
586
|
+
template<typename T>
|
|
587
|
+
static void upscale_sycl(const T *x, T *dst, const int nb00, const int nb01,
|
|
486
588
|
const int nb02, const int nb03, const int ne10, const int ne11,
|
|
487
589
|
const int ne12, const int ne13, const float sf0, const float sf1,
|
|
488
590
|
const float sf2, const float sf3, queue_ptr stream) {
|
|
@@ -492,11 +594,12 @@ static void upscale_f32_sycl(const float *x, float *dst, const int nb00, const i
|
|
|
492
594
|
stream->parallel_for(
|
|
493
595
|
sycl::nd_range<1>(gridDim, sycl::range<1>(SYCL_UPSCALE_BLOCK_SIZE)),
|
|
494
596
|
[=](sycl::nd_item<1> item_ct1) {
|
|
495
|
-
|
|
597
|
+
upscale(x, dst, nb00, nb01, nb02, nb03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3, item_ct1);
|
|
496
598
|
});
|
|
497
599
|
}
|
|
498
600
|
|
|
499
|
-
|
|
601
|
+
template<typename T>
|
|
602
|
+
static void pad_sycl(const T *x, T *dst, const int ne00,
|
|
500
603
|
const int ne01, const int ne02, const int ne0,
|
|
501
604
|
const int ne1, const int ne2, queue_ptr stream) {
|
|
502
605
|
int num_blocks = (ne0 + SYCL_PAD_BLOCK_SIZE - 1) / SYCL_PAD_BLOCK_SIZE;
|
|
@@ -505,526 +608,929 @@ static void pad_f32_sycl(const float *x, float *dst, const int ne00,
|
|
|
505
608
|
sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE),
|
|
506
609
|
sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE)),
|
|
507
610
|
[=](sycl::nd_item<3> item_ct1) {
|
|
508
|
-
|
|
611
|
+
pad(x, dst, ne0, ne00, ne01, ne02, item_ct1);
|
|
509
612
|
});
|
|
510
613
|
}
|
|
511
614
|
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
GGML_UNUSED(src1_dd);
|
|
525
|
-
GGML_UNUSED(ctx);
|
|
615
|
+
template<typename T>
|
|
616
|
+
static void clamp_sycl(const T *x, T *dst, const float min,
|
|
617
|
+
const float max, const int k,
|
|
618
|
+
queue_ptr stream) {
|
|
619
|
+
const int num_blocks = (k + SYCL_CLAMP_BLOCK_SIZE - 1) / SYCL_CLAMP_BLOCK_SIZE;
|
|
620
|
+
stream->parallel_for(
|
|
621
|
+
sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
|
|
622
|
+
sycl::range<3>(1, 1, SYCL_CLAMP_BLOCK_SIZE),
|
|
623
|
+
sycl::range<3>(1, 1, SYCL_CLAMP_BLOCK_SIZE)),
|
|
624
|
+
[=](sycl::nd_item<3> item_ct1) {
|
|
625
|
+
clamp(x, dst, min, max, k, item_ct1);
|
|
626
|
+
});
|
|
526
627
|
}
|
|
527
628
|
|
|
528
|
-
inline void
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
534
|
-
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
629
|
+
inline void ggml_sycl_op_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
630
|
+
#if defined (GGML_SYCL_F16)
|
|
631
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
|
|
632
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
|
535
633
|
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
634
|
+
#else
|
|
635
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
|
636
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
637
|
+
#endif
|
|
638
|
+
GGML_ASSERT(dst->src[0]->type == dst->type);
|
|
639
|
+
dpct::queue_ptr main_stream = ctx.stream();
|
|
640
|
+
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
|
641
|
+
switch (dst->type) {
|
|
642
|
+
#if defined (GGML_SYCL_F16)
|
|
643
|
+
case GGML_TYPE_F16:
|
|
644
|
+
{
|
|
645
|
+
auto data_pts = cast_data<sycl::half>(dst);
|
|
646
|
+
sgn_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
647
|
+
break;
|
|
648
|
+
}
|
|
649
|
+
#endif
|
|
650
|
+
case GGML_TYPE_F32:
|
|
651
|
+
{
|
|
652
|
+
auto data_pts = cast_data<float>(dst);
|
|
653
|
+
sgn_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
654
|
+
break;
|
|
655
|
+
}
|
|
656
|
+
default:
|
|
657
|
+
GGML_ABORT("GGML tensor type not supported!\n");
|
|
658
|
+
}
|
|
542
659
|
}
|
|
543
|
-
inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
|
|
544
|
-
const ggml_tensor *src1, ggml_tensor *dst,
|
|
545
|
-
const float *src0_dd, const float *src1_dd,
|
|
546
|
-
float *dst_dd,
|
|
547
|
-
const queue_ptr &main_stream) {
|
|
548
|
-
|
|
549
|
-
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
550
|
-
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
551
660
|
|
|
552
|
-
|
|
661
|
+
inline void ggml_sycl_op_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
662
|
+
#if defined (GGML_SYCL_F16)
|
|
663
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
|
|
664
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
|
553
665
|
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
666
|
+
#else
|
|
667
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
|
668
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
669
|
+
#endif
|
|
670
|
+
GGML_ASSERT(dst->src[0]->type == dst->type);
|
|
671
|
+
dpct::queue_ptr main_stream = ctx.stream();
|
|
672
|
+
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
|
673
|
+
switch (dst->type) {
|
|
674
|
+
#if defined (GGML_SYCL_F16)
|
|
675
|
+
case GGML_TYPE_F16:
|
|
676
|
+
{
|
|
677
|
+
auto data_pts = cast_data<sycl::half>(dst);
|
|
678
|
+
abs_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
679
|
+
break;
|
|
680
|
+
}
|
|
681
|
+
#endif
|
|
682
|
+
case GGML_TYPE_F32:
|
|
683
|
+
{
|
|
684
|
+
auto data_pts = cast_data<float>(dst);
|
|
685
|
+
abs_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
686
|
+
break;
|
|
687
|
+
}
|
|
688
|
+
default:
|
|
689
|
+
GGML_ABORT("GGML tensor type not supported!\n");
|
|
690
|
+
}
|
|
558
691
|
}
|
|
559
692
|
|
|
560
|
-
inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
|
561
|
-
ggml_tensor *dst, const float *src0_dd,
|
|
562
|
-
const float *src1_dd, float *dst_dd,
|
|
563
|
-
const queue_ptr &main_stream) {
|
|
564
693
|
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
694
|
+
inline void ggml_sycl_op_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
695
|
+
#if defined (GGML_SYCL_F16)
|
|
696
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
|
|
697
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
|
568
698
|
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
699
|
+
#else
|
|
700
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
|
701
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
702
|
+
#endif
|
|
703
|
+
GGML_ASSERT(dst->src[0]->type == dst->type);
|
|
704
|
+
dpct::queue_ptr main_stream = ctx.stream();
|
|
705
|
+
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
|
706
|
+
switch (dst->type) {
|
|
707
|
+
#if defined (GGML_SYCL_F16)
|
|
708
|
+
case GGML_TYPE_F16:
|
|
709
|
+
{
|
|
710
|
+
auto data_pts = cast_data<sycl::half>(dst);
|
|
711
|
+
elu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
712
|
+
break;
|
|
713
|
+
}
|
|
714
|
+
#endif
|
|
715
|
+
case GGML_TYPE_F32:
|
|
716
|
+
{
|
|
717
|
+
auto data_pts = cast_data<float>(dst);
|
|
718
|
+
elu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
719
|
+
break;
|
|
720
|
+
}
|
|
721
|
+
default:
|
|
722
|
+
GGML_ABORT("GGML tensor type not supported!\n");
|
|
723
|
+
}
|
|
573
724
|
}
|
|
574
725
|
|
|
575
|
-
inline void
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
GGML_ASSERT(
|
|
581
|
-
GGML_ASSERT(
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
726
|
+
inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
727
|
+
#if defined (GGML_SYCL_F16)
|
|
728
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
|
|
729
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
|
730
|
+
#else
|
|
731
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
|
732
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
733
|
+
#endif
|
|
734
|
+
GGML_ASSERT(dst->src[0]->type == dst->type);
|
|
735
|
+
dpct::queue_ptr main_stream = ctx.stream();
|
|
736
|
+
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
|
737
|
+
switch (dst->type) {
|
|
738
|
+
#if defined (GGML_SYCL_F16)
|
|
739
|
+
case GGML_TYPE_F16:
|
|
740
|
+
{
|
|
741
|
+
auto data_pts = cast_data<sycl::half>(dst);
|
|
742
|
+
silu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
743
|
+
break;
|
|
744
|
+
}
|
|
745
|
+
#endif
|
|
746
|
+
case GGML_TYPE_F32:
|
|
747
|
+
{
|
|
748
|
+
auto data_pts = cast_data<float>(dst);
|
|
749
|
+
silu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
750
|
+
break;
|
|
751
|
+
}
|
|
752
|
+
default:
|
|
753
|
+
GGML_ABORT("GGML tensor type not supported!\n");
|
|
754
|
+
}
|
|
589
755
|
}
|
|
590
756
|
|
|
591
|
-
inline void
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
GGML_ASSERT(
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
757
|
+
inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
758
|
+
#if defined (GGML_SYCL_F16)
|
|
759
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
|
|
760
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
|
761
|
+
#else
|
|
762
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
|
763
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
764
|
+
#endif
|
|
765
|
+
GGML_ASSERT(dst->src[0]->type == dst->type);
|
|
766
|
+
dpct::queue_ptr main_stream = ctx.stream();
|
|
767
|
+
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
|
768
|
+
switch (dst->type) {
|
|
769
|
+
#if defined (GGML_SYCL_F16)
|
|
770
|
+
case GGML_TYPE_F16:
|
|
771
|
+
{
|
|
772
|
+
auto data_pts = cast_data<sycl::half>(dst);
|
|
773
|
+
gelu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
774
|
+
break;
|
|
775
|
+
}
|
|
776
|
+
#endif
|
|
777
|
+
case GGML_TYPE_F32:
|
|
778
|
+
{
|
|
779
|
+
auto data_pts = cast_data<float>(dst);
|
|
780
|
+
gelu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
781
|
+
break;
|
|
782
|
+
}
|
|
783
|
+
default:
|
|
784
|
+
GGML_ABORT("GGML tensor type not supported!\n");
|
|
785
|
+
}
|
|
606
786
|
}
|
|
607
787
|
|
|
608
|
-
inline void
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
GGML_ASSERT(
|
|
614
|
-
GGML_ASSERT(
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
788
|
+
inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
|
789
|
+
#if defined (GGML_SYCL_F16)
|
|
790
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
|
|
791
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
|
792
|
+
#else
|
|
793
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
|
794
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
795
|
+
#endif
|
|
796
|
+
GGML_ASSERT(dst->src[0]->type == dst->type);
|
|
797
|
+
dpct::queue_ptr main_stream = ctx.stream();
|
|
798
|
+
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
|
799
|
+
switch (dst->type) {
|
|
800
|
+
#if defined (GGML_SYCL_F16)
|
|
801
|
+
case GGML_TYPE_F16:
|
|
802
|
+
{
|
|
803
|
+
auto data_pts = cast_data<sycl::half>(dst);
|
|
804
|
+
gelu_quick_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
805
|
+
break;
|
|
806
|
+
}
|
|
807
|
+
#endif
|
|
808
|
+
case GGML_TYPE_F32:
|
|
809
|
+
{
|
|
810
|
+
auto data_pts = cast_data<float>(dst);
|
|
811
|
+
gelu_quick_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
812
|
+
break;
|
|
813
|
+
}
|
|
814
|
+
default:
|
|
815
|
+
GGML_ABORT("GGML tensor type not supported!\n");
|
|
816
|
+
}
|
|
622
817
|
}
|
|
623
818
|
|
|
624
|
-
inline void
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
GGML_ASSERT(
|
|
630
|
-
GGML_ASSERT(
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
819
|
+
inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
820
|
+
#if defined (GGML_SYCL_F16)
|
|
821
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
|
|
822
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
|
823
|
+
#else
|
|
824
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
|
825
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
826
|
+
#endif
|
|
827
|
+
GGML_ASSERT(dst->src[0]->type == dst->type);
|
|
828
|
+
dpct::queue_ptr main_stream = ctx.stream();
|
|
829
|
+
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
|
830
|
+
switch (dst->type) {
|
|
831
|
+
#if defined (GGML_SYCL_F16)
|
|
832
|
+
case GGML_TYPE_F16:
|
|
833
|
+
{
|
|
834
|
+
auto data_pts = cast_data<sycl::half>(dst);
|
|
835
|
+
tanh_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
836
|
+
break;
|
|
837
|
+
}
|
|
838
|
+
#endif
|
|
839
|
+
case GGML_TYPE_F32:
|
|
840
|
+
{
|
|
841
|
+
auto data_pts = cast_data<float>(dst);
|
|
842
|
+
tanh_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
843
|
+
break;
|
|
844
|
+
}
|
|
845
|
+
default:
|
|
846
|
+
GGML_ABORT("GGML tensor type not supported!\n");
|
|
847
|
+
}
|
|
638
848
|
}
|
|
639
849
|
|
|
640
|
-
inline void
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
GGML_ASSERT(
|
|
646
|
-
GGML_ASSERT(
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
850
|
+
inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
851
|
+
#if defined (GGML_SYCL_F16)
|
|
852
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
|
|
853
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
|
854
|
+
#else
|
|
855
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
|
856
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
857
|
+
#endif
|
|
858
|
+
GGML_ASSERT(dst->src[0]->type == dst->type);
|
|
859
|
+
dpct::queue_ptr main_stream = ctx.stream();
|
|
860
|
+
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
|
861
|
+
|
|
862
|
+
switch (dst->type) {
|
|
863
|
+
#if defined (GGML_SYCL_F16)
|
|
864
|
+
case GGML_TYPE_F16:
|
|
865
|
+
{
|
|
866
|
+
auto data_pts = cast_data<sycl::half>(dst);
|
|
867
|
+
relu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
868
|
+
break;
|
|
869
|
+
}
|
|
870
|
+
#endif
|
|
871
|
+
case GGML_TYPE_F32:
|
|
872
|
+
{
|
|
873
|
+
auto data_pts = cast_data<float>(dst);
|
|
874
|
+
relu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
875
|
+
break;
|
|
876
|
+
}
|
|
877
|
+
default:
|
|
878
|
+
GGML_ABORT("GGML tensor type not supported!\n");
|
|
879
|
+
}
|
|
654
880
|
}
|
|
655
881
|
|
|
656
|
-
inline void
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
GGML_ASSERT(
|
|
662
|
-
GGML_ASSERT(
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
882
|
+
inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
883
|
+
#if defined (GGML_SYCL_F16)
|
|
884
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
|
|
885
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
|
886
|
+
#else
|
|
887
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
|
888
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
889
|
+
#endif
|
|
890
|
+
GGML_ASSERT(dst->src[0]->type == dst->type);
|
|
891
|
+
|
|
892
|
+
dpct::queue_ptr main_stream = ctx.stream();
|
|
893
|
+
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
|
894
|
+
|
|
895
|
+
switch (dst->type) {
|
|
896
|
+
#if defined (GGML_SYCL_F16)
|
|
897
|
+
case GGML_TYPE_F16:
|
|
898
|
+
{
|
|
899
|
+
auto data_pts = cast_data<sycl::half>(dst);
|
|
900
|
+
hardsigmoid_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
901
|
+
break;
|
|
902
|
+
}
|
|
903
|
+
#endif
|
|
904
|
+
case GGML_TYPE_F32:
|
|
905
|
+
{
|
|
906
|
+
auto data_pts = cast_data<float>(dst);
|
|
907
|
+
hardsigmoid_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
908
|
+
break;
|
|
909
|
+
}
|
|
910
|
+
default:
|
|
911
|
+
GGML_ABORT("GGML tensor type not supported!\n");
|
|
912
|
+
}
|
|
670
913
|
}
|
|
671
914
|
|
|
672
|
-
inline void
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
GGML_ASSERT(
|
|
678
|
-
GGML_ASSERT(
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
915
|
+
inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
916
|
+
#if defined (GGML_SYCL_F16)
|
|
917
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
|
|
918
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
|
919
|
+
#else
|
|
920
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
|
921
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
922
|
+
#endif
|
|
923
|
+
GGML_ASSERT(dst->src[0]->type == dst->type);
|
|
924
|
+
dpct::queue_ptr main_stream = ctx.stream();
|
|
925
|
+
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
|
926
|
+
switch (dst->type) {
|
|
927
|
+
#if defined (GGML_SYCL_F16)
|
|
928
|
+
case GGML_TYPE_F16:
|
|
929
|
+
{
|
|
930
|
+
auto data_pts = cast_data<sycl::half>(dst);
|
|
931
|
+
hardswish_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
932
|
+
break;
|
|
933
|
+
}
|
|
934
|
+
#endif
|
|
935
|
+
case GGML_TYPE_F32:
|
|
936
|
+
{
|
|
937
|
+
auto data_pts = cast_data<float>(dst);
|
|
938
|
+
hardswish_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
939
|
+
break;
|
|
940
|
+
}
|
|
941
|
+
default:
|
|
942
|
+
GGML_ABORT("GGML tensor type not supported!\n");
|
|
943
|
+
}
|
|
686
944
|
}
|
|
687
945
|
|
|
688
|
-
inline void
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
GGML_ASSERT(
|
|
694
|
-
GGML_ASSERT(
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
946
|
+
inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
947
|
+
#if defined (GGML_SYCL_F16)
|
|
948
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
|
|
949
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
|
950
|
+
#else
|
|
951
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
|
952
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
953
|
+
#endif
|
|
954
|
+
GGML_ASSERT(dst->src[0]->type == dst->type);
|
|
955
|
+
dpct::queue_ptr main_stream = ctx.stream();
|
|
956
|
+
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
|
957
|
+
switch (dst->type) {
|
|
958
|
+
#if defined (GGML_SYCL_F16)
|
|
959
|
+
case GGML_TYPE_F16:
|
|
960
|
+
{
|
|
961
|
+
auto data_pts = cast_data<sycl::half>(dst);
|
|
962
|
+
exp_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
963
|
+
break;
|
|
964
|
+
}
|
|
965
|
+
#endif
|
|
966
|
+
case GGML_TYPE_F32:
|
|
967
|
+
{
|
|
968
|
+
auto data_pts = cast_data<float>(dst);
|
|
969
|
+
exp_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
970
|
+
break;
|
|
971
|
+
}
|
|
972
|
+
default:
|
|
973
|
+
GGML_ABORT("GGML tensor type not supported!\n");
|
|
974
|
+
}
|
|
702
975
|
}
|
|
703
976
|
|
|
704
|
-
inline void
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
GGML_ASSERT(
|
|
710
|
-
GGML_ASSERT(
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
977
|
+
inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
978
|
+
#if defined (GGML_SYCL_F16)
|
|
979
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
|
|
980
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
|
981
|
+
#else
|
|
982
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
|
983
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
984
|
+
#endif
|
|
985
|
+
GGML_ASSERT(dst->src[0]->type == dst->type);
|
|
986
|
+
dpct::queue_ptr main_stream = ctx.stream();
|
|
987
|
+
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
|
988
|
+
switch (dst->type) {
|
|
989
|
+
#if defined (GGML_SYCL_F16)
|
|
990
|
+
case GGML_TYPE_F16:
|
|
991
|
+
{
|
|
992
|
+
auto data_pts = cast_data<sycl::half>(dst);
|
|
993
|
+
log_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
994
|
+
break;
|
|
995
|
+
}
|
|
996
|
+
#endif
|
|
997
|
+
case GGML_TYPE_F32:
|
|
998
|
+
{
|
|
999
|
+
auto data_pts = cast_data<float>(dst);
|
|
1000
|
+
log_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
1001
|
+
break;
|
|
1002
|
+
}
|
|
1003
|
+
default:
|
|
1004
|
+
GGML_ABORT("GGML tensor type not supported!\n");
|
|
1005
|
+
}
|
|
718
1006
|
}
|
|
719
1007
|
|
|
720
|
-
inline void
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
GGML_ASSERT(
|
|
726
|
-
GGML_ASSERT(
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
1008
|
+
inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
1009
|
+
#if defined (GGML_SYCL_F16)
|
|
1010
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
|
|
1011
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
|
1012
|
+
#else
|
|
1013
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
|
1014
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
1015
|
+
#endif
|
|
1016
|
+
GGML_ASSERT(dst->src[0]->type == dst->type);
|
|
1017
|
+
dpct::queue_ptr main_stream = ctx.stream();
|
|
1018
|
+
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
|
1019
|
+
switch (dst->type) {
|
|
1020
|
+
#if defined (GGML_SYCL_F16)
|
|
1021
|
+
case GGML_TYPE_F16:
|
|
1022
|
+
{
|
|
1023
|
+
auto data_pts = cast_data<sycl::half>(dst);
|
|
1024
|
+
sigmoid_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
1025
|
+
break;
|
|
1026
|
+
}
|
|
1027
|
+
#endif
|
|
1028
|
+
case GGML_TYPE_F32:
|
|
1029
|
+
{
|
|
1030
|
+
auto data_pts = cast_data<float>(dst);
|
|
1031
|
+
sigmoid_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
1032
|
+
break;
|
|
1033
|
+
}
|
|
1034
|
+
default:
|
|
1035
|
+
GGML_ABORT("GGML tensor type not supported!\n");
|
|
1036
|
+
}
|
|
734
1037
|
}
|
|
735
1038
|
|
|
736
|
-
inline void
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
1039
|
+
inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
1040
|
+
#if defined (GGML_SYCL_F16)
|
|
1041
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
|
|
1042
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
|
1043
|
+
#else
|
|
1044
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
|
1045
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
1046
|
+
#endif
|
|
1047
|
+
GGML_ASSERT(dst->src[0]->type == dst->type);
|
|
1048
|
+
|
|
1049
|
+
dpct::queue_ptr main_stream = ctx.stream();
|
|
1050
|
+
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
|
1051
|
+
switch (dst->type) {
|
|
1052
|
+
#if defined (GGML_SYCL_F16)
|
|
1053
|
+
case GGML_TYPE_F16:
|
|
1054
|
+
{
|
|
1055
|
+
auto data_pts = cast_data<sycl::half>(dst);
|
|
1056
|
+
sqrt_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
1057
|
+
break;
|
|
1058
|
+
}
|
|
1059
|
+
#endif
|
|
1060
|
+
case GGML_TYPE_F32:
|
|
1061
|
+
{
|
|
1062
|
+
auto data_pts = cast_data<float>(dst);
|
|
1063
|
+
sqrt_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
1064
|
+
break;
|
|
1065
|
+
}
|
|
1066
|
+
default:
|
|
1067
|
+
GGML_ABORT("GGML tensor type not supported!\n");
|
|
1068
|
+
}
|
|
1069
|
+
}
|
|
740
1070
|
|
|
741
|
-
|
|
742
|
-
|
|
1071
|
+
inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
1072
|
+
#if defined (GGML_SYCL_F16)
|
|
1073
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
|
|
1074
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
|
1075
|
+
#else
|
|
1076
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
|
1077
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
1078
|
+
#endif
|
|
1079
|
+
GGML_ASSERT(dst->src[0]->type == dst->type);
|
|
1080
|
+
dpct::queue_ptr main_stream = ctx.stream();
|
|
1081
|
+
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
|
1082
|
+
switch (dst->type) {
|
|
1083
|
+
#if defined (GGML_SYCL_F16)
|
|
1084
|
+
case GGML_TYPE_F16:
|
|
1085
|
+
{
|
|
1086
|
+
auto data_pts = cast_data<sycl::half>(dst);
|
|
1087
|
+
sin_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
1088
|
+
break;
|
|
1089
|
+
}
|
|
1090
|
+
#endif
|
|
1091
|
+
case GGML_TYPE_F32:
|
|
1092
|
+
{
|
|
1093
|
+
auto data_pts = cast_data<float>(dst);
|
|
1094
|
+
sin_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
1095
|
+
break;
|
|
1096
|
+
}
|
|
1097
|
+
default:
|
|
1098
|
+
GGML_ABORT("GGML tensor type not supported!\n");
|
|
1099
|
+
}
|
|
1100
|
+
}
|
|
743
1101
|
|
|
744
|
-
|
|
1102
|
+
inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
1103
|
+
#if defined (GGML_SYCL_F16)
|
|
1104
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
|
|
1105
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
|
1106
|
+
#else
|
|
1107
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
|
1108
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
1109
|
+
#endif
|
|
1110
|
+
GGML_ASSERT(dst->src[0]->type == dst->type);
|
|
1111
|
+
dpct::queue_ptr main_stream = ctx.stream();
|
|
1112
|
+
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
|
1113
|
+
switch (dst->type) {
|
|
1114
|
+
#if defined (GGML_SYCL_F16)
|
|
1115
|
+
case GGML_TYPE_F16:
|
|
1116
|
+
{
|
|
1117
|
+
auto data_pts = cast_data<sycl::half>(dst);
|
|
1118
|
+
cos_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
1119
|
+
break;
|
|
1120
|
+
}
|
|
1121
|
+
#endif
|
|
1122
|
+
case GGML_TYPE_F32:
|
|
1123
|
+
{
|
|
1124
|
+
auto data_pts = cast_data<float>(dst);
|
|
1125
|
+
cos_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
1126
|
+
break;
|
|
1127
|
+
}
|
|
1128
|
+
default:
|
|
1129
|
+
GGML_ABORT("GGML tensor type not supported!\n");
|
|
1130
|
+
}
|
|
1131
|
+
}
|
|
745
1132
|
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
1133
|
+
inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
1134
|
+
#if defined (GGML_SYCL_F16)
|
|
1135
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
|
|
1136
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
|
1137
|
+
#else
|
|
1138
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
|
1139
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
1140
|
+
#endif
|
|
1141
|
+
GGML_ASSERT(dst->src[0]->type == dst->type);
|
|
1142
|
+
dpct::queue_ptr main_stream = ctx.stream();
|
|
1143
|
+
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
|
1144
|
+
switch (dst->type) {
|
|
1145
|
+
#if defined (GGML_SYCL_F16)
|
|
1146
|
+
case GGML_TYPE_F16:
|
|
1147
|
+
{
|
|
1148
|
+
auto data_pts = cast_data<sycl::half>(dst);
|
|
1149
|
+
step_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
1150
|
+
break;
|
|
1151
|
+
}
|
|
1152
|
+
#endif
|
|
1153
|
+
case GGML_TYPE_F32:
|
|
1154
|
+
{
|
|
1155
|
+
auto data_pts = cast_data<float>(dst);
|
|
1156
|
+
step_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
1157
|
+
break;
|
|
1158
|
+
}
|
|
1159
|
+
default:
|
|
1160
|
+
GGML_ABORT("GGML tensor type not supported!\n");
|
|
1161
|
+
}
|
|
750
1162
|
}
|
|
751
1163
|
|
|
752
|
-
inline void
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
1164
|
+
inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
1165
|
+
#if defined (GGML_SYCL_F16)
|
|
1166
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
|
|
1167
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
|
1168
|
+
#else
|
|
1169
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
|
1170
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
1171
|
+
#endif
|
|
1172
|
+
GGML_ASSERT(dst->src[0]->type == dst->type);
|
|
1173
|
+
dpct::queue_ptr main_stream = ctx.stream();
|
|
1174
|
+
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
|
1175
|
+
switch (dst->type) {
|
|
1176
|
+
#if defined (GGML_SYCL_F16)
|
|
1177
|
+
case GGML_TYPE_F16:
|
|
1178
|
+
{
|
|
1179
|
+
auto data_pts = cast_data<sycl::half>(dst);
|
|
1180
|
+
neg_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
1181
|
+
break;
|
|
1182
|
+
}
|
|
1183
|
+
#endif
|
|
1184
|
+
case GGML_TYPE_F32:
|
|
1185
|
+
{
|
|
1186
|
+
auto data_pts = cast_data<float>(dst);
|
|
1187
|
+
neg_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
1188
|
+
break;
|
|
1189
|
+
}
|
|
1190
|
+
default:
|
|
1191
|
+
GGML_ABORT("GGML tensor type not supported!\n");
|
|
1192
|
+
}
|
|
1193
|
+
}
|
|
757
1194
|
|
|
758
|
-
|
|
759
|
-
|
|
1195
|
+
inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
1196
|
+
#if defined (GGML_SYCL_F16)
|
|
1197
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
|
|
1198
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
|
1199
|
+
#else
|
|
1200
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
|
1201
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
1202
|
+
#endif
|
|
760
1203
|
|
|
1204
|
+
GGML_ASSERT(dst->src[0]->type == dst->type);
|
|
761
1205
|
float negative_slope;
|
|
762
1206
|
memcpy(&negative_slope, dst->op_params, sizeof(float));
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
1207
|
+
dpct::queue_ptr main_stream = ctx.stream();
|
|
1208
|
+
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
|
1209
|
+
switch (dst->type) {
|
|
1210
|
+
#if defined (GGML_SYCL_F16)
|
|
1211
|
+
case GGML_TYPE_F16:
|
|
1212
|
+
{
|
|
1213
|
+
auto data_pts = cast_data<sycl::half>(dst);
|
|
1214
|
+
leaky_relu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), negative_slope, main_stream);
|
|
1215
|
+
break;
|
|
1216
|
+
}
|
|
1217
|
+
#endif
|
|
1218
|
+
case GGML_TYPE_F32:
|
|
1219
|
+
{
|
|
1220
|
+
auto data_pts = cast_data<float>(dst);
|
|
1221
|
+
leaky_relu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), negative_slope, main_stream);
|
|
1222
|
+
break;
|
|
1223
|
+
}
|
|
1224
|
+
default:
|
|
1225
|
+
GGML_ABORT("GGML tensor type not supported!\n");
|
|
1226
|
+
}
|
|
770
1227
|
}
|
|
771
1228
|
|
|
772
|
-
inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx,
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
GGML_ASSERT(
|
|
778
|
-
GGML_ASSERT(
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
1229
|
+
inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
1230
|
+
#if defined (GGML_SYCL_F16)
|
|
1231
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
|
|
1232
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
|
1233
|
+
#else
|
|
1234
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
|
1235
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
1236
|
+
#endif
|
|
1237
|
+
GGML_ASSERT(dst->src[0]->type == dst->type);
|
|
1238
|
+
dpct::queue_ptr main_stream = ctx.stream();
|
|
1239
|
+
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
|
1240
|
+
switch (dst->type) {
|
|
1241
|
+
#if defined (GGML_SYCL_F16)
|
|
1242
|
+
case GGML_TYPE_F16:
|
|
1243
|
+
{
|
|
1244
|
+
auto data_pts = cast_data<sycl::half>(dst);
|
|
1245
|
+
sqr_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
1246
|
+
break;
|
|
1247
|
+
}
|
|
1248
|
+
#endif
|
|
1249
|
+
case GGML_TYPE_F32:
|
|
1250
|
+
{
|
|
1251
|
+
auto data_pts = cast_data<float>(dst);
|
|
1252
|
+
sqr_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
|
|
1253
|
+
break;
|
|
1254
|
+
}
|
|
1255
|
+
default:
|
|
1256
|
+
GGML_ABORT("GGML tensor type not supported!\n");
|
|
1257
|
+
}
|
|
786
1258
|
}
|
|
787
1259
|
|
|
788
|
-
inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx,
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
1260
|
+
inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
1261
|
+
#if defined (GGML_SYCL_F16)
|
|
1262
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
|
|
1263
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
|
1264
|
+
#else
|
|
1265
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
|
795
1266
|
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
1267
|
+
#endif
|
|
1268
|
+
GGML_ASSERT(dst->src[0]->type == dst->type);
|
|
1269
|
+
|
|
1270
|
+
dpct::queue_ptr main_stream = ctx.stream();
|
|
1271
|
+
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
|
1272
|
+
|
|
1273
|
+
const float sf0 = (float) dst->ne[0] / dst->src[0]->ne[0];
|
|
1274
|
+
const float sf1 = (float) dst->ne[1] / dst->src[0]->ne[1];
|
|
1275
|
+
const float sf2 = (float) dst->ne[2] / dst->src[0]->ne[2];
|
|
1276
|
+
const float sf3 = (float) dst->ne[3] / dst->src[0]->ne[3];
|
|
1277
|
+
switch (dst->type) {
|
|
1278
|
+
#if defined (GGML_SYCL_F16)
|
|
1279
|
+
case GGML_TYPE_F16:
|
|
1280
|
+
{
|
|
1281
|
+
auto data_pts = cast_data<sycl::half>(dst);
|
|
1282
|
+
upscale_sycl(data_pts.src, data_pts.dst, dst->src[0]->nb[0], dst->src[0]->nb[1], dst->src[0]->nb[2],
|
|
1283
|
+
dst->src[0]->nb[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3,
|
|
1284
|
+
main_stream);
|
|
1285
|
+
break;
|
|
1286
|
+
}
|
|
1287
|
+
#endif
|
|
1288
|
+
case GGML_TYPE_F32:
|
|
1289
|
+
{
|
|
1290
|
+
auto data_pts = cast_data<float>(dst);
|
|
1291
|
+
upscale_sycl(data_pts.src, data_pts.dst, dst->src[0]->nb[0], dst->src[0]->nb[1], dst->src[0]->nb[2],
|
|
1292
|
+
dst->src[0]->nb[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3,
|
|
1293
|
+
main_stream);
|
|
1294
|
+
break;
|
|
1295
|
+
}
|
|
1296
|
+
default:
|
|
1297
|
+
GGML_ABORT("GGML tensor type not supported!\n");
|
|
1298
|
+
}
|
|
810
1299
|
}
|
|
811
1300
|
|
|
812
|
-
inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx,
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
GGML_ASSERT(
|
|
1301
|
+
inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
1302
|
+
#if defined (GGML_SYCL_F16)
|
|
1303
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
|
|
1304
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
|
1305
|
+
#else
|
|
1306
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
|
818
1307
|
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
819
|
-
|
|
1308
|
+
#endif
|
|
1309
|
+
GGML_ASSERT(dst->src[0]->type == dst->type);
|
|
1310
|
+
GGML_ASSERT(dst->src[0]->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
|
|
1311
|
+
dpct::queue_ptr main_stream = ctx.stream();
|
|
1312
|
+
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
|
1313
|
+
switch (dst->type) {
|
|
1314
|
+
#if defined (GGML_SYCL_F16)
|
|
1315
|
+
case GGML_TYPE_F16:
|
|
1316
|
+
{
|
|
1317
|
+
auto data_pts = cast_data<sycl::half>(dst);
|
|
1318
|
+
pad_sycl(data_pts.src, data_pts.dst, dst->src[0]->ne[0], dst->src[0]->ne[1], dst->src[0]->ne[2], dst->ne[0],
|
|
1319
|
+
dst->ne[1], dst->ne[2], main_stream);
|
|
1320
|
+
break;
|
|
1321
|
+
}
|
|
1322
|
+
#endif
|
|
1323
|
+
case GGML_TYPE_F32:
|
|
1324
|
+
{
|
|
1325
|
+
auto data_pts = cast_data<float>(dst);
|
|
1326
|
+
pad_sycl(data_pts.src, data_pts.dst, dst->src[0]->ne[0], dst->src[0]->ne[1], dst->src[0]->ne[2], dst->ne[0],
|
|
1327
|
+
dst->ne[1], dst->ne[2], main_stream);
|
|
1328
|
+
break;
|
|
1329
|
+
}
|
|
1330
|
+
default:
|
|
1331
|
+
GGML_ABORT("GGML tensor type not supported!\n");
|
|
1332
|
+
}
|
|
1333
|
+
}
|
|
820
1334
|
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
1335
|
+
inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
1336
|
+
#if defined(GGML_SYCL_F16)
|
|
1337
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
|
|
1338
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
|
1339
|
+
#else
|
|
824
1340
|
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
|
|
1341
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
|
1342
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
1343
|
+
#endif
|
|
1344
|
+
GGML_ASSERT(dst->src[0]->type == dst->type);
|
|
1345
|
+
dpct::queue_ptr main_stream = ctx.stream();
|
|
1346
|
+
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
|
1347
|
+
float min;
|
|
1348
|
+
float max;
|
|
1349
|
+
memcpy(&min, dst->op_params, sizeof(float));
|
|
1350
|
+
memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
|
|
1351
|
+
|
|
1352
|
+
switch (dst->type) {
|
|
1353
|
+
#if defined(GGML_SYCL_F16)
|
|
1354
|
+
case GGML_TYPE_F16:
|
|
1355
|
+
{
|
|
1356
|
+
auto data_pts = cast_data<sycl::half>(dst);
|
|
1357
|
+
clamp_sycl(data_pts.src, data_pts.dst, min, max, ggml_nelements(dst->src[0]), main_stream);
|
|
1358
|
+
break;
|
|
1359
|
+
}
|
|
1360
|
+
#endif
|
|
1361
|
+
case GGML_TYPE_F32:
|
|
1362
|
+
{
|
|
1363
|
+
auto data_pts = cast_data<float>(dst);
|
|
1364
|
+
clamp_sycl(data_pts.src, data_pts.dst, min, max, ggml_nelements(dst->src[0]), main_stream);
|
|
1365
|
+
break;
|
|
1366
|
+
}
|
|
1367
|
+
default:
|
|
1368
|
+
GGML_ABORT("GGML tensor type not supported!\n");
|
|
1369
|
+
}
|
|
829
1370
|
}
|
|
830
1371
|
|
|
831
|
-
inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx,
|
|
832
|
-
ggml_tensor *dst, const float *src0_dd,
|
|
833
|
-
const float *src1_dd, float *dst_dd,
|
|
834
|
-
const queue_ptr &main_stream) {
|
|
1372
|
+
inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
|
|
835
1373
|
|
|
836
|
-
GGML_ASSERT(
|
|
837
|
-
GGML_ASSERT(
|
|
1374
|
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
|
1375
|
+
GGML_ASSERT(dst->src[1]->type == GGML_TYPE_F32);
|
|
838
1376
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
839
1377
|
GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported
|
|
1378
|
+
dpct::queue_ptr main_stream = ctx.stream();
|
|
1379
|
+
SYCL_CHECK(ggml_sycl_set_device(ctx.device));
|
|
1380
|
+
const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
|
|
1381
|
+
const float * src1_dd = static_cast<const float*>(dst->src[1]->data);
|
|
1382
|
+
float * dst_dd = static_cast<float *>(dst->data);
|
|
840
1383
|
|
|
841
1384
|
int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
|
|
842
1385
|
int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
|
|
843
1386
|
// int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
|
|
844
1387
|
int offset = dst->op_params[3] / 4; // offset in bytes
|
|
845
1388
|
|
|
846
|
-
acc_f32_sycl(src0_dd, src1_dd, dst_dd, ggml_nelements(dst),
|
|
847
|
-
|
|
848
|
-
GGML_UNUSED(dst);
|
|
849
|
-
GGML_UNUSED(ctx);
|
|
850
|
-
}
|
|
851
|
-
|
|
852
|
-
inline void ggml_sycl_op_add(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
|
853
|
-
ggml_tensor *dst, const float *src0_dd,
|
|
854
|
-
const float *src1_dd, float *dst_dd,
|
|
855
|
-
const queue_ptr &main_stream) {
|
|
856
|
-
|
|
857
|
-
ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_add>>(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
|
|
858
|
-
}
|
|
859
|
-
|
|
860
|
-
inline void ggml_sycl_op_sub(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
|
861
|
-
ggml_tensor *dst, const float *src0_dd,
|
|
862
|
-
const float *src1_dd, float *dst_dd,
|
|
863
|
-
const queue_ptr &main_stream) {
|
|
864
|
-
|
|
865
|
-
ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_sub>>(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
|
|
866
|
-
}
|
|
867
|
-
|
|
868
|
-
inline void ggml_sycl_op_mul(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
|
869
|
-
ggml_tensor *dst, const float *src0_dd,
|
|
870
|
-
const float *src1_dd, float *dst_dd,
|
|
871
|
-
const queue_ptr &main_stream) {
|
|
872
|
-
|
|
873
|
-
ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_mul>>(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
|
|
874
|
-
}
|
|
875
|
-
|
|
876
|
-
inline void ggml_sycl_op_div(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
|
|
877
|
-
ggml_tensor *dst, const float *src0_dd,
|
|
878
|
-
const float *src1_dd, float *dst_dd,
|
|
879
|
-
const queue_ptr &main_stream) {
|
|
880
|
-
|
|
881
|
-
ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_div>>(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
|
|
1389
|
+
acc_f32_sycl(src0_dd, src1_dd, dst_dd, ggml_nelements(dst), dst->src[1]->ne[0], dst->src[1]->ne[1], dst->src[1]->ne[2], nb1, nb2, offset, main_stream);
|
|
882
1390
|
}
|
|
883
1391
|
|
|
884
1392
|
|
|
885
1393
|
void ggml_sycl_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
886
|
-
GGML_SYCL_DEBUG("call %s\n", __func__);
|
|
887
|
-
|
|
1394
|
+
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
|
1395
|
+
ggml_sycl_op_sqrt(ctx, dst);
|
|
888
1396
|
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
889
1397
|
}
|
|
890
1398
|
|
|
891
1399
|
void ggml_sycl_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
892
|
-
GGML_SYCL_DEBUG("call %s\n", __func__);
|
|
893
|
-
|
|
1400
|
+
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
|
1401
|
+
ggml_sycl_op_sin(ctx, dst);
|
|
894
1402
|
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
895
1403
|
}
|
|
896
1404
|
|
|
897
1405
|
void ggml_sycl_cos(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
898
|
-
GGML_SYCL_DEBUG("call %s\n", __func__);
|
|
899
|
-
|
|
1406
|
+
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
|
1407
|
+
ggml_sycl_op_cos(ctx, dst);
|
|
900
1408
|
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
901
1409
|
}
|
|
902
1410
|
|
|
903
1411
|
void ggml_sycl_acc(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
904
|
-
GGML_SYCL_DEBUG("call %s\n", __func__);
|
|
905
|
-
|
|
1412
|
+
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
|
1413
|
+
ggml_sycl_op_acc(ctx, dst);
|
|
906
1414
|
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
907
1415
|
}
|
|
908
1416
|
|
|
909
1417
|
void ggml_sycl_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
910
|
-
GGML_SYCL_DEBUG("call %s\n", __func__);
|
|
911
|
-
|
|
1418
|
+
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
|
1419
|
+
ggml_sycl_op_gelu(ctx, dst);
|
|
912
1420
|
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
913
1421
|
}
|
|
914
1422
|
|
|
915
1423
|
void ggml_sycl_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
916
|
-
GGML_SYCL_DEBUG("call %s\n", __func__);
|
|
917
|
-
|
|
1424
|
+
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
|
1425
|
+
ggml_sycl_op_silu(ctx, dst);
|
|
918
1426
|
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
919
1427
|
}
|
|
920
1428
|
|
|
921
1429
|
void ggml_sycl_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
922
|
-
GGML_SYCL_DEBUG("call %s\n", __func__);
|
|
923
|
-
|
|
1430
|
+
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
|
1431
|
+
ggml_sycl_op_gelu_quick(ctx, dst);
|
|
924
1432
|
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
925
1433
|
}
|
|
926
1434
|
|
|
927
1435
|
void ggml_sycl_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
928
|
-
GGML_SYCL_DEBUG("call %s\n", __func__);
|
|
929
|
-
|
|
1436
|
+
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
|
1437
|
+
ggml_sycl_op_tanh(ctx, dst);
|
|
930
1438
|
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
931
1439
|
}
|
|
932
1440
|
|
|
933
1441
|
void ggml_sycl_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
934
|
-
GGML_SYCL_DEBUG("call %s\n", __func__);
|
|
935
|
-
|
|
1442
|
+
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
|
1443
|
+
ggml_sycl_op_relu(ctx, dst);
|
|
936
1444
|
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
937
1445
|
}
|
|
938
1446
|
|
|
939
1447
|
void ggml_sycl_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
940
|
-
GGML_SYCL_DEBUG("call %s\n", __func__);
|
|
941
|
-
|
|
1448
|
+
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
|
1449
|
+
ggml_sycl_op_sigmoid(ctx, dst);
|
|
942
1450
|
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
943
1451
|
}
|
|
944
1452
|
|
|
945
1453
|
void ggml_sycl_hardsigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
946
|
-
GGML_SYCL_DEBUG("call %s\n", __func__);
|
|
947
|
-
|
|
1454
|
+
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
|
1455
|
+
ggml_sycl_op_hardsigmoid(ctx, dst);
|
|
948
1456
|
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
949
1457
|
}
|
|
950
1458
|
|
|
951
1459
|
void ggml_sycl_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
952
|
-
GGML_SYCL_DEBUG("call %s\n", __func__);
|
|
953
|
-
|
|
1460
|
+
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
|
1461
|
+
ggml_sycl_op_hardswish(ctx, dst);
|
|
954
1462
|
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
955
1463
|
}
|
|
956
1464
|
|
|
957
1465
|
|
|
958
1466
|
void ggml_sycl_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
959
|
-
GGML_SYCL_DEBUG("call %s\n", __func__);
|
|
960
|
-
|
|
1467
|
+
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
|
1468
|
+
ggml_sycl_op_exp(ctx, dst);
|
|
961
1469
|
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
962
1470
|
}
|
|
963
1471
|
|
|
964
1472
|
void ggml_sycl_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
965
|
-
GGML_SYCL_DEBUG("call %s\n", __func__);
|
|
966
|
-
|
|
1473
|
+
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
|
1474
|
+
ggml_sycl_op_log(ctx, dst);
|
|
967
1475
|
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
968
1476
|
}
|
|
969
1477
|
|
|
970
1478
|
void ggml_sycl_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
971
|
-
GGML_SYCL_DEBUG("call %s\n", __func__);
|
|
972
|
-
|
|
1479
|
+
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
|
1480
|
+
ggml_sycl_op_neg(ctx, dst);
|
|
973
1481
|
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
974
1482
|
}
|
|
975
1483
|
|
|
976
1484
|
void ggml_sycl_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
977
|
-
GGML_SYCL_DEBUG("call %s\n", __func__);
|
|
978
|
-
|
|
1485
|
+
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
|
1486
|
+
ggml_sycl_op_step(ctx, dst);
|
|
979
1487
|
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
980
1488
|
}
|
|
981
1489
|
|
|
982
1490
|
void ggml_sycl_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
983
|
-
GGML_SYCL_DEBUG("call %s\n", __func__);
|
|
984
|
-
|
|
1491
|
+
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
|
1492
|
+
ggml_sycl_op_leaky_relu(ctx, dst);
|
|
985
1493
|
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
986
1494
|
}
|
|
987
1495
|
|
|
988
1496
|
void ggml_sycl_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
989
|
-
GGML_SYCL_DEBUG("call %s\n", __func__);
|
|
990
|
-
|
|
1497
|
+
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
|
1498
|
+
ggml_sycl_op_sqr(ctx, dst);
|
|
991
1499
|
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
992
1500
|
}
|
|
993
1501
|
|
|
994
1502
|
void ggml_sycl_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
995
|
-
GGML_SYCL_DEBUG("call %s\n", __func__);
|
|
996
|
-
|
|
1503
|
+
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
|
1504
|
+
ggml_sycl_op_upscale(ctx, dst);
|
|
997
1505
|
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
998
1506
|
}
|
|
999
1507
|
|
|
1000
1508
|
void ggml_sycl_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
1001
|
-
GGML_SYCL_DEBUG("call %s\n", __func__);
|
|
1002
|
-
|
|
1509
|
+
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
|
1510
|
+
ggml_sycl_op_pad(ctx, dst);
|
|
1003
1511
|
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
1004
1512
|
}
|
|
1005
1513
|
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
|
|
1009
|
-
GGML_SYCL_DEBUG("call %s\n", __func__);
|
|
1010
|
-
ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_add);
|
|
1514
|
+
void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
1515
|
+
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
|
1516
|
+
ggml_sycl_op_clamp(ctx, dst);
|
|
1011
1517
|
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
1012
1518
|
}
|
|
1013
1519
|
|
|
1014
|
-
void
|
|
1015
|
-
GGML_SYCL_DEBUG("call %s\n", __func__);
|
|
1016
|
-
|
|
1520
|
+
void ggml_sycl_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
1521
|
+
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
|
1522
|
+
ggml_sycl_op_sgn(ctx, dst);
|
|
1017
1523
|
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
1018
1524
|
}
|
|
1019
1525
|
|
|
1020
|
-
void
|
|
1021
|
-
GGML_SYCL_DEBUG("call %s\n", __func__);
|
|
1022
|
-
|
|
1526
|
+
void ggml_sycl_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
1527
|
+
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
|
1528
|
+
ggml_sycl_op_abs(ctx, dst);
|
|
1023
1529
|
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
1024
1530
|
}
|
|
1025
1531
|
|
|
1026
|
-
void
|
|
1027
|
-
GGML_SYCL_DEBUG("call %s\n", __func__);
|
|
1028
|
-
|
|
1532
|
+
void ggml_sycl_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
|
1533
|
+
GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
|
|
1534
|
+
ggml_sycl_op_elu(ctx, dst);
|
|
1029
1535
|
GGML_SYCL_DEBUG("call %s done\n", __func__);
|
|
1030
1536
|
}
|