whispercpp 1.3.2 → 1.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +6 -3
- data/README.md +71 -14
- data/Rakefile +20 -7
- data/ext/.gitignore +4 -6
- data/ext/dependencies.rb +36 -24
- data/ext/extconf.rb +1 -1
- data/ext/options.rb +48 -184
- data/ext/ruby_whisper.c +18 -0
- data/ext/ruby_whisper_context.c +43 -12
- data/ext/ruby_whisper_model.c +1 -1
- data/ext/ruby_whisper_params.c +4 -2
- data/ext/ruby_whisper_segment.c +81 -4
- data/ext/ruby_whisper_transcribe.cpp +13 -7
- data/ext/ruby_whisper_vad_params.c +1 -1
- data/ext/sources/CMakeLists.txt +5 -1
- data/ext/sources/bindings/javascript/package.json +1 -1
- data/ext/sources/examples/addon.node/__test__/whisper.spec.js +120 -24
- data/ext/sources/examples/addon.node/addon.cpp +150 -31
- data/ext/sources/examples/addon.node/index.js +3 -0
- data/ext/sources/examples/addon.node/vad-example.js +132 -0
- data/ext/sources/examples/bench/bench.cpp +3 -2
- data/ext/sources/examples/cli/cli.cpp +3 -2
- data/ext/sources/examples/command/command.cpp +32 -8
- data/ext/sources/examples/common-whisper.cpp +14 -7
- data/ext/sources/examples/lsp/lsp.cpp +2 -0
- data/ext/sources/examples/quantize/quantize.cpp +3 -0
- data/ext/sources/examples/server/CMakeLists.txt +3 -0
- data/ext/sources/examples/server/server.cpp +169 -22
- data/ext/sources/examples/stream/stream.cpp +6 -0
- data/ext/sources/examples/talk-llama/CMakeLists.txt +4 -1
- data/ext/sources/examples/talk-llama/llama-arch.cpp +171 -3
- data/ext/sources/examples/talk-llama/llama-arch.h +28 -1
- data/ext/sources/examples/talk-llama/llama-batch.cpp +741 -272
- data/ext/sources/examples/talk-llama/llama-batch.h +112 -54
- data/ext/sources/examples/talk-llama/llama-chat.cpp +30 -8
- data/ext/sources/examples/talk-llama/llama-chat.h +1 -0
- data/ext/sources/examples/talk-llama/llama-context.cpp +520 -351
- data/ext/sources/examples/talk-llama/llama-context.h +38 -17
- data/ext/sources/examples/talk-llama/llama-cparams.cpp +1 -1
- data/ext/sources/examples/talk-llama/llama-cparams.h +1 -1
- data/ext/sources/examples/talk-llama/llama-graph.cpp +447 -372
- data/ext/sources/examples/talk-llama/llama-graph.h +128 -58
- data/ext/sources/examples/talk-llama/llama-hparams.cpp +10 -2
- data/ext/sources/examples/talk-llama/llama-hparams.h +19 -2
- data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.cpp +279 -0
- data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.h +128 -0
- data/ext/sources/examples/talk-llama/llama-kv-cache-unified.cpp +1841 -0
- data/ext/sources/examples/talk-llama/llama-kv-cache-unified.h +303 -0
- data/ext/sources/examples/talk-llama/llama-kv-cache.h +14 -472
- data/ext/sources/examples/talk-llama/llama-kv-cells.h +86 -26
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +246 -0
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +138 -0
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +1125 -0
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +183 -0
- data/ext/sources/examples/talk-llama/llama-memory.cpp +58 -0
- data/ext/sources/examples/talk-llama/llama-memory.h +88 -4
- data/ext/sources/examples/talk-llama/llama-mmap.cpp +1 -1
- data/ext/sources/examples/talk-llama/llama-model-loader.cpp +42 -17
- data/ext/sources/examples/talk-llama/llama-model-saver.cpp +1 -0
- data/ext/sources/examples/talk-llama/llama-model.cpp +1863 -563
- data/ext/sources/examples/talk-llama/llama-model.h +27 -0
- data/ext/sources/examples/talk-llama/llama-quant.cpp +89 -6
- data/ext/sources/examples/talk-llama/llama-vocab.cpp +65 -28
- data/ext/sources/examples/talk-llama/llama-vocab.h +1 -0
- data/ext/sources/examples/talk-llama/llama.cpp +11 -7
- data/ext/sources/examples/talk-llama/llama.h +147 -40
- data/ext/sources/examples/talk-llama/talk-llama.cpp +2 -0
- data/ext/sources/examples/talk-llama/unicode.cpp +5 -0
- data/ext/sources/examples/vad-speech-segments/speech.cpp +6 -0
- data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +2 -0
- data/ext/sources/ggml/CMakeLists.txt +48 -3
- data/ext/sources/ggml/cmake/common.cmake +24 -0
- data/ext/sources/ggml/include/ggml-backend.h +1 -1
- data/ext/sources/ggml/include/ggml-cpu.h +2 -0
- data/ext/sources/ggml/include/ggml.h +144 -5
- data/ext/sources/ggml/src/CMakeLists.txt +82 -24
- data/ext/sources/ggml/src/ggml-backend-reg.cpp +5 -0
- data/ext/sources/ggml/src/ggml-backend.cpp +46 -23
- data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +3 -3
- data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +1 -0
- data/ext/sources/ggml/src/ggml-cann/common.h +6 -1
- data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
- data/ext/sources/ggml/src/ggml-common.h +4 -0
- data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +133 -40
- data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
- data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +11 -10
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +4114 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +2163 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +2639 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +2732 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +2069 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +397 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +1300 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +1481 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +4311 -0
- data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
- data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +184 -0
- data/ext/sources/ggml/src/ggml-cpu/common.h +4 -3
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +16 -7
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +146 -105
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +12 -8
- data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +58 -8
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
- data/ext/sources/ggml/src/ggml-cpu/ops.cpp +1057 -174
- data/ext/sources/ggml/src/ggml-cpu/ops.h +8 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.c +1158 -0
- data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
- data/ext/sources/ggml/src/ggml-cpu/repack.cpp +1571 -0
- data/ext/sources/ggml/src/ggml-cpu/repack.h +98 -0
- data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +330 -38
- data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
- data/ext/sources/ggml/src/ggml-cpu/vec.cpp +111 -18
- data/ext/sources/ggml/src/ggml-cpu/vec.h +303 -94
- data/ext/sources/ggml/src/ggml-cuda/common.cuh +60 -37
- data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
- data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
- data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
- data/ext/sources/ggml/src/ggml-cuda/convert.cu +22 -0
- data/ext/sources/ggml/src/ggml-cuda/convert.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +2 -2
- data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +5 -2
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
- data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +265 -123
- data/ext/sources/ggml/src/ggml-cuda/mean.cu +19 -0
- data/ext/sources/ggml/src/ggml-cuda/mean.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/mmv.cu +257 -87
- data/ext/sources/ggml/src/ggml-cuda/mmv.cuh +2 -3
- data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
- data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +5 -18
- data/ext/sources/ggml/src/ggml-cuda/sumrows.cuh +0 -1
- data/ext/sources/ggml/src/ggml-cuda/unary.cu +89 -0
- data/ext/sources/ggml/src/ggml-cuda/unary.cuh +7 -0
- data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +4 -0
- data/ext/sources/ggml/src/ggml-impl.h +127 -183
- data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +11 -10
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +27 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.m +331 -49
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +564 -282
- data/ext/sources/ggml/src/ggml-musa/mudnn.cuh +2 -2
- data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +14 -0
- data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +1859 -489
- data/ext/sources/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/div.cl +72 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/glu.cl +201 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/group_norm.cl +72 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sub.cl +72 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
- data/ext/sources/ggml/src/ggml-quants.c +6 -8
- data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
- data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +3 -3
- data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +5 -6
- data/ext/sources/ggml/src/ggml-sycl/common.hpp +20 -48
- data/ext/sources/ggml/src/ggml-sycl/concat.cpp +28 -41
- data/ext/sources/ggml/src/ggml-sycl/conv.cpp +4 -10
- data/ext/sources/ggml/src/ggml-sycl/convert.cpp +117 -165
- data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +192 -53
- data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +32 -0
- data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +49 -67
- data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
- data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +648 -1039
- data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +18 -9
- data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +3 -0
- data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +8 -105
- data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +238 -100
- data/ext/sources/ggml/src/ggml-sycl/gla.cpp +2 -2
- data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +1 -1
- data/ext/sources/ggml/src/ggml-sycl/mmq.cpp +60 -80
- data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +158 -203
- data/ext/sources/ggml/src/ggml-sycl/norm.cpp +55 -74
- data/ext/sources/ggml/src/ggml-sycl/quants.hpp +38 -10
- data/ext/sources/ggml/src/ggml-sycl/rope.cpp +138 -27
- data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +3 -3
- data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
- data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
- data/ext/sources/ggml/src/ggml-sycl/tsembd.cpp +3 -8
- data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
- data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +12 -16
- data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +36 -32
- data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +726 -282
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -12
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +15 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +12 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +10 -1
- data/ext/sources/ggml/src/ggml.c +328 -48
- data/ext/sources/ggml/src/ggml.cpp +26 -0
- data/ext/sources/ggml/src/gguf.cpp +24 -3
- data/ext/sources/include/whisper.h +2 -0
- data/ext/sources/src/CMakeLists.txt +2 -0
- data/ext/sources/src/coreml/whisper-compat.h +10 -0
- data/ext/sources/src/coreml/whisper-compat.m +35 -0
- data/ext/sources/src/coreml/whisper-decoder-impl.m +1 -0
- data/ext/sources/src/coreml/whisper-encoder-impl.m +1 -0
- data/ext/sources/src/whisper.cpp +218 -169
- data/extsources.rb +15 -9
- data/lib/whisper/context.rb +15 -0
- data/lib/whisper/model/uri.rb +56 -1
- data/lib/whisper/segment.rb +58 -0
- data/sig/whisper.rbs +68 -38
- data/{tests → test}/helper.rb +1 -12
- data/{tests → test}/test_model.rb +9 -0
- data/test/test_package.rb +51 -0
- data/test/test_segment.rb +146 -0
- data/{tests → test}/test_whisper.rb +70 -0
- data/whispercpp.gemspec +2 -3
- metadata +91 -43
- data/ext/sources/.dockerignore +0 -3
- data/ext/sources/.github/workflows/bindings-ruby.yml +0 -21
- data/ext/sources/ci/run.sh +0 -336
- data/ext/sources/close-issue.yml +0 -28
- data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +0 -2739
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13747
- data/tests/test_package.rb +0 -46
- data/tests/test_segment.rb +0 -74
- /data/ext/sources/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
- /data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
- /data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
- /data/{tests → test}/jfk_reader/.gitignore +0 -0
- /data/{tests → test}/jfk_reader/extconf.rb +0 -0
- /data/{tests → test}/jfk_reader/jfk_reader.c +0 -0
- /data/{tests → test}/test_callback.rb +0 -0
- /data/{tests → test}/test_error.rb +0 -0
- /data/{tests → test}/test_params.rb +0 -0
- /data/{tests → test}/test_vad.rb +0 -0
- /data/{tests → test}/test_vad_params.rb +0 -0
@@ -0,0 +1,283 @@
|
|
1
|
+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
2
|
+
|
3
|
+
#ifdef cl_intel_subgroups
|
4
|
+
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
|
5
|
+
#else
|
6
|
+
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
7
|
+
#endif
|
8
|
+
|
9
|
+
#ifdef cl_intel_required_subgroup_size
|
10
|
+
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
11
|
+
#define INTEL_GPU 1
|
12
|
+
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
13
|
+
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
14
|
+
#elif defined(cl_qcom_reqd_sub_group_size)
|
15
|
+
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
16
|
+
#define ADRENO_GPU 1
|
17
|
+
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
18
|
+
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
19
|
+
#endif
|
20
|
+
|
21
|
+
#define QK4_0 32
|
22
|
+
|
23
|
+
typedef char int8_t;
|
24
|
+
typedef uchar uint8_t;
|
25
|
+
typedef short int16_t;
|
26
|
+
typedef ushort uint16_t;
|
27
|
+
typedef int int32_t;
|
28
|
+
typedef uint uint32_t;
|
29
|
+
|
30
|
+
//------------------------------------------------------------------------------
|
31
|
+
// block_q4_0
|
32
|
+
//------------------------------------------------------------------------------
|
33
|
+
struct block_q4_0
|
34
|
+
{
|
35
|
+
half d;
|
36
|
+
uint8_t qs[QK4_0 / 2];
|
37
|
+
};
|
38
|
+
|
39
|
+
// This function requires the original shuffled weights.
|
40
|
+
// As a reminder, the original weights are shuffled so that (q[0], q[16]) are
|
41
|
+
// packed together in a byte, so are (q[1], q[17]) and so on.
|
42
|
+
inline float block_q_4_0_dot_y_flat(
|
43
|
+
global uchar * x,
|
44
|
+
global half * dh,
|
45
|
+
float sumy,
|
46
|
+
float16 yl,
|
47
|
+
int il
|
48
|
+
) {
|
49
|
+
float d = *dh;
|
50
|
+
global ushort * qs = ((global ushort *)x + il/2);
|
51
|
+
float acc = 0.f;
|
52
|
+
|
53
|
+
acc += yl.s0 * (qs[0] & 0x000F);
|
54
|
+
acc += yl.s1 * (qs[0] & 0x0F00);
|
55
|
+
acc += yl.s8 * (qs[0] & 0x00F0);
|
56
|
+
acc += yl.s9 * (qs[0] & 0xF000);
|
57
|
+
|
58
|
+
acc += yl.s2 * (qs[1] & 0x000F);
|
59
|
+
acc += yl.s3 * (qs[1] & 0x0F00);
|
60
|
+
acc += yl.sa * (qs[1] & 0x00F0);
|
61
|
+
acc += yl.sb * (qs[1] & 0xF000);
|
62
|
+
|
63
|
+
acc += yl.s4 * (qs[2] & 0x000F);
|
64
|
+
acc += yl.s5 * (qs[2] & 0x0F00);
|
65
|
+
acc += yl.sc * (qs[2] & 0x00F0);
|
66
|
+
acc += yl.sd * (qs[2] & 0xF000);
|
67
|
+
|
68
|
+
acc += yl.s6 * (qs[3] & 0x000F);
|
69
|
+
acc += yl.s7 * (qs[3] & 0x0F00);
|
70
|
+
acc += yl.se * (qs[3] & 0x00F0);
|
71
|
+
acc += yl.sf * (qs[3] & 0xF000);
|
72
|
+
|
73
|
+
return d * (sumy * -8.f + acc);
|
74
|
+
}
|
75
|
+
|
76
|
+
//
|
77
|
+
// This variant outputs 8 values.
|
78
|
+
//
|
79
|
+
#undef N_DST
|
80
|
+
#undef N_SIMDGROUP
|
81
|
+
#undef N_SIMDWIDTH
|
82
|
+
|
83
|
+
#ifdef INTEL_GPU
|
84
|
+
#define N_DST 8 // each SIMD group works on 8 rows
|
85
|
+
#define N_SIMDGROUP 1 // number of SIMD groups in a thread group
|
86
|
+
#define N_SIMDWIDTH 16 // subgroup size
|
87
|
+
#elif defined (ADRENO_GPU)
|
88
|
+
#define N_DST 8
|
89
|
+
#define N_SIMDGROUP 1
|
90
|
+
#define N_SIMDWIDTH 64
|
91
|
+
#endif
|
92
|
+
|
93
|
+
inline void mul_vec_q_n_f32_8x_flat(
|
94
|
+
global char * src0_q,
|
95
|
+
global half * src0_d,
|
96
|
+
global float * src1,
|
97
|
+
global float * dst,
|
98
|
+
int ne00,
|
99
|
+
int ne01,
|
100
|
+
int ne02,
|
101
|
+
int ne10,
|
102
|
+
int ne12,
|
103
|
+
int ne0,
|
104
|
+
int ne1,
|
105
|
+
int r2,
|
106
|
+
int r3
|
107
|
+
) {
|
108
|
+
const ulong nb = ne00/QK4_0;
|
109
|
+
|
110
|
+
int r0 = get_group_id(0);
|
111
|
+
int r1 = get_group_id(1);
|
112
|
+
int im = 0;
|
113
|
+
|
114
|
+
int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
|
115
|
+
|
116
|
+
int i12 = im%ne12;
|
117
|
+
int i13 = im/ne12;
|
118
|
+
|
119
|
+
// The number of scales is the same as the number of blocks.
|
120
|
+
ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
|
121
|
+
// Each block contains QK4_0/2 uchars, hence offset for qs is as follows.
|
122
|
+
ulong offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2;
|
123
|
+
|
124
|
+
global uchar * x = (global uchar *) src0_q + offset0_q;
|
125
|
+
global half * d = (global half *) src0_d + offset0_d;
|
126
|
+
global float * y = (global float *) src1 + r1*ne10 + im*ne00*ne1;
|
127
|
+
|
128
|
+
float16 yl;
|
129
|
+
float8 sumf = 0.f;
|
130
|
+
|
131
|
+
int ix = get_sub_group_local_id()/2;
|
132
|
+
int il = 8*(get_sub_group_local_id()%2);
|
133
|
+
|
134
|
+
global float * yb = y + ix*QK4_0 + il;
|
135
|
+
|
136
|
+
for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
|
137
|
+
float sumy = 0.f;
|
138
|
+
|
139
|
+
sumy += yb[0];
|
140
|
+
sumy += yb[1];
|
141
|
+
sumy += yb[2];
|
142
|
+
sumy += yb[3];
|
143
|
+
sumy += yb[4];
|
144
|
+
sumy += yb[5];
|
145
|
+
sumy += yb[6];
|
146
|
+
sumy += yb[7];
|
147
|
+
|
148
|
+
sumy += yb[16];
|
149
|
+
sumy += yb[17];
|
150
|
+
sumy += yb[18];
|
151
|
+
sumy += yb[19];
|
152
|
+
sumy += yb[20];
|
153
|
+
sumy += yb[21];
|
154
|
+
sumy += yb[22];
|
155
|
+
sumy += yb[23];
|
156
|
+
|
157
|
+
yl.s0 = yb[0];
|
158
|
+
yl.s1 = yb[1]/256.f;
|
159
|
+
|
160
|
+
yl.s2 = yb[2];
|
161
|
+
yl.s3 = yb[3]/256.f;
|
162
|
+
|
163
|
+
yl.s4 = yb[4];
|
164
|
+
yl.s5 = yb[5]/256.f;
|
165
|
+
|
166
|
+
yl.s6 = yb[6];
|
167
|
+
yl.s7 = yb[7]/256.f;
|
168
|
+
|
169
|
+
yl.s8 = yb[16]/16.f;
|
170
|
+
yl.s9 = yb[17]/4096.f;
|
171
|
+
|
172
|
+
yl.sa = yb[18]/16.f;
|
173
|
+
yl.sb = yb[19]/4096.f;
|
174
|
+
|
175
|
+
yl.sc = yb[20]/16.f;
|
176
|
+
yl.sd = yb[21]/4096.f;
|
177
|
+
|
178
|
+
yl.se = yb[22]/16.f;
|
179
|
+
yl.sf = yb[23]/4096.f;
|
180
|
+
|
181
|
+
sumf.s0 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 0*nb*QK4_0/2, d + ib + 0*nb, sumy, yl, il);
|
182
|
+
sumf.s1 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 1*nb*QK4_0/2, d + ib + 1*nb, sumy, yl, il);
|
183
|
+
sumf.s2 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 2*nb*QK4_0/2, d + ib + 2*nb, sumy, yl, il);
|
184
|
+
sumf.s3 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 3*nb*QK4_0/2, d + ib + 3*nb, sumy, yl, il);
|
185
|
+
|
186
|
+
sumf.s4 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 4*nb*QK4_0/2, d + ib + 4*nb, sumy, yl, il);
|
187
|
+
sumf.s5 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 5*nb*QK4_0/2, d + ib + 5*nb, sumy, yl, il);
|
188
|
+
sumf.s6 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 6*nb*QK4_0/2, d + ib + 6*nb, sumy, yl, il);
|
189
|
+
sumf.s7 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 7*nb*QK4_0/2, d + ib + 7*nb, sumy, yl, il);
|
190
|
+
|
191
|
+
yb += QK4_0 * (N_SIMDWIDTH/2);
|
192
|
+
}
|
193
|
+
|
194
|
+
float8 tot = (float8)(
|
195
|
+
sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1),
|
196
|
+
sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3),
|
197
|
+
sub_group_reduce_add(sumf.s4), sub_group_reduce_add(sumf.s5),
|
198
|
+
sub_group_reduce_add(sumf.s6), sub_group_reduce_add(sumf.s7)
|
199
|
+
);
|
200
|
+
|
201
|
+
if (get_sub_group_local_id() == 0) {
|
202
|
+
if (first_row + 0 < ne01) {
|
203
|
+
dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
|
204
|
+
}
|
205
|
+
if (first_row + 1 < ne01) {
|
206
|
+
dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
|
207
|
+
}
|
208
|
+
if (first_row + 2 < ne01) {
|
209
|
+
dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
|
210
|
+
}
|
211
|
+
if (first_row + 3 < ne01) {
|
212
|
+
dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
|
213
|
+
}
|
214
|
+
|
215
|
+
if (first_row + 4 < ne01) {
|
216
|
+
dst[r1*ne0 + im*ne0*ne1 + first_row + 4] = tot.s4;
|
217
|
+
}
|
218
|
+
if (first_row + 5 < ne01) {
|
219
|
+
dst[r1*ne0 + im*ne0*ne1 + first_row + 5] = tot.s5;
|
220
|
+
}
|
221
|
+
if (first_row + 6 < ne01) {
|
222
|
+
dst[r1*ne0 + im*ne0*ne1 + first_row + 6] = tot.s6;
|
223
|
+
}
|
224
|
+
if (first_row + 7 < ne01) {
|
225
|
+
dst[r1*ne0 + im*ne0*ne1 + first_row + 7] = tot.s7;
|
226
|
+
}
|
227
|
+
}
|
228
|
+
}
|
229
|
+
|
230
|
+
#ifdef INTEL_GPU
|
231
|
+
REQD_SUBGROUP_SIZE_16
|
232
|
+
#elif defined (ADRENO_GPU)
|
233
|
+
REQD_SUBGROUP_SIZE_64
|
234
|
+
#endif
|
235
|
+
kernel void kernel_mul_mv_id_q4_0_f32_8x_flat(
|
236
|
+
global char * src0_q,
|
237
|
+
global half * src0_d,
|
238
|
+
global float * src1,
|
239
|
+
ulong offset1,
|
240
|
+
global char * src2,
|
241
|
+
ulong offset2,
|
242
|
+
global float * dst,
|
243
|
+
ulong offsetd,
|
244
|
+
int ne00,
|
245
|
+
int ne01,
|
246
|
+
int ne02,
|
247
|
+
ulong nb00,
|
248
|
+
ulong nb02,
|
249
|
+
int ne10,
|
250
|
+
int ne11,
|
251
|
+
int ne12,
|
252
|
+
ulong nb11,
|
253
|
+
ulong nb12,
|
254
|
+
int ne20,
|
255
|
+
int ne21,
|
256
|
+
ulong nb21,
|
257
|
+
int ne0,
|
258
|
+
int ne1,
|
259
|
+
int r2,
|
260
|
+
int r3
|
261
|
+
) {
|
262
|
+
src1 = (global float *)((global char *)src1 + offset1);
|
263
|
+
src2 = (global char *)((global char *)src2 + offset2);
|
264
|
+
dst = (global float *)((global char *)dst + offsetd);
|
265
|
+
|
266
|
+
const int iid1 = get_group_id(2)/ne20;
|
267
|
+
const int idx = get_group_id(2)%ne20;
|
268
|
+
|
269
|
+
const int i02 = ((global int *)(src2 + iid1*nb21))[idx];
|
270
|
+
|
271
|
+
const int i11 = idx%ne11;
|
272
|
+
const int i12 = iid1;
|
273
|
+
|
274
|
+
const int i1 = idx;
|
275
|
+
const int i2 = i12;
|
276
|
+
|
277
|
+
global char * src0_q_cur = src0_q + (i02*nb02/nb00)*(QK4_0/2);
|
278
|
+
global half * src0_d_cur = src0_d + (i02*nb02/nb00);
|
279
|
+
global float * src1_cur = (global float *)((global char *) src1 + i11*nb11 + i12*nb12);
|
280
|
+
global float * dst_cur = dst + i1*ne0 + i2*ne1*ne0;
|
281
|
+
|
282
|
+
mul_vec_q_n_f32_8x_flat(src0_q_cur, src0_d_cur, src1_cur, dst_cur, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
|
283
|
+
}
|
@@ -0,0 +1,30 @@
|
|
1
|
+
kernel void kernel_pad(
|
2
|
+
global const void * src0_ptr,
|
3
|
+
ulong src0_offset,
|
4
|
+
global void * dst_ptr,
|
5
|
+
ulong dst_offset,
|
6
|
+
int s_ne0, int s_ne1, int s_ne2,
|
7
|
+
int d_ne0, int d_ne1, int d_ne2
|
8
|
+
) {
|
9
|
+
global const float * src0 = (global const float *)((global const char *)src0_ptr + src0_offset);
|
10
|
+
global float * dst = (global float *)((global char *)dst_ptr + dst_offset);
|
11
|
+
|
12
|
+
int nidx = get_global_id(0);
|
13
|
+
int idx_d1 = get_group_id(1);
|
14
|
+
int idx_d2 = get_group_id(2);
|
15
|
+
|
16
|
+
if (nidx >= d_ne0) {
|
17
|
+
return;
|
18
|
+
}
|
19
|
+
|
20
|
+
int dst_el_offset = nidx + idx_d1 * d_ne0 + idx_d2 * d_ne0 * d_ne1;
|
21
|
+
|
22
|
+
bool in_src_bounds = (nidx < s_ne0) && (idx_d1 < s_ne1) && (idx_d2 < s_ne2);
|
23
|
+
|
24
|
+
if (in_src_bounds) {
|
25
|
+
int src_el_offset = nidx + idx_d1 * s_ne0 + idx_d2 * s_ne0 * s_ne1;
|
26
|
+
dst[dst_el_offset] = src0[src_el_offset];
|
27
|
+
} else {
|
28
|
+
dst[dst_el_offset] = 0.0f;
|
29
|
+
}
|
30
|
+
}
|
@@ -0,0 +1,39 @@
|
|
1
|
+
kernel void kernel_repeat(
|
2
|
+
global const char * src0_data_in,
|
3
|
+
global char * dst_data_in,
|
4
|
+
ulong src0_offset,
|
5
|
+
ulong dst_offset,
|
6
|
+
int src0_ne0, int src0_ne1, int src0_ne2, int src0_ne3,
|
7
|
+
ulong src0_nb0, ulong src0_nb1, ulong src0_nb2, ulong src0_nb3,
|
8
|
+
int dst_ne0, int dst_ne1, int dst_ne2, int dst_ne3,
|
9
|
+
ulong dst_nb0, ulong dst_nb1, ulong dst_nb2, ulong dst_nb3
|
10
|
+
) {
|
11
|
+
global const char * src0_data = src0_data_in + src0_offset;
|
12
|
+
global char * dst_data = dst_data_in + dst_offset;
|
13
|
+
|
14
|
+
const int d3 = get_global_id(2);
|
15
|
+
const int d2 = get_global_id(1);
|
16
|
+
const int d1 = get_global_id(0);
|
17
|
+
|
18
|
+
if (d3 >= dst_ne3 || d2 >= dst_ne2 || d1 >= dst_ne1) {
|
19
|
+
return;
|
20
|
+
}
|
21
|
+
|
22
|
+
const int s3 = d3 % src0_ne3;
|
23
|
+
const int s2 = d2 % src0_ne2;
|
24
|
+
const int s1 = d1 % src0_ne1;
|
25
|
+
|
26
|
+
const global char * p_src0_slice = src0_data + (ulong)s3*src0_nb3 + (ulong)s2*src0_nb2 + (ulong)s1*src0_nb1;
|
27
|
+
global char * p_dst_slice = dst_data + (ulong)d3*dst_nb3 + (ulong)d2*dst_nb2 + (ulong)d1*dst_nb1;
|
28
|
+
|
29
|
+
for (int d0 = 0; d0 < dst_ne0; ++d0) {
|
30
|
+
// Determine source index for dimension 0 based on tiling/broadcasting.
|
31
|
+
const int s0 = d0 % src0_ne0;
|
32
|
+
|
33
|
+
const global char * restrict current_src_el_ptr = p_src0_slice + (ulong)s0*src0_nb0;
|
34
|
+
global char * restrict current_dst_el_ptr = p_dst_slice + (ulong)d0*dst_nb0;
|
35
|
+
for (int k = 0; k < src0_nb0; ++k) {
|
36
|
+
current_dst_el_ptr[k] = current_src_el_ptr[k];
|
37
|
+
}
|
38
|
+
}
|
39
|
+
}
|
@@ -0,0 +1,29 @@
|
|
1
|
+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
2
|
+
|
3
|
+
//------------------------------------------------------------------------------
|
4
|
+
// sigmoid
|
5
|
+
//------------------------------------------------------------------------------
|
6
|
+
|
7
|
+
kernel void kernel_sigmoid_f32(
|
8
|
+
global float * src0,
|
9
|
+
ulong offset0,
|
10
|
+
global float * dst,
|
11
|
+
ulong offsetd
|
12
|
+
) {
|
13
|
+
src0 = (global float*)((global char*)src0 + offset0);
|
14
|
+
dst = (global float*)((global char*)dst + offsetd);
|
15
|
+
|
16
|
+
dst[get_global_id(0)] = 1.0f / (1.0f + exp(-src0[get_global_id(0)]));
|
17
|
+
}
|
18
|
+
|
19
|
+
kernel void kernel_sigmoid_f16(
|
20
|
+
global half * src0,
|
21
|
+
ulong offset0,
|
22
|
+
global half * dst,
|
23
|
+
ulong offsetd
|
24
|
+
) {
|
25
|
+
src0 = (global half*)((global char*)src0 + offset0);
|
26
|
+
dst = (global half*)((global char*)dst + offsetd);
|
27
|
+
|
28
|
+
dst[get_global_id(0)] = 1.0f / (1.0f + exp(-src0[get_global_id(0)]));
|
29
|
+
}
|
@@ -0,0 +1,72 @@
|
|
1
|
+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
2
|
+
|
3
|
+
//------------------------------------------------------------------------------
|
4
|
+
// div
|
5
|
+
//------------------------------------------------------------------------------
|
6
|
+
kernel void kernel_sub(
|
7
|
+
global char * src0,
|
8
|
+
ulong offset0,
|
9
|
+
global char * src1,
|
10
|
+
ulong offset1,
|
11
|
+
global char * dst,
|
12
|
+
ulong offsetd,
|
13
|
+
ulong nb00,
|
14
|
+
ulong nb01,
|
15
|
+
ulong nb02,
|
16
|
+
ulong nb03,
|
17
|
+
int ne10,
|
18
|
+
int ne11,
|
19
|
+
int ne12,
|
20
|
+
int ne13,
|
21
|
+
ulong nb10,
|
22
|
+
ulong nb11,
|
23
|
+
ulong nb12,
|
24
|
+
ulong nb13,
|
25
|
+
int ne0,
|
26
|
+
ulong nb0,
|
27
|
+
ulong nb1,
|
28
|
+
ulong nb2,
|
29
|
+
ulong nb3
|
30
|
+
) {
|
31
|
+
src0 = src0 + offset0;
|
32
|
+
src1 = src1 + offset1;
|
33
|
+
dst = dst + offsetd;
|
34
|
+
|
35
|
+
int i03 = get_group_id(2);
|
36
|
+
int i02 = get_group_id(1);
|
37
|
+
int i01 = get_group_id(0);
|
38
|
+
|
39
|
+
int i13 = i03 % ne13;
|
40
|
+
int i12 = i02 % ne12;
|
41
|
+
int i11 = i01 % ne11;
|
42
|
+
|
43
|
+
global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
|
44
|
+
global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
|
45
|
+
global char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1;
|
46
|
+
|
47
|
+
for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
|
48
|
+
const int i10 = i0 % ne10;
|
49
|
+
*((global float *)(dst_ptr + i0*nb0)) = *((global float *)(src0_ptr + i0*nb00)) - *((global float *)(src1_ptr + i10*nb10));
|
50
|
+
}
|
51
|
+
}
|
52
|
+
|
53
|
+
// assumption: src1 is a row
|
54
|
+
// broadcast src1 into src0
|
55
|
+
kernel void kernel_sub_row(
|
56
|
+
global float4 * src0,
|
57
|
+
ulong offset0,
|
58
|
+
global float4 * src1,
|
59
|
+
ulong offset1,
|
60
|
+
global float4 * dst,
|
61
|
+
ulong offsetd,
|
62
|
+
int ne
|
63
|
+
) {
|
64
|
+
src0 = (global float4*)((global char*)src0 + offset0);
|
65
|
+
src1 = (global float4*)((global char*)src1 + offset1);
|
66
|
+
dst = (global float4*)((global char*)dst + offsetd);
|
67
|
+
|
68
|
+
// This performs better than using %.
|
69
|
+
uint gid = get_global_id(0);
|
70
|
+
uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
|
71
|
+
dst[gid] = src0[gid] - src1[idx1];
|
72
|
+
}
|
@@ -0,0 +1,39 @@
|
|
1
|
+
|
2
|
+
kernel void kernel_sum_rows_f32(
|
3
|
+
global float * src0,
|
4
|
+
ulong offset0,
|
5
|
+
global float * dst,
|
6
|
+
ulong offsetd,
|
7
|
+
int ne00,
|
8
|
+
int ne01,
|
9
|
+
int ne02,
|
10
|
+
int ne03,
|
11
|
+
ulong nb01,
|
12
|
+
ulong nb02,
|
13
|
+
ulong nb03,
|
14
|
+
ulong nb1,
|
15
|
+
ulong nb2,
|
16
|
+
ulong nb3
|
17
|
+
) {
|
18
|
+
src0 = (global float *)((global char *)src0 + offset0);
|
19
|
+
dst = (global float *)((global char *)dst + offsetd);
|
20
|
+
|
21
|
+
int i3 = get_global_id(2);
|
22
|
+
int i2 = get_global_id(1);
|
23
|
+
int i1 = get_global_id(0);
|
24
|
+
|
25
|
+
if (i3 >= ne03 || i2 >= ne02 || i1 >= ne01) {
|
26
|
+
return;
|
27
|
+
}
|
28
|
+
|
29
|
+
global float * src_row = (global float *) ((global char *) src0 + i1*nb01 + i2*nb02 + i3*nb03);
|
30
|
+
global float * dst_row = (global float *) ((global char *) dst + i1*nb1 + i2*nb2 + i3*nb3);
|
31
|
+
|
32
|
+
float row_sum = 0;
|
33
|
+
|
34
|
+
for (int i0 = 0; i0 < ne00; i0++) {
|
35
|
+
row_sum += src_row[i0];
|
36
|
+
}
|
37
|
+
|
38
|
+
dst_row[0] = row_sum;
|
39
|
+
}
|
@@ -0,0 +1,63 @@
|
|
1
|
+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
2
|
+
|
3
|
+
#ifdef cl_intel_required_subgroup_size
|
4
|
+
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
5
|
+
#define INTEL_GPU 1
|
6
|
+
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
7
|
+
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
8
|
+
#elif defined(cl_qcom_reqd_sub_group_size)
|
9
|
+
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
10
|
+
#define ADRENO_GPU 1
|
11
|
+
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
12
|
+
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
13
|
+
#endif
|
14
|
+
|
15
|
+
kernel void kernel_tanh_f32_nd(
|
16
|
+
global void * p_src0_base, ulong off_src0_abs,
|
17
|
+
global void * p_dst_base, ulong off_dst_abs,
|
18
|
+
int ne00, int ne01, int ne02, int ne03,
|
19
|
+
ulong nb00, ulong nb01, ulong nb02, ulong nb03,
|
20
|
+
int ne10, int ne11, int ne12, int ne13,
|
21
|
+
ulong nb10, ulong nb11, ulong nb12, ulong nb13
|
22
|
+
) {
|
23
|
+
int i0 = get_global_id(0);
|
24
|
+
int i1 = get_global_id(1);
|
25
|
+
int i2 = get_global_id(2);
|
26
|
+
|
27
|
+
if (i0 < ne10 && i1 < ne11 && i2 < ne12) {
|
28
|
+
for (int i3 = 0; i3 < ne13; ++i3) {
|
29
|
+
ulong src_offset_in_tensor = (ulong)i0*nb00 + (ulong)i1*nb01 + (ulong)i2*nb02 + (ulong)i3*nb03;
|
30
|
+
global const float *src_val_ptr = (global const float *)((global char *)p_src0_base + off_src0_abs + src_offset_in_tensor);
|
31
|
+
|
32
|
+
ulong dst_offset_in_tensor = (ulong)i0*nb10 + (ulong)i1*nb11 + (ulong)i2*nb12 + (ulong)i3*nb13;
|
33
|
+
global float *dst_val_ptr = (global float *)((global char *)p_dst_base + off_dst_abs + dst_offset_in_tensor);
|
34
|
+
|
35
|
+
*dst_val_ptr = tanh(*src_val_ptr);
|
36
|
+
}
|
37
|
+
}
|
38
|
+
}
|
39
|
+
|
40
|
+
kernel void kernel_tanh_f16_nd(
|
41
|
+
global void * p_src0_base, ulong off_src0_abs,
|
42
|
+
global void * p_dst_base, ulong off_dst_abs,
|
43
|
+
int ne00, int ne01, int ne02, int ne03,
|
44
|
+
ulong nb00, ulong nb01, ulong nb02, ulong nb03,
|
45
|
+
int ne10, int ne11, int ne12, int ne13,
|
46
|
+
ulong nb10, ulong nb11, ulong nb12, ulong nb13
|
47
|
+
) {
|
48
|
+
int i0 = get_global_id(0);
|
49
|
+
int i1 = get_global_id(1);
|
50
|
+
int i2 = get_global_id(2);
|
51
|
+
|
52
|
+
if (i0 < ne10 && i1 < ne11 && i2 < ne12) {
|
53
|
+
for (int i3 = 0; i3 < ne13; ++i3) {
|
54
|
+
ulong src_offset_in_tensor = (ulong)i0*nb00 + (ulong)i1*nb01 + (ulong)i2*nb02 + (ulong)i3*nb03;
|
55
|
+
global const half *src_val_ptr = (global const half *)((global char *)p_src0_base + off_src0_abs + src_offset_in_tensor);
|
56
|
+
|
57
|
+
ulong dst_offset_in_tensor = (ulong)i0*nb10 + (ulong)i1*nb11 + (ulong)i2*nb12 + (ulong)i3*nb13;
|
58
|
+
global half *dst_val_ptr = (global half *)((global char *)p_dst_base + off_dst_abs + dst_offset_in_tensor);
|
59
|
+
|
60
|
+
*dst_val_ptr = tanh(*src_val_ptr);
|
61
|
+
}
|
62
|
+
}
|
63
|
+
}
|
@@ -0,0 +1,48 @@
|
|
1
|
+
kernel void kernel_timestep_embedding(
|
2
|
+
global const void * p_timesteps,
|
3
|
+
ulong off_timesteps,
|
4
|
+
global void * p_dst,
|
5
|
+
ulong off_dst,
|
6
|
+
int dst_nb1_bytes,
|
7
|
+
int logical_dim,
|
8
|
+
int max_period
|
9
|
+
) {
|
10
|
+
int local_i;
|
11
|
+
int local_j;
|
12
|
+
int local_half_dim;
|
13
|
+
float local_timestep_val;
|
14
|
+
float local_freq;
|
15
|
+
float local_arg;
|
16
|
+
global float * local_embed_data_ptr;
|
17
|
+
global const float * local_timesteps_input_ptr;
|
18
|
+
global float * local_dst_output_base_ptr;
|
19
|
+
|
20
|
+
local_timesteps_input_ptr = (global const float *)((global char *)p_timesteps + off_timesteps);
|
21
|
+
local_dst_output_base_ptr = (global float *)((global char *)p_dst + off_dst);
|
22
|
+
|
23
|
+
local_i = get_global_id(1);
|
24
|
+
local_j = get_global_id(0);
|
25
|
+
|
26
|
+
local_half_dim = logical_dim / 2;
|
27
|
+
local_embed_data_ptr = (global float *)((global char *)local_dst_output_base_ptr + local_i * dst_nb1_bytes);
|
28
|
+
|
29
|
+
if (logical_dim % 2 != 0 && local_j == ((logical_dim + 1) / 2)) {
|
30
|
+
local_embed_data_ptr[logical_dim] = 0.0f;
|
31
|
+
}
|
32
|
+
|
33
|
+
if (local_j >= local_half_dim) {
|
34
|
+
return;
|
35
|
+
}
|
36
|
+
|
37
|
+
local_timestep_val = local_timesteps_input_ptr[local_i];
|
38
|
+
|
39
|
+
if (local_half_dim == 0) {
|
40
|
+
local_freq = 1.0f;
|
41
|
+
} else {
|
42
|
+
local_freq = exp(-log((float)max_period) * (float)local_j / (float)local_half_dim);
|
43
|
+
}
|
44
|
+
|
45
|
+
local_arg = local_timestep_val * local_freq;
|
46
|
+
local_embed_data_ptr[local_j] = cos(local_arg);
|
47
|
+
local_embed_data_ptr[local_j + local_half_dim] = sin(local_arg);
|
48
|
+
}
|