whispercpp 1.3.2 → 1.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +6 -3
- data/README.md +71 -14
- data/Rakefile +20 -7
- data/ext/.gitignore +4 -6
- data/ext/dependencies.rb +36 -24
- data/ext/extconf.rb +1 -1
- data/ext/options.rb +48 -184
- data/ext/ruby_whisper.c +18 -0
- data/ext/ruby_whisper_context.c +43 -12
- data/ext/ruby_whisper_model.c +1 -1
- data/ext/ruby_whisper_params.c +4 -2
- data/ext/ruby_whisper_segment.c +81 -4
- data/ext/ruby_whisper_transcribe.cpp +13 -7
- data/ext/ruby_whisper_vad_params.c +1 -1
- data/ext/sources/CMakeLists.txt +5 -1
- data/ext/sources/bindings/javascript/package.json +1 -1
- data/ext/sources/examples/addon.node/__test__/whisper.spec.js +120 -24
- data/ext/sources/examples/addon.node/addon.cpp +150 -31
- data/ext/sources/examples/addon.node/index.js +3 -0
- data/ext/sources/examples/addon.node/vad-example.js +132 -0
- data/ext/sources/examples/bench/bench.cpp +3 -2
- data/ext/sources/examples/cli/cli.cpp +3 -2
- data/ext/sources/examples/command/command.cpp +32 -8
- data/ext/sources/examples/common-whisper.cpp +14 -7
- data/ext/sources/examples/lsp/lsp.cpp +2 -0
- data/ext/sources/examples/quantize/quantize.cpp +3 -0
- data/ext/sources/examples/server/CMakeLists.txt +3 -0
- data/ext/sources/examples/server/server.cpp +169 -22
- data/ext/sources/examples/stream/stream.cpp +6 -0
- data/ext/sources/examples/talk-llama/CMakeLists.txt +4 -1
- data/ext/sources/examples/talk-llama/llama-arch.cpp +171 -3
- data/ext/sources/examples/talk-llama/llama-arch.h +28 -1
- data/ext/sources/examples/talk-llama/llama-batch.cpp +741 -272
- data/ext/sources/examples/talk-llama/llama-batch.h +112 -54
- data/ext/sources/examples/talk-llama/llama-chat.cpp +30 -8
- data/ext/sources/examples/talk-llama/llama-chat.h +1 -0
- data/ext/sources/examples/talk-llama/llama-context.cpp +520 -351
- data/ext/sources/examples/talk-llama/llama-context.h +38 -17
- data/ext/sources/examples/talk-llama/llama-cparams.cpp +1 -1
- data/ext/sources/examples/talk-llama/llama-cparams.h +1 -1
- data/ext/sources/examples/talk-llama/llama-graph.cpp +447 -372
- data/ext/sources/examples/talk-llama/llama-graph.h +128 -58
- data/ext/sources/examples/talk-llama/llama-hparams.cpp +10 -2
- data/ext/sources/examples/talk-llama/llama-hparams.h +19 -2
- data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.cpp +279 -0
- data/ext/sources/examples/talk-llama/llama-kv-cache-unified-iswa.h +128 -0
- data/ext/sources/examples/talk-llama/llama-kv-cache-unified.cpp +1841 -0
- data/ext/sources/examples/talk-llama/llama-kv-cache-unified.h +303 -0
- data/ext/sources/examples/talk-llama/llama-kv-cache.h +14 -472
- data/ext/sources/examples/talk-llama/llama-kv-cells.h +86 -26
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.cpp +246 -0
- data/ext/sources/examples/talk-llama/llama-memory-hybrid.h +138 -0
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.cpp +1125 -0
- data/ext/sources/examples/talk-llama/llama-memory-recurrent.h +183 -0
- data/ext/sources/examples/talk-llama/llama-memory.cpp +58 -0
- data/ext/sources/examples/talk-llama/llama-memory.h +88 -4
- data/ext/sources/examples/talk-llama/llama-mmap.cpp +1 -1
- data/ext/sources/examples/talk-llama/llama-model-loader.cpp +42 -17
- data/ext/sources/examples/talk-llama/llama-model-saver.cpp +1 -0
- data/ext/sources/examples/talk-llama/llama-model.cpp +1863 -563
- data/ext/sources/examples/talk-llama/llama-model.h +27 -0
- data/ext/sources/examples/talk-llama/llama-quant.cpp +89 -6
- data/ext/sources/examples/talk-llama/llama-vocab.cpp +65 -28
- data/ext/sources/examples/talk-llama/llama-vocab.h +1 -0
- data/ext/sources/examples/talk-llama/llama.cpp +11 -7
- data/ext/sources/examples/talk-llama/llama.h +147 -40
- data/ext/sources/examples/talk-llama/talk-llama.cpp +2 -0
- data/ext/sources/examples/talk-llama/unicode.cpp +5 -0
- data/ext/sources/examples/vad-speech-segments/speech.cpp +6 -0
- data/ext/sources/examples/wchess/wchess.cmd/wchess.cmd.cpp +2 -0
- data/ext/sources/ggml/CMakeLists.txt +48 -3
- data/ext/sources/ggml/cmake/common.cmake +24 -0
- data/ext/sources/ggml/include/ggml-backend.h +1 -1
- data/ext/sources/ggml/include/ggml-cpu.h +2 -0
- data/ext/sources/ggml/include/ggml.h +144 -5
- data/ext/sources/ggml/src/CMakeLists.txt +82 -24
- data/ext/sources/ggml/src/ggml-backend-reg.cpp +5 -0
- data/ext/sources/ggml/src/ggml-backend.cpp +46 -23
- data/ext/sources/ggml/src/ggml-blas/CMakeLists.txt +3 -3
- data/ext/sources/ggml/src/ggml-cann/CMakeLists.txt +1 -0
- data/ext/sources/ggml/src/ggml-cann/common.h +6 -1
- data/ext/sources/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
- data/ext/sources/ggml/src/ggml-common.h +4 -0
- data/ext/sources/ggml/src/ggml-cpu/CMakeLists.txt +133 -40
- data/ext/sources/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
- data/ext/sources/ggml/src/ggml-cpu/amx/mmq.cpp +11 -10
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/quants.c +4114 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/arm/repack.cpp +2163 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/loongarch/quants.c +2639 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/powerpc/quants.c +2732 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/quants.c +2069 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/riscv/repack.cpp +397 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/s390/quants.c +1300 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/wasm/quants.c +1481 -0
- data/ext/sources/ggml/src/ggml-cpu/arch/x86/quants.c +4311 -0
- data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
- data/ext/sources/ggml/src/ggml-cpu/arch-fallback.h +184 -0
- data/ext/sources/ggml/src/ggml-cpu/common.h +4 -3
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-impl.h +16 -7
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.c +146 -105
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu.cpp +12 -8
- data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
- data/ext/sources/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.cpp +58 -8
- data/ext/sources/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
- data/ext/sources/ggml/src/ggml-cpu/ops.cpp +1057 -174
- data/ext/sources/ggml/src/ggml-cpu/ops.h +8 -0
- data/ext/sources/ggml/src/ggml-cpu/quants.c +1158 -0
- data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
- data/ext/sources/ggml/src/ggml-cpu/repack.cpp +1571 -0
- data/ext/sources/ggml/src/ggml-cpu/repack.h +98 -0
- data/ext/sources/ggml/src/ggml-cpu/simd-mappings.h +330 -38
- data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
- data/ext/sources/ggml/src/ggml-cpu/vec.cpp +111 -18
- data/ext/sources/ggml/src/ggml-cpu/vec.h +303 -94
- data/ext/sources/ggml/src/ggml-cuda/common.cuh +60 -37
- data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
- data/ext/sources/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
- data/ext/sources/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
- data/ext/sources/ggml/src/ggml-cuda/convert.cu +22 -0
- data/ext/sources/ggml/src/ggml-cuda/convert.cuh +5 -0
- data/ext/sources/ggml/src/ggml-cuda/fattn-common.cuh +2 -2
- data/ext/sources/ggml/src/ggml-cuda/fattn-mma-f16.cuh +5 -2
- data/ext/sources/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
- data/ext/sources/ggml/src/ggml-cuda/ggml-cuda.cu +265 -123
- data/ext/sources/ggml/src/ggml-cuda/mean.cu +19 -0
- data/ext/sources/ggml/src/ggml-cuda/mean.cuh +3 -0
- data/ext/sources/ggml/src/ggml-cuda/mmv.cu +257 -87
- data/ext/sources/ggml/src/ggml-cuda/mmv.cuh +2 -3
- data/ext/sources/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
- data/ext/sources/ggml/src/ggml-cuda/sumrows.cu +5 -18
- data/ext/sources/ggml/src/ggml-cuda/sumrows.cuh +0 -1
- data/ext/sources/ggml/src/ggml-cuda/unary.cu +89 -0
- data/ext/sources/ggml/src/ggml-cuda/unary.cuh +7 -0
- data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt +4 -0
- data/ext/sources/ggml/src/ggml-impl.h +127 -183
- data/ext/sources/ggml/src/ggml-metal/CMakeLists.txt +11 -10
- data/ext/sources/ggml/src/ggml-metal/ggml-metal-impl.h +27 -0
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.m +331 -49
- data/ext/sources/ggml/src/ggml-metal/ggml-metal.metal +564 -282
- data/ext/sources/ggml/src/ggml-musa/mudnn.cuh +2 -2
- data/ext/sources/ggml/src/ggml-opencl/CMakeLists.txt +14 -0
- data/ext/sources/ggml/src/ggml-opencl/ggml-opencl.cpp +1859 -489
- data/ext/sources/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/div.cl +72 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/glu.cl +201 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/group_norm.cl +72 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sub.cl +72 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
- data/ext/sources/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
- data/ext/sources/ggml/src/ggml-quants.c +6 -8
- data/ext/sources/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
- data/ext/sources/ggml/src/ggml-sycl/CMakeLists.txt +3 -3
- data/ext/sources/ggml/src/ggml-sycl/binbcast.cpp +5 -6
- data/ext/sources/ggml/src/ggml-sycl/common.hpp +20 -48
- data/ext/sources/ggml/src/ggml-sycl/concat.cpp +28 -41
- data/ext/sources/ggml/src/ggml-sycl/conv.cpp +4 -10
- data/ext/sources/ggml/src/ggml-sycl/convert.cpp +117 -165
- data/ext/sources/ggml/src/ggml-sycl/cpy.cpp +192 -53
- data/ext/sources/ggml/src/ggml-sycl/dequantize.hpp +32 -0
- data/ext/sources/ggml/src/ggml-sycl/dmmv.cpp +49 -67
- data/ext/sources/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
- data/ext/sources/ggml/src/ggml-sycl/element_wise.cpp +648 -1039
- data/ext/sources/ggml/src/ggml-sycl/element_wise.hpp +18 -9
- data/ext/sources/ggml/src/ggml-sycl/gemm.hpp +3 -0
- data/ext/sources/ggml/src/ggml-sycl/getrows.cpp +8 -105
- data/ext/sources/ggml/src/ggml-sycl/ggml-sycl.cpp +238 -100
- data/ext/sources/ggml/src/ggml-sycl/gla.cpp +2 -2
- data/ext/sources/ggml/src/ggml-sycl/im2col.cpp +1 -1
- data/ext/sources/ggml/src/ggml-sycl/mmq.cpp +60 -80
- data/ext/sources/ggml/src/ggml-sycl/mmvq.cpp +158 -203
- data/ext/sources/ggml/src/ggml-sycl/norm.cpp +55 -74
- data/ext/sources/ggml/src/ggml-sycl/quants.hpp +38 -10
- data/ext/sources/ggml/src/ggml-sycl/rope.cpp +138 -27
- data/ext/sources/ggml/src/ggml-sycl/softmax.cpp +3 -3
- data/ext/sources/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
- data/ext/sources/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
- data/ext/sources/ggml/src/ggml-sycl/tsembd.cpp +3 -8
- data/ext/sources/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
- data/ext/sources/ggml/src/ggml-sycl/wkv.cpp +12 -16
- data/ext/sources/ggml/src/ggml-vulkan/CMakeLists.txt +36 -32
- data/ext/sources/ggml/src/ggml-vulkan/ggml-vulkan.cpp +726 -282
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -12
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +13 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.comp +15 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.comp +29 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +9 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +12 -3
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +9 -0
- data/ext/sources/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +10 -1
- data/ext/sources/ggml/src/ggml.c +328 -48
- data/ext/sources/ggml/src/ggml.cpp +26 -0
- data/ext/sources/ggml/src/gguf.cpp +24 -3
- data/ext/sources/include/whisper.h +2 -0
- data/ext/sources/src/CMakeLists.txt +2 -0
- data/ext/sources/src/coreml/whisper-compat.h +10 -0
- data/ext/sources/src/coreml/whisper-compat.m +35 -0
- data/ext/sources/src/coreml/whisper-decoder-impl.m +1 -0
- data/ext/sources/src/coreml/whisper-encoder-impl.m +1 -0
- data/ext/sources/src/whisper.cpp +218 -169
- data/extsources.rb +15 -9
- data/lib/whisper/context.rb +15 -0
- data/lib/whisper/model/uri.rb +56 -1
- data/lib/whisper/segment.rb +58 -0
- data/sig/whisper.rbs +68 -38
- data/{tests → test}/helper.rb +1 -12
- data/{tests → test}/test_model.rb +9 -0
- data/test/test_package.rb +51 -0
- data/test/test_segment.rb +146 -0
- data/{tests → test}/test_whisper.rb +70 -0
- data/whispercpp.gemspec +2 -3
- metadata +91 -43
- data/ext/sources/.dockerignore +0 -3
- data/ext/sources/.github/workflows/bindings-ruby.yml +0 -21
- data/ext/sources/ci/run.sh +0 -336
- data/ext/sources/close-issue.yml +0 -28
- data/ext/sources/examples/talk-llama/llama-kv-cache.cpp +0 -2739
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
- data/ext/sources/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13747
- data/tests/test_package.rb +0 -46
- data/tests/test_segment.rb +0 -74
- /data/ext/sources/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
- /data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
- /data/ext/sources/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
- /data/{tests → test}/jfk_reader/.gitignore +0 -0
- /data/{tests → test}/jfk_reader/extconf.rb +0 -0
- /data/{tests → test}/jfk_reader/jfk_reader.c +0 -0
- /data/{tests → test}/test_callback.rb +0 -0
- /data/{tests → test}/test_error.rb +0 -0
- /data/{tests → test}/test_params.rb +0 -0
- /data/{tests → test}/test_vad.rb +0 -0
- /data/{tests → test}/test_vad_params.rb +0 -0
@@ -231,6 +231,71 @@ static ggml_cl_compiler_version get_adreno_cl_compiler_version(const char *drive
|
|
231
231
|
return { type, major, minor, patch };
|
232
232
|
}
|
233
233
|
|
234
|
+
// Profiling
|
235
|
+
struct ProfilingInfo {
|
236
|
+
std::string op_name;
|
237
|
+
std::string kernel_name;
|
238
|
+
|
239
|
+
cl_kernel kernel;
|
240
|
+
cl_event evt;
|
241
|
+
|
242
|
+
cl_ulong cmd_queued;
|
243
|
+
cl_ulong cmd_submit;
|
244
|
+
cl_ulong cmd_start;
|
245
|
+
cl_ulong cmd_end;
|
246
|
+
cl_ulong overhead_start;
|
247
|
+
cl_ulong overhead_end;
|
248
|
+
// For the times below, see spec for clGetEventProfilingInfo
|
249
|
+
// The time kernel spent in cmd queue - SUBMIT - QUEUED
|
250
|
+
cl_ulong cmd_queued_duration_ns;
|
251
|
+
// The time kernel spent for submission - START - SUBMIT
|
252
|
+
cl_ulong cmd_submit_duration_ns;
|
253
|
+
// Kernel execution time in nanoseconds - END - START
|
254
|
+
cl_ulong cmd_duration_ns;
|
255
|
+
// The time for the kernel to complete - COMPLETE - END
|
256
|
+
cl_ulong cmd_complete_duration_ns;
|
257
|
+
// Total time to finish the kernel - COMPELTE - QUEUED
|
258
|
+
cl_ulong cmd_total_duration_ns;
|
259
|
+
// Global and local work sizes.
|
260
|
+
size_t global_size[3];
|
261
|
+
size_t local_size[3];
|
262
|
+
// Op output size.
|
263
|
+
size_t output_size[4];
|
264
|
+
};
|
265
|
+
|
266
|
+
static void populateProfilingInfo(
|
267
|
+
ProfilingInfo& info, cl_event evt, cl_kernel kernel, cl_uint work_dim,
|
268
|
+
size_t global_size[3], size_t local_size[3],
|
269
|
+
const ggml_tensor * tensor) {
|
270
|
+
info.op_name = tensor->name;
|
271
|
+
info.kernel = kernel;
|
272
|
+
info.evt = evt;
|
273
|
+
|
274
|
+
// 0 means not specified, e.g., 2D workgroup, or NULL for driver to choose
|
275
|
+
info.local_size[0] = 0;
|
276
|
+
info.local_size[1] = 0;
|
277
|
+
info.local_size[2] = 0;
|
278
|
+
|
279
|
+
info.global_size[0] = 0;
|
280
|
+
info.global_size[1] = 0;
|
281
|
+
info.global_size[2] = 0;
|
282
|
+
|
283
|
+
if (local_size) {
|
284
|
+
for (cl_uint i = 0; i < work_dim; ++i) {
|
285
|
+
info.local_size[i] = local_size[i];
|
286
|
+
}
|
287
|
+
}
|
288
|
+
|
289
|
+
for (cl_uint i = 0; i < work_dim; ++i) {
|
290
|
+
info.global_size[i] = global_size[i];
|
291
|
+
}
|
292
|
+
|
293
|
+
info.output_size[0] = tensor->ne[0];
|
294
|
+
info.output_size[1] = tensor->ne[1];
|
295
|
+
info.output_size[2] = tensor->ne[2];
|
296
|
+
info.output_size[3] = tensor->ne[3];
|
297
|
+
}
|
298
|
+
|
234
299
|
struct ggml_backend_opencl_context;
|
235
300
|
|
236
301
|
// backend device context
|
@@ -254,6 +319,8 @@ struct ggml_backend_opencl_device_context {
|
|
254
319
|
|
255
320
|
// backend context
|
256
321
|
struct ggml_backend_opencl_context {
|
322
|
+
int ref_count;
|
323
|
+
|
257
324
|
cl_device_id device;
|
258
325
|
std::string device_name;
|
259
326
|
|
@@ -284,6 +351,7 @@ struct ggml_backend_opencl_context {
|
|
284
351
|
cl_program program_gemv_noshuffle_general;
|
285
352
|
cl_program program_gemv_noshuffle;
|
286
353
|
cl_program program_get_rows;
|
354
|
+
cl_program program_glu;
|
287
355
|
cl_program program_im2col_f16;
|
288
356
|
cl_program program_im2col_f32;
|
289
357
|
cl_program program_mul_mat_Ab_Bi_8x4;
|
@@ -299,27 +367,46 @@ struct ggml_backend_opencl_context {
|
|
299
367
|
cl_program program_mul_mv_f16_f32;
|
300
368
|
cl_program program_mul_mv_f32_f32;
|
301
369
|
cl_program program_mul;
|
370
|
+
cl_program program_div;
|
371
|
+
cl_program program_sub;
|
302
372
|
cl_program program_norm;
|
303
373
|
cl_program program_relu;
|
304
374
|
cl_program program_rms_norm;
|
375
|
+
cl_program program_group_norm;
|
305
376
|
cl_program program_rope;
|
306
377
|
cl_program program_scale;
|
307
378
|
cl_program program_silu;
|
379
|
+
cl_program program_sigmoid;
|
308
380
|
cl_program program_softmax_f32;
|
309
381
|
cl_program program_softmax_f16;
|
310
382
|
cl_program program_softmax_4_f32;
|
311
383
|
cl_program program_softmax_4_f16;
|
384
|
+
cl_program program_argsort_f32_i32;
|
385
|
+
cl_program program_sum_rows_f32;
|
386
|
+
cl_program program_repeat;
|
387
|
+
cl_program program_pad;
|
388
|
+
cl_program program_tanh;
|
389
|
+
cl_program program_upscale;
|
390
|
+
cl_program program_concat;
|
391
|
+
cl_program program_tsembd;
|
392
|
+
cl_program program_mul_mv_id_q4_0_f32_8x_flat;
|
312
393
|
|
313
394
|
cl_kernel kernel_add, kernel_add_row;
|
314
395
|
cl_kernel kernel_mul, kernel_mul_row;
|
396
|
+
cl_kernel kernel_div, kernel_div_row;
|
397
|
+
cl_kernel kernel_sub, kernel_sub_row;
|
315
398
|
cl_kernel kernel_scale;
|
316
399
|
cl_kernel kernel_silu, kernel_silu_4;
|
317
400
|
cl_kernel kernel_gelu, kernel_gelu_4;
|
318
401
|
cl_kernel kernel_gelu_quick, kernel_gelu_quick_4;
|
319
402
|
cl_kernel kernel_relu;
|
403
|
+
cl_kernel kernel_sigmoid_f32, kernel_sigmoid_f16;
|
320
404
|
cl_kernel kernel_clamp;
|
405
|
+
cl_kernel kernel_geglu, kernel_reglu, kernel_swiglu,
|
406
|
+
kernel_geglu_f16, kernel_reglu_f16, kernel_swiglu_f16;
|
321
407
|
cl_kernel kernel_norm;
|
322
408
|
cl_kernel kernel_rms_norm;
|
409
|
+
cl_kernel kernel_group_norm;
|
323
410
|
cl_kernel kernel_diag_mask_inf, kernel_diag_mask_inf_8;
|
324
411
|
cl_kernel kernel_soft_max, kernel_soft_max_4;
|
325
412
|
cl_kernel kernel_soft_max_f16, kernel_soft_max_4_f16;
|
@@ -339,6 +426,120 @@ struct ggml_backend_opencl_context {
|
|
339
426
|
cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
|
340
427
|
cl_kernel kernel_mul_mv_q6_K_f32;
|
341
428
|
cl_kernel kernel_im2col_f32, kernel_im2col_f16;
|
429
|
+
cl_kernel kernel_argsort_f32_i32;
|
430
|
+
cl_kernel kernel_sum_rows_f32;
|
431
|
+
cl_kernel kernel_repeat;
|
432
|
+
cl_kernel kernel_pad;
|
433
|
+
cl_kernel kernel_tanh_f32_nd;
|
434
|
+
cl_kernel kernel_tanh_f16_nd;
|
435
|
+
cl_kernel kernel_upscale;
|
436
|
+
cl_kernel kernel_upscale_bilinear;
|
437
|
+
cl_kernel kernel_concat_f32_contiguous;
|
438
|
+
cl_kernel kernel_concat_f32_non_contiguous;
|
439
|
+
cl_kernel kernel_timestep_embedding;
|
440
|
+
cl_kernel kernel_mul_mv_id_q4_0_f32_8x_flat;
|
441
|
+
|
442
|
+
std::vector<ProfilingInfo> profiling_info;
|
443
|
+
|
444
|
+
void write_profiling_info() {
|
445
|
+
FILE * fperf = fopen("cl_profiling.csv", "w");
|
446
|
+
if (!fperf) {
|
447
|
+
GGML_LOG_ERROR("Failed to open cl_profiling.csv\n");
|
448
|
+
return;
|
449
|
+
}
|
450
|
+
|
451
|
+
// Populate profiling info
|
452
|
+
for (ProfilingInfo & info : profiling_info) {
|
453
|
+
cl_ulong cmd_queued;
|
454
|
+
cl_ulong cmd_submit;
|
455
|
+
cl_ulong cmd_start;
|
456
|
+
cl_ulong cmd_end;
|
457
|
+
cl_ulong cmd_complete;
|
458
|
+
|
459
|
+
CL_CHECK(clWaitForEvents(1, &info.evt));
|
460
|
+
CL_CHECK(clGetEventProfilingInfo(
|
461
|
+
info.evt, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &cmd_queued, NULL));
|
462
|
+
CL_CHECK(clGetEventProfilingInfo(
|
463
|
+
info.evt, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &cmd_submit, NULL));
|
464
|
+
CL_CHECK(clGetEventProfilingInfo(
|
465
|
+
info.evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &cmd_start, NULL));
|
466
|
+
CL_CHECK(clGetEventProfilingInfo(
|
467
|
+
info.evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &cmd_end, NULL));
|
468
|
+
CL_CHECK(clGetEventProfilingInfo(
|
469
|
+
info.evt, CL_PROFILING_COMMAND_COMPLETE, sizeof(cl_ulong), &cmd_complete, NULL));
|
470
|
+
CL_CHECK(clReleaseEvent(info.evt));
|
471
|
+
|
472
|
+
char kernel_name[512];
|
473
|
+
CL_CHECK(clGetKernelInfo(info.kernel, CL_KERNEL_FUNCTION_NAME,
|
474
|
+
sizeof(kernel_name), kernel_name, NULL));
|
475
|
+
info.kernel_name = kernel_name;
|
476
|
+
|
477
|
+
info.cmd_queued = cmd_queued;
|
478
|
+
info.cmd_submit = cmd_submit;
|
479
|
+
info.cmd_start = cmd_start;
|
480
|
+
info.cmd_end = cmd_end;
|
481
|
+
|
482
|
+
info.cmd_queued_duration_ns = cmd_submit - cmd_queued;
|
483
|
+
info.cmd_submit_duration_ns = cmd_start - cmd_submit;
|
484
|
+
info.cmd_duration_ns = cmd_end - cmd_start;
|
485
|
+
info.cmd_complete_duration_ns = cmd_complete - cmd_end;
|
486
|
+
info.cmd_total_duration_ns = cmd_complete - cmd_queued;
|
487
|
+
}
|
488
|
+
|
489
|
+
// Dump a csv
|
490
|
+
float total_kernel_time = 0;
|
491
|
+
fprintf(fperf, "op name, kernel name, queued duration (ms), submit duration(ms), exec duration (ms), complete duration (ms), total duration (ms), global size, local size, output size\n");
|
492
|
+
for (const ProfilingInfo & info : profiling_info) {
|
493
|
+
total_kernel_time += info.cmd_duration_ns/1.e6f;
|
494
|
+
fprintf(fperf, "%s,%s,%f,%f,%f,%f,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n",
|
495
|
+
info.op_name.c_str(), info.kernel_name.c_str(),
|
496
|
+
info.cmd_queued_duration_ns/1.e6f,
|
497
|
+
info.cmd_submit_duration_ns/1.e6f,
|
498
|
+
info.cmd_duration_ns/1.e6f,
|
499
|
+
info.cmd_complete_duration_ns/1.e6f,
|
500
|
+
info.cmd_total_duration_ns/1.e6f,
|
501
|
+
info.global_size[0], info.global_size[1], info.global_size[2],
|
502
|
+
info.local_size[0], info.local_size[1], info.local_size[2],
|
503
|
+
info.output_size[0], info.output_size[1], info.output_size[2], info.output_size[3]);
|
504
|
+
}
|
505
|
+
fclose(fperf);
|
506
|
+
|
507
|
+
GGML_LOG_INFO("ggml_opencl: total kernel time: %f\n", total_kernel_time);
|
508
|
+
|
509
|
+
// Dump a simple chrome trace
|
510
|
+
FILE* ftrace = fopen("cl_trace.json", "w");
|
511
|
+
if (!ftrace) {
|
512
|
+
GGML_LOG_ERROR("Failed to open cl_trace.json\n");
|
513
|
+
return;
|
514
|
+
}
|
515
|
+
|
516
|
+
fprintf(ftrace, "[\n");
|
517
|
+
for (const ProfilingInfo & info : profiling_info) {
|
518
|
+
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n",
|
519
|
+
info.kernel_name.c_str(), info.cmd_queued/1000);
|
520
|
+
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n",
|
521
|
+
info.kernel_name.c_str(), info.cmd_submit/1000);
|
522
|
+
|
523
|
+
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n",
|
524
|
+
info.kernel_name.c_str(), info.cmd_start/1000);
|
525
|
+
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n",
|
526
|
+
info.kernel_name.c_str(), info.cmd_end/1000);
|
527
|
+
}
|
528
|
+
fclose(ftrace);
|
529
|
+
}
|
530
|
+
|
531
|
+
void enqueue_ndrange_kernel(cl_kernel kernel, cl_uint work_dim, size_t *global_work_size, size_t *local_work_size, const ggml_tensor * tensor) {
|
532
|
+
#ifdef GGML_OPENCL_PROFILING
|
533
|
+
cl_event evt;
|
534
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
535
|
+
|
536
|
+
profiling_info.emplace_back();
|
537
|
+
populateProfilingInfo(profiling_info.back(), evt, kernel, work_dim, global_work_size, local_work_size, tensor);
|
538
|
+
#else
|
539
|
+
GGML_UNUSED(tensor);
|
540
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
541
|
+
#endif
|
542
|
+
}
|
342
543
|
|
343
544
|
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
344
545
|
// Transpose kernels
|
@@ -366,46 +567,19 @@ struct ggml_backend_opencl_context {
|
|
366
567
|
cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096;
|
367
568
|
cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_32000_1_4096;
|
368
569
|
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
369
|
-
};
|
370
|
-
|
371
|
-
// All registered devices with a default device in the front.
|
372
|
-
static std::vector<ggml_backend_device> g_ggml_backend_opencl_devices;
|
373
570
|
|
374
|
-
|
571
|
+
void free() {
|
572
|
+
ref_count--;
|
573
|
+
if (ref_count == 0) {
|
375
574
|
#ifdef GGML_OPENCL_PROFILING
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
cl_kernel kernel;
|
381
|
-
cl_event evt;
|
382
|
-
|
383
|
-
cl_ulong cmd_queued;
|
384
|
-
cl_ulong cmd_submit;
|
385
|
-
cl_ulong cmd_start;
|
386
|
-
cl_ulong cmd_end;
|
387
|
-
cl_ulong overhead_start;
|
388
|
-
cl_ulong overhead_end;
|
389
|
-
// For the times below, see spec for clGetEventProfilingInfo
|
390
|
-
// The time kernel spent in cmd queue - SUBMIT - QUEUED
|
391
|
-
cl_ulong cmd_queued_duration_ns;
|
392
|
-
// The time kernel spent for submission - START - SUBMIT
|
393
|
-
cl_ulong cmd_submit_duration_ns;
|
394
|
-
// Kernel execution time in nanoseconds - END - START
|
395
|
-
cl_ulong cmd_duration_ns;
|
396
|
-
// The time for the kernel to complete - COMPLETE - END
|
397
|
-
cl_ulong cmd_complete_duration_ns;
|
398
|
-
// Total time to finish the kernel - COMPELTE - QUEUED
|
399
|
-
cl_ulong cmd_total_duration_ns;
|
400
|
-
// Global and local work sizes.
|
401
|
-
size_t global_size[3];
|
402
|
-
size_t local_size[3];
|
403
|
-
// Op output size.
|
404
|
-
size_t output_size[4];
|
575
|
+
write_profiling_info();
|
576
|
+
#endif
|
577
|
+
}
|
578
|
+
}
|
405
579
|
};
|
406
580
|
|
407
|
-
|
408
|
-
|
581
|
+
// All registered devices with a default device in the front.
|
582
|
+
static std::vector<ggml_backend_device> g_ggml_backend_opencl_devices;
|
409
583
|
|
410
584
|
inline std::string read_file(const std::string &path) {
|
411
585
|
std::ifstream ifs(path);
|
@@ -567,6 +741,27 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|
567
741
|
GGML_LOG_CONT(".");
|
568
742
|
}
|
569
743
|
|
744
|
+
// glu
|
745
|
+
{
|
746
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
747
|
+
const std::string kernel_src {
|
748
|
+
#include "glu.cl.h"
|
749
|
+
};
|
750
|
+
#else
|
751
|
+
const std::string kernel_src = read_file("glu.cl");
|
752
|
+
#endif
|
753
|
+
backend_ctx->program_glu =
|
754
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
755
|
+
|
756
|
+
CL_CHECK((backend_ctx->kernel_geglu = clCreateKernel(backend_ctx->program_glu, "kernel_geglu", &err), err));
|
757
|
+
CL_CHECK((backend_ctx->kernel_reglu = clCreateKernel(backend_ctx->program_glu, "kernel_reglu", &err), err));
|
758
|
+
CL_CHECK((backend_ctx->kernel_swiglu = clCreateKernel(backend_ctx->program_glu, "kernel_swiglu", &err), err));
|
759
|
+
CL_CHECK((backend_ctx->kernel_geglu_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_geglu_f16", &err), err));
|
760
|
+
CL_CHECK((backend_ctx->kernel_reglu_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_reglu_f16", &err), err));
|
761
|
+
CL_CHECK((backend_ctx->kernel_swiglu_f16 = clCreateKernel(backend_ctx->program_glu, "kernel_swiglu_f16", &err), err));
|
762
|
+
GGML_LOG_CONT(".");
|
763
|
+
}
|
764
|
+
|
570
765
|
// get_rows
|
571
766
|
{
|
572
767
|
#ifdef GGML_OPENCL_EMBED_KERNELS
|
@@ -986,152 +1181,411 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|
986
1181
|
GGML_LOG_CONT(".");
|
987
1182
|
}
|
988
1183
|
|
989
|
-
//
|
990
|
-
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
991
|
-
// transpose
|
1184
|
+
// argsort
|
992
1185
|
{
|
993
1186
|
#ifdef GGML_OPENCL_EMBED_KERNELS
|
994
1187
|
const std::string kernel_src {
|
995
|
-
#include "
|
1188
|
+
#include "argsort.cl.h"
|
996
1189
|
};
|
997
1190
|
#else
|
998
|
-
const std::string kernel_src = read_file("
|
1191
|
+
const std::string kernel_src = read_file("argsort.cl");
|
999
1192
|
#endif
|
1000
|
-
backend_ctx->
|
1193
|
+
backend_ctx->program_argsort_f32_i32 =
|
1001
1194
|
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
1002
1195
|
|
1003
|
-
CL_CHECK((backend_ctx->
|
1004
|
-
CL_CHECK((backend_ctx->kernel_transpose_32 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_32", &err), err));
|
1005
|
-
CL_CHECK((backend_ctx->kernel_transpose_16 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16", &err), err));
|
1196
|
+
CL_CHECK((backend_ctx->kernel_argsort_f32_i32 = clCreateKernel(backend_ctx->program_argsort_f32_i32, "kernel_argsort_f32_i32", &err), err));
|
1006
1197
|
GGML_LOG_CONT(".");
|
1007
1198
|
}
|
1008
1199
|
|
1009
|
-
//
|
1200
|
+
// div
|
1010
1201
|
{
|
1011
|
-
std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
1012
|
-
" -cl-mad-enable "
|
1013
|
-
" -DSIMDGROUP_WIDTH=" +
|
1014
|
-
std::to_string(backend_ctx->adreno_wave_size);
|
1015
|
-
if (backend_ctx->has_vector_subgroup_broadcast) {
|
1016
|
-
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
1017
|
-
}
|
1018
|
-
|
1019
1202
|
#ifdef GGML_OPENCL_EMBED_KERNELS
|
1020
|
-
const std::string
|
1021
|
-
#include "
|
1203
|
+
const std::string kernel_src {
|
1204
|
+
#include "div.cl.h"
|
1022
1205
|
};
|
1023
1206
|
#else
|
1024
|
-
const std::string
|
1207
|
+
const std::string kernel_src = read_file("div.cl");
|
1025
1208
|
#endif
|
1209
|
+
backend_ctx->program_div =
|
1210
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
1026
1211
|
|
1027
|
-
backend_ctx->
|
1028
|
-
|
1029
|
-
|
1030
|
-
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_general = clCreateKernel(backend_ctx->program_CL_gemv_general, "kernel_gemv_noshuffle", &err), err));
|
1212
|
+
CL_CHECK((backend_ctx->kernel_div = clCreateKernel(backend_ctx->program_div, "kernel_div", &err), err));
|
1213
|
+
CL_CHECK((backend_ctx->kernel_div_row = clCreateKernel(backend_ctx->program_div, "kernel_div_row", &err), err));
|
1031
1214
|
GGML_LOG_CONT(".");
|
1032
1215
|
}
|
1033
1216
|
|
1034
|
-
//
|
1217
|
+
// sub
|
1035
1218
|
{
|
1036
|
-
// Gemv 2048, 16384
|
1037
|
-
std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
1038
|
-
" -cl-mad-enable "
|
1039
|
-
" -DLINE_STRIDE_A=2048 "
|
1040
|
-
" -DBLOCK_STRIDE_A=16384 "
|
1041
|
-
" -DSIMDGROUP_WIDTH=" +
|
1042
|
-
std::to_string(backend_ctx->adreno_wave_size);
|
1043
|
-
if (backend_ctx->has_vector_subgroup_broadcast) {
|
1044
|
-
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
1045
|
-
}
|
1046
|
-
|
1047
1219
|
#ifdef GGML_OPENCL_EMBED_KERNELS
|
1048
|
-
const std::string
|
1049
|
-
#include "
|
1220
|
+
const std::string kernel_src {
|
1221
|
+
#include "sub.cl.h"
|
1050
1222
|
};
|
1051
1223
|
#else
|
1052
|
-
const std::string
|
1224
|
+
const std::string kernel_src = read_file("sub.cl");
|
1053
1225
|
#endif
|
1226
|
+
backend_ctx->program_sub =
|
1227
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
1054
1228
|
|
1055
|
-
backend_ctx->
|
1056
|
-
|
1057
|
-
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_4096, "kernel_gemv_noshuffle", &err), err));
|
1229
|
+
CL_CHECK((backend_ctx->kernel_sub = clCreateKernel(backend_ctx->program_sub, "kernel_sub", &err), err));
|
1230
|
+
CL_CHECK((backend_ctx->kernel_sub_row = clCreateKernel(backend_ctx->program_sub, "kernel_sub_row", &err), err));
|
1058
1231
|
GGML_LOG_CONT(".");
|
1232
|
+
}
|
1059
1233
|
|
1060
|
-
|
1061
|
-
|
1062
|
-
|
1063
|
-
|
1064
|
-
|
1065
|
-
|
1066
|
-
|
1067
|
-
|
1068
|
-
|
1069
|
-
|
1234
|
+
// sum_rows
|
1235
|
+
{
|
1236
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
1237
|
+
const std::string kernel_src {
|
1238
|
+
#include "sum_rows.cl.h"
|
1239
|
+
};
|
1240
|
+
#else
|
1241
|
+
const std::string kernel_src = read_file("sum_rows.cl");
|
1242
|
+
#endif
|
1243
|
+
backend_ctx->program_sum_rows_f32 =
|
1244
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
1070
1245
|
|
1071
|
-
backend_ctx->
|
1072
|
-
backend_ctx->context, backend_ctx->device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts);
|
1073
|
-
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_11008 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_11008, "kernel_gemv_noshuffle", &err), err));
|
1246
|
+
CL_CHECK((backend_ctx->kernel_sum_rows_f32 = clCreateKernel(backend_ctx->program_sum_rows_f32, "kernel_sum_rows_f32", &err), err));
|
1074
1247
|
GGML_LOG_CONT(".");
|
1248
|
+
}
|
1075
1249
|
|
1076
|
-
|
1077
|
-
|
1078
|
-
|
1079
|
-
|
1080
|
-
|
1081
|
-
|
1082
|
-
|
1083
|
-
|
1084
|
-
|
1085
|
-
|
1250
|
+
// sigmoid
|
1251
|
+
{
|
1252
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
1253
|
+
const std::string kernel_src {
|
1254
|
+
#include "sigmoid.cl.h"
|
1255
|
+
};
|
1256
|
+
#else
|
1257
|
+
const std::string kernel_src = read_file("sigmoid.cl");
|
1258
|
+
#endif
|
1259
|
+
backend_ctx->program_sigmoid =
|
1260
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
1086
1261
|
|
1087
|
-
backend_ctx->
|
1088
|
-
|
1089
|
-
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_11008_1_4096, "kernel_gemv_noshuffle", &err), err));
|
1262
|
+
CL_CHECK((backend_ctx->kernel_sigmoid_f32 = clCreateKernel(backend_ctx->program_sigmoid, "kernel_sigmoid_f32", &err), err));
|
1263
|
+
CL_CHECK((backend_ctx->kernel_sigmoid_f16 = clCreateKernel(backend_ctx->program_sigmoid, "kernel_sigmoid_f16", &err), err));
|
1090
1264
|
GGML_LOG_CONT(".");
|
1265
|
+
}
|
1091
1266
|
|
1092
|
-
|
1093
|
-
|
1094
|
-
|
1095
|
-
|
1096
|
-
|
1097
|
-
|
1098
|
-
|
1099
|
-
|
1100
|
-
|
1101
|
-
|
1102
|
-
|
1267
|
+
// group_norm
|
1268
|
+
{
|
1269
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
1270
|
+
const std::string kernel_src {
|
1271
|
+
#include "group_norm.cl.h"
|
1272
|
+
};
|
1273
|
+
#else
|
1274
|
+
const std::string kernel_src = read_file("group_norm.cl");
|
1275
|
+
#endif
|
1276
|
+
backend_ctx->program_group_norm =
|
1277
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
1103
1278
|
|
1104
|
-
backend_ctx->
|
1105
|
-
backend_ctx->context, backend_ctx->device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts);
|
1106
|
-
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_32000_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_32000_1_4096, "kernel_gemv_noshuffle", &err), err));
|
1279
|
+
CL_CHECK((backend_ctx->kernel_group_norm = clCreateKernel(backend_ctx->program_group_norm, "kernel_group_norm", &err), err));
|
1107
1280
|
GGML_LOG_CONT(".");
|
1108
1281
|
}
|
1109
1282
|
|
1110
|
-
//
|
1283
|
+
// repeat
|
1111
1284
|
{
|
1112
1285
|
#ifdef GGML_OPENCL_EMBED_KERNELS
|
1113
|
-
const std::string
|
1114
|
-
#include "
|
1286
|
+
const std::string kernel_src {
|
1287
|
+
#include "repeat.cl.h"
|
1115
1288
|
};
|
1116
1289
|
#else
|
1117
|
-
const std::string
|
1290
|
+
const std::string kernel_src = read_file("repeat.cl");
|
1118
1291
|
#endif
|
1119
|
-
|
1120
|
-
|
1121
|
-
|
1292
|
+
if (!kernel_src.empty()) {
|
1293
|
+
backend_ctx->program_repeat =
|
1294
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
1295
|
+
CL_CHECK((backend_ctx->kernel_repeat = clCreateKernel(backend_ctx->program_repeat, "kernel_repeat", &err), err));
|
1296
|
+
GGML_LOG_CONT(".");
|
1297
|
+
} else {
|
1298
|
+
GGML_LOG_WARN("ggml_opencl: repeat kernel source not found or empty. Repeat operations will not be available.\n");
|
1299
|
+
backend_ctx->program_repeat = nullptr;
|
1300
|
+
backend_ctx->kernel_repeat = nullptr;
|
1301
|
+
}
|
1122
1302
|
}
|
1123
|
-
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
1124
|
-
GGML_LOG_CONT("\n");
|
1125
|
-
}
|
1126
1303
|
|
1127
|
-
//
|
1128
|
-
|
1129
|
-
|
1130
|
-
|
1131
|
-
|
1132
|
-
|
1133
|
-
|
1134
|
-
|
1304
|
+
// pad
|
1305
|
+
{
|
1306
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
1307
|
+
const std::string kernel_src {
|
1308
|
+
#include "pad.cl.h"
|
1309
|
+
};
|
1310
|
+
#else
|
1311
|
+
const std::string kernel_src = read_file("pad.cl");
|
1312
|
+
#endif
|
1313
|
+
if (!kernel_src.empty()) {
|
1314
|
+
backend_ctx->program_pad =
|
1315
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
1316
|
+
CL_CHECK((backend_ctx->kernel_pad = clCreateKernel(backend_ctx->program_pad, "kernel_pad", &err), err));
|
1317
|
+
GGML_LOG_CONT(".");
|
1318
|
+
} else {
|
1319
|
+
GGML_LOG_WARN("ggml_opencl: pad kernel source not found or empty. Pad operations will not be available.\n");
|
1320
|
+
backend_ctx->program_pad = nullptr;
|
1321
|
+
backend_ctx->kernel_pad = nullptr;
|
1322
|
+
}
|
1323
|
+
}
|
1324
|
+
|
1325
|
+
// tanh
|
1326
|
+
{
|
1327
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
1328
|
+
const std::string kernel_src {
|
1329
|
+
#include "tanh.cl.h"
|
1330
|
+
};
|
1331
|
+
#else
|
1332
|
+
const std::string kernel_src = read_file("tanh.cl");
|
1333
|
+
#endif
|
1334
|
+
if (!kernel_src.empty()) {
|
1335
|
+
backend_ctx->program_tanh =
|
1336
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
1337
|
+
CL_CHECK((backend_ctx->kernel_tanh_f32_nd = clCreateKernel(backend_ctx->program_tanh, "kernel_tanh_f32_nd", &err), err));
|
1338
|
+
CL_CHECK((backend_ctx->kernel_tanh_f16_nd = clCreateKernel(backend_ctx->program_tanh, "kernel_tanh_f16_nd", &err), err));
|
1339
|
+
GGML_LOG_CONT(".");
|
1340
|
+
} else {
|
1341
|
+
GGML_LOG_WARN("ggml_opencl: tanh kernel source not found or empty. Tanh operation will not be available.\n");
|
1342
|
+
backend_ctx->program_tanh = nullptr;
|
1343
|
+
backend_ctx->kernel_tanh_f32_nd = nullptr;
|
1344
|
+
backend_ctx->kernel_tanh_f16_nd = nullptr;
|
1345
|
+
}
|
1346
|
+
}
|
1347
|
+
|
1348
|
+
// upscale
|
1349
|
+
{
|
1350
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
1351
|
+
const std::string kernel_src {
|
1352
|
+
#include "upscale.cl.h"
|
1353
|
+
};
|
1354
|
+
#else
|
1355
|
+
const std::string kernel_src = read_file("upscale.cl");
|
1356
|
+
#endif
|
1357
|
+
if (!kernel_src.empty()) {
|
1358
|
+
backend_ctx->program_upscale =
|
1359
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
1360
|
+
CL_CHECK((backend_ctx->kernel_upscale = clCreateKernel(backend_ctx->program_upscale, "kernel_upscale", &err), err));
|
1361
|
+
if (backend_ctx->program_upscale) {
|
1362
|
+
cl_int err_bilinear;
|
1363
|
+
backend_ctx->kernel_upscale_bilinear = clCreateKernel(backend_ctx->program_upscale, "kernel_upscale_bilinear", &err_bilinear);
|
1364
|
+
if (err_bilinear != CL_SUCCESS) {
|
1365
|
+
GGML_LOG_WARN("ggml_opencl: kernel_upscale_bilinear not found in upscale.cl. Bilinear upscale will not be available. Error: %d\n", err_bilinear);
|
1366
|
+
backend_ctx->kernel_upscale_bilinear = nullptr;
|
1367
|
+
}
|
1368
|
+
} else {
|
1369
|
+
backend_ctx->kernel_upscale_bilinear = nullptr;
|
1370
|
+
}
|
1371
|
+
GGML_LOG_CONT(".");
|
1372
|
+
} else {
|
1373
|
+
GGML_LOG_WARN("ggml_opencl: upscale kernel source not found or empty. Upscale operations will not be available.\n");
|
1374
|
+
backend_ctx->program_upscale = nullptr;
|
1375
|
+
backend_ctx->kernel_upscale = nullptr;
|
1376
|
+
backend_ctx->kernel_upscale_bilinear = nullptr;
|
1377
|
+
}
|
1378
|
+
}
|
1379
|
+
|
1380
|
+
// concat
|
1381
|
+
{
|
1382
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
1383
|
+
const std::string kernel_src {
|
1384
|
+
#include "concat.cl.h"
|
1385
|
+
};
|
1386
|
+
#else
|
1387
|
+
|
1388
|
+
const std::string kernel_src = read_file("concat.cl");
|
1389
|
+
#endif
|
1390
|
+
if (!kernel_src.empty()) {
|
1391
|
+
backend_ctx->program_concat =
|
1392
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
1393
|
+
|
1394
|
+
CL_CHECK((backend_ctx->kernel_concat_f32_contiguous = clCreateKernel(backend_ctx->program_concat, "kernel_concat_f32_contiguous", &err), err));
|
1395
|
+
CL_CHECK((backend_ctx->kernel_concat_f32_non_contiguous = clCreateKernel(backend_ctx->program_concat, "kernel_concat_f32_non_contiguous", &err), err));
|
1396
|
+
GGML_LOG_CONT(".");
|
1397
|
+
} else {
|
1398
|
+
GGML_LOG_WARN("ggml_opencl: concat kernel source not found or empty. Concat operations will not be available.\n");
|
1399
|
+
backend_ctx->program_concat = nullptr;
|
1400
|
+
backend_ctx->kernel_concat_f32_contiguous = nullptr;
|
1401
|
+
backend_ctx->kernel_concat_f32_non_contiguous = nullptr;
|
1402
|
+
}
|
1403
|
+
}
|
1404
|
+
|
1405
|
+
// timestep_embedding
|
1406
|
+
{
|
1407
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
1408
|
+
const std::string kernel_src {
|
1409
|
+
#include "tsembd.cl.h"
|
1410
|
+
};
|
1411
|
+
#else
|
1412
|
+
|
1413
|
+
const std::string kernel_src = read_file("tsembd.cl");
|
1414
|
+
#endif
|
1415
|
+
if (!kernel_src.empty()) {
|
1416
|
+
backend_ctx->program_tsembd =
|
1417
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
1418
|
+
CL_CHECK((backend_ctx->kernel_timestep_embedding = clCreateKernel(backend_ctx->program_tsembd, "kernel_timestep_embedding", &err), err));
|
1419
|
+
GGML_LOG_CONT(".");
|
1420
|
+
} else {
|
1421
|
+
GGML_LOG_WARN("ggml_opencl: timestep_embedding kernel source not found or empty. This op will not be available.\n");
|
1422
|
+
backend_ctx->program_tsembd = nullptr;
|
1423
|
+
backend_ctx->kernel_timestep_embedding = nullptr;
|
1424
|
+
}
|
1425
|
+
}
|
1426
|
+
|
1427
|
+
// mul_mv_id_q4_0_f32_8x_flat
|
1428
|
+
{
|
1429
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
1430
|
+
const std::string kernel_src {
|
1431
|
+
#include "mul_mv_id_q4_0_f32_8x_flat.cl.h"
|
1432
|
+
};
|
1433
|
+
#else
|
1434
|
+
const std::string kernel_src = read_file("mul_mv_id_q4_0_f32_8x_flat.cl");
|
1435
|
+
#endif
|
1436
|
+
backend_ctx->program_mul_mv_id_q4_0_f32_8x_flat =
|
1437
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
1438
|
+
|
1439
|
+
CL_CHECK((backend_ctx->kernel_mul_mv_id_q4_0_f32_8x_flat = clCreateKernel(backend_ctx->program_mul_mv_id_q4_0_f32_8x_flat, "kernel_mul_mv_id_q4_0_f32_8x_flat", &err), err));
|
1440
|
+
GGML_LOG_CONT(".");
|
1441
|
+
}
|
1442
|
+
|
1443
|
+
// Adreno kernels
|
1444
|
+
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
1445
|
+
// transpose
|
1446
|
+
{
|
1447
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
1448
|
+
const std::string kernel_src {
|
1449
|
+
#include "transpose.cl.h"
|
1450
|
+
};
|
1451
|
+
#else
|
1452
|
+
const std::string kernel_src = read_file("transpose.cl");
|
1453
|
+
#endif
|
1454
|
+
backend_ctx->program_transpose =
|
1455
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
1456
|
+
|
1457
|
+
CL_CHECK((backend_ctx->kernel_transpose_32_16 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_32_16", &err), err));
|
1458
|
+
CL_CHECK((backend_ctx->kernel_transpose_32 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_32", &err), err));
|
1459
|
+
CL_CHECK((backend_ctx->kernel_transpose_16 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16", &err), err));
|
1460
|
+
GGML_LOG_CONT(".");
|
1461
|
+
}
|
1462
|
+
|
1463
|
+
// gemv_noshuffle_general
|
1464
|
+
{
|
1465
|
+
std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
1466
|
+
" -cl-mad-enable "
|
1467
|
+
" -DSIMDGROUP_WIDTH=" +
|
1468
|
+
std::to_string(backend_ctx->adreno_wave_size);
|
1469
|
+
if (backend_ctx->has_vector_subgroup_broadcast) {
|
1470
|
+
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
1471
|
+
}
|
1472
|
+
|
1473
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
1474
|
+
const std::string kernel_src_CL_gemv_general {
|
1475
|
+
#include "gemv_noshuffle_general.cl.h"
|
1476
|
+
};
|
1477
|
+
#else
|
1478
|
+
const std::string kernel_src_CL_gemv_general = read_file("gemv_noshuffle_general.cl");
|
1479
|
+
#endif
|
1480
|
+
|
1481
|
+
backend_ctx->program_CL_gemv_general = build_program_from_source(
|
1482
|
+
backend_ctx->context, backend_ctx->device, kernel_src_CL_gemv_general.c_str(), CL_gemv_compile_opts);
|
1483
|
+
|
1484
|
+
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_general = clCreateKernel(backend_ctx->program_CL_gemv_general, "kernel_gemv_noshuffle", &err), err));
|
1485
|
+
GGML_LOG_CONT(".");
|
1486
|
+
}
|
1487
|
+
|
1488
|
+
// gemv_noshuffle
|
1489
|
+
{
|
1490
|
+
// Gemv 2048, 16384
|
1491
|
+
std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
1492
|
+
" -cl-mad-enable "
|
1493
|
+
" -DLINE_STRIDE_A=2048 "
|
1494
|
+
" -DBLOCK_STRIDE_A=16384 "
|
1495
|
+
" -DSIMDGROUP_WIDTH=" +
|
1496
|
+
std::to_string(backend_ctx->adreno_wave_size);
|
1497
|
+
if (backend_ctx->has_vector_subgroup_broadcast) {
|
1498
|
+
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
1499
|
+
}
|
1500
|
+
|
1501
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
1502
|
+
const std::string kernel_src_CL_gemv {
|
1503
|
+
#include "gemv_noshuffle.cl.h"
|
1504
|
+
};
|
1505
|
+
#else
|
1506
|
+
const std::string kernel_src_CL_gemv = read_file("gemv_noshuffle.cl");
|
1507
|
+
#endif
|
1508
|
+
|
1509
|
+
backend_ctx->program_CL_gemv_4096_1_4096 = build_program_from_source(
|
1510
|
+
backend_ctx->context, backend_ctx->device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts);
|
1511
|
+
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_4096, "kernel_gemv_noshuffle", &err), err));
|
1512
|
+
GGML_LOG_CONT(".");
|
1513
|
+
|
1514
|
+
// Gemv 2048, 16384
|
1515
|
+
CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
1516
|
+
" -cl-mad-enable "
|
1517
|
+
" -DLINE_STRIDE_A=2048 "
|
1518
|
+
" -DBLOCK_STRIDE_A=16384 "
|
1519
|
+
" -DSIMDGROUP_WIDTH=" +
|
1520
|
+
std::to_string(backend_ctx->adreno_wave_size);
|
1521
|
+
if (backend_ctx->has_vector_subgroup_broadcast) {
|
1522
|
+
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
1523
|
+
}
|
1524
|
+
|
1525
|
+
backend_ctx->program_CL_gemv_4096_1_11008 = build_program_from_source(
|
1526
|
+
backend_ctx->context, backend_ctx->device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts);
|
1527
|
+
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_11008 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_11008, "kernel_gemv_noshuffle", &err), err));
|
1528
|
+
GGML_LOG_CONT(".");
|
1529
|
+
|
1530
|
+
// Gemv 5504, 44032
|
1531
|
+
CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
1532
|
+
" -cl-mad-enable "
|
1533
|
+
" -DLINE_STRIDE_A=5504 "
|
1534
|
+
" -DBLOCK_STRIDE_A=44032 "
|
1535
|
+
" -DSIMDGROUP_WIDTH=" +
|
1536
|
+
std::to_string(backend_ctx->adreno_wave_size);
|
1537
|
+
if (backend_ctx->has_vector_subgroup_broadcast) {
|
1538
|
+
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
1539
|
+
}
|
1540
|
+
|
1541
|
+
backend_ctx->program_CL_gemv_11008_1_4096 = build_program_from_source(
|
1542
|
+
backend_ctx->context, backend_ctx->device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts);
|
1543
|
+
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_11008_1_4096, "kernel_gemv_noshuffle", &err), err));
|
1544
|
+
GGML_LOG_CONT(".");
|
1545
|
+
|
1546
|
+
// Gemv 16000, 128000
|
1547
|
+
CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
1548
|
+
" -cl-mad-enable "
|
1549
|
+
" -DLINE_STRIDE_A=16000 "
|
1550
|
+
" -DBLOCK_STRIDE_A=128000 "
|
1551
|
+
" -DSIMDGROUP_WIDTH=" +
|
1552
|
+
std::to_string(backend_ctx->adreno_wave_size);
|
1553
|
+
|
1554
|
+
if (backend_ctx->has_vector_subgroup_broadcast) {
|
1555
|
+
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
1556
|
+
}
|
1557
|
+
|
1558
|
+
backend_ctx->program_CL_gemv_32000_1_4096 = build_program_from_source(
|
1559
|
+
backend_ctx->context, backend_ctx->device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts);
|
1560
|
+
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_32000_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_32000_1_4096, "kernel_gemv_noshuffle", &err), err));
|
1561
|
+
GGML_LOG_CONT(".");
|
1562
|
+
}
|
1563
|
+
|
1564
|
+
// mul_mat_Ab_Bi_8x4
|
1565
|
+
{
|
1566
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
1567
|
+
const std::string kernel_src_CL_gemm {
|
1568
|
+
#include "mul_mat_Ab_Bi_8x4.cl.h"
|
1569
|
+
};
|
1570
|
+
#else
|
1571
|
+
const std::string kernel_src_CL_gemm = read_file("mul_mat_Ab_Bi_8x4.cl");
|
1572
|
+
#endif
|
1573
|
+
backend_ctx->program_CL_gemm = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_CL_gemm.c_str(), compile_opts);
|
1574
|
+
CL_CHECK((backend_ctx->CL_mul_mat_Ab_Bi_8x4 = clCreateKernel(backend_ctx->program_CL_gemm, "kernel_mul_mat_Ab_Bi_8x4", &err), err));
|
1575
|
+
GGML_LOG_CONT(".");
|
1576
|
+
}
|
1577
|
+
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
1578
|
+
GGML_LOG_CONT("\n");
|
1579
|
+
}
|
1580
|
+
|
1581
|
+
// XXX static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
1582
|
+
// XXX static bool initialized = false;
|
1583
|
+
// XXX static ggml_backend_opencl_context *backend_ctx = nullptr;
|
1584
|
+
|
1585
|
+
static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev);
|
1586
|
+
|
1587
|
+
namespace /* anonymous */ {
|
1588
|
+
extern struct ggml_backend_device_i ggml_backend_opencl_device_i;
|
1135
1589
|
}
|
1136
1590
|
|
1137
1591
|
// Look for available and suitable devices.
|
@@ -1381,6 +1835,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
1381
1835
|
backend_ctx->device = dev_ctx->device;
|
1382
1836
|
backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
|
1383
1837
|
|
1838
|
+
// ref_count get increased in ggml_backend_opencl_device_init
|
1839
|
+
// This function is also used to retrieve backend context, so we don't want
|
1840
|
+
// to increase ref_count for each call. We only want to increase ref_count
|
1841
|
+
// when the associated device is initialized
|
1842
|
+
backend_ctx->ref_count = 0;
|
1843
|
+
|
1384
1844
|
if (strstr(dev_ctx->device_name.c_str(), "Adreno") ||
|
1385
1845
|
strstr(dev_ctx->device_name.c_str(), "Qualcomm") ||
|
1386
1846
|
strstr(dev_ctx->device_version.c_str(), "Adreno")) {
|
@@ -1553,93 +2013,22 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
1553
2013
|
return dev_ctx->backend_ctx;
|
1554
2014
|
}
|
1555
2015
|
|
1556
|
-
static void ggml_cl2_free(
|
1557
|
-
|
1558
|
-
|
1559
|
-
if (!fperf) {
|
1560
|
-
GGML_LOG_ERROR("Failed to open cl_profiling.csv\n");
|
1561
|
-
return;
|
1562
|
-
}
|
2016
|
+
static void ggml_cl2_free(ggml_backend_t backend) {
|
2017
|
+
ggml_backend_opencl_context * ctx = (ggml_backend_opencl_context *) backend->context;
|
2018
|
+
ctx->free();
|
1563
2019
|
|
1564
|
-
//
|
1565
|
-
|
1566
|
-
|
1567
|
-
|
1568
|
-
|
1569
|
-
|
1570
|
-
|
1571
|
-
|
1572
|
-
CL_CHECK(clWaitForEvents(1, &info.evt));
|
1573
|
-
CL_CHECK(clGetEventProfilingInfo(
|
1574
|
-
info.evt, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &cmd_queued, NULL));
|
1575
|
-
CL_CHECK(clGetEventProfilingInfo(
|
1576
|
-
info.evt, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &cmd_submit, NULL));
|
1577
|
-
CL_CHECK(clGetEventProfilingInfo(
|
1578
|
-
info.evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &cmd_start, NULL));
|
1579
|
-
CL_CHECK(clGetEventProfilingInfo(
|
1580
|
-
info.evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &cmd_end, NULL));
|
1581
|
-
CL_CHECK(clGetEventProfilingInfo(
|
1582
|
-
info.evt, CL_PROFILING_COMMAND_COMPLETE, sizeof(cl_ulong), &cmd_complete, NULL));
|
1583
|
-
CL_CHECK(clReleaseEvent(info.evt));
|
1584
|
-
|
1585
|
-
char kernel_name[512];
|
1586
|
-
CL_CHECK(clGetKernelInfo(info.kernel, CL_KERNEL_FUNCTION_NAME,
|
1587
|
-
sizeof(kernel_name), kernel_name, NULL));
|
1588
|
-
info.kernel_name = kernel_name;
|
1589
|
-
|
1590
|
-
info.cmd_queued = cmd_queued;
|
1591
|
-
info.cmd_submit = cmd_submit;
|
1592
|
-
info.cmd_start = cmd_start;
|
1593
|
-
info.cmd_end = cmd_end;
|
1594
|
-
|
1595
|
-
info.cmd_queued_duration_ns = cmd_submit - cmd_queued;
|
1596
|
-
info.cmd_submit_duration_ns = cmd_start - cmd_submit;
|
1597
|
-
info.cmd_duration_ns = cmd_end - cmd_start;
|
1598
|
-
info.cmd_complete_duration_ns = cmd_complete - cmd_end;
|
1599
|
-
info.cmd_total_duration_ns = cmd_complete - cmd_queued;
|
1600
|
-
}
|
1601
|
-
|
1602
|
-
// Dump a csv
|
1603
|
-
float total_kernel_time = 0;
|
1604
|
-
fprintf(fperf, "op name, kernel name, queued duration (ms), submit duration(ms), exec duration (ms), complete duration (ms), total duration (ms), global size, local size, output size\n");
|
1605
|
-
for (const ProfilingInfo & info : g_profiling_info) {
|
1606
|
-
total_kernel_time += info.cmd_duration_ns/1.e6f;
|
1607
|
-
fprintf(fperf, "%s,%s,%f,%f,%f,%f,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n",
|
1608
|
-
info.op_name.c_str(), info.kernel_name.c_str(),
|
1609
|
-
info.cmd_queued_duration_ns/1.e6f,
|
1610
|
-
info.cmd_submit_duration_ns/1.e6f,
|
1611
|
-
info.cmd_duration_ns/1.e6f,
|
1612
|
-
info.cmd_complete_duration_ns/1.e6f,
|
1613
|
-
info.cmd_total_duration_ns/1.e6f,
|
1614
|
-
info.global_size[0], info.global_size[1], info.global_size[2],
|
1615
|
-
info.local_size[0], info.local_size[1], info.local_size[2],
|
1616
|
-
info.output_size[0], info.output_size[1], info.output_size[2], info.output_size[3]);
|
1617
|
-
}
|
1618
|
-
fclose(fperf);
|
1619
|
-
|
1620
|
-
GGML_LOG_INFO("ggml_opencl: total kernel time: %f\n", total_kernel_time);
|
1621
|
-
|
1622
|
-
// Dump a simple chrome trace
|
1623
|
-
FILE* ftrace = fopen("cl_trace.json", "w");
|
1624
|
-
if (!ftrace) {
|
1625
|
-
GGML_LOG_ERROR("Failed to open cl_trace.json\n");
|
1626
|
-
return;
|
2020
|
+
// The CL context is shared by all backends, release it if all backends have been released
|
2021
|
+
bool should_release_opencl = true;
|
2022
|
+
for (auto device : g_ggml_backend_opencl_devices) {
|
2023
|
+
ggml_backend_opencl_device_context * ctx_dev = (ggml_backend_opencl_device_context *) device.context;
|
2024
|
+
if (ctx_dev->backend_ctx->ref_count > 0) {
|
2025
|
+
should_release_opencl = false;
|
2026
|
+
}
|
1627
2027
|
}
|
1628
2028
|
|
1629
|
-
|
1630
|
-
|
1631
|
-
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n",
|
1632
|
-
info.kernel_name.c_str(), info.cmd_queued/1000);
|
1633
|
-
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n",
|
1634
|
-
info.kernel_name.c_str(), info.cmd_submit/1000);
|
1635
|
-
|
1636
|
-
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n",
|
1637
|
-
info.kernel_name.c_str(), info.cmd_start/1000);
|
1638
|
-
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n",
|
1639
|
-
info.kernel_name.c_str(), info.cmd_end/1000);
|
2029
|
+
if (should_release_opencl) {
|
2030
|
+
CL_CHECK(clReleaseContext(ctx->context));
|
1640
2031
|
}
|
1641
|
-
fclose(ftrace);
|
1642
|
-
#endif
|
1643
2032
|
}
|
1644
2033
|
|
1645
2034
|
//------------------------------------------------------------------------------
|
@@ -1723,9 +2112,7 @@ static const char * ggml_backend_opencl_name(ggml_backend_t backend) {
|
|
1723
2112
|
}
|
1724
2113
|
|
1725
2114
|
static void ggml_backend_opencl_free(ggml_backend_t backend) {
|
1726
|
-
ggml_cl2_free();
|
1727
|
-
|
1728
|
-
GGML_UNUSED(backend);
|
2115
|
+
ggml_cl2_free(backend);
|
1729
2116
|
}
|
1730
2117
|
|
1731
2118
|
static void ggml_backend_opencl_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
@@ -1752,7 +2139,12 @@ static bool ggml_backend_opencl_cpy_tensor_async(ggml_backend_t backend, const g
|
|
1752
2139
|
}
|
1753
2140
|
|
1754
2141
|
static void ggml_backend_opencl_synchronize(ggml_backend_t backend) {
|
1755
|
-
|
2142
|
+
auto * backend_ctx = static_cast<ggml_backend_opencl_context *>(backend->context);
|
2143
|
+
|
2144
|
+
cl_event evt;
|
2145
|
+
CL_CHECK(clEnqueueBarrierWithWaitList(backend_ctx->queue, 0, nullptr, &evt));
|
2146
|
+
CL_CHECK(clWaitForEvents(1, &evt));
|
2147
|
+
CL_CHECK(clReleaseEvent(evt));
|
1756
2148
|
}
|
1757
2149
|
|
1758
2150
|
// Syncronizes the 'backend_ctx's device with others so that commands
|
@@ -1856,6 +2248,8 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
|
1856
2248
|
case GGML_OP_ADD:
|
1857
2249
|
case GGML_OP_SCALE:
|
1858
2250
|
case GGML_OP_MUL:
|
2251
|
+
case GGML_OP_DIV:
|
2252
|
+
case GGML_OP_SUB:
|
1859
2253
|
return op->src[0]->type == GGML_TYPE_F32;
|
1860
2254
|
case GGML_OP_UNARY:
|
1861
2255
|
switch (ggml_get_unary_op(op)) {
|
@@ -1864,6 +2258,20 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
|
1864
2258
|
case GGML_UNARY_OP_RELU:
|
1865
2259
|
case GGML_UNARY_OP_GELU_QUICK:
|
1866
2260
|
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
|
2261
|
+
case GGML_UNARY_OP_SIGMOID:
|
2262
|
+
return ggml_is_contiguous(op->src[0]);
|
2263
|
+
case GGML_UNARY_OP_TANH:
|
2264
|
+
return (op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32) ||
|
2265
|
+
(op->src[0]->type == GGML_TYPE_F16 && op->type == GGML_TYPE_F16);
|
2266
|
+
default:
|
2267
|
+
return false;
|
2268
|
+
}
|
2269
|
+
case GGML_OP_GLU:
|
2270
|
+
switch (ggml_get_glu_op(op)) {
|
2271
|
+
case GGML_GLU_OP_GEGLU:
|
2272
|
+
case GGML_GLU_OP_REGLU:
|
2273
|
+
case GGML_GLU_OP_SWIGLU:
|
2274
|
+
return ggml_is_contiguous_1(op->src[0]) && (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16);
|
1867
2275
|
default:
|
1868
2276
|
return false;
|
1869
2277
|
}
|
@@ -1873,16 +2281,36 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
|
1873
2281
|
case GGML_OP_NORM:
|
1874
2282
|
case GGML_OP_RMS_NORM:
|
1875
2283
|
return true;
|
2284
|
+
case GGML_OP_REPEAT:
|
2285
|
+
return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32; // Assuming F32 for now, can be expanded
|
2286
|
+
case GGML_OP_PAD:
|
2287
|
+
return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32 &&
|
2288
|
+
op->src[0]->ne[3] == 1 && op->ne[3] == 1;
|
2289
|
+
case GGML_OP_UPSCALE:
|
2290
|
+
return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
|
2291
|
+
case GGML_OP_CONCAT:
|
2292
|
+
return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
|
2293
|
+
case GGML_OP_TIMESTEP_EMBEDDING:
|
2294
|
+
return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
|
2295
|
+
case GGML_OP_GROUP_NORM:
|
2296
|
+
return ggml_is_contiguous(op->src[0]);
|
1876
2297
|
case GGML_OP_MUL_MAT:
|
1877
2298
|
if (op->src[0]->type == GGML_TYPE_F16) {
|
1878
2299
|
return true;
|
1879
2300
|
} else if (op->src[0]->type == GGML_TYPE_F32) {
|
1880
|
-
return op->src[1]->type == GGML_TYPE_F32
|
2301
|
+
return op->src[1]->type == GGML_TYPE_F32;
|
1881
2302
|
} else if (op->src[0]->type == GGML_TYPE_Q4_0 ||
|
1882
2303
|
op->src[0]->type == GGML_TYPE_Q6_K) {
|
1883
2304
|
return op->src[1]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
|
1884
2305
|
}
|
1885
2306
|
return false;
|
2307
|
+
case GGML_OP_MUL_MAT_ID:
|
2308
|
+
if (op->src[0]->type == GGML_TYPE_Q4_0) {
|
2309
|
+
if (op->src[1]->type == GGML_TYPE_F32) {
|
2310
|
+
return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
|
2311
|
+
}
|
2312
|
+
}
|
2313
|
+
return false;
|
1886
2314
|
case GGML_OP_RESHAPE:
|
1887
2315
|
case GGML_OP_VIEW:
|
1888
2316
|
case GGML_OP_PERMUTE:
|
@@ -1912,6 +2340,10 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
|
1912
2340
|
}
|
1913
2341
|
case GGML_OP_IM2COL:
|
1914
2342
|
return true;
|
2343
|
+
case GGML_OP_ARGSORT:
|
2344
|
+
return op->src[0]->type == GGML_TYPE_F32;
|
2345
|
+
case GGML_OP_SUM_ROWS:
|
2346
|
+
return op->src[0]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]);
|
1915
2347
|
default:
|
1916
2348
|
return false;
|
1917
2349
|
}
|
@@ -1931,7 +2363,7 @@ static ggml_backend_i ggml_backend_opencl_i = {
|
|
1931
2363
|
/* .set_tensor_async = */ NULL, /* ggml_backend_opencl_set_tensor_async */
|
1932
2364
|
/* .get_tensor_async = */ NULL, /* ggml_backend_opencl_get_tensor_async */
|
1933
2365
|
/* .cpy_tensor_async = */ NULL, /* ggml_backend_opencl_cpy_tensor_async */
|
1934
|
-
/* .synchronize = */
|
2366
|
+
/* .synchronize = */ ggml_backend_opencl_synchronize,
|
1935
2367
|
/* .graph_plan_create = */ NULL,
|
1936
2368
|
/* .graph_plan_free = */ NULL,
|
1937
2369
|
/* .graph_plan_update = */ NULL,
|
@@ -2575,6 +3007,8 @@ static void ggml_backend_opencl_device_get_props(ggml_backend_dev_t dev, struct
|
|
2575
3007
|
|
2576
3008
|
static ggml_backend_t ggml_backend_opencl_device_init(ggml_backend_dev_t dev, const char * params) {
|
2577
3009
|
ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(dev);
|
3010
|
+
// Getting a new reference to the backend, increase ref_count
|
3011
|
+
backend_ctx->ref_count++;
|
2578
3012
|
|
2579
3013
|
ggml_backend_t backend = new ggml_backend {
|
2580
3014
|
/* .guid = */ ggml_backend_opencl_guid(),
|
@@ -2835,31 +3269,6 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso
|
|
2835
3269
|
#define dump_tensor(tensor)
|
2836
3270
|
#endif
|
2837
3271
|
|
2838
|
-
//------------------------------------------------------------------------------
|
2839
|
-
// Profiling utility
|
2840
|
-
//------------------------------------------------------------------------------
|
2841
|
-
#ifdef GGML_OPENCL_PROFILING
|
2842
|
-
static void populateProfilingInfo(
|
2843
|
-
ProfilingInfo& info, cl_event evt, cl_kernel kernel,
|
2844
|
-
size_t global_size[3], size_t local_size[3],
|
2845
|
-
const ggml_tensor * tensor) {
|
2846
|
-
info.op_name = tensor->name;
|
2847
|
-
info.kernel = kernel;
|
2848
|
-
info.evt = evt;
|
2849
|
-
|
2850
|
-
info.local_size[0] = local_size[0];
|
2851
|
-
info.local_size[1] = local_size[1];
|
2852
|
-
info.local_size[2] = local_size[2];
|
2853
|
-
info.global_size[0] = global_size[0];
|
2854
|
-
info.global_size[1] = global_size[1];
|
2855
|
-
info.global_size[2] = global_size[2];
|
2856
|
-
info.output_size[0] = tensor->ne[0];
|
2857
|
-
info.output_size[1] = tensor->ne[1];
|
2858
|
-
info.output_size[2] = tensor->ne[2];
|
2859
|
-
info.output_size[3] = tensor->ne[3];
|
2860
|
-
}
|
2861
|
-
#endif
|
2862
|
-
|
2863
3272
|
//------------------------------------------------------------------------------
|
2864
3273
|
// Ops
|
2865
3274
|
//------------------------------------------------------------------------------
|
@@ -2903,7 +3312,6 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
2903
3312
|
const cl_ulong nb2 = dst ? dst->nb[2] : 0;
|
2904
3313
|
|
2905
3314
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
2906
|
-
cl_command_queue queue = backend_ctx->queue;
|
2907
3315
|
|
2908
3316
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
2909
3317
|
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
@@ -2947,16 +3355,8 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
2947
3355
|
size_t global_work_size[] = {(size_t)ne10, (size_t)ne11, 1};
|
2948
3356
|
size_t local_work_size[] = {1, 1, 1};
|
2949
3357
|
|
2950
|
-
|
2951
|
-
|
2952
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
2953
|
-
|
2954
|
-
g_profiling_info.emplace_back();
|
2955
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
2956
|
-
#else
|
2957
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
2958
|
-
#endif
|
2959
|
-
}
|
3358
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
3359
|
+
}
|
2960
3360
|
|
2961
3361
|
static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
2962
3362
|
GGML_ASSERT(src0);
|
@@ -2997,7 +3397,6 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
2997
3397
|
const cl_ulong nb3 = dst ? dst->nb[3] : 0;
|
2998
3398
|
|
2999
3399
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
3000
|
-
cl_command_queue queue = backend_ctx->queue;
|
3001
3400
|
|
3002
3401
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
3003
3402
|
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
@@ -3072,29 +3471,13 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
3072
3471
|
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
3073
3472
|
}
|
3074
3473
|
|
3075
|
-
|
3076
|
-
cl_event evt;
|
3077
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
3078
|
-
|
3079
|
-
g_profiling_info.emplace_back();
|
3080
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
|
3081
|
-
#else
|
3082
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
3083
|
-
#endif
|
3474
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
3084
3475
|
} else {
|
3085
3476
|
unsigned int nth = MIN(64, ne0);
|
3086
3477
|
size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
|
3087
3478
|
size_t local_work_size[] = {nth, 1, 1};
|
3088
3479
|
|
3089
|
-
|
3090
|
-
cl_event evt;
|
3091
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
3092
|
-
|
3093
|
-
g_profiling_info.emplace_back();
|
3094
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
3095
|
-
#else
|
3096
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
3097
|
-
#endif
|
3480
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
3098
3481
|
}
|
3099
3482
|
}
|
3100
3483
|
|
@@ -3137,7 +3520,6 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
3137
3520
|
const cl_ulong nb3 = dst ? dst->nb[3] : 0;
|
3138
3521
|
|
3139
3522
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
3140
|
-
cl_command_queue queue = backend_ctx->queue;
|
3141
3523
|
|
3142
3524
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
3143
3525
|
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
@@ -3212,29 +3594,229 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
3212
3594
|
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
3213
3595
|
}
|
3214
3596
|
|
3215
|
-
|
3216
|
-
|
3217
|
-
|
3597
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
3598
|
+
} else {
|
3599
|
+
unsigned int nth = MIN(64, ne0);
|
3600
|
+
size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
|
3601
|
+
size_t local_work_size[] = {nth, 1, 1};
|
3218
3602
|
|
3219
|
-
|
3220
|
-
|
3221
|
-
|
3222
|
-
|
3223
|
-
|
3603
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
3604
|
+
}
|
3605
|
+
}
|
3606
|
+
|
3607
|
+
static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
3608
|
+
GGML_ASSERT(src0);
|
3609
|
+
GGML_ASSERT(src0->extra);
|
3610
|
+
GGML_ASSERT(src1);
|
3611
|
+
GGML_ASSERT(src1->extra);
|
3612
|
+
GGML_ASSERT(dst);
|
3613
|
+
GGML_ASSERT(dst->extra);
|
3614
|
+
|
3615
|
+
const int ne00 = src0->ne[0];
|
3616
|
+
const int ne01 = src0->ne[1];
|
3617
|
+
const int ne02 = src0->ne[2];
|
3618
|
+
const int ne03 = src0->ne[3];
|
3619
|
+
|
3620
|
+
const cl_ulong nb00 = src0->nb[0];
|
3621
|
+
const cl_ulong nb01 = src0->nb[1];
|
3622
|
+
const cl_ulong nb02 = src0->nb[2];
|
3623
|
+
const cl_ulong nb03 = src0->nb[3];
|
3624
|
+
|
3625
|
+
const int ne10 = src1->ne[0];
|
3626
|
+
const int ne11 = src1->ne[1];
|
3627
|
+
const int ne12 = src1->ne[2];
|
3628
|
+
const int ne13 = src1->ne[3];
|
3629
|
+
|
3630
|
+
const cl_ulong nb10 = src1->nb[0];
|
3631
|
+
const cl_ulong nb11 = src1->nb[1];
|
3632
|
+
const cl_ulong nb12 = src1->nb[2];
|
3633
|
+
const cl_ulong nb13 = src1->nb[3];
|
3634
|
+
|
3635
|
+
const int ne0 = dst->ne[0];
|
3636
|
+
|
3637
|
+
const cl_ulong nb0 = dst->nb[0];
|
3638
|
+
const cl_ulong nb1 = dst->nb[1];
|
3639
|
+
const cl_ulong nb2 = dst->nb[2];
|
3640
|
+
const cl_ulong nb3 = dst->nb[3];
|
3641
|
+
|
3642
|
+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
3643
|
+
|
3644
|
+
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
3645
|
+
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
3646
|
+
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
3647
|
+
|
3648
|
+
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
3649
|
+
cl_ulong offset1 = extra1->offset + src1->view_offs;
|
3650
|
+
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
3651
|
+
|
3652
|
+
bool bcast_row = false;
|
3653
|
+
cl_kernel kernel;
|
3654
|
+
|
3655
|
+
if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
|
3656
|
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
3657
|
+
|
3658
|
+
// src1 is a row
|
3659
|
+
GGML_ASSERT(ne11 == 1);
|
3660
|
+
|
3661
|
+
bcast_row = true;
|
3662
|
+
int ne = ne00 / 4;
|
3663
|
+
kernel = backend_ctx->kernel_div_row;
|
3664
|
+
|
3665
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
3666
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
3667
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
3668
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
3669
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
3670
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
3671
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
|
3672
|
+
} else {
|
3673
|
+
kernel = backend_ctx->kernel_div;
|
3674
|
+
|
3675
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
3676
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
3677
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
3678
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
3679
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
3680
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
3681
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb00));
|
3682
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
|
3683
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
|
3684
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03));
|
3685
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne10));
|
3686
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne11));
|
3687
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne12));
|
3688
|
+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne13));
|
3689
|
+
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb10));
|
3690
|
+
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb11));
|
3691
|
+
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb12));
|
3692
|
+
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb13));
|
3693
|
+
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne0));
|
3694
|
+
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb0));
|
3695
|
+
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb1));
|
3696
|
+
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb2));
|
3697
|
+
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb3));
|
3698
|
+
}
|
3699
|
+
|
3700
|
+
if (bcast_row) {
|
3701
|
+
int n = ggml_nelements(dst)/4;
|
3702
|
+
size_t global_work_size[] = {(size_t)n, 1, 1};
|
3703
|
+
size_t local_work_size[] = {64, 1, 1};
|
3704
|
+
|
3705
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
3224
3706
|
} else {
|
3225
3707
|
unsigned int nth = MIN(64, ne0);
|
3226
3708
|
size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
|
3227
3709
|
size_t local_work_size[] = {nth, 1, 1};
|
3228
3710
|
|
3229
|
-
|
3230
|
-
|
3231
|
-
|
3711
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
3712
|
+
}
|
3713
|
+
}
|
3232
3714
|
|
3233
|
-
|
3234
|
-
|
3235
|
-
|
3236
|
-
|
3237
|
-
|
3715
|
+
static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
3716
|
+
GGML_ASSERT(src0);
|
3717
|
+
GGML_ASSERT(src0->extra);
|
3718
|
+
GGML_ASSERT(src1);
|
3719
|
+
GGML_ASSERT(src1->extra);
|
3720
|
+
GGML_ASSERT(dst);
|
3721
|
+
GGML_ASSERT(dst->extra);
|
3722
|
+
|
3723
|
+
const int ne00 = src0->ne[0];
|
3724
|
+
const int ne01 = src0->ne[1];
|
3725
|
+
const int ne02 = src0->ne[2];
|
3726
|
+
const int ne03 = src0->ne[3];
|
3727
|
+
|
3728
|
+
const cl_ulong nb00 = src0->nb[0];
|
3729
|
+
const cl_ulong nb01 = src0->nb[1];
|
3730
|
+
const cl_ulong nb02 = src0->nb[2];
|
3731
|
+
const cl_ulong nb03 = src0->nb[3];
|
3732
|
+
|
3733
|
+
const int ne10 = src1->ne[0];
|
3734
|
+
const int ne11 = src1->ne[1];
|
3735
|
+
const int ne12 = src1->ne[2];
|
3736
|
+
const int ne13 = src1->ne[3];
|
3737
|
+
|
3738
|
+
const cl_ulong nb10 = src1->nb[0];
|
3739
|
+
const cl_ulong nb11 = src1->nb[1];
|
3740
|
+
const cl_ulong nb12 = src1->nb[2];
|
3741
|
+
const cl_ulong nb13 = src1->nb[3];
|
3742
|
+
|
3743
|
+
const int ne0 = dst->ne[0];
|
3744
|
+
|
3745
|
+
const cl_ulong nb0 = dst->nb[0];
|
3746
|
+
const cl_ulong nb1 = dst->nb[1];
|
3747
|
+
const cl_ulong nb2 = dst->nb[2];
|
3748
|
+
const cl_ulong nb3 = dst->nb[3];
|
3749
|
+
|
3750
|
+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
3751
|
+
|
3752
|
+
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
3753
|
+
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
3754
|
+
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
3755
|
+
|
3756
|
+
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
3757
|
+
cl_ulong offset1 = extra1->offset + src1->view_offs;
|
3758
|
+
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
3759
|
+
|
3760
|
+
bool bcast_row = false;
|
3761
|
+
cl_kernel kernel;
|
3762
|
+
|
3763
|
+
if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
|
3764
|
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
3765
|
+
|
3766
|
+
// src1 is a row
|
3767
|
+
GGML_ASSERT(ne11 == 1);
|
3768
|
+
|
3769
|
+
bcast_row = true;
|
3770
|
+
int ne = ne00 / 4;
|
3771
|
+
kernel = backend_ctx->kernel_sub_row;
|
3772
|
+
|
3773
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
3774
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
3775
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
3776
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
3777
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
3778
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
3779
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne));
|
3780
|
+
} else {
|
3781
|
+
kernel = backend_ctx->kernel_sub;
|
3782
|
+
|
3783
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
3784
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
3785
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
3786
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
3787
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
3788
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
3789
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb00));
|
3790
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
|
3791
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
|
3792
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03));
|
3793
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne10));
|
3794
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne11));
|
3795
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne12));
|
3796
|
+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne13));
|
3797
|
+
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb10));
|
3798
|
+
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb11));
|
3799
|
+
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb12));
|
3800
|
+
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb13));
|
3801
|
+
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne0));
|
3802
|
+
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb0));
|
3803
|
+
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb1));
|
3804
|
+
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb2));
|
3805
|
+
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb3));
|
3806
|
+
}
|
3807
|
+
|
3808
|
+
if (bcast_row) {
|
3809
|
+
int n = ggml_nelements(dst)/4;
|
3810
|
+
size_t global_work_size[] = {(size_t)n, 1, 1};
|
3811
|
+
size_t local_work_size[] = {64, 1, 1};
|
3812
|
+
|
3813
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
3814
|
+
} else {
|
3815
|
+
unsigned int nth = MIN(64, ne0);
|
3816
|
+
size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
|
3817
|
+
size_t local_work_size[] = {nth, 1, 1};
|
3818
|
+
|
3819
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
3238
3820
|
}
|
3239
3821
|
}
|
3240
3822
|
|
@@ -3247,7 +3829,6 @@ static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
3247
3829
|
UNUSED(src1);
|
3248
3830
|
|
3249
3831
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
3250
|
-
cl_command_queue queue = backend_ctx->queue;
|
3251
3832
|
|
3252
3833
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
3253
3834
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
@@ -3274,15 +3855,7 @@ static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
3274
3855
|
size_t global_work_size[] = {(size_t)n, 1, 1};
|
3275
3856
|
size_t local_work_size[] = {64, 1, 1};
|
3276
3857
|
|
3277
|
-
|
3278
|
-
cl_event evt;
|
3279
|
-
clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt);
|
3280
|
-
|
3281
|
-
g_profiling_info.emplace_back();
|
3282
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
3283
|
-
#else
|
3284
|
-
clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL);
|
3285
|
-
#endif
|
3858
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
3286
3859
|
}
|
3287
3860
|
|
3288
3861
|
static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -3294,7 +3867,6 @@ static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0,
|
|
3294
3867
|
UNUSED(src1);
|
3295
3868
|
|
3296
3869
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
3297
|
-
cl_command_queue queue = backend_ctx->queue;
|
3298
3870
|
|
3299
3871
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
3300
3872
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
@@ -3321,15 +3893,7 @@ static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0,
|
|
3321
3893
|
size_t global_work_size[] = {(size_t)n, 1, 1};
|
3322
3894
|
size_t local_work_size[] = {64, 1, 1};
|
3323
3895
|
|
3324
|
-
|
3325
|
-
cl_event evt;
|
3326
|
-
clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt);
|
3327
|
-
|
3328
|
-
g_profiling_info.emplace_back();
|
3329
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
3330
|
-
#else
|
3331
|
-
clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL);
|
3332
|
-
#endif
|
3896
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
3333
3897
|
}
|
3334
3898
|
|
3335
3899
|
static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -3341,7 +3905,6 @@ static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
3341
3905
|
UNUSED(src1);
|
3342
3906
|
|
3343
3907
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
3344
|
-
cl_command_queue queue = backend_ctx->queue;
|
3345
3908
|
|
3346
3909
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
3347
3910
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
@@ -3373,15 +3936,7 @@ static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
3373
3936
|
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
3374
3937
|
}
|
3375
3938
|
|
3376
|
-
|
3377
|
-
cl_event evt;
|
3378
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
3379
|
-
|
3380
|
-
g_profiling_info.emplace_back();
|
3381
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
|
3382
|
-
#else
|
3383
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
3384
|
-
#endif
|
3939
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
3385
3940
|
}
|
3386
3941
|
|
3387
3942
|
static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -3393,7 +3948,6 @@ static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
3393
3948
|
UNUSED(src1);
|
3394
3949
|
|
3395
3950
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
3396
|
-
cl_command_queue queue = backend_ctx->queue;
|
3397
3951
|
|
3398
3952
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
3399
3953
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
@@ -3418,15 +3972,50 @@ static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
3418
3972
|
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
3419
3973
|
}
|
3420
3974
|
|
3421
|
-
|
3422
|
-
|
3423
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
3975
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
3976
|
+
}
|
3424
3977
|
|
3425
|
-
|
3426
|
-
|
3427
|
-
|
3428
|
-
|
3429
|
-
|
3978
|
+
static void ggml_cl_sigmoid(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
3979
|
+
GGML_ASSERT(src0);
|
3980
|
+
GGML_ASSERT(src0->extra);
|
3981
|
+
GGML_ASSERT(dst);
|
3982
|
+
GGML_ASSERT(dst->extra);
|
3983
|
+
|
3984
|
+
UNUSED(src1);
|
3985
|
+
|
3986
|
+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
3987
|
+
|
3988
|
+
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
3989
|
+
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
3990
|
+
|
3991
|
+
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
3992
|
+
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
3993
|
+
|
3994
|
+
cl_kernel kernel;
|
3995
|
+
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
3996
|
+
kernel = backend_ctx->kernel_sigmoid_f32;
|
3997
|
+
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
|
3998
|
+
kernel = backend_ctx->kernel_sigmoid_f16;
|
3999
|
+
} else {
|
4000
|
+
GGML_ASSERT(false && "Unsupported data types for sigmoid (input and output must be both f32 or f16)");
|
4001
|
+
}
|
4002
|
+
|
4003
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
4004
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
4005
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
4006
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
4007
|
+
|
4008
|
+
const int64_t n = ggml_nelements(dst);
|
4009
|
+
|
4010
|
+
size_t global_work_size[] = {(size_t)n, 1, 1};
|
4011
|
+
size_t local_work_size[] = {64, 1, 1};
|
4012
|
+
|
4013
|
+
size_t * local_work_size_ptr = local_work_size;
|
4014
|
+
if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
|
4015
|
+
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
4016
|
+
}
|
4017
|
+
|
4018
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
3430
4019
|
}
|
3431
4020
|
|
3432
4021
|
static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -3438,7 +4027,6 @@ static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, cons
|
|
3438
4027
|
UNUSED(src1);
|
3439
4028
|
|
3440
4029
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
3441
|
-
cl_command_queue queue = backend_ctx->queue;
|
3442
4030
|
|
3443
4031
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
3444
4032
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
@@ -3470,15 +4058,7 @@ static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, cons
|
|
3470
4058
|
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
3471
4059
|
}
|
3472
4060
|
|
3473
|
-
|
3474
|
-
cl_event evt;
|
3475
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
3476
|
-
|
3477
|
-
g_profiling_info.emplace_back();
|
3478
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
|
3479
|
-
#else
|
3480
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
3481
|
-
#endif
|
4061
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
3482
4062
|
}
|
3483
4063
|
|
3484
4064
|
static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -3490,7 +4070,6 @@ static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
3490
4070
|
UNUSED(src1);
|
3491
4071
|
|
3492
4072
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
3493
|
-
cl_command_queue queue = backend_ctx->queue;
|
3494
4073
|
|
3495
4074
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
3496
4075
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
@@ -3531,15 +4110,7 @@ static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
3531
4110
|
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
3532
4111
|
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
3533
4112
|
|
3534
|
-
|
3535
|
-
cl_event evt;
|
3536
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
3537
|
-
|
3538
|
-
g_profiling_info.emplace_back();
|
3539
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
3540
|
-
#else
|
3541
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
3542
|
-
#endif
|
4113
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
3543
4114
|
}
|
3544
4115
|
|
3545
4116
|
static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -3551,7 +4122,6 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
3551
4122
|
UNUSED(src1);
|
3552
4123
|
|
3553
4124
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
3554
|
-
cl_command_queue queue = backend_ctx->queue;
|
3555
4125
|
|
3556
4126
|
//ggml_backend_opencl_device_context * dev_ctx =
|
3557
4127
|
// (ggml_backend_opencl_device_context *)backend->device->context;
|
@@ -3615,15 +4185,537 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
3615
4185
|
// This is local memory - the size depends on subgroup size.
|
3616
4186
|
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float)*nth/sgs, NULL));
|
3617
4187
|
|
3618
|
-
|
3619
|
-
|
3620
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
4188
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
4189
|
+
}
|
3621
4190
|
|
3622
|
-
|
3623
|
-
|
3624
|
-
|
3625
|
-
|
3626
|
-
|
4191
|
+
static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
4192
|
+
GGML_ASSERT(src0);
|
4193
|
+
GGML_ASSERT(src0->extra);
|
4194
|
+
GGML_ASSERT(dst);
|
4195
|
+
GGML_ASSERT(dst->extra);
|
4196
|
+
|
4197
|
+
UNUSED(src1);
|
4198
|
+
|
4199
|
+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
4200
|
+
|
4201
|
+
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
4202
|
+
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
4203
|
+
|
4204
|
+
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
4205
|
+
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
4206
|
+
|
4207
|
+
int32_t n_groups = ((const int32_t *) dst->op_params)[0];
|
4208
|
+
int32_t group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + n_groups - 1) / n_groups);
|
4209
|
+
float eps = ((const float *) dst->op_params)[1];
|
4210
|
+
|
4211
|
+
const int ne00 = src0->ne[0];
|
4212
|
+
const int ne01 = src0->ne[1];
|
4213
|
+
const int ne02 = src0->ne[2];
|
4214
|
+
const int ne = ne00*ne01*ne02;
|
4215
|
+
|
4216
|
+
cl_kernel kernel = backend_ctx->kernel_group_norm;
|
4217
|
+
|
4218
|
+
size_t sgs = 64;
|
4219
|
+
if (backend_ctx->gpu_family == ADRENO) {
|
4220
|
+
sgs = 64;
|
4221
|
+
} else if (backend_ctx->gpu_family == INTEL) {
|
4222
|
+
sgs = 32;
|
4223
|
+
} else {
|
4224
|
+
GGML_ASSERT(false && "Unsupported GPU");
|
4225
|
+
}
|
4226
|
+
|
4227
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
4228
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
4229
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
4230
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
4231
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne));
|
4232
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &group_size));
|
4233
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(float), &eps));
|
4234
|
+
|
4235
|
+
size_t global_work_size[] = {(size_t)n_groups*sgs, 1, 1};
|
4236
|
+
size_t local_work_size[] = {(size_t)sgs, 1, 1};
|
4237
|
+
|
4238
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
4239
|
+
}
|
4240
|
+
|
4241
|
+
static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
4242
|
+
GGML_ASSERT(src0);
|
4243
|
+
GGML_ASSERT(src0->extra);
|
4244
|
+
GGML_ASSERT(dst);
|
4245
|
+
GGML_ASSERT(dst->extra);
|
4246
|
+
|
4247
|
+
UNUSED(src1);
|
4248
|
+
|
4249
|
+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
4250
|
+
|
4251
|
+
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
4252
|
+
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
4253
|
+
|
4254
|
+
cl_ulong offset0_abs = extra0->offset + src0->view_offs;
|
4255
|
+
cl_ulong offsetd_abs = extrad->offset + dst->view_offs;
|
4256
|
+
|
4257
|
+
cl_kernel kernel;
|
4258
|
+
if (dst->type == GGML_TYPE_F32) {
|
4259
|
+
kernel = backend_ctx->kernel_tanh_f32_nd;
|
4260
|
+
} else if (dst->type == GGML_TYPE_F16) {
|
4261
|
+
kernel = backend_ctx->kernel_tanh_f16_nd;
|
4262
|
+
} else {
|
4263
|
+
GGML_ASSERT(false && "Unsupported type for ggml_cl_tanh");
|
4264
|
+
}
|
4265
|
+
GGML_ASSERT(kernel != nullptr);
|
4266
|
+
|
4267
|
+
const int ne00 = src0->ne[0]; const int ne01 = src0->ne[1]; const int ne02 = src0->ne[2]; const int ne03 = src0->ne[3];
|
4268
|
+
const cl_ulong nb00 = src0->nb[0]; const cl_ulong nb01 = src0->nb[1]; const cl_ulong nb02 = src0->nb[2]; const cl_ulong nb03 = src0->nb[3];
|
4269
|
+
|
4270
|
+
const int ne10 = dst->ne[0]; const int ne11 = dst->ne[1]; const int ne12 = dst->ne[2]; const int ne13 = dst->ne[3];
|
4271
|
+
const cl_ulong nb10 = dst->nb[0]; const cl_ulong nb11 = dst->nb[1]; const cl_ulong nb12 = dst->nb[2]; const cl_ulong nb13 = dst->nb[3];
|
4272
|
+
|
4273
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
4274
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0_abs));
|
4275
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
4276
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd_abs));
|
4277
|
+
|
4278
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
|
4279
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
|
4280
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
|
4281
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
|
4282
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb00));
|
4283
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
|
4284
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong),&nb02));
|
4285
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong),&nb03));
|
4286
|
+
|
4287
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne10));
|
4288
|
+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne11));
|
4289
|
+
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne12));
|
4290
|
+
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne13));
|
4291
|
+
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong),&nb10));
|
4292
|
+
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong),&nb11));
|
4293
|
+
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong),&nb12));
|
4294
|
+
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong),&nb13));
|
4295
|
+
|
4296
|
+
size_t global_work_size[3];
|
4297
|
+
if (ne10 == 0 || ne11 == 0 || ne12 == 0 || ne13 == 0) { // Handle case of 0 elements
|
4298
|
+
return;
|
4299
|
+
}
|
4300
|
+
global_work_size[0] = (size_t)ne10;
|
4301
|
+
global_work_size[1] = (size_t)ne11;
|
4302
|
+
global_work_size[2] = (size_t)ne12;
|
4303
|
+
|
4304
|
+
size_t lws0 = 16, lws1 = 4, lws2 = 1;
|
4305
|
+
if (ne10 < 16) lws0 = ne10;
|
4306
|
+
if (ne11 < 4) lws1 = ne11;
|
4307
|
+
if (ne12 < 1) lws2 = ne12 > 0 ? ne12 : 1;
|
4308
|
+
|
4309
|
+
while (lws0 * lws1 * lws2 > 256 && lws0 > 1) lws0 /= 2;
|
4310
|
+
while (lws0 * lws1 * lws2 > 256 && lws1 > 1) lws1 /= 2;
|
4311
|
+
while (lws0 * lws1 * lws2 > 256 && lws2 > 1) lws2 /= 2;
|
4312
|
+
|
4313
|
+
|
4314
|
+
size_t local_work_size[] = {lws0, lws1, lws2};
|
4315
|
+
|
4316
|
+
size_t* local_work_size_ptr = local_work_size;
|
4317
|
+
if (!backend_ctx->non_uniform_workgroups) {
|
4318
|
+
if (global_work_size[0] % local_work_size[0] != 0 ||
|
4319
|
+
global_work_size[1] % local_work_size[1] != 0 ||
|
4320
|
+
global_work_size[2] % local_work_size[2] != 0) {
|
4321
|
+
local_work_size_ptr = NULL;
|
4322
|
+
}
|
4323
|
+
}
|
4324
|
+
if (global_work_size[0] == 0 || global_work_size[1] == 0 || global_work_size[2] == 0) return;
|
4325
|
+
|
4326
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
4327
|
+
}
|
4328
|
+
|
4329
|
+
static void ggml_cl_repeat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1_shape_def, ggml_tensor * dst) {
|
4330
|
+
GGML_ASSERT(src0);
|
4331
|
+
GGML_ASSERT(src0->extra);
|
4332
|
+
GGML_ASSERT(dst);
|
4333
|
+
GGML_ASSERT(dst->extra);
|
4334
|
+
GGML_ASSERT(dst->type == src0->type);
|
4335
|
+
|
4336
|
+
UNUSED(src1_shape_def);
|
4337
|
+
|
4338
|
+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
4339
|
+
|
4340
|
+
if (backend_ctx->kernel_repeat == nullptr) {
|
4341
|
+
GGML_LOG_WARN("%s: repeat kernel not available, skipping OpenCL execution.\n", __func__);
|
4342
|
+
return;
|
4343
|
+
}
|
4344
|
+
|
4345
|
+
ggml_tensor_extra_cl * extra_src0 = (ggml_tensor_extra_cl *)src0->extra;
|
4346
|
+
ggml_tensor_extra_cl * extra_dst = (ggml_tensor_extra_cl *)dst->extra;
|
4347
|
+
|
4348
|
+
cl_ulong off_src0 = extra_src0->offset + src0->view_offs;
|
4349
|
+
cl_ulong off_dst = extra_dst->offset + dst->view_offs;
|
4350
|
+
|
4351
|
+
const int src0_ne0 = src0->ne[0]; const int src0_ne1 = src0->ne[1]; const int src0_ne2 = src0->ne[2]; const int src0_ne3 = src0->ne[3];
|
4352
|
+
const cl_ulong src0_nb0 = src0->nb[0]; const cl_ulong src0_nb1 = src0->nb[1]; const cl_ulong src0_nb2 = src0->nb[2]; const cl_ulong src0_nb3 = src0->nb[3];
|
4353
|
+
|
4354
|
+
const int dst_ne0 = dst->ne[0]; const int dst_ne1 = dst->ne[1]; const int dst_ne2 = dst->ne[2]; const int dst_ne3 = dst->ne[3];
|
4355
|
+
const cl_ulong dst_nb0 = dst->nb[0]; const cl_ulong dst_nb1 = dst->nb[1]; const cl_ulong dst_nb2 = dst->nb[2]; const cl_ulong dst_nb3 = dst->nb[3];
|
4356
|
+
|
4357
|
+
cl_kernel kernel = backend_ctx->kernel_repeat;
|
4358
|
+
|
4359
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra_src0->data_device));
|
4360
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra_dst->data_device));
|
4361
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_ulong), &off_src0));
|
4362
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &off_dst));
|
4363
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &src0_ne0));
|
4364
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &src0_ne1));
|
4365
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &src0_ne2));
|
4366
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &src0_ne3));
|
4367
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &src0_nb0));
|
4368
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &src0_nb1));
|
4369
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &src0_nb2));
|
4370
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &src0_nb3));
|
4371
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &dst_ne0));
|
4372
|
+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &dst_ne1));
|
4373
|
+
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &dst_ne2));
|
4374
|
+
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &dst_ne3));
|
4375
|
+
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &dst_nb0));
|
4376
|
+
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &dst_nb1));
|
4377
|
+
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &dst_nb2));
|
4378
|
+
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &dst_nb3));
|
4379
|
+
|
4380
|
+
size_t gws0 = dst_ne1 > 0 ? (size_t)dst_ne1 : 1;
|
4381
|
+
size_t gws1 = dst_ne2 > 0 ? (size_t)dst_ne2 : 1;
|
4382
|
+
size_t gws2 = dst_ne3 > 0 ? (size_t)dst_ne3 : 1;
|
4383
|
+
|
4384
|
+
size_t global_work_size[] = { gws0, gws1, gws2 };
|
4385
|
+
|
4386
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, NULL, dst);
|
4387
|
+
}
|
4388
|
+
|
4389
|
+
static void ggml_cl_pad(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
|
4390
|
+
GGML_ASSERT(src0);
|
4391
|
+
GGML_ASSERT(src0->extra);
|
4392
|
+
GGML_ASSERT(dst);
|
4393
|
+
GGML_ASSERT(dst->extra);
|
4394
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
4395
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
4396
|
+
GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1);
|
4397
|
+
|
4398
|
+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
4399
|
+
|
4400
|
+
if (backend_ctx->kernel_pad == nullptr) {
|
4401
|
+
GGML_LOG_WARN("%s: pad kernel not available, skipping OpenCL execution.\n", __func__);
|
4402
|
+
return;
|
4403
|
+
}
|
4404
|
+
|
4405
|
+
ggml_tensor_extra_cl * extra_src0 = (ggml_tensor_extra_cl *)src0->extra;
|
4406
|
+
ggml_tensor_extra_cl * extra_dst = (ggml_tensor_extra_cl *)dst->extra;
|
4407
|
+
|
4408
|
+
cl_ulong off_src0 = extra_src0->offset + src0->view_offs;
|
4409
|
+
cl_ulong off_dst = extra_dst->offset + dst->view_offs;
|
4410
|
+
|
4411
|
+
const int s_ne0 = src0->ne[0];
|
4412
|
+
const int s_ne1 = src0->ne[1];
|
4413
|
+
const int s_ne2 = src0->ne[2];
|
4414
|
+
|
4415
|
+
const int d_ne0 = dst->ne[0];
|
4416
|
+
const int d_ne1 = dst->ne[1];
|
4417
|
+
const int d_ne2 = dst->ne[2];
|
4418
|
+
|
4419
|
+
cl_kernel kernel = backend_ctx->kernel_pad;
|
4420
|
+
|
4421
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra_src0->data_device));
|
4422
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &off_src0));
|
4423
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra_dst->data_device));
|
4424
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &off_dst));
|
4425
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &s_ne0));
|
4426
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &s_ne1));
|
4427
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &s_ne2));
|
4428
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &d_ne0));
|
4429
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &d_ne1));
|
4430
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &d_ne2));
|
4431
|
+
|
4432
|
+
size_t lws0 = 64;
|
4433
|
+
size_t gws0 = (( (size_t)d_ne0 + lws0 - 1 ) / lws0) * lws0;
|
4434
|
+
|
4435
|
+
size_t global_work_size[] = { gws0, (size_t)d_ne1, (size_t)d_ne2 };
|
4436
|
+
size_t local_work_size[] = { lws0, 1, 1 };
|
4437
|
+
|
4438
|
+
size_t * local_work_size_ptr = local_work_size;
|
4439
|
+
if (d_ne0 % lws0 != 0 && !backend_ctx->non_uniform_workgroups) {
|
4440
|
+
local_work_size_ptr = nullptr;
|
4441
|
+
}
|
4442
|
+
|
4443
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
4444
|
+
}
|
4445
|
+
|
4446
|
+
static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
|
4447
|
+
GGML_ASSERT(src0);
|
4448
|
+
GGML_ASSERT(src0->extra);
|
4449
|
+
GGML_ASSERT(dst);
|
4450
|
+
GGML_ASSERT(dst->extra);
|
4451
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
4452
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
4453
|
+
|
4454
|
+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
4455
|
+
|
4456
|
+
const ggml_scale_mode mode = (ggml_scale_mode) ggml_get_op_params_i32(dst, 0);
|
4457
|
+
cl_kernel kernel = nullptr;
|
4458
|
+
|
4459
|
+
if (mode == GGML_SCALE_MODE_NEAREST) {
|
4460
|
+
kernel = backend_ctx->kernel_upscale;
|
4461
|
+
if (kernel == nullptr) {
|
4462
|
+
GGML_LOG_WARN("%s: nearest upscale kernel not available, skipping OpenCL execution.\n", __func__);
|
4463
|
+
return;
|
4464
|
+
}
|
4465
|
+
} else if (mode == GGML_SCALE_MODE_BILINEAR) {
|
4466
|
+
kernel = backend_ctx->kernel_upscale_bilinear;
|
4467
|
+
if (kernel == nullptr) {
|
4468
|
+
GGML_LOG_WARN("%s: bilinear upscale kernel not available, skipping OpenCL execution.\n", __func__);
|
4469
|
+
return;
|
4470
|
+
}
|
4471
|
+
} else {
|
4472
|
+
GGML_LOG_WARN("%s: unsupported upscale mode %d, skipping OpenCL execution.\n", __func__, mode);
|
4473
|
+
return;
|
4474
|
+
}
|
4475
|
+
|
4476
|
+
ggml_tensor_extra_cl * extra_src0 = (ggml_tensor_extra_cl *)src0->extra;
|
4477
|
+
ggml_tensor_extra_cl * extra_dst = (ggml_tensor_extra_cl *)dst->extra;
|
4478
|
+
|
4479
|
+
cl_ulong off_src0 = extra_src0->offset + src0->view_offs;
|
4480
|
+
cl_ulong off_dst = extra_dst->offset + dst->view_offs;
|
4481
|
+
|
4482
|
+
const cl_ulong nb00 = src0->nb[0];
|
4483
|
+
const cl_ulong nb01 = src0->nb[1];
|
4484
|
+
const cl_ulong nb02 = src0->nb[2];
|
4485
|
+
const cl_ulong nb03 = src0->nb[3];
|
4486
|
+
|
4487
|
+
const int ne00_src = src0->ne[0];
|
4488
|
+
const int ne01_src = src0->ne[1];
|
4489
|
+
|
4490
|
+
const int ne10_dst = dst->ne[0];
|
4491
|
+
const int ne11_dst = dst->ne[1];
|
4492
|
+
const int ne12_dst = dst->ne[2];
|
4493
|
+
const int ne13_dst = dst->ne[3];
|
4494
|
+
|
4495
|
+
const float sf0 = (float)dst->ne[0] / src0->ne[0];
|
4496
|
+
const float sf1 = (float)dst->ne[1] / src0->ne[1];
|
4497
|
+
const float sf2 = (float)dst->ne[2] / src0->ne[2];
|
4498
|
+
const float sf3 = (float)dst->ne[3] / src0->ne[3];
|
4499
|
+
|
4500
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra_src0->data_device));
|
4501
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &off_src0));
|
4502
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra_dst->data_device));
|
4503
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &off_dst));
|
4504
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_ulong), &nb00));
|
4505
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &nb01));
|
4506
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb02));
|
4507
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb03));
|
4508
|
+
|
4509
|
+
if (mode == GGML_SCALE_MODE_NEAREST) {
|
4510
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne10_dst));
|
4511
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne11_dst));
|
4512
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12_dst));
|
4513
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne13_dst));
|
4514
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float), &sf0));
|
4515
|
+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(float), &sf1));
|
4516
|
+
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(float), &sf2));
|
4517
|
+
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(float), &sf3));
|
4518
|
+
} else if (mode == GGML_SCALE_MODE_BILINEAR) {
|
4519
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00_src));
|
4520
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01_src));
|
4521
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne10_dst));
|
4522
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne11_dst));
|
4523
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne12_dst));
|
4524
|
+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne13_dst));
|
4525
|
+
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(float), &sf0));
|
4526
|
+
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(float), &sf1));
|
4527
|
+
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(float), &sf2));
|
4528
|
+
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(float), &sf3));
|
4529
|
+
}
|
4530
|
+
|
4531
|
+
|
4532
|
+
size_t dst_total_elements = (size_t)ne10_dst * ne11_dst * ne12_dst * ne13_dst;
|
4533
|
+
if (dst_total_elements == 0) {
|
4534
|
+
return;
|
4535
|
+
}
|
4536
|
+
size_t global_work_size[] = { dst_total_elements, 1, 1 };
|
4537
|
+
size_t local_work_size_pref = 256;
|
4538
|
+
size_t local_work_size[] = { MIN(local_work_size_pref, dst_total_elements), 1, 1};
|
4539
|
+
|
4540
|
+
size_t * local_work_size_ptr = local_work_size;
|
4541
|
+
if (dst_total_elements % local_work_size[0] != 0 && !backend_ctx->non_uniform_workgroups) {
|
4542
|
+
local_work_size_ptr = nullptr;
|
4543
|
+
}
|
4544
|
+
|
4545
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
4546
|
+
}
|
4547
|
+
|
4548
|
+
static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
4549
|
+
GGML_ASSERT(src0);
|
4550
|
+
GGML_ASSERT(src0->extra);
|
4551
|
+
GGML_ASSERT(src1);
|
4552
|
+
GGML_ASSERT(src1->extra);
|
4553
|
+
GGML_ASSERT(dst);
|
4554
|
+
GGML_ASSERT(dst->extra);
|
4555
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
4556
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
4557
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
4558
|
+
|
4559
|
+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
4560
|
+
cl_command_queue queue = backend_ctx->queue;
|
4561
|
+
|
4562
|
+
if (backend_ctx->kernel_concat_f32_contiguous == nullptr || backend_ctx->kernel_concat_f32_non_contiguous == nullptr) {
|
4563
|
+
GGML_LOG_WARN("%s: concat kernels not available, skipping OpenCL execution.\n", __func__);
|
4564
|
+
return;
|
4565
|
+
}
|
4566
|
+
|
4567
|
+
ggml_tensor_extra_cl * extra0_cl = (ggml_tensor_extra_cl *)src0->extra;
|
4568
|
+
ggml_tensor_extra_cl * extra1_cl = (ggml_tensor_extra_cl *)src1->extra;
|
4569
|
+
ggml_tensor_extra_cl * extrad_cl = (ggml_tensor_extra_cl *)dst->extra;
|
4570
|
+
|
4571
|
+
cl_ulong off_src0 = extra0_cl->offset + src0->view_offs;
|
4572
|
+
cl_ulong off_src1 = extra1_cl->offset + src1->view_offs;
|
4573
|
+
cl_ulong off_dst = extrad_cl->offset + dst->view_offs;
|
4574
|
+
|
4575
|
+
const int32_t dim = ((const int32_t *) dst->op_params)[0];
|
4576
|
+
GGML_ASSERT(dim >= 0 && dim <= 3);
|
4577
|
+
|
4578
|
+
if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) {
|
4579
|
+
if (dim == 3) {
|
4580
|
+
|
4581
|
+
size_t nbytes_src0 = ggml_nbytes(src0);
|
4582
|
+
size_t nbytes_src1 = ggml_nbytes(src1);
|
4583
|
+
|
4584
|
+
CL_CHECK(clEnqueueCopyBuffer(queue, extra0_cl->data_device, extrad_cl->data_device,
|
4585
|
+
off_src0, off_dst, nbytes_src0, 0, NULL, NULL));
|
4586
|
+
CL_CHECK(clEnqueueCopyBuffer(queue, extra1_cl->data_device, extrad_cl->data_device,
|
4587
|
+
off_src1, off_dst + nbytes_src0, nbytes_src1, 0, NULL, NULL));
|
4588
|
+
} else {
|
4589
|
+
|
4590
|
+
cl_kernel kernel = backend_ctx->kernel_concat_f32_contiguous;
|
4591
|
+
size_t global_work_size[3];
|
4592
|
+
|
4593
|
+
for (int i3 = 0; i3 < dst->ne[3]; ++i3) {
|
4594
|
+
cl_ulong current_off_src0 = off_src0 + (i3 * src0->nb[3]);
|
4595
|
+
cl_ulong current_off_src1 = off_src1 + (i3 * src1->nb[3]);
|
4596
|
+
cl_ulong current_off_dst = off_dst + (i3 * dst->nb[3]);
|
4597
|
+
|
4598
|
+
int d_ne00 = src0->ne[0]; int d_ne01 = src0->ne[1]; int d_ne02 = src0->ne[2];
|
4599
|
+
int d_ne10 = src1->ne[0]; int d_ne11 = src1->ne[1]; int d_ne12 = src1->ne[2];
|
4600
|
+
int d_ne0 = dst->ne[0]; int d_ne1 = dst->ne[1]; int d_ne2 = dst->ne[2];
|
4601
|
+
|
4602
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_cl->data_device));
|
4603
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), ¤t_off_src0));
|
4604
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1_cl->data_device));
|
4605
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), ¤t_off_src1));
|
4606
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad_cl->data_device));
|
4607
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), ¤t_off_dst));
|
4608
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &d_ne00));
|
4609
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &d_ne01));
|
4610
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &d_ne02));
|
4611
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &d_ne10));
|
4612
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &d_ne11));
|
4613
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &d_ne12));
|
4614
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &d_ne0));
|
4615
|
+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &d_ne1));
|
4616
|
+
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &d_ne2));
|
4617
|
+
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &dim));
|
4618
|
+
|
4619
|
+
global_work_size[0] = d_ne0;
|
4620
|
+
global_work_size[1] = d_ne1;
|
4621
|
+
global_work_size[2] = d_ne2;
|
4622
|
+
|
4623
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, NULL, dst);
|
4624
|
+
}
|
4625
|
+
}
|
4626
|
+
} else {
|
4627
|
+
cl_kernel kernel = backend_ctx->kernel_concat_f32_non_contiguous;
|
4628
|
+
|
4629
|
+
long ne00 = src0->ne[0], ne01 = src0->ne[1], ne02 = src0->ne[2], ne03 = src0->ne[3];
|
4630
|
+
cl_ulong nb00 = src0->nb[0], nb01 = src0->nb[1], nb02 = src0->nb[2], nb03 = src0->nb[3];
|
4631
|
+
|
4632
|
+
cl_ulong nb10 = src1->nb[0], nb11 = src1->nb[1], nb12 = src1->nb[2], nb13 = src1->nb[3];
|
4633
|
+
|
4634
|
+
long d_ne0 = dst->ne[0], d_ne1 = dst->ne[1], d_ne2 = dst->ne[2], d_ne3 = dst->ne[3];
|
4635
|
+
cl_ulong d_nb0 = dst->nb[0], d_nb1 = dst->nb[1], d_nb2 = dst->nb[2], d_nb3 = dst->nb[3];
|
4636
|
+
|
4637
|
+
|
4638
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_cl->data_device));
|
4639
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &off_src0));
|
4640
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1_cl->data_device));
|
4641
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &off_src1));
|
4642
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad_cl->data_device));
|
4643
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &off_dst));
|
4644
|
+
|
4645
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(long), &ne00));
|
4646
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(long), &ne01));
|
4647
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(long), &ne02));
|
4648
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(long), &ne03));
|
4649
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb00));
|
4650
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb01));
|
4651
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
|
4652
|
+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb03));
|
4653
|
+
|
4654
|
+
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb10));
|
4655
|
+
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb11));
|
4656
|
+
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb12));
|
4657
|
+
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb13));
|
4658
|
+
|
4659
|
+
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(long), &d_ne0));
|
4660
|
+
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(long), &d_ne1));
|
4661
|
+
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(long), &d_ne2));
|
4662
|
+
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(long), &d_ne3));
|
4663
|
+
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &d_nb0));
|
4664
|
+
CL_CHECK(clSetKernelArg(kernel, 23, sizeof(cl_ulong), &d_nb1));
|
4665
|
+
CL_CHECK(clSetKernelArg(kernel, 24, sizeof(cl_ulong), &d_nb2));
|
4666
|
+
CL_CHECK(clSetKernelArg(kernel, 25, sizeof(cl_ulong), &d_nb3));
|
4667
|
+
CL_CHECK(clSetKernelArg(kernel, 26, sizeof(int), &dim));
|
4668
|
+
|
4669
|
+
size_t global_work_size_nc[] = { d_ne1 > 0 ? (size_t)d_ne1 : 1,
|
4670
|
+
d_ne2 > 0 ? (size_t)d_ne2 : 1,
|
4671
|
+
d_ne3 > 0 ? (size_t)d_ne3 : 1 };
|
4672
|
+
|
4673
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size_nc, NULL, dst);
|
4674
|
+
}
|
4675
|
+
}
|
4676
|
+
|
4677
|
+
static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
|
4678
|
+
GGML_ASSERT(src0);
|
4679
|
+
GGML_ASSERT(src0->extra);
|
4680
|
+
GGML_ASSERT(dst);
|
4681
|
+
GGML_ASSERT(dst->extra);
|
4682
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
4683
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
4684
|
+
|
4685
|
+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
4686
|
+
|
4687
|
+
if (backend_ctx->kernel_timestep_embedding == nullptr) {
|
4688
|
+
GGML_LOG_WARN("%s: timestep_embedding kernel not available, skipping OpenCL execution.\n", __func__);
|
4689
|
+
return;
|
4690
|
+
}
|
4691
|
+
|
4692
|
+
ggml_tensor_extra_cl * extra_src0 = (ggml_tensor_extra_cl *)src0->extra;
|
4693
|
+
ggml_tensor_extra_cl * extra_dst = (ggml_tensor_extra_cl *)dst->extra;
|
4694
|
+
|
4695
|
+
cl_ulong off_src0 = extra_src0->offset + src0->view_offs;
|
4696
|
+
cl_ulong off_dst = extra_dst->offset + dst->view_offs;
|
4697
|
+
|
4698
|
+
const int logical_dim = dst->op_params[0];
|
4699
|
+
const int max_period = dst->op_params[1];
|
4700
|
+
const int dst_nb1_bytes = dst->nb[1];
|
4701
|
+
|
4702
|
+
cl_kernel kernel = backend_ctx->kernel_timestep_embedding;
|
4703
|
+
|
4704
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra_src0->data_device));
|
4705
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &off_src0));
|
4706
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra_dst->data_device));
|
4707
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &off_dst));
|
4708
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &dst_nb1_bytes));
|
4709
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &logical_dim));
|
4710
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &max_period));
|
4711
|
+
|
4712
|
+
size_t gws0 = (size_t)(((logical_dim + 1) / 2) + 1);
|
4713
|
+
|
4714
|
+
size_t gws1 = (size_t)src0->ne[0];
|
4715
|
+
|
4716
|
+
size_t global_work_size[] = {gws0, gws1, 1};
|
4717
|
+
|
4718
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, NULL, dst);
|
3627
4719
|
}
|
3628
4720
|
|
3629
4721
|
static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -3638,7 +4730,6 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
3638
4730
|
const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
|
3639
4731
|
|
3640
4732
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
3641
|
-
cl_command_queue queue = backend_ctx->queue;
|
3642
4733
|
|
3643
4734
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
3644
4735
|
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
@@ -3843,15 +4934,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
3843
4934
|
static_cast<size_t>(padded_height_B)
|
3844
4935
|
};
|
3845
4936
|
|
3846
|
-
|
3847
|
-
cl_event evt;
|
3848
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_size_t, local_size_t, 0, NULL, &evt));
|
3849
|
-
|
3850
|
-
g_profiling_info.emplace_back();
|
3851
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_size_t, local_size_t, dst);
|
3852
|
-
#else
|
3853
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_size_t, local_size_t, 0, NULL, NULL));
|
3854
|
-
#endif
|
4937
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_size_t, local_size_t, dst);
|
3855
4938
|
} else {
|
3856
4939
|
// no need to transpose B in other cases
|
3857
4940
|
// create an image for B from sub_buffer
|
@@ -3973,16 +5056,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
3973
5056
|
|
3974
5057
|
// enqueue kernel with profiling
|
3975
5058
|
// <--------------------------------------------> //
|
3976
|
-
|
3977
|
-
cl_event evt;
|
3978
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
3979
|
-
|
3980
|
-
g_profiling_info.emplace_back();
|
3981
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
3982
|
-
// enqueue kernel without profiling
|
3983
|
-
#else
|
3984
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
3985
|
-
#endif
|
5059
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
3986
5060
|
// <--------------------------------------------> //
|
3987
5061
|
|
3988
5062
|
// deallocate sub buffers and images
|
@@ -4062,15 +5136,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
4062
5136
|
global_work_size[2] = (size_t)ne12*ne13;
|
4063
5137
|
}
|
4064
5138
|
|
4065
|
-
|
4066
|
-
cl_event evt;
|
4067
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
4068
|
-
|
4069
|
-
g_profiling_info.emplace_back();
|
4070
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
4071
|
-
#else
|
4072
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
4073
|
-
#endif
|
5139
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
4074
5140
|
return;
|
4075
5141
|
}
|
4076
5142
|
#else // GGML_OPENCL_SOA_Q
|
@@ -4300,15 +5366,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
4300
5366
|
size_t global_work_size[] = {(size_t)(ne01 + ndst-1)/ndst*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13};
|
4301
5367
|
size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
|
4302
5368
|
|
4303
|
-
|
4304
|
-
cl_event evt;
|
4305
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
4306
|
-
|
4307
|
-
g_profiling_info.emplace_back();
|
4308
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
4309
|
-
#else
|
4310
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
4311
|
-
#endif
|
5369
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
4312
5370
|
} else if (src0t == GGML_TYPE_Q4_K) {
|
4313
5371
|
GGML_ASSERT(false && "not implemented");
|
4314
5372
|
} else if (src0t == GGML_TYPE_Q3_K) {
|
@@ -4317,33 +5375,138 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
4317
5375
|
GGML_ASSERT(false && "not implemented");
|
4318
5376
|
} else if (src0t == GGML_TYPE_Q6_K) {
|
4319
5377
|
size_t global_work_size[] = {(size_t)(ne01+1)/2*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13};
|
4320
|
-
size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
|
4321
|
-
|
4322
|
-
#ifdef GGML_OPENCL_PROFILING
|
4323
|
-
cl_event evt;
|
4324
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
5378
|
+
size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
|
4325
5379
|
|
4326
|
-
|
4327
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
4328
|
-
#else
|
4329
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
4330
|
-
#endif
|
5380
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
4331
5381
|
} else {
|
4332
5382
|
int64_t ny = (ne11 + nrows - 1)/nrows;
|
4333
5383
|
|
4334
5384
|
size_t global_work_size[] = {(size_t)ne01*nth0, (size_t)ny*nth1, (size_t)ne12*ne13};
|
4335
5385
|
size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
|
4336
5386
|
|
4337
|
-
|
4338
|
-
|
4339
|
-
|
5387
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
5388
|
+
}
|
5389
|
+
}
|
4340
5390
|
|
4341
|
-
|
4342
|
-
|
4343
|
-
|
4344
|
-
|
5391
|
+
static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
5392
|
+
GGML_ASSERT(src0);
|
5393
|
+
GGML_ASSERT(src0->extra);
|
5394
|
+
GGML_ASSERT(src1);
|
5395
|
+
GGML_ASSERT(src1->extra);
|
5396
|
+
GGML_ASSERT(dst);
|
5397
|
+
GGML_ASSERT(dst->extra);
|
5398
|
+
|
5399
|
+
const ggml_tensor * src2 = dst->src[2];
|
5400
|
+
GGML_ASSERT(src2);
|
5401
|
+
GGML_ASSERT(src2->extra);
|
5402
|
+
|
5403
|
+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
5404
|
+
|
5405
|
+
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
5406
|
+
ggml_tensor_extra_cl * extra2 = (ggml_tensor_extra_cl *)src2->extra;
|
5407
|
+
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
5408
|
+
|
5409
|
+
cl_ulong offset1 = extra1->offset + src1->view_offs;
|
5410
|
+
cl_ulong offset2 = extra2->offset + src2->view_offs;
|
5411
|
+
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
5412
|
+
|
5413
|
+
#ifdef GGML_OPENCL_SOA_Q
|
5414
|
+
ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra;
|
4345
5415
|
#endif
|
5416
|
+
|
5417
|
+
const int ne00 = src0->ne[0];
|
5418
|
+
const int ne01 = src0->ne[1];
|
5419
|
+
const int ne02 = src0->ne[2];
|
5420
|
+
const int ne03 = src0->ne[3];
|
5421
|
+
|
5422
|
+
const cl_ulong nb00 = src0->nb[0];
|
5423
|
+
const cl_ulong nb02 = src0->nb[2];
|
5424
|
+
|
5425
|
+
const int ne10 = src1->ne[0];
|
5426
|
+
const int ne11 = src1->ne[1];
|
5427
|
+
const int ne12 = src1->ne[2];
|
5428
|
+
const int ne13 = src1->ne[3];
|
5429
|
+
|
5430
|
+
const cl_ulong nb11 = src1->nb[1];
|
5431
|
+
const cl_ulong nb12 = src1->nb[2];
|
5432
|
+
|
5433
|
+
const int ne20 = src2->ne[0];
|
5434
|
+
const int ne21 = src2->ne[1];
|
5435
|
+
|
5436
|
+
const cl_ulong nb21 = src2->nb[1];
|
5437
|
+
|
5438
|
+
const int ne0 = dst->ne[0];
|
5439
|
+
const int ne1 = dst->ne[1];
|
5440
|
+
|
5441
|
+
const int r2 = ne12/ne02;
|
5442
|
+
const int r3 = ne13/ne03;
|
5443
|
+
const int dst_rows = ne20*ne21; // ne20 = n_used_experts, ne21 = n_rows
|
5444
|
+
|
5445
|
+
GGML_ASSERT(ne00 == ne10);
|
5446
|
+
|
5447
|
+
int sgs = 32; // subgroup size
|
5448
|
+
int nsg = 1; // number of subgroups
|
5449
|
+
int nrows = 1; // number of row in src1
|
5450
|
+
int ndst = 4; // number of values produced by each subgroup
|
5451
|
+
|
5452
|
+
cl_kernel kernel;
|
5453
|
+
|
5454
|
+
// subgroup mat vec
|
5455
|
+
switch (src0->type) {
|
5456
|
+
case GGML_TYPE_Q4_0: {
|
5457
|
+
kernel = backend_ctx->kernel_mul_mv_id_q4_0_f32_8x_flat;
|
5458
|
+
|
5459
|
+
if (backend_ctx->gpu_family == INTEL) {
|
5460
|
+
sgs = 16;
|
5461
|
+
nsg = 1;
|
5462
|
+
ndst = 8;
|
5463
|
+
} else if (backend_ctx->gpu_family == ADRENO) {
|
5464
|
+
sgs = 64;
|
5465
|
+
nsg = 1;
|
5466
|
+
ndst = 8;
|
5467
|
+
} else {
|
5468
|
+
GGML_ASSERT(false && "TODO: Unknown GPU");
|
5469
|
+
}
|
5470
|
+
|
5471
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_0->q));
|
5472
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_0->d));
|
5473
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
5474
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
5475
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra2->data_device));
|
5476
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset2));
|
5477
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
|
5478
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd));
|
5479
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00));
|
5480
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01));
|
5481
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne02));
|
5482
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb00));
|
5483
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
|
5484
|
+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne10));
|
5485
|
+
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne11));
|
5486
|
+
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne12));
|
5487
|
+
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb11));
|
5488
|
+
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb12));
|
5489
|
+
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne20));
|
5490
|
+
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &ne21));
|
5491
|
+
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb21));
|
5492
|
+
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int), &ne0));
|
5493
|
+
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &ne1));
|
5494
|
+
CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &r2));
|
5495
|
+
CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &r3));
|
5496
|
+
|
5497
|
+
break;
|
5498
|
+
}
|
5499
|
+
default:
|
5500
|
+
GGML_ASSERT(false && "not implemented");;
|
4346
5501
|
}
|
5502
|
+
|
5503
|
+
int _ne1 = 1;
|
5504
|
+
int ne123 = dst_rows;
|
5505
|
+
|
5506
|
+
size_t global_work_size[] = {(size_t)(ne01+ndst*nsg-1)/(ndst*nsg)*sgs, (size_t)(_ne1+nrows-1)/nrows*nsg, (size_t)ne123};
|
5507
|
+
size_t local_work_size[] = {(size_t)sgs, (size_t)nsg, 1};
|
5508
|
+
|
5509
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
4347
5510
|
}
|
4348
5511
|
|
4349
5512
|
static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -4356,7 +5519,6 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons
|
|
4356
5519
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
4357
5520
|
|
4358
5521
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
4359
|
-
cl_command_queue queue = backend_ctx->queue;
|
4360
5522
|
|
4361
5523
|
float scale;
|
4362
5524
|
memcpy(&scale, dst->op_params, sizeof(scale));
|
@@ -4385,15 +5547,7 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons
|
|
4385
5547
|
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
4386
5548
|
}
|
4387
5549
|
|
4388
|
-
|
4389
|
-
cl_event evt;
|
4390
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
4391
|
-
|
4392
|
-
g_profiling_info.emplace_back();
|
4393
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
|
4394
|
-
#else
|
4395
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
4396
|
-
#endif
|
5550
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
4397
5551
|
}
|
4398
5552
|
|
4399
5553
|
static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -4430,7 +5584,6 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
4430
5584
|
const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
|
4431
5585
|
|
4432
5586
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
4433
|
-
cl_command_queue queue = backend_ctx->queue;
|
4434
5587
|
|
4435
5588
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
4436
5589
|
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
@@ -4495,15 +5648,7 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
4495
5648
|
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
4496
5649
|
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
4497
5650
|
|
4498
|
-
|
4499
|
-
cl_event evt;
|
4500
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
4501
|
-
|
4502
|
-
g_profiling_info.emplace_back();
|
4503
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, src1);
|
4504
|
-
#else
|
4505
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
4506
|
-
#endif
|
5651
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, src1);
|
4507
5652
|
}
|
4508
5653
|
|
4509
5654
|
static void ggml_cl_dup(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -4526,7 +5671,6 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr
|
|
4526
5671
|
const int ne02 = src0 ? src0->ne[2] : 0;
|
4527
5672
|
|
4528
5673
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
4529
|
-
cl_command_queue queue = backend_ctx->queue;
|
4530
5674
|
|
4531
5675
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
4532
5676
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
@@ -4550,15 +5694,7 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr
|
|
4550
5694
|
size_t global_work_size[] = {(size_t)ne00*ne01*ne02/8, 1, 1};
|
4551
5695
|
size_t local_work_size[] = {64, 1, 1};
|
4552
5696
|
|
4553
|
-
|
4554
|
-
cl_event evt;
|
4555
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
4556
|
-
|
4557
|
-
g_profiling_info.emplace_back();
|
4558
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
4559
|
-
#else
|
4560
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
4561
|
-
#endif
|
5697
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
4562
5698
|
} else {
|
4563
5699
|
kernel = backend_ctx->kernel_diag_mask_inf;
|
4564
5700
|
|
@@ -4578,15 +5714,7 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr
|
|
4578
5714
|
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
4579
5715
|
}
|
4580
5716
|
|
4581
|
-
|
4582
|
-
cl_event evt;
|
4583
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
4584
|
-
|
4585
|
-
g_profiling_info.emplace_back();
|
4586
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
|
4587
|
-
#else
|
4588
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
4589
|
-
#endif
|
5717
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
4590
5718
|
}
|
4591
5719
|
}
|
4592
5720
|
|
@@ -4606,7 +5734,6 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
4606
5734
|
}
|
4607
5735
|
|
4608
5736
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
4609
|
-
cl_command_queue queue = backend_ctx->queue;
|
4610
5737
|
|
4611
5738
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
4612
5739
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
@@ -4686,15 +5813,7 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
4686
5813
|
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
4687
5814
|
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
4688
5815
|
|
4689
|
-
|
4690
|
-
cl_event evt;
|
4691
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
4692
|
-
|
4693
|
-
g_profiling_info.emplace_back();
|
4694
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
4695
|
-
#else
|
4696
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
4697
|
-
#endif
|
5816
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
4698
5817
|
}
|
4699
5818
|
|
4700
5819
|
static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -4706,7 +5825,6 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
4706
5825
|
GGML_ASSERT(dst->extra);
|
4707
5826
|
|
4708
5827
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
4709
|
-
cl_command_queue queue = backend_ctx->queue;
|
4710
5828
|
|
4711
5829
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
4712
5830
|
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
@@ -4872,15 +5990,7 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
4872
5990
|
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
4873
5991
|
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
4874
5992
|
|
4875
|
-
|
4876
|
-
cl_event evt;
|
4877
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
4878
|
-
|
4879
|
-
g_profiling_info.emplace_back();
|
4880
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
4881
|
-
#else
|
4882
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
4883
|
-
#endif
|
5993
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
4884
5994
|
}
|
4885
5995
|
|
4886
5996
|
static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -4895,7 +6005,6 @@ static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, con
|
|
4895
6005
|
GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
|
4896
6006
|
|
4897
6007
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
4898
|
-
cl_command_queue queue = backend_ctx->queue;
|
4899
6008
|
|
4900
6009
|
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
4901
6010
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
@@ -4964,15 +6073,192 @@ static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, con
|
|
4964
6073
|
size_t global_work_size[] = {(size_t)num_blocks*256, (size_t)OH, (size_t)batch*IC};
|
4965
6074
|
size_t local_work_size[] = {256, 1, 1};
|
4966
6075
|
|
4967
|
-
|
4968
|
-
|
4969
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
6076
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
6077
|
+
}
|
4970
6078
|
|
4971
|
-
|
4972
|
-
|
4973
|
-
|
4974
|
-
|
4975
|
-
|
6079
|
+
static void ggml_cl_argsort(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6080
|
+
GGML_ASSERT(src0);
|
6081
|
+
GGML_ASSERT(src0->extra);
|
6082
|
+
GGML_ASSERT(dst);
|
6083
|
+
GGML_ASSERT(dst->extra);
|
6084
|
+
GGML_UNUSED(src1);
|
6085
|
+
|
6086
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6087
|
+
GGML_ASSERT( dst->type == GGML_TYPE_I32);
|
6088
|
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
6089
|
+
|
6090
|
+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
6091
|
+
|
6092
|
+
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
6093
|
+
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
6094
|
+
|
6095
|
+
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
6096
|
+
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
6097
|
+
|
6098
|
+
const int ne00 = src0->ne[0];
|
6099
|
+
const int nrows = ggml_nrows(src0);
|
6100
|
+
|
6101
|
+
int ne00_padded = 1;
|
6102
|
+
while (ne00_padded < ne00) {
|
6103
|
+
ne00_padded *= 2;
|
6104
|
+
}
|
6105
|
+
|
6106
|
+
int order = (enum ggml_sort_order) dst->op_params[0];
|
6107
|
+
|
6108
|
+
cl_kernel kernel = backend_ctx->kernel_argsort_f32_i32;
|
6109
|
+
|
6110
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
6111
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
6112
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
6113
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
6114
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
|
6115
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne00_padded));
|
6116
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &order));
|
6117
|
+
CL_CHECK(clSetKernelArg(kernel, 7, ne00_padded*sizeof(int), NULL));
|
6118
|
+
|
6119
|
+
size_t global_work_size[] = {(size_t)ne00_padded, (size_t)nrows, (size_t)1};
|
6120
|
+
size_t local_work_size[] = {(size_t)ne00_padded, 1, 1};
|
6121
|
+
|
6122
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
6123
|
+
}
|
6124
|
+
|
6125
|
+
static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6126
|
+
GGML_ASSERT(src0);
|
6127
|
+
GGML_ASSERT(src0->extra);
|
6128
|
+
GGML_ASSERT(dst);
|
6129
|
+
GGML_ASSERT(dst->extra);
|
6130
|
+
GGML_UNUSED(src1);
|
6131
|
+
|
6132
|
+
GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
|
6133
|
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
6134
|
+
|
6135
|
+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
6136
|
+
|
6137
|
+
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
6138
|
+
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
6139
|
+
|
6140
|
+
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
6141
|
+
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
6142
|
+
|
6143
|
+
const int ne00 = src0->ne[0];
|
6144
|
+
const int ne01 = src0->ne[1];
|
6145
|
+
const int ne02 = src0->ne[2];
|
6146
|
+
const int ne03 = src0->ne[3];
|
6147
|
+
|
6148
|
+
const cl_ulong nb01 = src0->nb[1];
|
6149
|
+
const cl_ulong nb02 = src0->nb[2];
|
6150
|
+
const cl_ulong nb03 = src0->nb[3];
|
6151
|
+
|
6152
|
+
const cl_ulong nb1 = dst->nb[1];
|
6153
|
+
const cl_ulong nb2 = dst->nb[2];
|
6154
|
+
const cl_ulong nb3 = dst->nb[3];
|
6155
|
+
|
6156
|
+
cl_kernel kernel = backend_ctx->kernel_sum_rows_f32;
|
6157
|
+
|
6158
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
6159
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
6160
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
6161
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
6162
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
|
6163
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
|
6164
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
|
6165
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
|
6166
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb01));
|
6167
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb02));
|
6168
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb03));
|
6169
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb1));
|
6170
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb2));
|
6171
|
+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb3));
|
6172
|
+
|
6173
|
+
size_t global_work_size[] = {(size_t)ne01, (size_t)ne02, (size_t)ne03};
|
6174
|
+
size_t local_work_size[] = {(size_t)64, 1, 1};
|
6175
|
+
|
6176
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
6177
|
+
}
|
6178
|
+
|
6179
|
+
static void ggml_cl_glu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6180
|
+
GGML_ASSERT(src0);
|
6181
|
+
GGML_ASSERT(src0->extra);
|
6182
|
+
GGML_ASSERT(dst);
|
6183
|
+
GGML_ASSERT(dst->extra);
|
6184
|
+
|
6185
|
+
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
6186
|
+
|
6187
|
+
if (src1) {
|
6188
|
+
GGML_ASSERT(src1);
|
6189
|
+
GGML_ASSERT(src1->extra);
|
6190
|
+
GGML_ASSERT(ggml_are_same_shape(src0, src1));
|
6191
|
+
}
|
6192
|
+
|
6193
|
+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
6194
|
+
|
6195
|
+
cl_kernel kernel;
|
6196
|
+
switch (ggml_get_glu_op(dst)) {
|
6197
|
+
case GGML_GLU_OP_GEGLU:
|
6198
|
+
if (dst->type == GGML_TYPE_F32) {
|
6199
|
+
kernel = backend_ctx->kernel_geglu;
|
6200
|
+
} else {
|
6201
|
+
kernel = backend_ctx->kernel_geglu_f16;
|
6202
|
+
}
|
6203
|
+
break;
|
6204
|
+
case GGML_GLU_OP_REGLU:
|
6205
|
+
if (dst->type == GGML_TYPE_F32) {
|
6206
|
+
kernel = backend_ctx->kernel_reglu;
|
6207
|
+
} else {
|
6208
|
+
kernel = backend_ctx->kernel_reglu_f16;
|
6209
|
+
}
|
6210
|
+
break;
|
6211
|
+
case GGML_GLU_OP_SWIGLU:
|
6212
|
+
if (dst->type == GGML_TYPE_F32) {
|
6213
|
+
kernel = backend_ctx->kernel_swiglu;
|
6214
|
+
} else {
|
6215
|
+
kernel = backend_ctx->kernel_swiglu_f16;
|
6216
|
+
}
|
6217
|
+
break;
|
6218
|
+
default:
|
6219
|
+
GGML_ABORT("Unsupported glu op");
|
6220
|
+
}
|
6221
|
+
|
6222
|
+
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
6223
|
+
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
6224
|
+
|
6225
|
+
ggml_tensor_extra_cl * extra1 = src1 ? (ggml_tensor_extra_cl *)src1->extra : nullptr;
|
6226
|
+
|
6227
|
+
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
6228
|
+
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
6229
|
+
|
6230
|
+
cl_ulong offset1 = extra1 ? extra1->offset + src1->view_offs : offset0;
|
6231
|
+
|
6232
|
+
const int ne0 = dst->ne[0];
|
6233
|
+
|
6234
|
+
const cl_ulong nb01 = src0->nb[1];
|
6235
|
+
const cl_ulong nb11 = src1 ? src1->nb[1] : nb01;
|
6236
|
+
|
6237
|
+
const cl_ulong nb1 = dst->nb[1];
|
6238
|
+
|
6239
|
+
const int swp = ((const int32_t *) dst->op_params)[1];
|
6240
|
+
const int ne00_off = src1 ? 0 : (swp ? ne0 : 0);
|
6241
|
+
const int ne10_off = src1 ? 0 : (swp ? 0 : ne0);
|
6242
|
+
|
6243
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
6244
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
6245
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), src1 ? &extra1->data_device : &extra0->data_device));
|
6246
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
6247
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
6248
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
6249
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb01));
|
6250
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb11));
|
6251
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne0));
|
6252
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb1));
|
6253
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne00_off));
|
6254
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne10_off));
|
6255
|
+
|
6256
|
+
const size_t nrows = ggml_nrows(src0);
|
6257
|
+
size_t nth = 512;
|
6258
|
+
size_t global_work_size[] = {nrows*nth, 1, 1};
|
6259
|
+
size_t local_work_size[] = {nth, 1, 1};
|
6260
|
+
|
6261
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
4976
6262
|
}
|
4977
6263
|
|
4978
6264
|
//------------------------------------------------------------------------------
|
@@ -5023,6 +6309,18 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
|
|
5023
6309
|
}
|
5024
6310
|
func = ggml_cl_mul;
|
5025
6311
|
break;
|
6312
|
+
case GGML_OP_DIV:
|
6313
|
+
if (!any_on_device) {
|
6314
|
+
return false;
|
6315
|
+
}
|
6316
|
+
func = ggml_cl_div;
|
6317
|
+
break;
|
6318
|
+
case GGML_OP_SUB:
|
6319
|
+
if (!any_on_device) {
|
6320
|
+
return false;
|
6321
|
+
}
|
6322
|
+
func = ggml_cl_sub;
|
6323
|
+
break;
|
5026
6324
|
case GGML_OP_UNARY:
|
5027
6325
|
switch (ggml_get_unary_op(tensor)) {
|
5028
6326
|
case GGML_UNARY_OP_GELU:
|
@@ -5049,9 +6347,27 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
|
|
5049
6347
|
}
|
5050
6348
|
func = ggml_cl_relu;
|
5051
6349
|
break;
|
6350
|
+
case GGML_UNARY_OP_SIGMOID:
|
6351
|
+
if (!any_on_device) {
|
6352
|
+
return false;
|
6353
|
+
}
|
6354
|
+
func = ggml_cl_sigmoid;
|
6355
|
+
break;
|
6356
|
+
case GGML_UNARY_OP_TANH:
|
6357
|
+
if (!any_on_device) {
|
6358
|
+
return false;
|
6359
|
+
}
|
6360
|
+
func = ggml_cl_tanh;
|
6361
|
+
break;
|
5052
6362
|
default:
|
5053
6363
|
return false;
|
5054
6364
|
} break;
|
6365
|
+
case GGML_OP_GLU:
|
6366
|
+
if (!any_on_device) {
|
6367
|
+
return false;
|
6368
|
+
}
|
6369
|
+
func = ggml_cl_glu;
|
6370
|
+
break;
|
5055
6371
|
case GGML_OP_CLAMP:
|
5056
6372
|
if (!any_on_device) {
|
5057
6373
|
return false;
|
@@ -5070,12 +6386,54 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
|
|
5070
6386
|
}
|
5071
6387
|
func = ggml_cl_rms_norm;
|
5072
6388
|
break;
|
6389
|
+
case GGML_OP_GROUP_NORM:
|
6390
|
+
if (!any_on_device) {
|
6391
|
+
return false;
|
6392
|
+
}
|
6393
|
+
func = ggml_cl_group_norm;
|
6394
|
+
break;
|
6395
|
+
case GGML_OP_REPEAT:
|
6396
|
+
if (!any_on_device) {
|
6397
|
+
return false;
|
6398
|
+
}
|
6399
|
+
func = ggml_cl_repeat;
|
6400
|
+
break;
|
6401
|
+
case GGML_OP_PAD:
|
6402
|
+
if (!any_on_device) {
|
6403
|
+
return false;
|
6404
|
+
}
|
6405
|
+
ggml_cl_pad(backend, tensor->src[0], tensor);
|
6406
|
+
return true;
|
6407
|
+
case GGML_OP_UPSCALE:
|
6408
|
+
if (!any_on_device) {
|
6409
|
+
return false;
|
6410
|
+
}
|
6411
|
+
ggml_cl_upscale(backend, tensor->src[0], tensor);
|
6412
|
+
return true;
|
6413
|
+
case GGML_OP_CONCAT:
|
6414
|
+
if (!any_on_device) {
|
6415
|
+
return false;
|
6416
|
+
}
|
6417
|
+
func = ggml_cl_concat;
|
6418
|
+
break;
|
6419
|
+
case GGML_OP_TIMESTEP_EMBEDDING:
|
6420
|
+
if (!any_on_device) {
|
6421
|
+
return false;
|
6422
|
+
}
|
6423
|
+
ggml_cl_timestep_embedding(backend, tensor->src[0], tensor);
|
6424
|
+
return true;
|
5073
6425
|
case GGML_OP_MUL_MAT:
|
5074
6426
|
if (!any_on_device && !ggml_cl_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
|
5075
6427
|
return false;
|
5076
6428
|
}
|
5077
6429
|
func = ggml_cl_mul_mat;
|
5078
6430
|
break;
|
6431
|
+
case GGML_OP_MUL_MAT_ID:
|
6432
|
+
if (!any_on_device) {
|
6433
|
+
return false;
|
6434
|
+
}
|
6435
|
+
func = ggml_cl_mul_mat_id;
|
6436
|
+
break;
|
5079
6437
|
case GGML_OP_SCALE:
|
5080
6438
|
if (!any_on_device) {
|
5081
6439
|
return false;
|
@@ -5115,6 +6473,18 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
|
|
5115
6473
|
}
|
5116
6474
|
func = ggml_cl_im2col;
|
5117
6475
|
break;
|
6476
|
+
case GGML_OP_ARGSORT:
|
6477
|
+
if (!any_on_device) {
|
6478
|
+
return false;
|
6479
|
+
}
|
6480
|
+
func = ggml_cl_argsort;
|
6481
|
+
break;
|
6482
|
+
case GGML_OP_SUM_ROWS:
|
6483
|
+
if (!any_on_device) {
|
6484
|
+
return false;
|
6485
|
+
}
|
6486
|
+
func = ggml_cl_sum_rows;
|
6487
|
+
break;
|
5118
6488
|
default:
|
5119
6489
|
return false;
|
5120
6490
|
}
|