@novastera-oss/llamarn 0.2.7 → 0.2.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/cpp/include/llama.h +8 -3
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/LlamaCppModel.cpp +56 -22
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakeLists.txt +1 -1
- package/cpp/llama.cpp/common/arg.cpp +7 -0
- package/cpp/llama.cpp/common/common.cpp +3 -0
- package/cpp/llama.cpp/common/common.h +1 -0
- package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
- package/cpp/llama.cpp/convert_hf_to_gguf.py +118 -20
- package/cpp/llama.cpp/ggml/CMakeLists.txt +1 -0
- package/cpp/llama.cpp/ggml/include/ggml-cpu.h +2 -0
- package/cpp/llama.cpp/ggml/include/ggml.h +33 -0
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +17 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +31 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +10 -9
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +109 -108
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +1027 -1038
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +53 -52
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +56 -55
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +42 -41
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +24 -23
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +29 -28
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +30 -29
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +83 -82
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +20 -19
- package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +9 -3
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +83 -102
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +192 -67
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +25 -24
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +56 -40
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +211 -33
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +54 -29
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +84 -31
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +19 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cuh +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cu +257 -87
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cuh +2 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +5 -18
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cuh +0 -1
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +61 -183
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +16 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +227 -41
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +362 -182
- package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +240 -535
- package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -6
- package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +1 -24
- package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +28 -41
- package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +4 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +99 -166
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +94 -72
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +49 -67
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +99 -159
- package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +6 -9
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +45 -54
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +60 -80
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +132 -201
- package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +55 -74
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +24 -20
- package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -8
- package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +12 -16
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +12 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +57 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -0
- package/cpp/llama.cpp/ggml/src/ggml.c +69 -13
- package/cpp/llama.cpp/ggml/src/gguf.cpp +5 -1
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +76 -0
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +21 -0
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +64 -0
- package/cpp/llama.cpp/gguf-py/gguf/vocab.py +97 -4
- package/cpp/llama.cpp/gguf-py/pyproject.toml +2 -2
- package/cpp/llama.cpp/include/llama.h +8 -3
- package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +124 -0
- package/cpp/llama.cpp/src/llama-arch.cpp +55 -0
- package/cpp/llama.cpp/src/llama-arch.h +18 -0
- package/cpp/llama.cpp/src/llama-batch.cpp +570 -359
- package/cpp/llama.cpp/src/llama-batch.h +98 -70
- package/cpp/llama.cpp/src/llama-chat.cpp +11 -6
- package/cpp/llama.cpp/src/llama-context.cpp +101 -107
- package/cpp/llama.cpp/src/llama-context.h +13 -13
- package/cpp/llama.cpp/src/llama-graph.cpp +199 -252
- package/cpp/llama.cpp/src/llama-graph.h +44 -32
- package/cpp/llama.cpp/src/llama-hparams.cpp +4 -0
- package/cpp/llama.cpp/src/llama-hparams.h +8 -0
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +51 -53
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +19 -24
- package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +110 -104
- package/cpp/llama.cpp/src/llama-kv-cache-unified.h +17 -22
- package/cpp/llama.cpp/src/llama-kv-cells.h +35 -11
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +66 -67
- package/cpp/llama.cpp/src/llama-memory-hybrid.h +16 -21
- package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +69 -68
- package/cpp/llama.cpp/src/llama-memory-recurrent.h +15 -20
- package/cpp/llama.cpp/src/llama-memory.h +18 -22
- package/cpp/llama.cpp/src/llama-model-saver.cpp +1 -0
- package/cpp/llama.cpp/src/llama-model.cpp +1006 -472
- package/cpp/llama.cpp/src/llama-model.h +22 -0
- package/cpp/llama.cpp/src/llama-quant.cpp +87 -5
- package/cpp/llama.cpp/src/llama-vocab.cpp +26 -3
- package/cpp/llama.cpp/src/llama-vocab.h +1 -0
- package/cpp/rn-utils.h +3 -0
- package/ios/include/common.h +1 -0
- package/ios/include/llama.h +8 -3
- package/ios/libs/llama.xcframework/Info.plist +19 -19
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4863
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4834
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3742
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4834
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3766 -3744
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4863
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4834
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3742
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4926 -4900
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4897 -4871
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3794 -3773
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +8 -3
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +1 -1
|
@@ -231,6 +231,71 @@ static ggml_cl_compiler_version get_adreno_cl_compiler_version(const char *drive
|
|
|
231
231
|
return { type, major, minor, patch };
|
|
232
232
|
}
|
|
233
233
|
|
|
234
|
+
// Profiling
|
|
235
|
+
struct ProfilingInfo {
|
|
236
|
+
std::string op_name;
|
|
237
|
+
std::string kernel_name;
|
|
238
|
+
|
|
239
|
+
cl_kernel kernel;
|
|
240
|
+
cl_event evt;
|
|
241
|
+
|
|
242
|
+
cl_ulong cmd_queued;
|
|
243
|
+
cl_ulong cmd_submit;
|
|
244
|
+
cl_ulong cmd_start;
|
|
245
|
+
cl_ulong cmd_end;
|
|
246
|
+
cl_ulong overhead_start;
|
|
247
|
+
cl_ulong overhead_end;
|
|
248
|
+
// For the times below, see spec for clGetEventProfilingInfo
|
|
249
|
+
// The time kernel spent in cmd queue - SUBMIT - QUEUED
|
|
250
|
+
cl_ulong cmd_queued_duration_ns;
|
|
251
|
+
// The time kernel spent for submission - START - SUBMIT
|
|
252
|
+
cl_ulong cmd_submit_duration_ns;
|
|
253
|
+
// Kernel execution time in nanoseconds - END - START
|
|
254
|
+
cl_ulong cmd_duration_ns;
|
|
255
|
+
// The time for the kernel to complete - COMPLETE - END
|
|
256
|
+
cl_ulong cmd_complete_duration_ns;
|
|
257
|
+
// Total time to finish the kernel - COMPELTE - QUEUED
|
|
258
|
+
cl_ulong cmd_total_duration_ns;
|
|
259
|
+
// Global and local work sizes.
|
|
260
|
+
size_t global_size[3];
|
|
261
|
+
size_t local_size[3];
|
|
262
|
+
// Op output size.
|
|
263
|
+
size_t output_size[4];
|
|
264
|
+
};
|
|
265
|
+
|
|
266
|
+
static void populateProfilingInfo(
|
|
267
|
+
ProfilingInfo& info, cl_event evt, cl_kernel kernel, cl_uint work_dim,
|
|
268
|
+
size_t global_size[3], size_t local_size[3],
|
|
269
|
+
const ggml_tensor * tensor) {
|
|
270
|
+
info.op_name = tensor->name;
|
|
271
|
+
info.kernel = kernel;
|
|
272
|
+
info.evt = evt;
|
|
273
|
+
|
|
274
|
+
// 0 means not specified, e.g., 2D workgroup, or NULL for driver to choose
|
|
275
|
+
info.local_size[0] = 0;
|
|
276
|
+
info.local_size[1] = 0;
|
|
277
|
+
info.local_size[2] = 0;
|
|
278
|
+
|
|
279
|
+
info.global_size[0] = 0;
|
|
280
|
+
info.global_size[1] = 0;
|
|
281
|
+
info.global_size[2] = 0;
|
|
282
|
+
|
|
283
|
+
if (local_size) {
|
|
284
|
+
for (cl_uint i = 0; i < work_dim; ++i) {
|
|
285
|
+
info.local_size[i] = local_size[i];
|
|
286
|
+
}
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
for (cl_uint i = 0; i < work_dim; ++i) {
|
|
290
|
+
info.global_size[i] = global_size[i];
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
info.output_size[0] = tensor->ne[0];
|
|
294
|
+
info.output_size[1] = tensor->ne[1];
|
|
295
|
+
info.output_size[2] = tensor->ne[2];
|
|
296
|
+
info.output_size[3] = tensor->ne[3];
|
|
297
|
+
}
|
|
298
|
+
|
|
234
299
|
struct ggml_backend_opencl_context;
|
|
235
300
|
|
|
236
301
|
// backend device context
|
|
@@ -254,6 +319,8 @@ struct ggml_backend_opencl_device_context {
|
|
|
254
319
|
|
|
255
320
|
// backend context
|
|
256
321
|
struct ggml_backend_opencl_context {
|
|
322
|
+
int ref_count;
|
|
323
|
+
|
|
257
324
|
cl_device_id device;
|
|
258
325
|
std::string device_name;
|
|
259
326
|
|
|
@@ -369,6 +436,108 @@ struct ggml_backend_opencl_context {
|
|
|
369
436
|
cl_kernel kernel_timestep_embedding;
|
|
370
437
|
cl_kernel kernel_mul_mv_id_q4_0_f32_8x_flat;
|
|
371
438
|
|
|
439
|
+
std::vector<ProfilingInfo> profiling_info;
|
|
440
|
+
|
|
441
|
+
void write_profiling_info() {
|
|
442
|
+
FILE * fperf = fopen("cl_profiling.csv", "w");
|
|
443
|
+
if (!fperf) {
|
|
444
|
+
GGML_LOG_ERROR("Failed to open cl_profiling.csv\n");
|
|
445
|
+
return;
|
|
446
|
+
}
|
|
447
|
+
|
|
448
|
+
// Populate profiling info
|
|
449
|
+
for (ProfilingInfo & info : profiling_info) {
|
|
450
|
+
cl_ulong cmd_queued;
|
|
451
|
+
cl_ulong cmd_submit;
|
|
452
|
+
cl_ulong cmd_start;
|
|
453
|
+
cl_ulong cmd_end;
|
|
454
|
+
cl_ulong cmd_complete;
|
|
455
|
+
|
|
456
|
+
CL_CHECK(clWaitForEvents(1, &info.evt));
|
|
457
|
+
CL_CHECK(clGetEventProfilingInfo(
|
|
458
|
+
info.evt, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &cmd_queued, NULL));
|
|
459
|
+
CL_CHECK(clGetEventProfilingInfo(
|
|
460
|
+
info.evt, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &cmd_submit, NULL));
|
|
461
|
+
CL_CHECK(clGetEventProfilingInfo(
|
|
462
|
+
info.evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &cmd_start, NULL));
|
|
463
|
+
CL_CHECK(clGetEventProfilingInfo(
|
|
464
|
+
info.evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &cmd_end, NULL));
|
|
465
|
+
CL_CHECK(clGetEventProfilingInfo(
|
|
466
|
+
info.evt, CL_PROFILING_COMMAND_COMPLETE, sizeof(cl_ulong), &cmd_complete, NULL));
|
|
467
|
+
CL_CHECK(clReleaseEvent(info.evt));
|
|
468
|
+
|
|
469
|
+
char kernel_name[512];
|
|
470
|
+
CL_CHECK(clGetKernelInfo(info.kernel, CL_KERNEL_FUNCTION_NAME,
|
|
471
|
+
sizeof(kernel_name), kernel_name, NULL));
|
|
472
|
+
info.kernel_name = kernel_name;
|
|
473
|
+
|
|
474
|
+
info.cmd_queued = cmd_queued;
|
|
475
|
+
info.cmd_submit = cmd_submit;
|
|
476
|
+
info.cmd_start = cmd_start;
|
|
477
|
+
info.cmd_end = cmd_end;
|
|
478
|
+
|
|
479
|
+
info.cmd_queued_duration_ns = cmd_submit - cmd_queued;
|
|
480
|
+
info.cmd_submit_duration_ns = cmd_start - cmd_submit;
|
|
481
|
+
info.cmd_duration_ns = cmd_end - cmd_start;
|
|
482
|
+
info.cmd_complete_duration_ns = cmd_complete - cmd_end;
|
|
483
|
+
info.cmd_total_duration_ns = cmd_complete - cmd_queued;
|
|
484
|
+
}
|
|
485
|
+
|
|
486
|
+
// Dump a csv
|
|
487
|
+
float total_kernel_time = 0;
|
|
488
|
+
fprintf(fperf, "op name, kernel name, queued duration (ms), submit duration(ms), exec duration (ms), complete duration (ms), total duration (ms), global size, local size, output size\n");
|
|
489
|
+
for (const ProfilingInfo & info : profiling_info) {
|
|
490
|
+
total_kernel_time += info.cmd_duration_ns/1.e6f;
|
|
491
|
+
fprintf(fperf, "%s,%s,%f,%f,%f,%f,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n",
|
|
492
|
+
info.op_name.c_str(), info.kernel_name.c_str(),
|
|
493
|
+
info.cmd_queued_duration_ns/1.e6f,
|
|
494
|
+
info.cmd_submit_duration_ns/1.e6f,
|
|
495
|
+
info.cmd_duration_ns/1.e6f,
|
|
496
|
+
info.cmd_complete_duration_ns/1.e6f,
|
|
497
|
+
info.cmd_total_duration_ns/1.e6f,
|
|
498
|
+
info.global_size[0], info.global_size[1], info.global_size[2],
|
|
499
|
+
info.local_size[0], info.local_size[1], info.local_size[2],
|
|
500
|
+
info.output_size[0], info.output_size[1], info.output_size[2], info.output_size[3]);
|
|
501
|
+
}
|
|
502
|
+
fclose(fperf);
|
|
503
|
+
|
|
504
|
+
GGML_LOG_INFO("ggml_opencl: total kernel time: %f\n", total_kernel_time);
|
|
505
|
+
|
|
506
|
+
// Dump a simple chrome trace
|
|
507
|
+
FILE* ftrace = fopen("cl_trace.json", "w");
|
|
508
|
+
if (!ftrace) {
|
|
509
|
+
GGML_LOG_ERROR("Failed to open cl_trace.json\n");
|
|
510
|
+
return;
|
|
511
|
+
}
|
|
512
|
+
|
|
513
|
+
fprintf(ftrace, "[\n");
|
|
514
|
+
for (const ProfilingInfo & info : profiling_info) {
|
|
515
|
+
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n",
|
|
516
|
+
info.kernel_name.c_str(), info.cmd_queued/1000);
|
|
517
|
+
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n",
|
|
518
|
+
info.kernel_name.c_str(), info.cmd_submit/1000);
|
|
519
|
+
|
|
520
|
+
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n",
|
|
521
|
+
info.kernel_name.c_str(), info.cmd_start/1000);
|
|
522
|
+
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n",
|
|
523
|
+
info.kernel_name.c_str(), info.cmd_end/1000);
|
|
524
|
+
}
|
|
525
|
+
fclose(ftrace);
|
|
526
|
+
}
|
|
527
|
+
|
|
528
|
+
void enqueue_ndrange_kernel(cl_kernel kernel, cl_uint work_dim, size_t *global_work_size, size_t *local_work_size, const ggml_tensor * tensor) {
|
|
529
|
+
#ifdef GGML_OPENCL_PROFILING
|
|
530
|
+
cl_event evt;
|
|
531
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
532
|
+
|
|
533
|
+
profiling_info.emplace_back();
|
|
534
|
+
populateProfilingInfo(profiling_info.back(), evt, kernel, work_dim, global_work_size, local_work_size, tensor);
|
|
535
|
+
#else
|
|
536
|
+
GGML_UNUSED(tensor);
|
|
537
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
538
|
+
#endif
|
|
539
|
+
}
|
|
540
|
+
|
|
372
541
|
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
|
373
542
|
// Transpose kernels
|
|
374
543
|
cl_program program_transpose;
|
|
@@ -395,46 +564,19 @@ struct ggml_backend_opencl_context {
|
|
|
395
564
|
cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096;
|
|
396
565
|
cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_32000_1_4096;
|
|
397
566
|
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
|
398
|
-
};
|
|
399
567
|
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
// Profiling
|
|
568
|
+
void free() {
|
|
569
|
+
ref_count--;
|
|
570
|
+
if (ref_count == 0) {
|
|
404
571
|
#ifdef GGML_OPENCL_PROFILING
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
cl_kernel kernel;
|
|
410
|
-
cl_event evt;
|
|
411
|
-
|
|
412
|
-
cl_ulong cmd_queued;
|
|
413
|
-
cl_ulong cmd_submit;
|
|
414
|
-
cl_ulong cmd_start;
|
|
415
|
-
cl_ulong cmd_end;
|
|
416
|
-
cl_ulong overhead_start;
|
|
417
|
-
cl_ulong overhead_end;
|
|
418
|
-
// For the times below, see spec for clGetEventProfilingInfo
|
|
419
|
-
// The time kernel spent in cmd queue - SUBMIT - QUEUED
|
|
420
|
-
cl_ulong cmd_queued_duration_ns;
|
|
421
|
-
// The time kernel spent for submission - START - SUBMIT
|
|
422
|
-
cl_ulong cmd_submit_duration_ns;
|
|
423
|
-
// Kernel execution time in nanoseconds - END - START
|
|
424
|
-
cl_ulong cmd_duration_ns;
|
|
425
|
-
// The time for the kernel to complete - COMPLETE - END
|
|
426
|
-
cl_ulong cmd_complete_duration_ns;
|
|
427
|
-
// Total time to finish the kernel - COMPELTE - QUEUED
|
|
428
|
-
cl_ulong cmd_total_duration_ns;
|
|
429
|
-
// Global and local work sizes.
|
|
430
|
-
size_t global_size[3];
|
|
431
|
-
size_t local_size[3];
|
|
432
|
-
// Op output size.
|
|
433
|
-
size_t output_size[4];
|
|
572
|
+
write_profiling_info();
|
|
573
|
+
#endif
|
|
574
|
+
}
|
|
575
|
+
}
|
|
434
576
|
};
|
|
435
577
|
|
|
436
|
-
|
|
437
|
-
|
|
578
|
+
// All registered devices with a default device in the front.
|
|
579
|
+
static std::vector<ggml_backend_device> g_ggml_backend_opencl_devices;
|
|
438
580
|
|
|
439
581
|
inline std::string read_file(const std::string &path) {
|
|
440
582
|
std::ifstream ifs(path);
|
|
@@ -1669,6 +1811,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
|
1669
1811
|
backend_ctx->device = dev_ctx->device;
|
|
1670
1812
|
backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
|
|
1671
1813
|
|
|
1814
|
+
// ref_count get increased in ggml_backend_opencl_device_init
|
|
1815
|
+
// This function is also used to retrieve backend context, so we don't want
|
|
1816
|
+
// to increase ref_count for each call. We only want to increase ref_count
|
|
1817
|
+
// when the associated device is initialized
|
|
1818
|
+
backend_ctx->ref_count = 0;
|
|
1819
|
+
|
|
1672
1820
|
if (strstr(dev_ctx->device_name.c_str(), "Adreno") ||
|
|
1673
1821
|
strstr(dev_ctx->device_name.c_str(), "Qualcomm") ||
|
|
1674
1822
|
strstr(dev_ctx->device_version.c_str(), "Adreno")) {
|
|
@@ -1841,93 +1989,22 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
|
1841
1989
|
return dev_ctx->backend_ctx;
|
|
1842
1990
|
}
|
|
1843
1991
|
|
|
1844
|
-
static void ggml_cl2_free(
|
|
1845
|
-
|
|
1846
|
-
|
|
1847
|
-
if (!fperf) {
|
|
1848
|
-
GGML_LOG_ERROR("Failed to open cl_profiling.csv\n");
|
|
1849
|
-
return;
|
|
1850
|
-
}
|
|
1992
|
+
static void ggml_cl2_free(ggml_backend_t backend) {
|
|
1993
|
+
ggml_backend_opencl_context * ctx = (ggml_backend_opencl_context *) backend->context;
|
|
1994
|
+
ctx->free();
|
|
1851
1995
|
|
|
1852
|
-
//
|
|
1853
|
-
|
|
1854
|
-
|
|
1855
|
-
|
|
1856
|
-
|
|
1857
|
-
|
|
1858
|
-
|
|
1859
|
-
|
|
1860
|
-
CL_CHECK(clWaitForEvents(1, &info.evt));
|
|
1861
|
-
CL_CHECK(clGetEventProfilingInfo(
|
|
1862
|
-
info.evt, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &cmd_queued, NULL));
|
|
1863
|
-
CL_CHECK(clGetEventProfilingInfo(
|
|
1864
|
-
info.evt, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &cmd_submit, NULL));
|
|
1865
|
-
CL_CHECK(clGetEventProfilingInfo(
|
|
1866
|
-
info.evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &cmd_start, NULL));
|
|
1867
|
-
CL_CHECK(clGetEventProfilingInfo(
|
|
1868
|
-
info.evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &cmd_end, NULL));
|
|
1869
|
-
CL_CHECK(clGetEventProfilingInfo(
|
|
1870
|
-
info.evt, CL_PROFILING_COMMAND_COMPLETE, sizeof(cl_ulong), &cmd_complete, NULL));
|
|
1871
|
-
CL_CHECK(clReleaseEvent(info.evt));
|
|
1872
|
-
|
|
1873
|
-
char kernel_name[512];
|
|
1874
|
-
CL_CHECK(clGetKernelInfo(info.kernel, CL_KERNEL_FUNCTION_NAME,
|
|
1875
|
-
sizeof(kernel_name), kernel_name, NULL));
|
|
1876
|
-
info.kernel_name = kernel_name;
|
|
1877
|
-
|
|
1878
|
-
info.cmd_queued = cmd_queued;
|
|
1879
|
-
info.cmd_submit = cmd_submit;
|
|
1880
|
-
info.cmd_start = cmd_start;
|
|
1881
|
-
info.cmd_end = cmd_end;
|
|
1882
|
-
|
|
1883
|
-
info.cmd_queued_duration_ns = cmd_submit - cmd_queued;
|
|
1884
|
-
info.cmd_submit_duration_ns = cmd_start - cmd_submit;
|
|
1885
|
-
info.cmd_duration_ns = cmd_end - cmd_start;
|
|
1886
|
-
info.cmd_complete_duration_ns = cmd_complete - cmd_end;
|
|
1887
|
-
info.cmd_total_duration_ns = cmd_complete - cmd_queued;
|
|
1888
|
-
}
|
|
1889
|
-
|
|
1890
|
-
// Dump a csv
|
|
1891
|
-
float total_kernel_time = 0;
|
|
1892
|
-
fprintf(fperf, "op name, kernel name, queued duration (ms), submit duration(ms), exec duration (ms), complete duration (ms), total duration (ms), global size, local size, output size\n");
|
|
1893
|
-
for (const ProfilingInfo & info : g_profiling_info) {
|
|
1894
|
-
total_kernel_time += info.cmd_duration_ns/1.e6f;
|
|
1895
|
-
fprintf(fperf, "%s,%s,%f,%f,%f,%f,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n",
|
|
1896
|
-
info.op_name.c_str(), info.kernel_name.c_str(),
|
|
1897
|
-
info.cmd_queued_duration_ns/1.e6f,
|
|
1898
|
-
info.cmd_submit_duration_ns/1.e6f,
|
|
1899
|
-
info.cmd_duration_ns/1.e6f,
|
|
1900
|
-
info.cmd_complete_duration_ns/1.e6f,
|
|
1901
|
-
info.cmd_total_duration_ns/1.e6f,
|
|
1902
|
-
info.global_size[0], info.global_size[1], info.global_size[2],
|
|
1903
|
-
info.local_size[0], info.local_size[1], info.local_size[2],
|
|
1904
|
-
info.output_size[0], info.output_size[1], info.output_size[2], info.output_size[3]);
|
|
1905
|
-
}
|
|
1906
|
-
fclose(fperf);
|
|
1907
|
-
|
|
1908
|
-
GGML_LOG_INFO("ggml_opencl: total kernel time: %f\n", total_kernel_time);
|
|
1909
|
-
|
|
1910
|
-
// Dump a simple chrome trace
|
|
1911
|
-
FILE* ftrace = fopen("cl_trace.json", "w");
|
|
1912
|
-
if (!ftrace) {
|
|
1913
|
-
GGML_LOG_ERROR("Failed to open cl_trace.json\n");
|
|
1914
|
-
return;
|
|
1996
|
+
// The CL context is shared by all backends, release it if all backends have been released
|
|
1997
|
+
bool should_release_opencl = true;
|
|
1998
|
+
for (auto device : g_ggml_backend_opencl_devices) {
|
|
1999
|
+
ggml_backend_opencl_device_context * ctx_dev = (ggml_backend_opencl_device_context *) device.context;
|
|
2000
|
+
if (ctx_dev->backend_ctx->ref_count > 0) {
|
|
2001
|
+
should_release_opencl = false;
|
|
2002
|
+
}
|
|
1915
2003
|
}
|
|
1916
2004
|
|
|
1917
|
-
|
|
1918
|
-
|
|
1919
|
-
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n",
|
|
1920
|
-
info.kernel_name.c_str(), info.cmd_queued/1000);
|
|
1921
|
-
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n",
|
|
1922
|
-
info.kernel_name.c_str(), info.cmd_submit/1000);
|
|
1923
|
-
|
|
1924
|
-
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n",
|
|
1925
|
-
info.kernel_name.c_str(), info.cmd_start/1000);
|
|
1926
|
-
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n",
|
|
1927
|
-
info.kernel_name.c_str(), info.cmd_end/1000);
|
|
2005
|
+
if (should_release_opencl) {
|
|
2006
|
+
CL_CHECK(clReleaseContext(ctx->context));
|
|
1928
2007
|
}
|
|
1929
|
-
fclose(ftrace);
|
|
1930
|
-
#endif
|
|
1931
2008
|
}
|
|
1932
2009
|
|
|
1933
2010
|
//------------------------------------------------------------------------------
|
|
@@ -2011,9 +2088,7 @@ static const char * ggml_backend_opencl_name(ggml_backend_t backend) {
|
|
|
2011
2088
|
}
|
|
2012
2089
|
|
|
2013
2090
|
static void ggml_backend_opencl_free(ggml_backend_t backend) {
|
|
2014
|
-
ggml_cl2_free();
|
|
2015
|
-
|
|
2016
|
-
GGML_UNUSED(backend);
|
|
2091
|
+
ggml_cl2_free(backend);
|
|
2017
2092
|
}
|
|
2018
2093
|
|
|
2019
2094
|
static void ggml_backend_opencl_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
|
@@ -2899,6 +2974,8 @@ static void ggml_backend_opencl_device_get_props(ggml_backend_dev_t dev, struct
|
|
|
2899
2974
|
|
|
2900
2975
|
static ggml_backend_t ggml_backend_opencl_device_init(ggml_backend_dev_t dev, const char * params) {
|
|
2901
2976
|
ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(dev);
|
|
2977
|
+
// Getting a new reference to the backend, increase ref_count
|
|
2978
|
+
backend_ctx->ref_count++;
|
|
2902
2979
|
|
|
2903
2980
|
ggml_backend_t backend = new ggml_backend {
|
|
2904
2981
|
/* .guid = */ ggml_backend_opencl_guid(),
|
|
@@ -3159,31 +3236,6 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso
|
|
|
3159
3236
|
#define dump_tensor(tensor)
|
|
3160
3237
|
#endif
|
|
3161
3238
|
|
|
3162
|
-
//------------------------------------------------------------------------------
|
|
3163
|
-
// Profiling utility
|
|
3164
|
-
//------------------------------------------------------------------------------
|
|
3165
|
-
#ifdef GGML_OPENCL_PROFILING
|
|
3166
|
-
static void populateProfilingInfo(
|
|
3167
|
-
ProfilingInfo& info, cl_event evt, cl_kernel kernel,
|
|
3168
|
-
size_t global_size[3], size_t local_size[3],
|
|
3169
|
-
const ggml_tensor * tensor) {
|
|
3170
|
-
info.op_name = tensor->name;
|
|
3171
|
-
info.kernel = kernel;
|
|
3172
|
-
info.evt = evt;
|
|
3173
|
-
|
|
3174
|
-
info.local_size[0] = local_size[0];
|
|
3175
|
-
info.local_size[1] = local_size[1];
|
|
3176
|
-
info.local_size[2] = local_size[2];
|
|
3177
|
-
info.global_size[0] = global_size[0];
|
|
3178
|
-
info.global_size[1] = global_size[1];
|
|
3179
|
-
info.global_size[2] = global_size[2];
|
|
3180
|
-
info.output_size[0] = tensor->ne[0];
|
|
3181
|
-
info.output_size[1] = tensor->ne[1];
|
|
3182
|
-
info.output_size[2] = tensor->ne[2];
|
|
3183
|
-
info.output_size[3] = tensor->ne[3];
|
|
3184
|
-
}
|
|
3185
|
-
#endif
|
|
3186
|
-
|
|
3187
3239
|
//------------------------------------------------------------------------------
|
|
3188
3240
|
// Ops
|
|
3189
3241
|
//------------------------------------------------------------------------------
|
|
@@ -3227,7 +3279,6 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
|
3227
3279
|
const cl_ulong nb2 = dst ? dst->nb[2] : 0;
|
|
3228
3280
|
|
|
3229
3281
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
3230
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
3231
3282
|
|
|
3232
3283
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
3233
3284
|
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
|
@@ -3271,15 +3322,7 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
|
3271
3322
|
size_t global_work_size[] = {(size_t)ne10, (size_t)ne11, 1};
|
|
3272
3323
|
size_t local_work_size[] = {1, 1, 1};
|
|
3273
3324
|
|
|
3274
|
-
|
|
3275
|
-
cl_event evt;
|
|
3276
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
3277
|
-
|
|
3278
|
-
g_profiling_info.emplace_back();
|
|
3279
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
3280
|
-
#else
|
|
3281
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
3282
|
-
#endif
|
|
3325
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
3283
3326
|
}
|
|
3284
3327
|
|
|
3285
3328
|
static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -3321,7 +3364,6 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
3321
3364
|
const cl_ulong nb3 = dst ? dst->nb[3] : 0;
|
|
3322
3365
|
|
|
3323
3366
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
3324
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
3325
3367
|
|
|
3326
3368
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
3327
3369
|
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
|
@@ -3396,29 +3438,13 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
3396
3438
|
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
|
3397
3439
|
}
|
|
3398
3440
|
|
|
3399
|
-
|
|
3400
|
-
cl_event evt;
|
|
3401
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
|
3402
|
-
|
|
3403
|
-
g_profiling_info.emplace_back();
|
|
3404
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
|
|
3405
|
-
#else
|
|
3406
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
|
3407
|
-
#endif
|
|
3441
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
|
3408
3442
|
} else {
|
|
3409
3443
|
unsigned int nth = MIN(64, ne0);
|
|
3410
3444
|
size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
|
|
3411
3445
|
size_t local_work_size[] = {nth, 1, 1};
|
|
3412
3446
|
|
|
3413
|
-
|
|
3414
|
-
cl_event evt;
|
|
3415
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
3416
|
-
|
|
3417
|
-
g_profiling_info.emplace_back();
|
|
3418
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
3419
|
-
#else
|
|
3420
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
3421
|
-
#endif
|
|
3447
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
3422
3448
|
}
|
|
3423
3449
|
}
|
|
3424
3450
|
|
|
@@ -3461,7 +3487,6 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
3461
3487
|
const cl_ulong nb3 = dst ? dst->nb[3] : 0;
|
|
3462
3488
|
|
|
3463
3489
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
3464
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
3465
3490
|
|
|
3466
3491
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
3467
3492
|
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
|
@@ -3536,29 +3561,13 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
3536
3561
|
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
|
3537
3562
|
}
|
|
3538
3563
|
|
|
3539
|
-
|
|
3540
|
-
cl_event evt;
|
|
3541
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
|
3542
|
-
|
|
3543
|
-
g_profiling_info.emplace_back();
|
|
3544
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
|
|
3545
|
-
#else
|
|
3546
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
|
3547
|
-
#endif
|
|
3564
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
|
3548
3565
|
} else {
|
|
3549
3566
|
unsigned int nth = MIN(64, ne0);
|
|
3550
3567
|
size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
|
|
3551
3568
|
size_t local_work_size[] = {nth, 1, 1};
|
|
3552
3569
|
|
|
3553
|
-
|
|
3554
|
-
cl_event evt;
|
|
3555
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
3556
|
-
|
|
3557
|
-
g_profiling_info.emplace_back();
|
|
3558
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
3559
|
-
#else
|
|
3560
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
3561
|
-
#endif
|
|
3570
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
3562
3571
|
}
|
|
3563
3572
|
}
|
|
3564
3573
|
|
|
@@ -3598,7 +3607,6 @@ static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
3598
3607
|
const cl_ulong nb3 = dst->nb[3];
|
|
3599
3608
|
|
|
3600
3609
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
3601
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
3602
3610
|
|
|
3603
3611
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
3604
3612
|
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
|
@@ -3661,29 +3669,13 @@ static void ggml_cl_div(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
3661
3669
|
size_t global_work_size[] = {(size_t)n, 1, 1};
|
|
3662
3670
|
size_t local_work_size[] = {64, 1, 1};
|
|
3663
3671
|
|
|
3664
|
-
|
|
3665
|
-
cl_event evt;
|
|
3666
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
3667
|
-
|
|
3668
|
-
g_profiling_info.emplace_back();
|
|
3669
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
3670
|
-
#else
|
|
3671
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
3672
|
-
#endif
|
|
3672
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
3673
3673
|
} else {
|
|
3674
3674
|
unsigned int nth = MIN(64, ne0);
|
|
3675
3675
|
size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
|
|
3676
3676
|
size_t local_work_size[] = {nth, 1, 1};
|
|
3677
3677
|
|
|
3678
|
-
|
|
3679
|
-
cl_event evt;
|
|
3680
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
3681
|
-
|
|
3682
|
-
g_profiling_info.emplace_back();
|
|
3683
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
3684
|
-
#else
|
|
3685
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
3686
|
-
#endif
|
|
3678
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
3687
3679
|
}
|
|
3688
3680
|
}
|
|
3689
3681
|
|
|
@@ -3723,7 +3715,6 @@ static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
3723
3715
|
const cl_ulong nb3 = dst->nb[3];
|
|
3724
3716
|
|
|
3725
3717
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
3726
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
3727
3718
|
|
|
3728
3719
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
3729
3720
|
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
|
@@ -3786,29 +3777,13 @@ static void ggml_cl_sub(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
3786
3777
|
size_t global_work_size[] = {(size_t)n, 1, 1};
|
|
3787
3778
|
size_t local_work_size[] = {64, 1, 1};
|
|
3788
3779
|
|
|
3789
|
-
|
|
3790
|
-
cl_event evt;
|
|
3791
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
3792
|
-
|
|
3793
|
-
g_profiling_info.emplace_back();
|
|
3794
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
3795
|
-
#else
|
|
3796
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
3797
|
-
#endif
|
|
3780
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
3798
3781
|
} else {
|
|
3799
3782
|
unsigned int nth = MIN(64, ne0);
|
|
3800
3783
|
size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03};
|
|
3801
3784
|
size_t local_work_size[] = {nth, 1, 1};
|
|
3802
3785
|
|
|
3803
|
-
|
|
3804
|
-
cl_event evt;
|
|
3805
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
3806
|
-
|
|
3807
|
-
g_profiling_info.emplace_back();
|
|
3808
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
3809
|
-
#else
|
|
3810
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
3811
|
-
#endif
|
|
3786
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
3812
3787
|
}
|
|
3813
3788
|
}
|
|
3814
3789
|
|
|
@@ -3821,7 +3796,6 @@ static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
3821
3796
|
UNUSED(src1);
|
|
3822
3797
|
|
|
3823
3798
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
3824
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
3825
3799
|
|
|
3826
3800
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
3827
3801
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
@@ -3848,15 +3822,7 @@ static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
3848
3822
|
size_t global_work_size[] = {(size_t)n, 1, 1};
|
|
3849
3823
|
size_t local_work_size[] = {64, 1, 1};
|
|
3850
3824
|
|
|
3851
|
-
|
|
3852
|
-
cl_event evt;
|
|
3853
|
-
clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt);
|
|
3854
|
-
|
|
3855
|
-
g_profiling_info.emplace_back();
|
|
3856
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
3857
|
-
#else
|
|
3858
|
-
clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL);
|
|
3859
|
-
#endif
|
|
3825
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
3860
3826
|
}
|
|
3861
3827
|
|
|
3862
3828
|
static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -3868,7 +3834,6 @@ static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0,
|
|
|
3868
3834
|
UNUSED(src1);
|
|
3869
3835
|
|
|
3870
3836
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
3871
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
3872
3837
|
|
|
3873
3838
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
3874
3839
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
@@ -3895,15 +3860,7 @@ static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0,
|
|
|
3895
3860
|
size_t global_work_size[] = {(size_t)n, 1, 1};
|
|
3896
3861
|
size_t local_work_size[] = {64, 1, 1};
|
|
3897
3862
|
|
|
3898
|
-
|
|
3899
|
-
cl_event evt;
|
|
3900
|
-
clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt);
|
|
3901
|
-
|
|
3902
|
-
g_profiling_info.emplace_back();
|
|
3903
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
3904
|
-
#else
|
|
3905
|
-
clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL);
|
|
3906
|
-
#endif
|
|
3863
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
3907
3864
|
}
|
|
3908
3865
|
|
|
3909
3866
|
static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -3915,7 +3872,6 @@ static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
3915
3872
|
UNUSED(src1);
|
|
3916
3873
|
|
|
3917
3874
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
3918
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
3919
3875
|
|
|
3920
3876
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
3921
3877
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
@@ -3947,15 +3903,7 @@ static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
3947
3903
|
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
|
3948
3904
|
}
|
|
3949
3905
|
|
|
3950
|
-
|
|
3951
|
-
cl_event evt;
|
|
3952
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
|
3953
|
-
|
|
3954
|
-
g_profiling_info.emplace_back();
|
|
3955
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
|
|
3956
|
-
#else
|
|
3957
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
|
3958
|
-
#endif
|
|
3906
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
|
3959
3907
|
}
|
|
3960
3908
|
|
|
3961
3909
|
static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -3967,7 +3915,6 @@ static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
3967
3915
|
UNUSED(src1);
|
|
3968
3916
|
|
|
3969
3917
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
3970
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
3971
3918
|
|
|
3972
3919
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
3973
3920
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
@@ -3992,15 +3939,7 @@ static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
3992
3939
|
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
|
3993
3940
|
}
|
|
3994
3941
|
|
|
3995
|
-
|
|
3996
|
-
cl_event evt;
|
|
3997
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
|
3998
|
-
|
|
3999
|
-
g_profiling_info.emplace_back();
|
|
4000
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
|
|
4001
|
-
#else
|
|
4002
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
|
4003
|
-
#endif
|
|
3942
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
|
4004
3943
|
}
|
|
4005
3944
|
|
|
4006
3945
|
static void ggml_cl_sigmoid(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -4012,7 +3951,6 @@ static void ggml_cl_sigmoid(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
4012
3951
|
UNUSED(src1);
|
|
4013
3952
|
|
|
4014
3953
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
4015
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
4016
3954
|
|
|
4017
3955
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
4018
3956
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
@@ -4044,15 +3982,7 @@ static void ggml_cl_sigmoid(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
4044
3982
|
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
|
4045
3983
|
}
|
|
4046
3984
|
|
|
4047
|
-
|
|
4048
|
-
cl_event evt;
|
|
4049
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
|
4050
|
-
|
|
4051
|
-
g_profiling_info.emplace_back();
|
|
4052
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
|
|
4053
|
-
#else
|
|
4054
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
|
4055
|
-
#endif
|
|
3985
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
|
4056
3986
|
}
|
|
4057
3987
|
|
|
4058
3988
|
static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -4064,7 +3994,6 @@ static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, cons
|
|
|
4064
3994
|
UNUSED(src1);
|
|
4065
3995
|
|
|
4066
3996
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
4067
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
4068
3997
|
|
|
4069
3998
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
4070
3999
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
@@ -4096,15 +4025,7 @@ static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, cons
|
|
|
4096
4025
|
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
|
4097
4026
|
}
|
|
4098
4027
|
|
|
4099
|
-
|
|
4100
|
-
cl_event evt;
|
|
4101
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
|
4102
|
-
|
|
4103
|
-
g_profiling_info.emplace_back();
|
|
4104
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
|
|
4105
|
-
#else
|
|
4106
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
|
4107
|
-
#endif
|
|
4028
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
|
4108
4029
|
}
|
|
4109
4030
|
|
|
4110
4031
|
static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -4116,7 +4037,6 @@ static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
4116
4037
|
UNUSED(src1);
|
|
4117
4038
|
|
|
4118
4039
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
4119
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
4120
4040
|
|
|
4121
4041
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
4122
4042
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
@@ -4157,15 +4077,7 @@ static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
4157
4077
|
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
|
4158
4078
|
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
|
4159
4079
|
|
|
4160
|
-
|
|
4161
|
-
cl_event evt;
|
|
4162
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
4163
|
-
|
|
4164
|
-
g_profiling_info.emplace_back();
|
|
4165
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
4166
|
-
#else
|
|
4167
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
4168
|
-
#endif
|
|
4080
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
4169
4081
|
}
|
|
4170
4082
|
|
|
4171
4083
|
static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -4177,7 +4089,6 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
|
4177
4089
|
UNUSED(src1);
|
|
4178
4090
|
|
|
4179
4091
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
4180
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
4181
4092
|
|
|
4182
4093
|
//ggml_backend_opencl_device_context * dev_ctx =
|
|
4183
4094
|
// (ggml_backend_opencl_device_context *)backend->device->context;
|
|
@@ -4241,15 +4152,7 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
|
4241
4152
|
// This is local memory - the size depends on subgroup size.
|
|
4242
4153
|
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float)*nth/sgs, NULL));
|
|
4243
4154
|
|
|
4244
|
-
|
|
4245
|
-
cl_event evt;
|
|
4246
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
4247
|
-
|
|
4248
|
-
g_profiling_info.emplace_back();
|
|
4249
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
4250
|
-
#else
|
|
4251
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
4252
|
-
#endif
|
|
4155
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
4253
4156
|
}
|
|
4254
4157
|
|
|
4255
4158
|
static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -4261,7 +4164,6 @@ static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0,
|
|
|
4261
4164
|
UNUSED(src1);
|
|
4262
4165
|
|
|
4263
4166
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
4264
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
4265
4167
|
|
|
4266
4168
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
4267
4169
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
@@ -4300,15 +4202,7 @@ static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0,
|
|
|
4300
4202
|
size_t global_work_size[] = {(size_t)n_groups*sgs, 1, 1};
|
|
4301
4203
|
size_t local_work_size[] = {(size_t)sgs, 1, 1};
|
|
4302
4204
|
|
|
4303
|
-
|
|
4304
|
-
cl_event evt;
|
|
4305
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
4306
|
-
|
|
4307
|
-
g_profiling_info.emplace_back();
|
|
4308
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
4309
|
-
#else
|
|
4310
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
4311
|
-
#endif
|
|
4205
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
4312
4206
|
}
|
|
4313
4207
|
|
|
4314
4208
|
static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -4320,7 +4214,6 @@ static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
4320
4214
|
UNUSED(src1);
|
|
4321
4215
|
|
|
4322
4216
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
4323
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
4324
4217
|
|
|
4325
4218
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
4326
4219
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
@@ -4397,16 +4290,7 @@ static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
4397
4290
|
}
|
|
4398
4291
|
if (global_work_size[0] == 0 || global_work_size[1] == 0 || global_work_size[2] == 0) return;
|
|
4399
4292
|
|
|
4400
|
-
|
|
4401
|
-
#ifdef GGML_OPENCL_PROFILING
|
|
4402
|
-
cl_event evt;
|
|
4403
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
|
4404
|
-
|
|
4405
|
-
g_profiling_info.emplace_back();
|
|
4406
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr ? local_work_size : (size_t[3]){0,0,0}, dst);
|
|
4407
|
-
#else
|
|
4408
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
|
4409
|
-
#endif
|
|
4293
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
|
4410
4294
|
}
|
|
4411
4295
|
|
|
4412
4296
|
static void ggml_cl_repeat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1_shape_def, ggml_tensor * dst) {
|
|
@@ -4419,7 +4303,6 @@ static void ggml_cl_repeat(ggml_backend_t backend, const ggml_tensor * src0, con
|
|
|
4419
4303
|
UNUSED(src1_shape_def);
|
|
4420
4304
|
|
|
4421
4305
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
4422
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
4423
4306
|
|
|
4424
4307
|
if (backend_ctx->kernel_repeat == nullptr) {
|
|
4425
4308
|
GGML_LOG_WARN("%s: repeat kernel not available, skipping OpenCL execution.\n", __func__);
|
|
@@ -4467,15 +4350,7 @@ static void ggml_cl_repeat(ggml_backend_t backend, const ggml_tensor * src0, con
|
|
|
4467
4350
|
|
|
4468
4351
|
size_t global_work_size[] = { gws0, gws1, gws2 };
|
|
4469
4352
|
|
|
4470
|
-
|
|
4471
|
-
cl_event evt;
|
|
4472
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, NULL, 0, NULL, &evt));
|
|
4473
|
-
|
|
4474
|
-
g_profiling_info.emplace_back();
|
|
4475
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, (size_t[3]){0,0,0}, dst);
|
|
4476
|
-
#else
|
|
4477
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, NULL, 0, NULL, NULL));
|
|
4478
|
-
#endif
|
|
4353
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, NULL, dst);
|
|
4479
4354
|
}
|
|
4480
4355
|
|
|
4481
4356
|
static void ggml_cl_pad(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
|
|
@@ -4488,7 +4363,6 @@ static void ggml_cl_pad(ggml_backend_t backend, const ggml_tensor * src0, ggml_t
|
|
|
4488
4363
|
GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1);
|
|
4489
4364
|
|
|
4490
4365
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
4491
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
4492
4366
|
|
|
4493
4367
|
if (backend_ctx->kernel_pad == nullptr) {
|
|
4494
4368
|
GGML_LOG_WARN("%s: pad kernel not available, skipping OpenCL execution.\n", __func__);
|
|
@@ -4533,15 +4407,7 @@ static void ggml_cl_pad(ggml_backend_t backend, const ggml_tensor * src0, ggml_t
|
|
|
4533
4407
|
local_work_size_ptr = nullptr;
|
|
4534
4408
|
}
|
|
4535
4409
|
|
|
4536
|
-
|
|
4537
|
-
cl_event evt;
|
|
4538
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
|
4539
|
-
|
|
4540
|
-
g_profiling_info.emplace_back();
|
|
4541
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr ? local_work_size : (size_t[3]){0,0,0}, dst);
|
|
4542
|
-
#else
|
|
4543
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
|
4544
|
-
#endif
|
|
4410
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
|
4545
4411
|
}
|
|
4546
4412
|
|
|
4547
4413
|
static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
|
|
@@ -4553,7 +4419,6 @@ static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, gg
|
|
|
4553
4419
|
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
4554
4420
|
|
|
4555
4421
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
4556
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
4557
4422
|
|
|
4558
4423
|
const ggml_scale_mode mode = (ggml_scale_mode) ggml_get_op_params_i32(dst, 0);
|
|
4559
4424
|
cl_kernel kernel = nullptr;
|
|
@@ -4644,17 +4509,7 @@ static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, gg
|
|
|
4644
4509
|
local_work_size_ptr = nullptr;
|
|
4645
4510
|
}
|
|
4646
4511
|
|
|
4647
|
-
|
|
4648
|
-
cl_event evt;
|
|
4649
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
|
4650
|
-
|
|
4651
|
-
g_profiling_info.emplace_back();
|
|
4652
|
-
size_t profiling_gws[3] = {global_work_size[0], 1, 1};
|
|
4653
|
-
size_t profiling_lws[3] = {local_work_size_ptr ? local_work_size[0] : 0, 1, 1};
|
|
4654
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, profiling_gws, profiling_lws, dst);
|
|
4655
|
-
#else
|
|
4656
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
|
4657
|
-
#endif
|
|
4512
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
|
4658
4513
|
}
|
|
4659
4514
|
|
|
4660
4515
|
static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -4732,7 +4587,7 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con
|
|
|
4732
4587
|
global_work_size[1] = d_ne1;
|
|
4733
4588
|
global_work_size[2] = d_ne2;
|
|
4734
4589
|
|
|
4735
|
-
|
|
4590
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, NULL, dst);
|
|
4736
4591
|
}
|
|
4737
4592
|
}
|
|
4738
4593
|
} else {
|
|
@@ -4782,7 +4637,7 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con
|
|
|
4782
4637
|
d_ne2 > 0 ? (size_t)d_ne2 : 1,
|
|
4783
4638
|
d_ne3 > 0 ? (size_t)d_ne3 : 1 };
|
|
4784
4639
|
|
|
4785
|
-
|
|
4640
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size_nc, NULL, dst);
|
|
4786
4641
|
}
|
|
4787
4642
|
}
|
|
4788
4643
|
|
|
@@ -4795,7 +4650,6 @@ static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor
|
|
|
4795
4650
|
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
4796
4651
|
|
|
4797
4652
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
4798
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
4799
4653
|
|
|
4800
4654
|
if (backend_ctx->kernel_timestep_embedding == nullptr) {
|
|
4801
4655
|
GGML_LOG_WARN("%s: timestep_embedding kernel not available, skipping OpenCL execution.\n", __func__);
|
|
@@ -4828,17 +4682,7 @@ static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor
|
|
|
4828
4682
|
|
|
4829
4683
|
size_t global_work_size[] = {gws0, gws1, 1};
|
|
4830
4684
|
|
|
4831
|
-
|
|
4832
|
-
cl_event evt;
|
|
4833
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_work_size, NULL, 0, NULL, &evt)); // Pass 2 for 2D problem
|
|
4834
|
-
|
|
4835
|
-
g_profiling_info.emplace_back();
|
|
4836
|
-
size_t profiling_gws[3] = {global_work_size[0], global_work_size[1], 1};
|
|
4837
|
-
size_t profiling_lws[3] = {0,0,0}; // Reflects NULL LWS
|
|
4838
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, profiling_gws, profiling_lws, dst);
|
|
4839
|
-
#else
|
|
4840
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL)); // Pass 2 for 2D problem
|
|
4841
|
-
#endif
|
|
4685
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, NULL, dst);
|
|
4842
4686
|
}
|
|
4843
4687
|
|
|
4844
4688
|
static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -4853,7 +4697,6 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
4853
4697
|
const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
|
|
4854
4698
|
|
|
4855
4699
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
4856
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
4857
4700
|
|
|
4858
4701
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
4859
4702
|
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
|
@@ -5058,15 +4901,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
5058
4901
|
static_cast<size_t>(padded_height_B)
|
|
5059
4902
|
};
|
|
5060
4903
|
|
|
5061
|
-
|
|
5062
|
-
cl_event evt;
|
|
5063
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_size_t, local_size_t, 0, NULL, &evt));
|
|
5064
|
-
|
|
5065
|
-
g_profiling_info.emplace_back();
|
|
5066
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_size_t, local_size_t, dst);
|
|
5067
|
-
#else
|
|
5068
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_size_t, local_size_t, 0, NULL, NULL));
|
|
5069
|
-
#endif
|
|
4904
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_size_t, local_size_t, dst);
|
|
5070
4905
|
} else {
|
|
5071
4906
|
// no need to transpose B in other cases
|
|
5072
4907
|
// create an image for B from sub_buffer
|
|
@@ -5188,16 +5023,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
5188
5023
|
|
|
5189
5024
|
// enqueue kernel with profiling
|
|
5190
5025
|
// <--------------------------------------------> //
|
|
5191
|
-
|
|
5192
|
-
cl_event evt;
|
|
5193
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
5194
|
-
|
|
5195
|
-
g_profiling_info.emplace_back();
|
|
5196
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
5197
|
-
// enqueue kernel without profiling
|
|
5198
|
-
#else
|
|
5199
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
5200
|
-
#endif
|
|
5026
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
5201
5027
|
// <--------------------------------------------> //
|
|
5202
5028
|
|
|
5203
5029
|
// deallocate sub buffers and images
|
|
@@ -5277,15 +5103,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
5277
5103
|
global_work_size[2] = (size_t)ne12*ne13;
|
|
5278
5104
|
}
|
|
5279
5105
|
|
|
5280
|
-
|
|
5281
|
-
cl_event evt;
|
|
5282
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
5283
|
-
|
|
5284
|
-
g_profiling_info.emplace_back();
|
|
5285
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
5286
|
-
#else
|
|
5287
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
5288
|
-
#endif
|
|
5106
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
5289
5107
|
return;
|
|
5290
5108
|
}
|
|
5291
5109
|
#else // GGML_OPENCL_SOA_Q
|
|
@@ -5515,15 +5333,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
5515
5333
|
size_t global_work_size[] = {(size_t)(ne01 + ndst-1)/ndst*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13};
|
|
5516
5334
|
size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
|
|
5517
5335
|
|
|
5518
|
-
|
|
5519
|
-
cl_event evt;
|
|
5520
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
5521
|
-
|
|
5522
|
-
g_profiling_info.emplace_back();
|
|
5523
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
5524
|
-
#else
|
|
5525
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
5526
|
-
#endif
|
|
5336
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
5527
5337
|
} else if (src0t == GGML_TYPE_Q4_K) {
|
|
5528
5338
|
GGML_ASSERT(false && "not implemented");
|
|
5529
5339
|
} else if (src0t == GGML_TYPE_Q3_K) {
|
|
@@ -5534,30 +5344,14 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
5534
5344
|
size_t global_work_size[] = {(size_t)(ne01+1)/2*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13};
|
|
5535
5345
|
size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
|
|
5536
5346
|
|
|
5537
|
-
|
|
5538
|
-
cl_event evt;
|
|
5539
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
5540
|
-
|
|
5541
|
-
g_profiling_info.emplace_back();
|
|
5542
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
5543
|
-
#else
|
|
5544
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
5545
|
-
#endif
|
|
5347
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
5546
5348
|
} else {
|
|
5547
5349
|
int64_t ny = (ne11 + nrows - 1)/nrows;
|
|
5548
5350
|
|
|
5549
5351
|
size_t global_work_size[] = {(size_t)ne01*nth0, (size_t)ny*nth1, (size_t)ne12*ne13};
|
|
5550
5352
|
size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
|
|
5551
5353
|
|
|
5552
|
-
|
|
5553
|
-
cl_event evt;
|
|
5554
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
5555
|
-
|
|
5556
|
-
g_profiling_info.emplace_back();
|
|
5557
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
5558
|
-
#else
|
|
5559
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
5560
|
-
#endif
|
|
5354
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
5561
5355
|
}
|
|
5562
5356
|
}
|
|
5563
5357
|
|
|
@@ -5574,7 +5368,6 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
|
|
|
5574
5368
|
GGML_ASSERT(src2->extra);
|
|
5575
5369
|
|
|
5576
5370
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
5577
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
5578
5371
|
|
|
5579
5372
|
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
|
5580
5373
|
ggml_tensor_extra_cl * extra2 = (ggml_tensor_extra_cl *)src2->extra;
|
|
@@ -5680,15 +5473,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
|
|
|
5680
5473
|
size_t global_work_size[] = {(size_t)(ne01+ndst*nsg-1)/(ndst*nsg)*sgs, (size_t)(_ne1+nrows-1)/nrows*nsg, (size_t)ne123};
|
|
5681
5474
|
size_t local_work_size[] = {(size_t)sgs, (size_t)nsg, 1};
|
|
5682
5475
|
|
|
5683
|
-
|
|
5684
|
-
cl_event evt;
|
|
5685
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
5686
|
-
|
|
5687
|
-
g_profiling_info.emplace_back();
|
|
5688
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
5689
|
-
#else
|
|
5690
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
5691
|
-
#endif
|
|
5476
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
5692
5477
|
}
|
|
5693
5478
|
|
|
5694
5479
|
static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -5701,7 +5486,6 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons
|
|
|
5701
5486
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
|
5702
5487
|
|
|
5703
5488
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
5704
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
5705
5489
|
|
|
5706
5490
|
float scale;
|
|
5707
5491
|
memcpy(&scale, dst->op_params, sizeof(scale));
|
|
@@ -5730,15 +5514,7 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons
|
|
|
5730
5514
|
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
|
5731
5515
|
}
|
|
5732
5516
|
|
|
5733
|
-
|
|
5734
|
-
cl_event evt;
|
|
5735
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
|
5736
|
-
|
|
5737
|
-
g_profiling_info.emplace_back();
|
|
5738
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
|
|
5739
|
-
#else
|
|
5740
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
|
5741
|
-
#endif
|
|
5517
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
|
5742
5518
|
}
|
|
5743
5519
|
|
|
5744
5520
|
static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -5775,7 +5551,6 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
5775
5551
|
const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
|
|
5776
5552
|
|
|
5777
5553
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
5778
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
5779
5554
|
|
|
5780
5555
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
5781
5556
|
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
|
@@ -5840,15 +5615,7 @@ static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
5840
5615
|
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
|
5841
5616
|
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
|
5842
5617
|
|
|
5843
|
-
|
|
5844
|
-
cl_event evt;
|
|
5845
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
5846
|
-
|
|
5847
|
-
g_profiling_info.emplace_back();
|
|
5848
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, src1);
|
|
5849
|
-
#else
|
|
5850
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
5851
|
-
#endif
|
|
5618
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, src1);
|
|
5852
5619
|
}
|
|
5853
5620
|
|
|
5854
5621
|
static void ggml_cl_dup(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -5871,7 +5638,6 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr
|
|
|
5871
5638
|
const int ne02 = src0 ? src0->ne[2] : 0;
|
|
5872
5639
|
|
|
5873
5640
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
5874
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
5875
5641
|
|
|
5876
5642
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
5877
5643
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
@@ -5895,15 +5661,7 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr
|
|
|
5895
5661
|
size_t global_work_size[] = {(size_t)ne00*ne01*ne02/8, 1, 1};
|
|
5896
5662
|
size_t local_work_size[] = {64, 1, 1};
|
|
5897
5663
|
|
|
5898
|
-
|
|
5899
|
-
cl_event evt;
|
|
5900
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
5901
|
-
|
|
5902
|
-
g_profiling_info.emplace_back();
|
|
5903
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
5904
|
-
#else
|
|
5905
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
5906
|
-
#endif
|
|
5664
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
5907
5665
|
} else {
|
|
5908
5666
|
kernel = backend_ctx->kernel_diag_mask_inf;
|
|
5909
5667
|
|
|
@@ -5923,15 +5681,7 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr
|
|
|
5923
5681
|
local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
|
|
5924
5682
|
}
|
|
5925
5683
|
|
|
5926
|
-
|
|
5927
|
-
cl_event evt;
|
|
5928
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
|
|
5929
|
-
|
|
5930
|
-
g_profiling_info.emplace_back();
|
|
5931
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
|
|
5932
|
-
#else
|
|
5933
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
|
|
5934
|
-
#endif
|
|
5684
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size_ptr, dst);
|
|
5935
5685
|
}
|
|
5936
5686
|
}
|
|
5937
5687
|
|
|
@@ -5951,7 +5701,6 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
|
5951
5701
|
}
|
|
5952
5702
|
|
|
5953
5703
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
5954
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
5955
5704
|
|
|
5956
5705
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
5957
5706
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
@@ -6031,15 +5780,7 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
|
6031
5780
|
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
|
6032
5781
|
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
|
6033
5782
|
|
|
6034
|
-
|
|
6035
|
-
cl_event evt;
|
|
6036
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
6037
|
-
|
|
6038
|
-
g_profiling_info.emplace_back();
|
|
6039
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
6040
|
-
#else
|
|
6041
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
6042
|
-
#endif
|
|
5783
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
6043
5784
|
}
|
|
6044
5785
|
|
|
6045
5786
|
static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -6051,7 +5792,6 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
6051
5792
|
GGML_ASSERT(dst->extra);
|
|
6052
5793
|
|
|
6053
5794
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
6054
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
6055
5795
|
|
|
6056
5796
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
6057
5797
|
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
|
@@ -6217,15 +5957,7 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
6217
5957
|
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
|
6218
5958
|
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
|
6219
5959
|
|
|
6220
|
-
|
|
6221
|
-
cl_event evt;
|
|
6222
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
6223
|
-
|
|
6224
|
-
g_profiling_info.emplace_back();
|
|
6225
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
6226
|
-
#else
|
|
6227
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
6228
|
-
#endif
|
|
5960
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
6229
5961
|
}
|
|
6230
5962
|
|
|
6231
5963
|
static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -6240,7 +5972,6 @@ static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, con
|
|
|
6240
5972
|
GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
|
|
6241
5973
|
|
|
6242
5974
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
6243
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
6244
5975
|
|
|
6245
5976
|
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
|
6246
5977
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
@@ -6309,15 +6040,7 @@ static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, con
|
|
|
6309
6040
|
size_t global_work_size[] = {(size_t)num_blocks*256, (size_t)OH, (size_t)batch*IC};
|
|
6310
6041
|
size_t local_work_size[] = {256, 1, 1};
|
|
6311
6042
|
|
|
6312
|
-
|
|
6313
|
-
cl_event evt;
|
|
6314
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
6315
|
-
|
|
6316
|
-
g_profiling_info.emplace_back();
|
|
6317
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
6318
|
-
#else
|
|
6319
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
6320
|
-
#endif
|
|
6043
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
6321
6044
|
}
|
|
6322
6045
|
|
|
6323
6046
|
static void ggml_cl_argsort(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -6332,7 +6055,6 @@ static void ggml_cl_argsort(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
6332
6055
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
|
6333
6056
|
|
|
6334
6057
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
6335
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
6336
6058
|
|
|
6337
6059
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
6338
6060
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
@@ -6364,15 +6086,7 @@ static void ggml_cl_argsort(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
6364
6086
|
size_t global_work_size[] = {(size_t)ne00_padded, (size_t)nrows, (size_t)1};
|
|
6365
6087
|
size_t local_work_size[] = {(size_t)ne00_padded, 1, 1};
|
|
6366
6088
|
|
|
6367
|
-
|
|
6368
|
-
cl_event evt;
|
|
6369
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
6370
|
-
|
|
6371
|
-
g_profiling_info.emplace_back();
|
|
6372
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
6373
|
-
#else
|
|
6374
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
6375
|
-
#endif
|
|
6089
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
6376
6090
|
}
|
|
6377
6091
|
|
|
6378
6092
|
static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -6386,7 +6100,6 @@ static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
|
6386
6100
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
|
6387
6101
|
|
|
6388
6102
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
6389
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
6390
6103
|
|
|
6391
6104
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
6392
6105
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
@@ -6427,15 +6140,7 @@ static void ggml_cl_sum_rows(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
|
6427
6140
|
size_t global_work_size[] = {(size_t)ne01, (size_t)ne02, (size_t)ne03};
|
|
6428
6141
|
size_t local_work_size[] = {(size_t)64, 1, 1};
|
|
6429
6142
|
|
|
6430
|
-
|
|
6431
|
-
cl_event evt;
|
|
6432
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
6433
|
-
|
|
6434
|
-
g_profiling_info.emplace_back();
|
|
6435
|
-
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
6436
|
-
#else
|
|
6437
|
-
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
6438
|
-
#endif
|
|
6143
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
6439
6144
|
}
|
|
6440
6145
|
|
|
6441
6146
|
//------------------------------------------------------------------------------
|