@fugood/llama.node 0.3.16 → 0.3.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +3 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +5 -0
- package/package.json +1 -1
- package/src/LlamaCompletionWorker.cpp +8 -0
- package/src/LlamaCompletionWorker.h +1 -0
- package/src/LlamaContext.cpp +3 -2
- package/src/llama.cpp/.github/workflows/build-linux-cross.yml +124 -0
- package/src/llama.cpp/.github/workflows/build.yml +70 -27
- package/src/llama.cpp/.github/workflows/docker.yml +6 -6
- package/src/llama.cpp/.github/workflows/server.yml +7 -11
- package/src/llama.cpp/CMakeLists.txt +23 -1
- package/src/llama.cpp/common/CMakeLists.txt +6 -3
- package/src/llama.cpp/common/arg.cpp +809 -105
- package/src/llama.cpp/common/arg.h +9 -0
- package/src/llama.cpp/common/chat.cpp +1 -1
- package/src/llama.cpp/common/common.cpp +31 -521
- package/src/llama.cpp/common/common.h +17 -36
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
- package/src/llama.cpp/common/llguidance.cpp +30 -47
- package/src/llama.cpp/common/minja/chat-template.hpp +15 -7
- package/src/llama.cpp/common/minja/minja.hpp +119 -93
- package/src/llama.cpp/common/sampling.cpp +3 -0
- package/src/llama.cpp/docs/build.md +122 -7
- package/src/llama.cpp/examples/CMakeLists.txt +0 -9
- package/src/llama.cpp/examples/batched/batched.cpp +1 -1
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +7 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -16
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +210 -8
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llava/CMakeLists.txt +39 -24
- package/src/llama.cpp/examples/llava/clip-impl.h +345 -0
- package/src/llama.cpp/examples/llava/clip.cpp +2152 -1803
- package/src/llama.cpp/examples/llava/clip.h +39 -22
- package/src/llama.cpp/examples/llava/deprecation-warning.cpp +22 -0
- package/src/llama.cpp/examples/llava/llava.cpp +64 -52
- package/src/llama.cpp/examples/llava/mtmd-cli.cpp +344 -0
- package/src/llama.cpp/examples/llava/mtmd.cpp +708 -0
- package/src/llama.cpp/examples/llava/mtmd.h +168 -0
- package/src/llama.cpp/examples/llava/{qwen2vl-cli.cpp → qwen2vl-test.cpp} +83 -31
- package/src/llama.cpp/examples/main/main.cpp +16 -5
- package/src/llama.cpp/examples/parallel/parallel.cpp +3 -1
- package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +17 -3
- package/src/llama.cpp/examples/quantize/quantize.cpp +115 -2
- package/src/llama.cpp/examples/rpc/CMakeLists.txt +4 -2
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +163 -8
- package/src/llama.cpp/examples/run/CMakeLists.txt +12 -1
- package/src/llama.cpp/examples/run/run.cpp +14 -28
- package/src/llama.cpp/examples/server/httplib.h +313 -247
- package/src/llama.cpp/examples/server/server.cpp +238 -139
- package/src/llama.cpp/examples/server/utils.hpp +51 -2
- package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
- package/src/llama.cpp/examples/sycl/build.sh +2 -2
- package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
- package/src/llama.cpp/examples/tts/tts.cpp +6 -9
- package/src/llama.cpp/ggml/CMakeLists.txt +8 -2
- package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
- package/src/llama.cpp/ggml/include/ggml.h +66 -99
- package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
- package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +48 -22
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -192
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +754 -404
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1003 -13519
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +2 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +0 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +3 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +533 -88
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8809 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +258 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
- package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +70 -3
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -260
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +293 -40
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +96 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +350 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +2 -292
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +967 -438
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +204 -280
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +23 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +646 -114
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +12 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +17 -8
- package/src/llama.cpp/ggml/src/ggml.c +141 -245
- package/src/llama.cpp/ggml/src/gguf.cpp +1 -0
- package/src/llama.cpp/include/llama.h +30 -11
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +2 -0
- package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
- package/src/llama.cpp/src/CMakeLists.txt +3 -2
- package/src/llama.cpp/src/llama-adapter.cpp +37 -1
- package/src/llama.cpp/src/llama-arch.cpp +160 -17
- package/src/llama.cpp/src/llama-arch.h +16 -0
- package/src/llama.cpp/src/llama-chat.cpp +82 -17
- package/src/llama.cpp/src/llama-chat.h +6 -2
- package/src/llama.cpp/src/llama-context.cpp +108 -92
- package/src/llama.cpp/src/llama-context.h +1 -2
- package/src/llama.cpp/src/llama-graph.cpp +189 -119
- package/src/llama.cpp/src/llama-graph.h +26 -6
- package/src/llama.cpp/src/llama-hparams.h +13 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +70 -123
- package/src/llama.cpp/src/llama-kv-cache.h +41 -115
- package/src/llama.cpp/src/llama-memory.h +1 -1
- package/src/llama.cpp/src/llama-mmap.cpp +1 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +10 -5
- package/src/llama.cpp/src/llama-model-loader.h +5 -3
- package/src/llama.cpp/src/llama-model.cpp +1760 -534
- package/src/llama.cpp/src/llama-model.h +13 -1
- package/src/llama.cpp/src/llama-quant.cpp +29 -8
- package/src/llama.cpp/src/llama-sampling.cpp +7 -1
- package/src/llama.cpp/src/llama-vocab.cpp +44 -6
- package/src/llama.cpp/src/llama.cpp +1 -1
- package/src/llama.cpp/tests/CMakeLists.txt +43 -30
- package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
- package/src/llama.cpp/tests/test-backend-ops.cpp +82 -43
- package/src/llama.cpp/tests/test-chat-template.cpp +34 -13
- package/src/llama.cpp/tests/test-chat.cpp +12 -2
- package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
- package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
- package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
- package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
- package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
|
@@ -64,11 +64,33 @@ enum ADRENO_GPU_GEN {
|
|
|
64
64
|
X1E,
|
|
65
65
|
};
|
|
66
66
|
|
|
67
|
+
enum ADRENO_CL_COMPILER_TYPE {
|
|
68
|
+
E031,
|
|
69
|
+
DX,
|
|
70
|
+
};
|
|
71
|
+
|
|
67
72
|
struct ggml_cl_version {
|
|
68
73
|
cl_uint major = 0;
|
|
69
74
|
cl_uint minor = 0;
|
|
70
75
|
};
|
|
71
76
|
|
|
77
|
+
struct ggml_cl_compiler_version {
|
|
78
|
+
ADRENO_CL_COMPILER_TYPE type;
|
|
79
|
+
int major = -1;
|
|
80
|
+
int minor = -1;
|
|
81
|
+
int patch = -1;
|
|
82
|
+
|
|
83
|
+
bool same(ADRENO_CL_COMPILER_TYPE t, int x, int y, int z) const {
|
|
84
|
+
return major == x && minor == y && patch == z && type == t;
|
|
85
|
+
}
|
|
86
|
+
bool newer_than(ADRENO_CL_COMPILER_TYPE t, int x, int y, int z) const {
|
|
87
|
+
return major*10000 + minor*100 + patch > x*10000 + y*100 + z && type == t;
|
|
88
|
+
}
|
|
89
|
+
bool newer_than_or_same(ADRENO_CL_COMPILER_TYPE t, int x, int y, int z) const {
|
|
90
|
+
return same(t, x, y, z) || newer_than(t, x, y, z);
|
|
91
|
+
}
|
|
92
|
+
};
|
|
93
|
+
|
|
72
94
|
// Parses a version string of form "XX.YY ". On an error returns ggml_cl_version with all zeroes.
|
|
73
95
|
static ggml_cl_version parse_cl_version(std::string_view str) {
|
|
74
96
|
size_t major_str_begin = 0;
|
|
@@ -173,24 +195,30 @@ static ADRENO_GPU_GEN get_adreno_gpu_gen(const char *device_name) {
|
|
|
173
195
|
return ADRENO_GPU_GEN::ADRENO_UNKNOWN;
|
|
174
196
|
}
|
|
175
197
|
|
|
176
|
-
static
|
|
198
|
+
static ggml_cl_compiler_version get_adreno_cl_compiler_version(const char *driver_version) {
|
|
177
199
|
std::string driver_ver_str(driver_version);
|
|
200
|
+
ADRENO_CL_COMPILER_TYPE type = ADRENO_CL_COMPILER_TYPE::E031;
|
|
178
201
|
size_t compiler_ver_pos = driver_ver_str.find("E031");
|
|
179
202
|
size_t compiler_ver_len = 13;
|
|
180
|
-
size_t
|
|
203
|
+
size_t compiler_major_offset = 5;
|
|
204
|
+
size_t compiler_minor_offset = 8;
|
|
205
|
+
size_t compiler_patch_offset = 11;
|
|
181
206
|
|
|
182
207
|
if (compiler_ver_pos == std::string::npos) {
|
|
183
208
|
compiler_ver_pos = driver_ver_str.find("DX");
|
|
184
209
|
if (compiler_ver_pos == std::string::npos) {
|
|
185
|
-
return
|
|
210
|
+
return {};
|
|
186
211
|
}
|
|
212
|
+
type = ADRENO_CL_COMPILER_TYPE::DX;
|
|
187
213
|
compiler_ver_len = 11;
|
|
188
|
-
|
|
214
|
+
compiler_major_offset = 3;
|
|
189
215
|
}
|
|
190
216
|
|
|
191
217
|
std::string compiler_ver_str = driver_ver_str.substr(compiler_ver_pos, compiler_ver_len);
|
|
192
|
-
|
|
193
|
-
|
|
218
|
+
int major = std::atoi(compiler_ver_str.substr(compiler_major_offset, 2).c_str());
|
|
219
|
+
int minor = std::atoi(compiler_ver_str.substr(compiler_minor_offset, 2).c_str());
|
|
220
|
+
int patch = std::atoi(compiler_ver_str.substr(compiler_patch_offset, 2).c_str());
|
|
221
|
+
return { type, major, minor, patch };
|
|
194
222
|
}
|
|
195
223
|
|
|
196
224
|
// backend device context
|
|
@@ -215,21 +243,55 @@ struct ggml_backend_opencl_context {
|
|
|
215
243
|
cl_int alignment;
|
|
216
244
|
size_t max_alloc_size;
|
|
217
245
|
bool fp16_support;
|
|
246
|
+
bool has_vector_subgroup_broadcast;
|
|
247
|
+
ggml_cl_compiler_version adreno_cl_compiler_version;
|
|
218
248
|
|
|
219
249
|
int adreno_wave_size;
|
|
220
250
|
|
|
221
251
|
cl_context context;
|
|
222
252
|
cl_command_queue queue;
|
|
223
253
|
|
|
224
|
-
cl_program
|
|
225
|
-
cl_program
|
|
226
|
-
cl_program
|
|
254
|
+
cl_program program_add;
|
|
255
|
+
cl_program program_clamp;
|
|
256
|
+
cl_program program_cpy;
|
|
257
|
+
cl_program program_cvt;
|
|
258
|
+
cl_program program_diag_mask_inf;
|
|
259
|
+
cl_program program_gelu;
|
|
260
|
+
cl_program program_gemv_noshuffle_general;
|
|
261
|
+
cl_program program_gemv_noshuffle;
|
|
262
|
+
cl_program program_get_rows;
|
|
263
|
+
cl_program program_im2col_f16;
|
|
264
|
+
cl_program program_im2col_f32;
|
|
265
|
+
cl_program program_mul_mat_Ab_Bi_8x4;
|
|
266
|
+
cl_program program_mul_mv_q4_0_f32;
|
|
267
|
+
cl_program program_mul_mv_q4_0_f32_v;
|
|
268
|
+
cl_program program_mul_mv_q4_0_f32_8x_flat;
|
|
269
|
+
cl_program program_mul_mv_q4_0_f32_1d_8x_flat;
|
|
270
|
+
cl_program program_mul_mv_q4_0_f32_1d_16x_flat;
|
|
271
|
+
cl_program program_mul_mv_q6_K;
|
|
272
|
+
cl_program program_mul_mv_f16_f16;
|
|
273
|
+
cl_program program_mul_mv_f16_f32_1row;
|
|
274
|
+
cl_program program_mul_mv_f16_f32_l4;
|
|
275
|
+
cl_program program_mul_mv_f16_f32;
|
|
276
|
+
cl_program program_mul_mv_f32_f32;
|
|
277
|
+
cl_program program_mul;
|
|
278
|
+
cl_program program_norm;
|
|
279
|
+
cl_program program_relu;
|
|
280
|
+
cl_program program_rms_norm;
|
|
281
|
+
cl_program program_rope;
|
|
282
|
+
cl_program program_scale;
|
|
283
|
+
cl_program program_silu;
|
|
284
|
+
cl_program program_softmax_f32;
|
|
285
|
+
cl_program program_softmax_f16;
|
|
286
|
+
cl_program program_softmax_4_f32;
|
|
287
|
+
cl_program program_softmax_4_f16;
|
|
227
288
|
|
|
228
289
|
cl_kernel kernel_add, kernel_add_row;
|
|
229
290
|
cl_kernel kernel_mul, kernel_mul_row;
|
|
230
291
|
cl_kernel kernel_scale;
|
|
231
292
|
cl_kernel kernel_silu, kernel_silu_4;
|
|
232
293
|
cl_kernel kernel_gelu, kernel_gelu_4;
|
|
294
|
+
cl_kernel kernel_gelu_quick, kernel_gelu_quick_4;
|
|
233
295
|
cl_kernel kernel_relu;
|
|
234
296
|
cl_kernel kernel_clamp;
|
|
235
297
|
cl_kernel kernel_norm;
|
|
@@ -239,6 +301,7 @@ struct ggml_backend_opencl_context {
|
|
|
239
301
|
cl_kernel kernel_soft_max_f16, kernel_soft_max_4_f16;
|
|
240
302
|
cl_kernel kernel_get_rows_f32, kernel_get_rows_f16, kernel_get_rows_q4_0;
|
|
241
303
|
cl_kernel kernel_rope_norm_f32, kernel_rope_norm_f16, kernel_rope_neox_f32, kernel_rope_neox_f16;
|
|
304
|
+
cl_kernel kernel_rope_multi_f32, kernel_rope_multi_f16, kernel_rope_vision_f32, kernel_rope_vision_f16;
|
|
242
305
|
cl_kernel kernel_cpy_f16_f16, kernel_cpy_f16_f32, kernel_cpy_f32_f16, kernel_cpy_f32_f32;
|
|
243
306
|
cl_kernel kernel_mul_mat_f32_f32;
|
|
244
307
|
cl_kernel kernel_mul_mat_f16_f16;
|
|
@@ -246,18 +309,17 @@ struct ggml_backend_opencl_context {
|
|
|
246
309
|
cl_kernel kernel_mul_mat_f16_f32;
|
|
247
310
|
cl_kernel kernel_mul_mat_f16_f32_l4;
|
|
248
311
|
cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v;
|
|
249
|
-
cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0
|
|
312
|
+
cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0;
|
|
250
313
|
cl_kernel kernel_mul_mat_q4_0_f32_8x_flat;
|
|
251
|
-
cl_kernel kernel_convert_block_q4_0_noshuffle
|
|
252
|
-
kernel_mul_mat_q4_0_f32_flat_img_v0;
|
|
314
|
+
cl_kernel kernel_convert_block_q4_0_noshuffle;
|
|
253
315
|
cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
|
|
254
316
|
cl_kernel kernel_mul_mv_q6_K_f32;
|
|
317
|
+
cl_kernel kernel_im2col_f32, kernel_im2col_f16;
|
|
255
318
|
|
|
256
319
|
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
|
257
320
|
// Transpose kernels
|
|
258
|
-
cl_program
|
|
259
|
-
|
|
260
|
-
cl_program program_transpose_16;
|
|
321
|
+
cl_program program_transpose;
|
|
322
|
+
|
|
261
323
|
cl_kernel kernel_transpose_32;
|
|
262
324
|
cl_kernel kernel_transpose_32_16;
|
|
263
325
|
cl_kernel kernel_transpose_16;
|
|
@@ -370,6 +432,681 @@ static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, co
|
|
|
370
432
|
return p;
|
|
371
433
|
}
|
|
372
434
|
|
|
435
|
+
static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_version opencl_c_version) {
|
|
436
|
+
cl_int err;
|
|
437
|
+
|
|
438
|
+
// compiler options for general kernels
|
|
439
|
+
auto opencl_c_std =
|
|
440
|
+
std::string("CL") + std::to_string(opencl_c_version.major) + "." + std::to_string(opencl_c_version.minor);
|
|
441
|
+
std::string compile_opts = std::string("-cl-std=") + opencl_c_std +
|
|
442
|
+
" -cl-mad-enable -cl-unsafe-math-optimizations"
|
|
443
|
+
" -cl-finite-math-only -cl-fast-relaxed-math";
|
|
444
|
+
|
|
445
|
+
GGML_LOG_INFO("ggml_opencl: loading OpenCL kernels");
|
|
446
|
+
|
|
447
|
+
// add
|
|
448
|
+
{
|
|
449
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
450
|
+
const std::string kernel_src {
|
|
451
|
+
#include "add.cl.h"
|
|
452
|
+
};
|
|
453
|
+
#else
|
|
454
|
+
const std::string kernel_src = read_file("add.cl");
|
|
455
|
+
#endif
|
|
456
|
+
backend_ctx->program_add =
|
|
457
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
458
|
+
|
|
459
|
+
CL_CHECK((backend_ctx->kernel_add = clCreateKernel(backend_ctx->program_add, "kernel_add", &err), err));
|
|
460
|
+
CL_CHECK((backend_ctx->kernel_add_row = clCreateKernel(backend_ctx->program_add, "kernel_add_row", &err), err));
|
|
461
|
+
GGML_LOG_CONT(".");
|
|
462
|
+
}
|
|
463
|
+
|
|
464
|
+
// clamp
|
|
465
|
+
{
|
|
466
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
467
|
+
const std::string kernel_src {
|
|
468
|
+
#include "clamp.cl.h"
|
|
469
|
+
};
|
|
470
|
+
#else
|
|
471
|
+
const std::string kernel_src = read_file("clamp.cl");
|
|
472
|
+
#endif
|
|
473
|
+
backend_ctx->program_clamp =
|
|
474
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
475
|
+
|
|
476
|
+
CL_CHECK((backend_ctx->kernel_clamp = clCreateKernel(backend_ctx->program_clamp, "kernel_clamp", &err), err));
|
|
477
|
+
GGML_LOG_CONT(".");
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
// cpy
|
|
481
|
+
{
|
|
482
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
483
|
+
const std::string kernel_src {
|
|
484
|
+
#include "cpy.cl.h"
|
|
485
|
+
};
|
|
486
|
+
#else
|
|
487
|
+
const std::string kernel_src = read_file("cpy.cl");
|
|
488
|
+
#endif
|
|
489
|
+
backend_ctx->program_cpy =
|
|
490
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
491
|
+
|
|
492
|
+
CL_CHECK((backend_ctx->kernel_cpy_f16_f16 = clCreateKernel(backend_ctx->program_cpy, "kernel_cpy_f16_f16", &err), err));
|
|
493
|
+
CL_CHECK((backend_ctx->kernel_cpy_f16_f32 = clCreateKernel(backend_ctx->program_cpy, "kernel_cpy_f16_f32", &err), err));
|
|
494
|
+
CL_CHECK((backend_ctx->kernel_cpy_f32_f16 = clCreateKernel(backend_ctx->program_cpy, "kernel_cpy_f32_f16", &err), err));
|
|
495
|
+
CL_CHECK((backend_ctx->kernel_cpy_f32_f32 = clCreateKernel(backend_ctx->program_cpy, "kernel_cpy_f32_f32", &err), err));
|
|
496
|
+
GGML_LOG_CONT(".");
|
|
497
|
+
}
|
|
498
|
+
|
|
499
|
+
// cvt
|
|
500
|
+
{
|
|
501
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
502
|
+
const std::string kernel_src {
|
|
503
|
+
#include "cvt.cl.h"
|
|
504
|
+
};
|
|
505
|
+
#else
|
|
506
|
+
const std::string kernel_src = read_file("cvt.cl");
|
|
507
|
+
#endif
|
|
508
|
+
backend_ctx->program_cvt =
|
|
509
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
510
|
+
|
|
511
|
+
CL_CHECK((backend_ctx->kernel_convert_block_q4_0_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_0_noshuffle", &err), err));
|
|
512
|
+
CL_CHECK((backend_ctx->kernel_convert_block_q4_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_0", &err), err));
|
|
513
|
+
CL_CHECK((backend_ctx->kernel_restore_block_q4_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_0", &err), err));
|
|
514
|
+
GGML_LOG_CONT(".");
|
|
515
|
+
}
|
|
516
|
+
|
|
517
|
+
// diag_mask_inf
|
|
518
|
+
{
|
|
519
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
520
|
+
const std::string kernel_src {
|
|
521
|
+
#include "diag_mask_inf.cl.h"
|
|
522
|
+
};
|
|
523
|
+
#else
|
|
524
|
+
const std::string kernel_src = read_file("diag_mask_inf.cl");
|
|
525
|
+
#endif
|
|
526
|
+
backend_ctx->program_diag_mask_inf =
|
|
527
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
528
|
+
|
|
529
|
+
CL_CHECK((backend_ctx->kernel_diag_mask_inf_8 = clCreateKernel(backend_ctx->program_diag_mask_inf, "kernel_diag_mask_inf_8", &err), err));
|
|
530
|
+
CL_CHECK((backend_ctx->kernel_diag_mask_inf = clCreateKernel(backend_ctx->program_diag_mask_inf, "kernel_diag_mask_inf", &err), err));
|
|
531
|
+
GGML_LOG_CONT(".");
|
|
532
|
+
}
|
|
533
|
+
|
|
534
|
+
// gelu
|
|
535
|
+
{
|
|
536
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
537
|
+
const std::string kernel_src {
|
|
538
|
+
#include "gelu.cl.h"
|
|
539
|
+
};
|
|
540
|
+
#else
|
|
541
|
+
const std::string kernel_src = read_file("gelu.cl");
|
|
542
|
+
#endif
|
|
543
|
+
backend_ctx->program_gelu =
|
|
544
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
545
|
+
|
|
546
|
+
CL_CHECK((backend_ctx->kernel_gelu = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu", &err), err));
|
|
547
|
+
CL_CHECK((backend_ctx->kernel_gelu_4 = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu_4", &err), err));
|
|
548
|
+
CL_CHECK((backend_ctx->kernel_gelu_quick = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu_quick", &err), err));
|
|
549
|
+
CL_CHECK((backend_ctx->kernel_gelu_quick_4 = clCreateKernel(backend_ctx->program_gelu, "kernel_gelu_quick_4", &err), err));
|
|
550
|
+
GGML_LOG_CONT(".");
|
|
551
|
+
}
|
|
552
|
+
|
|
553
|
+
// get_rows
|
|
554
|
+
{
|
|
555
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
556
|
+
const std::string kernel_src {
|
|
557
|
+
#include "get_rows.cl.h"
|
|
558
|
+
};
|
|
559
|
+
#else
|
|
560
|
+
const std::string kernel_src = read_file("get_rows.cl");
|
|
561
|
+
#endif
|
|
562
|
+
backend_ctx->program_get_rows =
|
|
563
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
564
|
+
|
|
565
|
+
CL_CHECK((backend_ctx->kernel_get_rows_f32 = clCreateKernel(backend_ctx->program_get_rows, "kernel_get_rows_f32", &err), err));
|
|
566
|
+
CL_CHECK((backend_ctx->kernel_get_rows_f16 = clCreateKernel(backend_ctx->program_get_rows, "kernel_get_rows_f16", &err), err));
|
|
567
|
+
CL_CHECK((backend_ctx->kernel_get_rows_q4_0 = clCreateKernel(backend_ctx->program_get_rows, "kernel_get_rows_q4_0", &err), err));
|
|
568
|
+
GGML_LOG_CONT(".");
|
|
569
|
+
}
|
|
570
|
+
|
|
571
|
+
// im2col_f32
|
|
572
|
+
{
|
|
573
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
574
|
+
const std::string kernel_src {
|
|
575
|
+
#include "im2col_f32.cl.h"
|
|
576
|
+
};
|
|
577
|
+
#else
|
|
578
|
+
const std::string kernel_src = read_file("im2col_f32.cl");
|
|
579
|
+
#endif
|
|
580
|
+
backend_ctx->program_im2col_f32 =
|
|
581
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
582
|
+
|
|
583
|
+
CL_CHECK((backend_ctx->kernel_im2col_f32 = clCreateKernel(backend_ctx->program_im2col_f32, "kernel_im2col_f32", &err), err));
|
|
584
|
+
GGML_LOG_CONT(".");
|
|
585
|
+
}
|
|
586
|
+
|
|
587
|
+
// im2col_f16
|
|
588
|
+
{
|
|
589
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
590
|
+
const std::string kernel_src {
|
|
591
|
+
#include "im2col_f16.cl.h"
|
|
592
|
+
};
|
|
593
|
+
#else
|
|
594
|
+
const std::string kernel_src = read_file("im2col_f16.cl");
|
|
595
|
+
#endif
|
|
596
|
+
backend_ctx->program_im2col_f16 =
|
|
597
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
598
|
+
|
|
599
|
+
CL_CHECK((backend_ctx->kernel_im2col_f16 = clCreateKernel(backend_ctx->program_im2col_f16, "kernel_im2col_f16", &err), err));
|
|
600
|
+
GGML_LOG_CONT(".");
|
|
601
|
+
}
|
|
602
|
+
|
|
603
|
+
// mul_mv_q4_0_f32
|
|
604
|
+
{
|
|
605
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
606
|
+
const std::string kernel_src {
|
|
607
|
+
#include "mul_mv_q4_0_f32.cl.h"
|
|
608
|
+
};
|
|
609
|
+
#else
|
|
610
|
+
const std::string kernel_src = read_file("mul_mv_q4_0_f32.cl");
|
|
611
|
+
#endif
|
|
612
|
+
backend_ctx->program_mul_mv_q4_0_f32 =
|
|
613
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
614
|
+
|
|
615
|
+
CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32 = clCreateKernel(backend_ctx->program_mul_mv_q4_0_f32, "kernel_mul_mat_q4_0_f32", &err), err));
|
|
616
|
+
GGML_LOG_CONT(".");
|
|
617
|
+
}
|
|
618
|
+
|
|
619
|
+
// mul_mv_q4_0_f32_v
|
|
620
|
+
{
|
|
621
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
622
|
+
const std::string kernel_src {
|
|
623
|
+
#include "mul_mv_q4_0_f32_v.cl.h"
|
|
624
|
+
};
|
|
625
|
+
#else
|
|
626
|
+
const std::string kernel_src = read_file("mul_mv_q4_0_f32_v.cl");
|
|
627
|
+
#endif
|
|
628
|
+
backend_ctx->program_mul_mv_q4_0_f32_v =
|
|
629
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
630
|
+
|
|
631
|
+
CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_v = clCreateKernel(backend_ctx->program_mul_mv_q4_0_f32_v, "kernel_mul_mat_q4_0_f32_v", &err), err));
|
|
632
|
+
GGML_LOG_CONT(".");
|
|
633
|
+
}
|
|
634
|
+
|
|
635
|
+
// mul_mv_q4_0_f32_8x_flat
|
|
636
|
+
{
|
|
637
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
638
|
+
const std::string kernel_src {
|
|
639
|
+
#include "mul_mv_q4_0_f32_8x_flat.cl.h"
|
|
640
|
+
};
|
|
641
|
+
#else
|
|
642
|
+
const std::string kernel_src = read_file("mul_mv_q4_0_f32_8x_flat.cl");
|
|
643
|
+
#endif
|
|
644
|
+
backend_ctx->program_mul_mv_q4_0_f32_8x_flat =
|
|
645
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
646
|
+
|
|
647
|
+
CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_8x_flat = clCreateKernel(backend_ctx->program_mul_mv_q4_0_f32_8x_flat, "kernel_mul_mat_q4_0_f32_8x_flat", &err), err));
|
|
648
|
+
GGML_LOG_CONT(".");
|
|
649
|
+
}
|
|
650
|
+
|
|
651
|
+
// mul_mv_q4_0_f32_1d_8x_flat
|
|
652
|
+
// This kernel does not compiler on Adreno cl compiler 38.01. Skip it for
|
|
653
|
+
// those compiler versions since it is anyway not used for Adreno.
|
|
654
|
+
if (backend_ctx->gpu_family != ADRENO ||
|
|
655
|
+
backend_ctx->adreno_cl_compiler_version.newer_than_or_same(E031, 38, 11, 0) ||
|
|
656
|
+
backend_ctx->adreno_cl_compiler_version.type == DX) {
|
|
657
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
658
|
+
const std::string kernel_src {
|
|
659
|
+
#include "mul_mv_q4_0_f32_1d_8x_flat.cl.h"
|
|
660
|
+
};
|
|
661
|
+
#else
|
|
662
|
+
const std::string kernel_src = read_file("mul_mv_q4_0_f32_1d_8x_flat.cl");
|
|
663
|
+
#endif
|
|
664
|
+
backend_ctx->program_mul_mv_q4_0_f32_1d_8x_flat =
|
|
665
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
666
|
+
|
|
667
|
+
CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_1d_8x_flat = clCreateKernel(backend_ctx->program_mul_mv_q4_0_f32_1d_8x_flat, "kernel_mul_mat_q4_0_f32_1d_8x_flat", &err), err));
|
|
668
|
+
GGML_LOG_CONT(".");
|
|
669
|
+
}
|
|
670
|
+
|
|
671
|
+
// mul_mv_q4_0_f32_1d_16x_flat
|
|
672
|
+
// This kernel does not compiler on Adreno cl compiler 38.01. Skip it for
|
|
673
|
+
// those compiler versions since it is anyway not used for Adreno.
|
|
674
|
+
if (backend_ctx->gpu_family != ADRENO ||
|
|
675
|
+
backend_ctx->adreno_cl_compiler_version.newer_than_or_same(E031, 38, 11, 0) ||
|
|
676
|
+
backend_ctx->adreno_cl_compiler_version.type == DX) {
|
|
677
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
678
|
+
const std::string kernel_src {
|
|
679
|
+
#include "mul_mv_q4_0_f32_1d_16x_flat.cl.h"
|
|
680
|
+
};
|
|
681
|
+
#else
|
|
682
|
+
const std::string kernel_src = read_file("mul_mv_q4_0_f32_1d_16x_flat.cl");
|
|
683
|
+
#endif
|
|
684
|
+
backend_ctx->program_mul_mv_q4_0_f32_1d_16x_flat =
|
|
685
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
686
|
+
|
|
687
|
+
CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_1d_16x_flat = clCreateKernel(backend_ctx->program_mul_mv_q4_0_f32_1d_16x_flat, "kernel_mul_mat_q4_0_f32_1d_16x_flat", &err), err));
|
|
688
|
+
GGML_LOG_CONT(".");
|
|
689
|
+
}
|
|
690
|
+
|
|
691
|
+
// mul_mv_q6_k
|
|
692
|
+
{
|
|
693
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
694
|
+
const std::string kernel_src {
|
|
695
|
+
#include "mul_mv_q6_k.cl.h"
|
|
696
|
+
};
|
|
697
|
+
#else
|
|
698
|
+
const std::string kernel_src = read_file("mul_mv_q6_k.cl");
|
|
699
|
+
#endif
|
|
700
|
+
backend_ctx->program_mul_mv_q6_K =
|
|
701
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
702
|
+
|
|
703
|
+
CL_CHECK((backend_ctx->kernel_mul_mv_q6_K_f32 = clCreateKernel(backend_ctx->program_mul_mv_q6_K, "kernel_mul_mv_q6_K_f32", &err), err));
|
|
704
|
+
GGML_LOG_CONT(".");
|
|
705
|
+
}
|
|
706
|
+
|
|
707
|
+
// mul_mv_f16_f16
|
|
708
|
+
{
|
|
709
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
710
|
+
const std::string kernel_src {
|
|
711
|
+
#include "mul_mv_f16_f16.cl.h"
|
|
712
|
+
};
|
|
713
|
+
#else
|
|
714
|
+
const std::string kernel_src = read_file("mul_mv_f16_f16.cl");
|
|
715
|
+
#endif
|
|
716
|
+
backend_ctx->program_mul_mv_f16_f16 =
|
|
717
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
718
|
+
|
|
719
|
+
CL_CHECK((backend_ctx->kernel_mul_mat_f16_f16 = clCreateKernel(backend_ctx->program_mul_mv_f16_f16, "kernel_mul_mat_f16_f16", &err), err));
|
|
720
|
+
GGML_LOG_CONT(".");
|
|
721
|
+
}
|
|
722
|
+
|
|
723
|
+
// mul_mv_f16_f32_1row
|
|
724
|
+
{
|
|
725
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
726
|
+
const std::string kernel_src {
|
|
727
|
+
#include "mul_mv_f16_f32_1row.cl.h"
|
|
728
|
+
};
|
|
729
|
+
#else
|
|
730
|
+
const std::string kernel_src = read_file("mul_mv_f16_f32_1row.cl");
|
|
731
|
+
#endif
|
|
732
|
+
backend_ctx->program_mul_mv_f16_f32_1row =
|
|
733
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
734
|
+
|
|
735
|
+
CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_1row = clCreateKernel(backend_ctx->program_mul_mv_f16_f32_1row, "kernel_mul_mat_f16_f32_1row", &err), err));
|
|
736
|
+
GGML_LOG_CONT(".");
|
|
737
|
+
}
|
|
738
|
+
|
|
739
|
+
// mul_mv_f16_f32_l4
|
|
740
|
+
{
|
|
741
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
742
|
+
const std::string kernel_src {
|
|
743
|
+
#include "mul_mv_f16_f32_l4.cl.h"
|
|
744
|
+
};
|
|
745
|
+
#else
|
|
746
|
+
const std::string kernel_src = read_file("mul_mv_f16_f32_l4.cl");
|
|
747
|
+
#endif
|
|
748
|
+
backend_ctx->program_mul_mv_f16_f32_l4 =
|
|
749
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
750
|
+
|
|
751
|
+
CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_l4 = clCreateKernel(backend_ctx->program_mul_mv_f16_f32_l4, "kernel_mul_mat_f16_f32_l4", &err), err));
|
|
752
|
+
GGML_LOG_CONT(".");
|
|
753
|
+
}
|
|
754
|
+
|
|
755
|
+
// mul_mv_f16_f32
|
|
756
|
+
{
|
|
757
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
758
|
+
const std::string kernel_src {
|
|
759
|
+
#include "mul_mv_f16_f32.cl.h"
|
|
760
|
+
};
|
|
761
|
+
#else
|
|
762
|
+
const std::string kernel_src = read_file("mul_mv_f16_f32.cl");
|
|
763
|
+
#endif
|
|
764
|
+
backend_ctx->program_mul_mv_f16_f32 =
|
|
765
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
766
|
+
|
|
767
|
+
CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32 = clCreateKernel(backend_ctx->program_mul_mv_f16_f32, "kernel_mul_mat_f16_f32", &err), err));
|
|
768
|
+
GGML_LOG_CONT(".");
|
|
769
|
+
}
|
|
770
|
+
|
|
771
|
+
// mul_mv_f32_f32
|
|
772
|
+
{
|
|
773
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
774
|
+
const std::string kernel_src {
|
|
775
|
+
#include "mul_mv_f32_f32.cl.h"
|
|
776
|
+
};
|
|
777
|
+
#else
|
|
778
|
+
const std::string kernel_src = read_file("mul_mv_f32_f32.cl");
|
|
779
|
+
#endif
|
|
780
|
+
backend_ctx->program_mul_mv_f32_f32 =
|
|
781
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
782
|
+
|
|
783
|
+
CL_CHECK((backend_ctx->kernel_mul_mat_f32_f32 = clCreateKernel(backend_ctx->program_mul_mv_f32_f32, "kernel_mul_mat_f32_f32", &err), err));
|
|
784
|
+
GGML_LOG_CONT(".");
|
|
785
|
+
}
|
|
786
|
+
|
|
787
|
+
// mul
|
|
788
|
+
{
|
|
789
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
790
|
+
const std::string kernel_src {
|
|
791
|
+
#include "mul.cl.h"
|
|
792
|
+
};
|
|
793
|
+
#else
|
|
794
|
+
const std::string kernel_src = read_file("mul.cl");
|
|
795
|
+
#endif
|
|
796
|
+
backend_ctx->program_mul =
|
|
797
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
798
|
+
|
|
799
|
+
CL_CHECK((backend_ctx->kernel_mul = clCreateKernel(backend_ctx->program_mul, "kernel_mul", &err), err));
|
|
800
|
+
CL_CHECK((backend_ctx->kernel_mul_row = clCreateKernel(backend_ctx->program_mul, "kernel_mul_row", &err), err));
|
|
801
|
+
GGML_LOG_CONT(".");
|
|
802
|
+
}
|
|
803
|
+
|
|
804
|
+
// norm
|
|
805
|
+
{
|
|
806
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
807
|
+
const std::string kernel_src {
|
|
808
|
+
#include "norm.cl.h"
|
|
809
|
+
};
|
|
810
|
+
#else
|
|
811
|
+
const std::string kernel_src = read_file("norm.cl");
|
|
812
|
+
#endif
|
|
813
|
+
backend_ctx->program_norm =
|
|
814
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
815
|
+
|
|
816
|
+
CL_CHECK((backend_ctx->kernel_norm = clCreateKernel(backend_ctx->program_norm, "kernel_norm", &err), err));
|
|
817
|
+
GGML_LOG_CONT(".");
|
|
818
|
+
}
|
|
819
|
+
|
|
820
|
+
// relu
|
|
821
|
+
{
|
|
822
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
823
|
+
const std::string kernel_src {
|
|
824
|
+
#include "relu.cl.h"
|
|
825
|
+
};
|
|
826
|
+
#else
|
|
827
|
+
const std::string kernel_src = read_file("relu.cl");
|
|
828
|
+
#endif
|
|
829
|
+
backend_ctx->program_relu =
|
|
830
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
831
|
+
|
|
832
|
+
CL_CHECK((backend_ctx->kernel_relu = clCreateKernel(backend_ctx->program_relu, "kernel_relu", &err), err));
|
|
833
|
+
GGML_LOG_CONT(".");
|
|
834
|
+
}
|
|
835
|
+
|
|
836
|
+
// rms_norm
|
|
837
|
+
{
|
|
838
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
839
|
+
const std::string kernel_src {
|
|
840
|
+
#include "rms_norm.cl.h"
|
|
841
|
+
};
|
|
842
|
+
#else
|
|
843
|
+
const std::string kernel_src = read_file("rms_norm.cl");
|
|
844
|
+
#endif
|
|
845
|
+
backend_ctx->program_rms_norm =
|
|
846
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
847
|
+
|
|
848
|
+
CL_CHECK((backend_ctx->kernel_rms_norm = clCreateKernel(backend_ctx->program_rms_norm, "kernel_rms_norm", &err), err));
|
|
849
|
+
GGML_LOG_CONT(".");
|
|
850
|
+
}
|
|
851
|
+
|
|
852
|
+
// rope
|
|
853
|
+
{
|
|
854
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
855
|
+
const std::string kernel_src {
|
|
856
|
+
#include "rope.cl.h"
|
|
857
|
+
};
|
|
858
|
+
#else
|
|
859
|
+
const std::string kernel_src = read_file("rope.cl");
|
|
860
|
+
#endif
|
|
861
|
+
backend_ctx->program_rope =
|
|
862
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
863
|
+
|
|
864
|
+
CL_CHECK((backend_ctx->kernel_rope_norm_f32 = clCreateKernel(backend_ctx->program_rope, "kernel_rope_norm_f32", &err), err));
|
|
865
|
+
CL_CHECK((backend_ctx->kernel_rope_norm_f16 = clCreateKernel(backend_ctx->program_rope, "kernel_rope_norm_f16", &err), err));
|
|
866
|
+
CL_CHECK((backend_ctx->kernel_rope_neox_f32 = clCreateKernel(backend_ctx->program_rope, "kernel_rope_neox_f32", &err), err));
|
|
867
|
+
CL_CHECK((backend_ctx->kernel_rope_neox_f16 = clCreateKernel(backend_ctx->program_rope, "kernel_rope_neox_f16", &err), err));
|
|
868
|
+
CL_CHECK((backend_ctx->kernel_rope_multi_f32 = clCreateKernel(backend_ctx->program_rope, "kernel_rope_multi_f32", &err), err));
|
|
869
|
+
CL_CHECK((backend_ctx->kernel_rope_multi_f16 = clCreateKernel(backend_ctx->program_rope, "kernel_rope_multi_f16", &err), err));
|
|
870
|
+
CL_CHECK((backend_ctx->kernel_rope_vision_f32 = clCreateKernel(backend_ctx->program_rope, "kernel_rope_vision_f32", &err), err));
|
|
871
|
+
CL_CHECK((backend_ctx->kernel_rope_vision_f16 = clCreateKernel(backend_ctx->program_rope, "kernel_rope_vision_f16", &err), err));
|
|
872
|
+
GGML_LOG_CONT(".");
|
|
873
|
+
}
|
|
874
|
+
|
|
875
|
+
// scale
|
|
876
|
+
{
|
|
877
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
878
|
+
const std::string kernel_src {
|
|
879
|
+
#include "scale.cl.h"
|
|
880
|
+
};
|
|
881
|
+
#else
|
|
882
|
+
const std::string kernel_src = read_file("scale.cl");
|
|
883
|
+
#endif
|
|
884
|
+
backend_ctx->program_scale =
|
|
885
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
886
|
+
|
|
887
|
+
CL_CHECK((backend_ctx->kernel_scale = clCreateKernel(backend_ctx->program_scale, "kernel_scale", &err), err));
|
|
888
|
+
GGML_LOG_CONT(".");
|
|
889
|
+
}
|
|
890
|
+
|
|
891
|
+
// silu
|
|
892
|
+
{
|
|
893
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
894
|
+
const std::string kernel_src {
|
|
895
|
+
#include "silu.cl.h"
|
|
896
|
+
};
|
|
897
|
+
#else
|
|
898
|
+
const std::string kernel_src = read_file("silu.cl");
|
|
899
|
+
#endif
|
|
900
|
+
backend_ctx->program_silu =
|
|
901
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
902
|
+
|
|
903
|
+
CL_CHECK((backend_ctx->kernel_silu = clCreateKernel(backend_ctx->program_silu, "kernel_silu", &err), err));
|
|
904
|
+
CL_CHECK((backend_ctx->kernel_silu_4 = clCreateKernel(backend_ctx->program_silu, "kernel_silu_4", &err), err));
|
|
905
|
+
GGML_LOG_CONT(".");
|
|
906
|
+
}
|
|
907
|
+
|
|
908
|
+
// softmax_f32
|
|
909
|
+
{
|
|
910
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
911
|
+
const std::string kernel_src {
|
|
912
|
+
#include "softmax_f32.cl.h"
|
|
913
|
+
};
|
|
914
|
+
#else
|
|
915
|
+
const std::string kernel_src = read_file("softmax_f32.cl");
|
|
916
|
+
#endif
|
|
917
|
+
backend_ctx->program_softmax_f32 =
|
|
918
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
919
|
+
|
|
920
|
+
CL_CHECK((backend_ctx->kernel_soft_max = clCreateKernel(backend_ctx->program_softmax_f32, "kernel_soft_max", &err), err));
|
|
921
|
+
GGML_LOG_CONT(".");
|
|
922
|
+
}
|
|
923
|
+
|
|
924
|
+
// softmax_f16
|
|
925
|
+
{
|
|
926
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
927
|
+
const std::string kernel_src {
|
|
928
|
+
#include "softmax_f16.cl.h"
|
|
929
|
+
};
|
|
930
|
+
#else
|
|
931
|
+
const std::string kernel_src = read_file("softmax_f16.cl");
|
|
932
|
+
#endif
|
|
933
|
+
backend_ctx->program_softmax_f16 =
|
|
934
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
935
|
+
|
|
936
|
+
CL_CHECK((backend_ctx->kernel_soft_max_f16 = clCreateKernel(backend_ctx->program_softmax_f16, "kernel_soft_max_f16", &err), err));
|
|
937
|
+
GGML_LOG_CONT(".");
|
|
938
|
+
}
|
|
939
|
+
|
|
940
|
+
// softmax_4_f32
|
|
941
|
+
{
|
|
942
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
943
|
+
const std::string kernel_src {
|
|
944
|
+
#include "softmax_4_f32.cl.h"
|
|
945
|
+
};
|
|
946
|
+
#else
|
|
947
|
+
const std::string kernel_src = read_file("softmax_4_f32.cl");
|
|
948
|
+
#endif
|
|
949
|
+
backend_ctx->program_softmax_4_f32 =
|
|
950
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
951
|
+
|
|
952
|
+
CL_CHECK((backend_ctx->kernel_soft_max_4 = clCreateKernel(backend_ctx->program_softmax_4_f32, "kernel_soft_max_4", &err), err));
|
|
953
|
+
GGML_LOG_CONT(".");
|
|
954
|
+
}
|
|
955
|
+
|
|
956
|
+
// softmax_4_f16
|
|
957
|
+
{
|
|
958
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
959
|
+
const std::string kernel_src {
|
|
960
|
+
#include "softmax_4_f16.cl.h"
|
|
961
|
+
};
|
|
962
|
+
#else
|
|
963
|
+
const std::string kernel_src = read_file("softmax_4_f16.cl");
|
|
964
|
+
#endif
|
|
965
|
+
backend_ctx->program_softmax_4_f16 =
|
|
966
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
967
|
+
|
|
968
|
+
CL_CHECK((backend_ctx->kernel_soft_max_4_f16 = clCreateKernel(backend_ctx->program_softmax_4_f16, "kernel_soft_max_4_f16", &err), err));
|
|
969
|
+
GGML_LOG_CONT(".");
|
|
970
|
+
}
|
|
971
|
+
|
|
972
|
+
// Adreno kernels
|
|
973
|
+
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
|
974
|
+
// transpose
|
|
975
|
+
{
|
|
976
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
977
|
+
const std::string kernel_src {
|
|
978
|
+
#include "transpose.cl.h"
|
|
979
|
+
};
|
|
980
|
+
#else
|
|
981
|
+
const std::string kernel_src = read_file("transpose.cl");
|
|
982
|
+
#endif
|
|
983
|
+
backend_ctx->program_transpose =
|
|
984
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
985
|
+
|
|
986
|
+
CL_CHECK((backend_ctx->kernel_transpose_32_16 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_32_16", &err), err));
|
|
987
|
+
CL_CHECK((backend_ctx->kernel_transpose_32 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_32", &err), err));
|
|
988
|
+
CL_CHECK((backend_ctx->kernel_transpose_16 = clCreateKernel(backend_ctx->program_transpose, "kernel_transpose_16", &err), err));
|
|
989
|
+
GGML_LOG_CONT(".");
|
|
990
|
+
}
|
|
991
|
+
|
|
992
|
+
// gemv_noshuffle_general
|
|
993
|
+
{
|
|
994
|
+
std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
|
995
|
+
" -cl-mad-enable "
|
|
996
|
+
" -DSIMDGROUP_WIDTH=" +
|
|
997
|
+
std::to_string(backend_ctx->adreno_wave_size);
|
|
998
|
+
if (backend_ctx->has_vector_subgroup_broadcast) {
|
|
999
|
+
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
|
1000
|
+
}
|
|
1001
|
+
|
|
1002
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
1003
|
+
const std::string kernel_src_CL_gemv_general {
|
|
1004
|
+
#include "gemv_noshuffle_general.cl.h"
|
|
1005
|
+
};
|
|
1006
|
+
#else
|
|
1007
|
+
const std::string kernel_src_CL_gemv_general = read_file("gemv_noshuffle_general.cl");
|
|
1008
|
+
#endif
|
|
1009
|
+
|
|
1010
|
+
backend_ctx->program_CL_gemv_general = build_program_from_source(
|
|
1011
|
+
backend_ctx->context, backend_ctx->device, kernel_src_CL_gemv_general.c_str(), CL_gemv_compile_opts);
|
|
1012
|
+
|
|
1013
|
+
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_general = clCreateKernel(backend_ctx->program_CL_gemv_general, "kernel_gemv_noshuffle", &err), err));
|
|
1014
|
+
GGML_LOG_CONT(".");
|
|
1015
|
+
}
|
|
1016
|
+
|
|
1017
|
+
// gemv_noshuffle
|
|
1018
|
+
{
|
|
1019
|
+
// Gemv 2048, 16384
|
|
1020
|
+
std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
|
1021
|
+
" -cl-mad-enable "
|
|
1022
|
+
" -DLINE_STRIDE_A=2048 "
|
|
1023
|
+
" -DBLOCK_STRIDE_A=16384 "
|
|
1024
|
+
" -DSIMDGROUP_WIDTH=" +
|
|
1025
|
+
std::to_string(backend_ctx->adreno_wave_size);
|
|
1026
|
+
if (backend_ctx->has_vector_subgroup_broadcast) {
|
|
1027
|
+
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
|
1028
|
+
}
|
|
1029
|
+
|
|
1030
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
1031
|
+
const std::string kernel_src_CL_gemv {
|
|
1032
|
+
#include "gemv_noshuffle.cl.h"
|
|
1033
|
+
};
|
|
1034
|
+
#else
|
|
1035
|
+
const std::string kernel_src_CL_gemv = read_file("gemv_noshuffle.cl");
|
|
1036
|
+
#endif
|
|
1037
|
+
|
|
1038
|
+
backend_ctx->program_CL_gemv_4096_1_4096 = build_program_from_source(
|
|
1039
|
+
backend_ctx->context, backend_ctx->device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts);
|
|
1040
|
+
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_4096, "kernel_gemv_noshuffle", &err), err));
|
|
1041
|
+
GGML_LOG_CONT(".");
|
|
1042
|
+
|
|
1043
|
+
// Gemv 2048, 16384
|
|
1044
|
+
CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
|
1045
|
+
" -cl-mad-enable "
|
|
1046
|
+
" -DLINE_STRIDE_A=2048 "
|
|
1047
|
+
" -DBLOCK_STRIDE_A=16384 "
|
|
1048
|
+
" -DSIMDGROUP_WIDTH=" +
|
|
1049
|
+
std::to_string(backend_ctx->adreno_wave_size);
|
|
1050
|
+
if (backend_ctx->has_vector_subgroup_broadcast) {
|
|
1051
|
+
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
|
1052
|
+
}
|
|
1053
|
+
|
|
1054
|
+
backend_ctx->program_CL_gemv_4096_1_11008 = build_program_from_source(
|
|
1055
|
+
backend_ctx->context, backend_ctx->device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts);
|
|
1056
|
+
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_11008 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_11008, "kernel_gemv_noshuffle", &err), err));
|
|
1057
|
+
GGML_LOG_CONT(".");
|
|
1058
|
+
|
|
1059
|
+
// Gemv 5504, 44032
|
|
1060
|
+
CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
|
1061
|
+
" -cl-mad-enable "
|
|
1062
|
+
" -DLINE_STRIDE_A=5504 "
|
|
1063
|
+
" -DBLOCK_STRIDE_A=44032 "
|
|
1064
|
+
" -DSIMDGROUP_WIDTH=" +
|
|
1065
|
+
std::to_string(backend_ctx->adreno_wave_size);
|
|
1066
|
+
if (backend_ctx->has_vector_subgroup_broadcast) {
|
|
1067
|
+
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
|
1068
|
+
}
|
|
1069
|
+
|
|
1070
|
+
backend_ctx->program_CL_gemv_11008_1_4096 = build_program_from_source(
|
|
1071
|
+
backend_ctx->context, backend_ctx->device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts);
|
|
1072
|
+
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_11008_1_4096, "kernel_gemv_noshuffle", &err), err));
|
|
1073
|
+
GGML_LOG_CONT(".");
|
|
1074
|
+
|
|
1075
|
+
// Gemv 16000, 128000
|
|
1076
|
+
CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
|
1077
|
+
" -cl-mad-enable "
|
|
1078
|
+
" -DLINE_STRIDE_A=16000 "
|
|
1079
|
+
" -DBLOCK_STRIDE_A=128000 "
|
|
1080
|
+
" -DSIMDGROUP_WIDTH=" +
|
|
1081
|
+
std::to_string(backend_ctx->adreno_wave_size);
|
|
1082
|
+
|
|
1083
|
+
if (backend_ctx->has_vector_subgroup_broadcast) {
|
|
1084
|
+
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
|
1085
|
+
}
|
|
1086
|
+
|
|
1087
|
+
backend_ctx->program_CL_gemv_32000_1_4096 = build_program_from_source(
|
|
1088
|
+
backend_ctx->context, backend_ctx->device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts);
|
|
1089
|
+
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_32000_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_32000_1_4096, "kernel_gemv_noshuffle", &err), err));
|
|
1090
|
+
GGML_LOG_CONT(".");
|
|
1091
|
+
}
|
|
1092
|
+
|
|
1093
|
+
// mul_mat_Ab_Bi_8x4
|
|
1094
|
+
{
|
|
1095
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
1096
|
+
const std::string kernel_src_CL_gemm {
|
|
1097
|
+
#include "mul_mat_Ab_Bi_8x4.cl.h"
|
|
1098
|
+
};
|
|
1099
|
+
#else
|
|
1100
|
+
const std::string kernel_src_CL_gemm = read_file("mul_mat_Ab_Bi_8x4.cl");
|
|
1101
|
+
#endif
|
|
1102
|
+
backend_ctx->program_CL_gemm = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src_CL_gemm.c_str(), compile_opts);
|
|
1103
|
+
CL_CHECK((backend_ctx->CL_mul_mat_Ab_Bi_8x4 = clCreateKernel(backend_ctx->program_CL_gemm, "kernel_mul_mat_Ab_Bi_8x4", &err), err));
|
|
1104
|
+
GGML_LOG_CONT(".");
|
|
1105
|
+
}
|
|
1106
|
+
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
|
1107
|
+
GGML_LOG_CONT("\n");
|
|
1108
|
+
}
|
|
1109
|
+
|
|
373
1110
|
static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
374
1111
|
static bool initialized = false;
|
|
375
1112
|
static ggml_backend_opencl_context *backend_ctx = nullptr;
|
|
@@ -411,6 +1148,7 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
|
411
1148
|
unsigned number;
|
|
412
1149
|
cl_device_type type;
|
|
413
1150
|
char name[128];
|
|
1151
|
+
char version[128];
|
|
414
1152
|
};
|
|
415
1153
|
|
|
416
1154
|
enum { NPLAT = 16, NDEV = 16 };
|
|
@@ -451,6 +1189,7 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
|
451
1189
|
d->platform = p;
|
|
452
1190
|
CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_NAME, sizeof(d->name), &d->name, NULL));
|
|
453
1191
|
CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_TYPE, sizeof(d->type), &d->type, NULL));
|
|
1192
|
+
CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_VERSION, sizeof(d->version), &d->version, NULL));
|
|
454
1193
|
|
|
455
1194
|
if (p->default_device == NULL && d->type == CL_DEVICE_TYPE_GPU) {
|
|
456
1195
|
p->default_device = d;
|
|
@@ -543,7 +1282,7 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
|
543
1282
|
}
|
|
544
1283
|
|
|
545
1284
|
GGML_LOG_INFO("ggml_opencl: selecting platform: '%s'\n", default_device->platform->name);
|
|
546
|
-
GGML_LOG_INFO("ggml_opencl: selecting device: '%s'\n", default_device->name);
|
|
1285
|
+
GGML_LOG_INFO("ggml_opencl: selecting device: '%s (%s)'\n", default_device->name, default_device->version);
|
|
547
1286
|
if (default_device->type != CL_DEVICE_TYPE_GPU) {
|
|
548
1287
|
GGML_LOG_WARN("ggml_opencl: warning, not a GPU: '%s'.\n", default_device->name);
|
|
549
1288
|
}
|
|
@@ -552,9 +1291,15 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
|
552
1291
|
dev_ctx->device = default_device->id;
|
|
553
1292
|
backend_ctx->device = default_device->id;
|
|
554
1293
|
|
|
555
|
-
if (strstr(default_device->name, "Adreno")
|
|
1294
|
+
if (strstr(default_device->name, "Adreno") ||
|
|
1295
|
+
strstr(default_device->name, "Qualcomm") ||
|
|
1296
|
+
strstr(default_device->version, "Adreno")) {
|
|
556
1297
|
backend_ctx->gpu_family = GPU_FAMILY::ADRENO;
|
|
557
|
-
|
|
1298
|
+
// Usually device version contains the detailed device name
|
|
1299
|
+
backend_ctx->adreno_gen = get_adreno_gpu_gen(default_device->version);
|
|
1300
|
+
if (backend_ctx->adreno_gen == ADRENO_GPU_GEN::ADRENO_UNKNOWN) {
|
|
1301
|
+
backend_ctx->adreno_gen = get_adreno_gpu_gen(default_device->name);
|
|
1302
|
+
}
|
|
558
1303
|
|
|
559
1304
|
// Use wave size of 64 for all Adreno GPUs.
|
|
560
1305
|
backend_ctx->adreno_wave_size = 64;
|
|
@@ -600,11 +1345,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
|
600
1345
|
GGML_LOG_INFO("ggml_opencl: OpenCL driver: %s\n", driver_version);
|
|
601
1346
|
backend_ctx->driver_version = driver_version;
|
|
602
1347
|
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
adreno_cl_compiler_version >= 47 ||
|
|
1348
|
+
backend_ctx->adreno_cl_compiler_version = get_adreno_cl_compiler_version(driver_version);
|
|
1349
|
+
backend_ctx->has_vector_subgroup_broadcast =
|
|
1350
|
+
backend_ctx->adreno_cl_compiler_version.major >= 47 ||
|
|
1351
|
+
backend_ctx->adreno_cl_compiler_version.major == 17;
|
|
606
1352
|
GGML_LOG_INFO("ggml_opencl: vector subgroup broadcast support: %s\n",
|
|
607
|
-
has_vector_subgroup_broadcast ? "true" : "false");
|
|
1353
|
+
backend_ctx->has_vector_subgroup_broadcast ? "true" : "false");
|
|
608
1354
|
|
|
609
1355
|
size_t ext_str_size;
|
|
610
1356
|
clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 0, NULL, &ext_str_size);
|
|
@@ -679,230 +1425,32 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
|
679
1425
|
#endif
|
|
680
1426
|
CL_CHECK((backend_ctx->queue = clCreateCommandQueue(context, device, command_queue_props, &err), err));
|
|
681
1427
|
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
#include "ggml-opencl.cl.h"
|
|
685
|
-
};
|
|
686
|
-
#else
|
|
687
|
-
const std::string kernel_src = read_file("ggml-opencl.cl");
|
|
688
|
-
#endif
|
|
689
|
-
|
|
690
|
-
auto opencl_c_std =
|
|
691
|
-
std::string("CL") + std::to_string(opencl_c_version.major) + "." + std::to_string(opencl_c_version.minor);
|
|
692
|
-
|
|
693
|
-
std::string compile_opts = std::string("-cl-std=") + opencl_c_std +
|
|
694
|
-
" -cl-mad-enable -cl-unsafe-math-optimizations"
|
|
695
|
-
" -cl-finite-math-only -cl-fast-relaxed-math";
|
|
696
|
-
backend_ctx->program = build_program_from_source(context, device, kernel_src.c_str(), compile_opts);
|
|
697
|
-
|
|
698
|
-
// Non matmul kernels.
|
|
699
|
-
CL_CHECK((backend_ctx->kernel_get_rows_f32 = clCreateKernel(backend_ctx->program, "kernel_get_rows_f32", &err), err));
|
|
700
|
-
CL_CHECK((backend_ctx->kernel_get_rows_f16 = clCreateKernel(backend_ctx->program, "kernel_get_rows_f16", &err), err));
|
|
701
|
-
CL_CHECK((backend_ctx->kernel_get_rows_q4_0 = clCreateKernel(backend_ctx->program, "kernel_get_rows_q4_0", &err), err));
|
|
702
|
-
CL_CHECK((backend_ctx->kernel_add = clCreateKernel(backend_ctx->program, "kernel_add", &err), err));
|
|
703
|
-
CL_CHECK((backend_ctx->kernel_add_row = clCreateKernel(backend_ctx->program, "kernel_add_row", &err), err));
|
|
704
|
-
CL_CHECK((backend_ctx->kernel_mul = clCreateKernel(backend_ctx->program, "kernel_mul", &err), err));
|
|
705
|
-
CL_CHECK((backend_ctx->kernel_mul_row = clCreateKernel(backend_ctx->program, "kernel_mul_row", &err), err));
|
|
706
|
-
CL_CHECK((backend_ctx->kernel_scale = clCreateKernel(backend_ctx->program, "kernel_scale", &err), err));
|
|
707
|
-
CL_CHECK((backend_ctx->kernel_silu = clCreateKernel(backend_ctx->program, "kernel_silu", &err), err));
|
|
708
|
-
CL_CHECK((backend_ctx->kernel_silu_4 = clCreateKernel(backend_ctx->program, "kernel_silu_4", &err), err));
|
|
709
|
-
CL_CHECK((backend_ctx->kernel_gelu = clCreateKernel(backend_ctx->program, "kernel_gelu", &err), err));
|
|
710
|
-
CL_CHECK((backend_ctx->kernel_gelu_4 = clCreateKernel(backend_ctx->program, "kernel_gelu_4", &err), err));
|
|
711
|
-
CL_CHECK((backend_ctx->kernel_relu = clCreateKernel(backend_ctx->program, "kernel_relu", &err), err));
|
|
712
|
-
CL_CHECK((backend_ctx->kernel_clamp = clCreateKernel(backend_ctx->program, "kernel_clamp", &err), err));
|
|
713
|
-
CL_CHECK((backend_ctx->kernel_norm = clCreateKernel(backend_ctx->program, "kernel_norm", &err), err));
|
|
714
|
-
CL_CHECK((backend_ctx->kernel_rms_norm = clCreateKernel(backend_ctx->program, "kernel_rms_norm", &err), err));
|
|
715
|
-
CL_CHECK((backend_ctx->kernel_diag_mask_inf = clCreateKernel(backend_ctx->program, "kernel_diag_mask_inf", &err), err));
|
|
716
|
-
CL_CHECK((backend_ctx->kernel_diag_mask_inf_8 = clCreateKernel(backend_ctx->program, "kernel_diag_mask_inf_8", &err), err));
|
|
717
|
-
CL_CHECK((backend_ctx->kernel_soft_max = clCreateKernel(backend_ctx->program, "kernel_soft_max", &err), err));
|
|
718
|
-
CL_CHECK((backend_ctx->kernel_soft_max_4 = clCreateKernel(backend_ctx->program, "kernel_soft_max_4", &err), err));
|
|
719
|
-
CL_CHECK((backend_ctx->kernel_soft_max_f16 = clCreateKernel(backend_ctx->program, "kernel_soft_max_f16", &err), err));
|
|
720
|
-
CL_CHECK((backend_ctx->kernel_soft_max_4_f16 = clCreateKernel(backend_ctx->program, "kernel_soft_max_4_f16", &err), err));
|
|
721
|
-
CL_CHECK((backend_ctx->kernel_rope_norm_f32 = clCreateKernel(backend_ctx->program, "kernel_rope_norm_f32", &err), err));
|
|
722
|
-
CL_CHECK((backend_ctx->kernel_rope_norm_f16 = clCreateKernel(backend_ctx->program, "kernel_rope_norm_f16", &err), err));
|
|
723
|
-
CL_CHECK((backend_ctx->kernel_rope_neox_f32 = clCreateKernel(backend_ctx->program, "kernel_rope_neox_f32", &err), err));
|
|
724
|
-
CL_CHECK((backend_ctx->kernel_rope_neox_f16 = clCreateKernel(backend_ctx->program, "kernel_rope_neox_f16", &err), err));
|
|
725
|
-
CL_CHECK((backend_ctx->kernel_cpy_f16_f16 = clCreateKernel(backend_ctx->program, "kernel_cpy_f16_f16", &err), err));
|
|
726
|
-
CL_CHECK((backend_ctx->kernel_cpy_f16_f32 = clCreateKernel(backend_ctx->program, "kernel_cpy_f16_f32", &err), err));
|
|
727
|
-
CL_CHECK((backend_ctx->kernel_cpy_f32_f16 = clCreateKernel(backend_ctx->program, "kernel_cpy_f32_f16", &err), err));
|
|
728
|
-
CL_CHECK((backend_ctx->kernel_cpy_f32_f32 = clCreateKernel(backend_ctx->program, "kernel_cpy_f32_f32", &err), err));
|
|
729
|
-
|
|
730
|
-
// Matmul kernels.
|
|
731
|
-
CL_CHECK((backend_ctx->kernel_mul_mat_f32_f32 = clCreateKernel(backend_ctx->program, "kernel_mul_mat_f32_f32", &err), err));
|
|
732
|
-
CL_CHECK((backend_ctx->kernel_mul_mat_f16_f16 = clCreateKernel(backend_ctx->program, "kernel_mul_mat_f16_f16", &err), err));
|
|
733
|
-
CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_1row = clCreateKernel(backend_ctx->program, "kernel_mul_mat_f16_f32_1row", &err), err));
|
|
734
|
-
CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32 = clCreateKernel(backend_ctx->program, "kernel_mul_mat_f16_f32", &err), err));
|
|
735
|
-
CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_l4 = clCreateKernel(backend_ctx->program, "kernel_mul_mat_f16_f32_l4", &err), err));
|
|
736
|
-
CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32 = clCreateKernel(backend_ctx->program, "kernel_mul_mat_q4_0_f32", &err), err));
|
|
737
|
-
CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_v = clCreateKernel(backend_ctx->program, "kernel_mul_mat_q4_0_f32_v", &err), err));
|
|
738
|
-
|
|
739
|
-
CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_flat = clCreateKernel(backend_ctx->program, "kernel_mul_mat_q4_0_f32_flat", &err), err));
|
|
740
|
-
CL_CHECK((backend_ctx->kernel_convert_block_q4_0 = clCreateKernel(backend_ctx->program, "kernel_convert_block_q4_0", &err), err));
|
|
741
|
-
CL_CHECK((backend_ctx->kernel_restore_block_q4_0 = clCreateKernel(backend_ctx->program, "kernel_restore_block_q4_0", &err), err));
|
|
742
|
-
CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_8x_flat = clCreateKernel(backend_ctx->program, "kernel_mul_mat_q4_0_f32_8x_flat", &err), err));
|
|
743
|
-
|
|
744
|
-
// Load additional mulmat kernels.
|
|
745
|
-
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
746
|
-
const std::string kernel_src_1 {
|
|
747
|
-
#include "ggml-opencl_mm.cl.h"
|
|
748
|
-
};
|
|
749
|
-
#else
|
|
750
|
-
const std::string kernel_src_1 = read_file("ggml-opencl_mm.cl");
|
|
751
|
-
#endif
|
|
752
|
-
backend_ctx->program_1 = build_program_from_source(context, device, kernel_src_1.c_str(), compile_opts);
|
|
753
|
-
|
|
754
|
-
CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_1d_8x_flat = clCreateKernel(backend_ctx->program_1, "kernel_mul_mat_q4_0_f32_1d_8x_flat", &err), err));
|
|
755
|
-
CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_1d_16x_flat = clCreateKernel(backend_ctx->program_1, "kernel_mul_mat_q4_0_f32_1d_16x_flat", &err), err));
|
|
756
|
-
CL_CHECK((backend_ctx->kernel_mul_mv_q6_K_f32 = clCreateKernel(backend_ctx->program_1, "kernel_mul_mv_q6_K_f32", &err), err));
|
|
757
|
-
CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_flat_v0 = clCreateKernel(backend_ctx->program_1, "kernel_mul_mat_q4_0_f32_flat_v0", &err), err));
|
|
758
|
-
CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_flat_img_v0 = clCreateKernel(backend_ctx->program_1, "kernel_mul_mat_q4_0_f32_flat_img_v0", &err), err));
|
|
1428
|
+
// Load kernels
|
|
1429
|
+
load_cl_kernels(backend_ctx, opencl_c_version);
|
|
759
1430
|
|
|
760
|
-
// Load additional data conversion kernels.
|
|
761
|
-
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
762
|
-
const std::string kernel_src_2 {
|
|
763
|
-
#include "ggml-opencl_cvt.cl.h"
|
|
764
|
-
};
|
|
765
|
-
#else
|
|
766
|
-
const std::string kernel_src_2 = read_file("ggml-opencl_cvt.cl");
|
|
767
|
-
#endif
|
|
768
|
-
backend_ctx->program_2 = build_program_from_source(context, device, kernel_src_2.c_str(), compile_opts);
|
|
769
|
-
|
|
770
|
-
CL_CHECK((backend_ctx->kernel_convert_block_q4_0_noshuffle = clCreateKernel(backend_ctx->program_2, "kernel_convert_block_q4_0_noshuffle", &err), err));
|
|
771
|
-
|
|
772
|
-
// Kernels for Adreno
|
|
773
1431
|
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
#include "ggml-opencl_transpose_32_16.cl.h"
|
|
787
|
-
};
|
|
788
|
-
#else
|
|
789
|
-
const std::string transpose_32_16_src = read_file("ggml-opencl_transpose_32_16.cl");
|
|
790
|
-
#endif
|
|
791
|
-
backend_ctx->program_transpose_32_16 = build_program_from_source(context, device, transpose_32_16_src.c_str(), compile_opts);
|
|
792
|
-
CL_CHECK((backend_ctx->kernel_transpose_32_16 = clCreateKernel(backend_ctx->program_transpose_32_16, "kernel_transpose_32_16", &err), err));
|
|
793
|
-
|
|
794
|
-
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
795
|
-
const std::string transpose_16_src {
|
|
796
|
-
#include "ggml-opencl_transpose_16.cl.h"
|
|
797
|
-
};
|
|
798
|
-
#else
|
|
799
|
-
const std::string transpose_16_src = read_file("ggml-opencl_transpose_16.cl");
|
|
800
|
-
#endif
|
|
801
|
-
backend_ctx->program_transpose_16 = build_program_from_source(context, device, transpose_16_src.c_str(), compile_opts);
|
|
802
|
-
CL_CHECK((backend_ctx->kernel_transpose_16 = clCreateKernel(backend_ctx->program_transpose_16, "kernel_transpose_16", &err), err));
|
|
803
|
-
|
|
804
|
-
// Gemv general
|
|
805
|
-
std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
|
806
|
-
" -cl-mad-enable "
|
|
807
|
-
" -DSIMDGROUP_WIDTH=" +
|
|
808
|
-
std::to_string(backend_ctx->adreno_wave_size);
|
|
809
|
-
if (has_vector_subgroup_broadcast) {
|
|
810
|
-
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
|
811
|
-
}
|
|
812
|
-
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
813
|
-
const std::string kernel_src_CL_gemv_general {
|
|
814
|
-
#include "ggml-opencl_gemv_noshuffle_general.cl.h"
|
|
815
|
-
};
|
|
816
|
-
#else
|
|
817
|
-
const std::string kernel_src_CL_gemv_general = read_file("ggml-opencl_gemv_noshuffle_general.cl");
|
|
818
|
-
#endif
|
|
819
|
-
|
|
820
|
-
backend_ctx->program_CL_gemv_general = build_program_from_source(
|
|
821
|
-
context, device, kernel_src_CL_gemv_general.c_str(), CL_gemv_compile_opts);
|
|
822
|
-
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_general = clCreateKernel(backend_ctx->program_CL_gemv_general, "kernel_gemv_noshuffle", &err), err));
|
|
823
|
-
|
|
824
|
-
// Gemv 2048, 16384
|
|
825
|
-
CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
|
826
|
-
" -cl-mad-enable "
|
|
827
|
-
" -DLINE_STRIDE_A=2048 "
|
|
828
|
-
" -DBLOCK_STRIDE_A=16384 "
|
|
829
|
-
" -DSIMDGROUP_WIDTH=" +
|
|
830
|
-
std::to_string(backend_ctx->adreno_wave_size);
|
|
831
|
-
if (has_vector_subgroup_broadcast) {
|
|
832
|
-
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
|
1432
|
+
// Allocate intermediate buffers and images
|
|
1433
|
+
size_t required_A_q_d_bytes = 311164928;
|
|
1434
|
+
size_t required_A_s_d_bytes = 38895616;
|
|
1435
|
+
size_t required_B_d_bytes = 45088768;
|
|
1436
|
+
|
|
1437
|
+
// Ensure buffer sizes do not exceed the maximum allocation size
|
|
1438
|
+
size_t max_A_q_d_bytes = MIN(required_A_q_d_bytes, backend_ctx->max_alloc_size);
|
|
1439
|
+
size_t max_A_s_d_bytes = MIN(required_A_s_d_bytes, backend_ctx->max_alloc_size);
|
|
1440
|
+
size_t max_B_d_bytes = MIN(required_B_d_bytes, backend_ctx->max_alloc_size);
|
|
1441
|
+
if (required_A_q_d_bytes > backend_ctx->max_alloc_size) {
|
|
1442
|
+
GGML_LOG_WARN("ggml_opencl: A_q_d buffer size reduced from %zu to %zu due to device limitations.\n",
|
|
1443
|
+
required_A_q_d_bytes, max_A_q_d_bytes);
|
|
833
1444
|
}
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
};
|
|
838
|
-
#else
|
|
839
|
-
const std::string kernel_src_CL_gemv = read_file("ggml-opencl_gemv_noshuffle.cl");
|
|
840
|
-
#endif
|
|
841
|
-
|
|
842
|
-
backend_ctx->program_CL_gemv_4096_1_4096 = build_program_from_source(
|
|
843
|
-
context, device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts);
|
|
844
|
-
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_4096, "kernel_gemv_noshuffle", &err), err));
|
|
845
|
-
|
|
846
|
-
// Gemv 2048, 16384
|
|
847
|
-
CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
|
848
|
-
" -cl-mad-enable "
|
|
849
|
-
" -DLINE_STRIDE_A=2048 "
|
|
850
|
-
" -DBLOCK_STRIDE_A=16384 "
|
|
851
|
-
" -DSIMDGROUP_WIDTH=" +
|
|
852
|
-
std::to_string(backend_ctx->adreno_wave_size);
|
|
853
|
-
if (has_vector_subgroup_broadcast) {
|
|
854
|
-
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
|
1445
|
+
if (required_A_s_d_bytes > backend_ctx->max_alloc_size) {
|
|
1446
|
+
GGML_LOG_WARN("ggml_opencl: A_s_d buffer size reduced from %zu to %zu due to device limitations.\n",
|
|
1447
|
+
required_A_s_d_bytes, max_A_s_d_bytes);
|
|
855
1448
|
}
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_11008 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_11008, "kernel_gemv_noshuffle", &err), err));
|
|
860
|
-
|
|
861
|
-
// Gemv 5504, 44032
|
|
862
|
-
CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
|
863
|
-
" -cl-mad-enable "
|
|
864
|
-
" -DLINE_STRIDE_A=5504 "
|
|
865
|
-
" -DBLOCK_STRIDE_A=44032 "
|
|
866
|
-
" -DSIMDGROUP_WIDTH=" +
|
|
867
|
-
std::to_string(backend_ctx->adreno_wave_size);
|
|
868
|
-
if (has_vector_subgroup_broadcast) {
|
|
869
|
-
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
|
1449
|
+
if (required_B_d_bytes > backend_ctx->max_alloc_size) {
|
|
1450
|
+
GGML_LOG_WARN("ggml_opencl: B_d buffer size reduced from %zu to %zu due to device limitations.\n",
|
|
1451
|
+
required_B_d_bytes, max_B_d_bytes);
|
|
870
1452
|
}
|
|
871
1453
|
|
|
872
|
-
backend_ctx->program_CL_gemv_11008_1_4096 = build_program_from_source(
|
|
873
|
-
context, device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts);
|
|
874
|
-
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_11008_1_4096, "kernel_gemv_noshuffle", &err), err));
|
|
875
|
-
|
|
876
|
-
// Gemv 16000, 128000
|
|
877
|
-
CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
|
878
|
-
" -cl-mad-enable "
|
|
879
|
-
" -DLINE_STRIDE_A=16000 "
|
|
880
|
-
" -DBLOCK_STRIDE_A=128000 "
|
|
881
|
-
" -DSIMDGROUP_WIDTH=" +
|
|
882
|
-
std::to_string(backend_ctx->adreno_wave_size);
|
|
883
|
-
if (has_vector_subgroup_broadcast) {
|
|
884
|
-
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
|
885
|
-
}
|
|
886
|
-
|
|
887
|
-
backend_ctx->program_CL_gemv_32000_1_4096 = build_program_from_source(context, device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts);
|
|
888
|
-
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_32000_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_32000_1_4096, "kernel_gemv_noshuffle", &err), err));
|
|
889
|
-
|
|
890
|
-
// Gemm
|
|
891
|
-
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
892
|
-
const std::string kernel_src_CL_gemm {
|
|
893
|
-
#include "ggml-opencl_mul_mat_Ab_Bi_8x4.cl.h"
|
|
894
|
-
};
|
|
895
|
-
#else
|
|
896
|
-
const std::string kernel_src_CL_gemm = read_file("ggml-opencl_mul_mat_Ab_Bi_8x4.cl");
|
|
897
|
-
#endif
|
|
898
|
-
backend_ctx->program_CL_gemm = build_program_from_source(context, device, kernel_src_CL_gemm.c_str(), compile_opts);
|
|
899
|
-
CL_CHECK((backend_ctx->CL_mul_mat_Ab_Bi_8x4 = clCreateKernel(backend_ctx->program_CL_gemm, "kernel_mul_mat_Ab_Bi_8x4", &err), err));
|
|
900
|
-
|
|
901
|
-
// Allocate intermediate buffers and images
|
|
902
|
-
size_t max_A_q_d_bytes = 311164928;
|
|
903
|
-
size_t max_A_s_d_bytes = 38895616;
|
|
904
|
-
size_t max_B_d_bytes = 45088768;
|
|
905
|
-
|
|
906
1454
|
CL_CHECK((backend_ctx->A_q_d_max = clCreateBuffer(context, 0, max_A_q_d_bytes, NULL, &err), err));
|
|
907
1455
|
CL_CHECK((backend_ctx->A_s_d_max = clCreateBuffer(context, 0, max_A_s_d_bytes, NULL, &err), err));
|
|
908
1456
|
CL_CHECK((backend_ctx->B_d_max = clCreateBuffer(context, 0, max_B_d_bytes, NULL, &err), err));
|
|
@@ -973,7 +1521,7 @@ static void ggml_cl2_free(void) {
|
|
|
973
1521
|
info.cmd_complete_duration_ns/1.e6f,
|
|
974
1522
|
info.cmd_total_duration_ns/1.e6f,
|
|
975
1523
|
info.global_size[0], info.global_size[1], info.global_size[2],
|
|
976
|
-
info.local_size[0], info.local_size[
|
|
1524
|
+
info.local_size[0], info.local_size[1], info.local_size[2],
|
|
977
1525
|
info.output_size[0], info.output_size[1], info.output_size[2], info.output_size[3]);
|
|
978
1526
|
}
|
|
979
1527
|
fclose(fperf);
|
|
@@ -1187,6 +1735,7 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
|
|
1187
1735
|
case GGML_UNARY_OP_GELU:
|
|
1188
1736
|
case GGML_UNARY_OP_SILU:
|
|
1189
1737
|
case GGML_UNARY_OP_RELU:
|
|
1738
|
+
case GGML_UNARY_OP_GELU_QUICK:
|
|
1190
1739
|
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
|
|
1191
1740
|
default:
|
|
1192
1741
|
return false;
|
|
@@ -1216,14 +1765,26 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
|
|
1216
1765
|
return op->ne[3] == 1;
|
|
1217
1766
|
case GGML_OP_ROPE: {
|
|
1218
1767
|
const int mode = ((const int32_t *) op->op_params)[2];
|
|
1219
|
-
|
|
1768
|
+
const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
|
|
1769
|
+
const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
|
|
1770
|
+
if (is_mrope && !is_vision) {
|
|
1771
|
+
if (op->src[0]->type == GGML_TYPE_F32 ||
|
|
1772
|
+
op->src[0]->type == GGML_TYPE_F16) {
|
|
1773
|
+
return true;
|
|
1774
|
+
}
|
|
1220
1775
|
return false;
|
|
1221
1776
|
}
|
|
1222
|
-
if (
|
|
1777
|
+
if (is_vision) {
|
|
1778
|
+
if (op->src[0]->type == GGML_TYPE_F32 ||
|
|
1779
|
+
op->src[0]->type == GGML_TYPE_F16) {
|
|
1780
|
+
return true;
|
|
1781
|
+
}
|
|
1223
1782
|
return false;
|
|
1224
1783
|
}
|
|
1225
1784
|
return true;
|
|
1226
1785
|
}
|
|
1786
|
+
case GGML_OP_IM2COL:
|
|
1787
|
+
return true;
|
|
1227
1788
|
default:
|
|
1228
1789
|
return false;
|
|
1229
1790
|
}
|
|
@@ -1431,8 +1992,15 @@ static enum ggml_status ggml_backend_opencl_buffer_init_tensor(ggml_backend_buff
|
|
|
1431
1992
|
|
|
1432
1993
|
// The optimized gemm and gemv kernels are used for large matrices without batch.
|
|
1433
1994
|
// tensor is the quantized weights matrix.
|
|
1434
|
-
inline bool use_adreno_kernels(const ggml_tensor *tensor) {
|
|
1435
|
-
|
|
1995
|
+
inline bool use_adreno_kernels(const ggml_backend_opencl_context *backend_ctx, const ggml_tensor *tensor) {
|
|
1996
|
+
int64_t threshold_ne0 = 512;
|
|
1997
|
+
int64_t threshold_ne1 = 512;
|
|
1998
|
+
if (!backend_ctx->adreno_cl_compiler_version.newer_than_or_same(E031, 38, 11, 0) &&
|
|
1999
|
+
backend_ctx->adreno_cl_compiler_version.type != DX) {
|
|
2000
|
+
threshold_ne0 = 128;
|
|
2001
|
+
threshold_ne1 = 128;
|
|
2002
|
+
}
|
|
2003
|
+
return tensor->ne[0] >= threshold_ne0 && tensor->ne[1] >= threshold_ne1 &&
|
|
1436
2004
|
tensor->ne[2] == 1 && tensor->ne[3] == 1;
|
|
1437
2005
|
}
|
|
1438
2006
|
|
|
@@ -1510,7 +2078,7 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|
|
1510
2078
|
cl_kernel kernel = backend_ctx->kernel_convert_block_q4_0;
|
|
1511
2079
|
|
|
1512
2080
|
// The optimized kernels need weights in natural order, so unshuffle.
|
|
1513
|
-
if (use_adreno_kernels(tensor)) {
|
|
2081
|
+
if (use_adreno_kernels(backend_ctx, tensor)) {
|
|
1514
2082
|
kernel = backend_ctx->kernel_convert_block_q4_0_noshuffle;
|
|
1515
2083
|
}
|
|
1516
2084
|
#else
|
|
@@ -1534,7 +2102,7 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|
|
1534
2102
|
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
|
1535
2103
|
// Only do transpose for large, non batched matrix
|
|
1536
2104
|
// TODO: use preallocated images instead of sub-buffer then image
|
|
1537
|
-
if (use_adreno_kernels(tensor)) {
|
|
2105
|
+
if (use_adreno_kernels(backend_ctx, tensor)) {
|
|
1538
2106
|
// <----------------------------------------------------------------------------------> //
|
|
1539
2107
|
// start transpose
|
|
1540
2108
|
// <----------------------------------------------------------------------------------> //
|
|
@@ -2582,6 +3150,53 @@ static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
2582
3150
|
#endif
|
|
2583
3151
|
}
|
|
2584
3152
|
|
|
3153
|
+
static void ggml_cl_gelu_quick(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
3154
|
+
GGML_ASSERT(src0);
|
|
3155
|
+
GGML_ASSERT(src0->extra);
|
|
3156
|
+
GGML_ASSERT(dst);
|
|
3157
|
+
GGML_ASSERT(dst->extra);
|
|
3158
|
+
|
|
3159
|
+
UNUSED(src1);
|
|
3160
|
+
|
|
3161
|
+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
3162
|
+
cl_command_queue queue = backend_ctx->queue;
|
|
3163
|
+
|
|
3164
|
+
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
3165
|
+
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
3166
|
+
|
|
3167
|
+
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
|
3168
|
+
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
|
3169
|
+
|
|
3170
|
+
cl_kernel kernel;
|
|
3171
|
+
|
|
3172
|
+
int n = ggml_nelements(dst);
|
|
3173
|
+
|
|
3174
|
+
if (n % 4 == 0) {
|
|
3175
|
+
kernel = backend_ctx->kernel_gelu_quick_4;
|
|
3176
|
+
n /= 4;
|
|
3177
|
+
} else {
|
|
3178
|
+
kernel = backend_ctx->kernel_gelu_quick;
|
|
3179
|
+
}
|
|
3180
|
+
|
|
3181
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
3182
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
|
3183
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
|
3184
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
|
3185
|
+
|
|
3186
|
+
size_t global_work_size[] = {(size_t)n, 1, 1};
|
|
3187
|
+
size_t local_work_size[] = {64, 1, 1};
|
|
3188
|
+
|
|
3189
|
+
#ifdef GGML_OPENCL_PROFILING
|
|
3190
|
+
cl_event evt;
|
|
3191
|
+
clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt);
|
|
3192
|
+
|
|
3193
|
+
g_profiling_info.emplace_back();
|
|
3194
|
+
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
3195
|
+
#else
|
|
3196
|
+
clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL);
|
|
3197
|
+
#endif
|
|
3198
|
+
}
|
|
3199
|
+
|
|
2585
3200
|
static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
2586
3201
|
GGML_ASSERT(src0);
|
|
2587
3202
|
GGML_ASSERT(src0->extra);
|
|
@@ -2788,8 +3403,8 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
|
2788
3403
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
2789
3404
|
cl_command_queue queue = backend_ctx->queue;
|
|
2790
3405
|
|
|
2791
|
-
ggml_backend_opencl_device_context * dev_ctx =
|
|
2792
|
-
|
|
3406
|
+
//ggml_backend_opencl_device_context * dev_ctx =
|
|
3407
|
+
// (ggml_backend_opencl_device_context *)backend->device->context;
|
|
2793
3408
|
|
|
2794
3409
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
2795
3410
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
@@ -2820,13 +3435,20 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
|
2820
3435
|
|
|
2821
3436
|
// Note, this kernel declares local memory in kernel args and the size
|
|
2822
3437
|
// depends on subgroup size.
|
|
2823
|
-
// Retrieve subgroup size.
|
|
2824
3438
|
// Note, this requires OpenCL 2.1 and above
|
|
3439
|
+
// For now we use fixed subgroup size to simplify support for OpenCL 2.0.
|
|
2825
3440
|
size_t sgs;
|
|
2826
|
-
CL_CHECK(clGetKernelSubGroupInfo(kernel, dev_ctx->device,
|
|
2827
|
-
|
|
2828
|
-
|
|
2829
|
-
|
|
3441
|
+
//CL_CHECK(clGetKernelSubGroupInfo(kernel, dev_ctx->device,
|
|
3442
|
+
// CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE,
|
|
3443
|
+
// sizeof(local_work_size), local_work_size,
|
|
3444
|
+
// sizeof(size_t), &sgs, NULL));
|
|
3445
|
+
if (backend_ctx->gpu_family == ADRENO) {
|
|
3446
|
+
sgs = 64;
|
|
3447
|
+
} else if (backend_ctx->gpu_family == INTEL) {
|
|
3448
|
+
sgs = 32;
|
|
3449
|
+
} else {
|
|
3450
|
+
GGML_ASSERT(false && "Unsupported GPU");
|
|
3451
|
+
}
|
|
2830
3452
|
|
|
2831
3453
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
2832
3454
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
|
@@ -2919,7 +3541,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
2919
3541
|
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
|
2920
3542
|
cl_context context = backend_ctx->context;
|
|
2921
3543
|
|
|
2922
|
-
if (ne01 && ne1 && use_adreno_kernels(src0)) {
|
|
3544
|
+
if (ne01 && ne1 && use_adreno_kernels(backend_ctx, src0)) {
|
|
2923
3545
|
|
|
2924
3546
|
// init CL objects
|
|
2925
3547
|
// <--------------------------------------------> //
|
|
@@ -3980,6 +4602,7 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
3980
4602
|
float attn_factor;
|
|
3981
4603
|
float beta_fast;
|
|
3982
4604
|
float beta_slow;
|
|
4605
|
+
int32_t sections[4];
|
|
3983
4606
|
|
|
3984
4607
|
memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
|
|
3985
4608
|
memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
|
|
@@ -3987,29 +4610,62 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
3987
4610
|
memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
|
|
3988
4611
|
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
|
3989
4612
|
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
|
4613
|
+
memcpy(§ions, (int32_t *) dst->op_params + 11, sizeof(int32_t)*4);
|
|
3990
4614
|
|
|
3991
4615
|
const bool is_neox = mode & 2;
|
|
4616
|
+
const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
|
|
4617
|
+
const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
|
|
4618
|
+
|
|
4619
|
+
if (is_mrope) {
|
|
4620
|
+
GGML_ASSERT(sections[0] > 0 || sections[1] > 0 || sections[2] > 0);
|
|
4621
|
+
}
|
|
4622
|
+
|
|
4623
|
+
if (is_vision) {
|
|
4624
|
+
GGML_ASSERT(n_dims == ne00/2);
|
|
4625
|
+
}
|
|
3992
4626
|
|
|
3993
4627
|
cl_kernel kernel;
|
|
3994
4628
|
|
|
3995
|
-
if (
|
|
4629
|
+
if (is_neox) {
|
|
3996
4630
|
switch (src0->type) {
|
|
3997
4631
|
case GGML_TYPE_F32:
|
|
3998
|
-
kernel = backend_ctx->
|
|
4632
|
+
kernel = backend_ctx->kernel_rope_neox_f32;
|
|
3999
4633
|
break;
|
|
4000
4634
|
case GGML_TYPE_F16:
|
|
4001
|
-
kernel = backend_ctx->
|
|
4635
|
+
kernel = backend_ctx->kernel_rope_neox_f16;
|
|
4636
|
+
break;
|
|
4637
|
+
default:
|
|
4638
|
+
GGML_ASSERT(false);
|
|
4639
|
+
};
|
|
4640
|
+
} else if (is_mrope && !is_vision) {
|
|
4641
|
+
switch (src0->type) {
|
|
4642
|
+
case GGML_TYPE_F32:
|
|
4643
|
+
kernel = backend_ctx->kernel_rope_multi_f32;
|
|
4644
|
+
break;
|
|
4645
|
+
case GGML_TYPE_F16:
|
|
4646
|
+
kernel = backend_ctx->kernel_rope_multi_f16;
|
|
4002
4647
|
break;
|
|
4003
4648
|
default:
|
|
4004
4649
|
GGML_ASSERT(false);
|
|
4005
4650
|
};
|
|
4651
|
+
} else if (is_vision) {
|
|
4652
|
+
switch (src0->type) {
|
|
4653
|
+
case GGML_TYPE_F32:
|
|
4654
|
+
kernel = backend_ctx->kernel_rope_vision_f32;
|
|
4655
|
+
break;
|
|
4656
|
+
case GGML_TYPE_F16:
|
|
4657
|
+
kernel = backend_ctx->kernel_rope_vision_f16;
|
|
4658
|
+
break;
|
|
4659
|
+
default:
|
|
4660
|
+
GGML_ASSERT(false);
|
|
4661
|
+
}
|
|
4006
4662
|
} else {
|
|
4007
4663
|
switch (src0->type) {
|
|
4008
4664
|
case GGML_TYPE_F32:
|
|
4009
|
-
kernel = backend_ctx->
|
|
4665
|
+
kernel = backend_ctx->kernel_rope_norm_f32;
|
|
4010
4666
|
break;
|
|
4011
4667
|
case GGML_TYPE_F16:
|
|
4012
|
-
kernel = backend_ctx->
|
|
4668
|
+
kernel = backend_ctx->kernel_rope_norm_f16;
|
|
4013
4669
|
break;
|
|
4014
4670
|
default:
|
|
4015
4671
|
GGML_ASSERT(false);
|
|
@@ -4049,6 +4705,9 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
4049
4705
|
CL_CHECK(clSetKernelArg(kernel, 30, sizeof(float), &attn_factor));
|
|
4050
4706
|
CL_CHECK(clSetKernelArg(kernel, 31, sizeof(float), &beta_fast));
|
|
4051
4707
|
CL_CHECK(clSetKernelArg(kernel, 32, sizeof(float), &beta_slow));
|
|
4708
|
+
if (is_mrope || is_vision) {
|
|
4709
|
+
CL_CHECK(clSetKernelArg(kernel, 33, sizeof(int32_t)*4, §ions));
|
|
4710
|
+
}
|
|
4052
4711
|
|
|
4053
4712
|
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
|
4054
4713
|
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
|
@@ -4064,6 +4723,98 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
4064
4723
|
#endif
|
|
4065
4724
|
}
|
|
4066
4725
|
|
|
4726
|
+
static void ggml_cl_im2col(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
4727
|
+
GGML_ASSERT(src0);
|
|
4728
|
+
GGML_ASSERT(src1);
|
|
4729
|
+
GGML_ASSERT(src1->extra);
|
|
4730
|
+
GGML_ASSERT(dst);
|
|
4731
|
+
GGML_ASSERT(dst->extra);
|
|
4732
|
+
|
|
4733
|
+
// src0 - filter, src1 - input
|
|
4734
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
|
4735
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
|
|
4736
|
+
|
|
4737
|
+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
4738
|
+
cl_command_queue queue = backend_ctx->queue;
|
|
4739
|
+
|
|
4740
|
+
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
|
4741
|
+
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
4742
|
+
|
|
4743
|
+
cl_ulong offset1 = extra1->offset + src1->view_offs;
|
|
4744
|
+
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
|
4745
|
+
|
|
4746
|
+
const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
|
|
4747
|
+
const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
|
|
4748
|
+
const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
|
|
4749
|
+
const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
|
|
4750
|
+
const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
|
|
4751
|
+
const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
|
|
4752
|
+
|
|
4753
|
+
const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
|
|
4754
|
+
|
|
4755
|
+
const cl_long IC = src1->ne[is_2D ? 2 : 1];
|
|
4756
|
+
const cl_long IH = is_2D ? src1->ne[1] : 1;
|
|
4757
|
+
const cl_long IW = src1->ne[0];
|
|
4758
|
+
|
|
4759
|
+
const cl_long KH = is_2D ? src0->ne[1] : 1;
|
|
4760
|
+
const cl_long KW = src0->ne[0];
|
|
4761
|
+
|
|
4762
|
+
const cl_long OH = is_2D ? dst->ne[2] : 1;
|
|
4763
|
+
const cl_long OW = dst->ne[1];
|
|
4764
|
+
|
|
4765
|
+
// nb is byte offset, src is type float32
|
|
4766
|
+
const cl_ulong delta_offset = src1->nb[is_2D ? 2 : 1]/4;
|
|
4767
|
+
const cl_long batch = src1->ne[is_2D ? 3 : 2];
|
|
4768
|
+
const cl_ulong batch_offset = src1->nb[is_2D ? 3 : 2]/4;
|
|
4769
|
+
|
|
4770
|
+
const cl_long pelements = OW*KW*KH;
|
|
4771
|
+
const cl_long CHW = IC*KH*KW;
|
|
4772
|
+
|
|
4773
|
+
cl_kernel kernel;
|
|
4774
|
+
|
|
4775
|
+
if(dst->type == GGML_TYPE_F16) {
|
|
4776
|
+
kernel = backend_ctx->kernel_im2col_f16;
|
|
4777
|
+
} else {
|
|
4778
|
+
kernel = backend_ctx->kernel_im2col_f32;
|
|
4779
|
+
}
|
|
4780
|
+
|
|
4781
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra1->data_device));
|
|
4782
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset1));
|
|
4783
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
|
4784
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
|
4785
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_ulong), &batch_offset));
|
|
4786
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &delta_offset));
|
|
4787
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_long), &IW));
|
|
4788
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_long), &IH));
|
|
4789
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_long), &IC));
|
|
4790
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_long), &OW));
|
|
4791
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_long), &OH));
|
|
4792
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_long), &KW));
|
|
4793
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_long), &KH));
|
|
4794
|
+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_long), &pelements));
|
|
4795
|
+
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_long), &CHW));
|
|
4796
|
+
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &s0));
|
|
4797
|
+
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &s1));
|
|
4798
|
+
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &p0));
|
|
4799
|
+
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &p1));
|
|
4800
|
+
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &d0));
|
|
4801
|
+
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int), &d1));
|
|
4802
|
+
|
|
4803
|
+
const int num_blocks = (pelements + 256 - 1) / 256;
|
|
4804
|
+
size_t global_work_size[] = {(size_t)num_blocks*256, (size_t)OH, (size_t)batch*IC};
|
|
4805
|
+
size_t local_work_size[] = {256, 1, 1};
|
|
4806
|
+
|
|
4807
|
+
#ifdef GGML_OPENCL_PROFILING
|
|
4808
|
+
cl_event evt;
|
|
4809
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
4810
|
+
|
|
4811
|
+
g_profiling_info.emplace_back();
|
|
4812
|
+
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
|
4813
|
+
#else
|
|
4814
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
|
4815
|
+
#endif
|
|
4816
|
+
}
|
|
4817
|
+
|
|
4067
4818
|
//------------------------------------------------------------------------------
|
|
4068
4819
|
// Op offloading
|
|
4069
4820
|
//------------------------------------------------------------------------------
|
|
@@ -4122,6 +4873,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
|
|
|
4122
4873
|
}
|
|
4123
4874
|
func = ggml_cl_gelu;
|
|
4124
4875
|
break;
|
|
4876
|
+
case GGML_UNARY_OP_GELU_QUICK:
|
|
4877
|
+
if (!any_on_device) {
|
|
4878
|
+
return false;
|
|
4879
|
+
}
|
|
4880
|
+
func = ggml_cl_gelu_quick;
|
|
4881
|
+
break;
|
|
4125
4882
|
case GGML_UNARY_OP_SILU:
|
|
4126
4883
|
if (!any_on_device) {
|
|
4127
4884
|
return false;
|
|
@@ -4194,6 +4951,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
|
|
|
4194
4951
|
}
|
|
4195
4952
|
func = ggml_cl_rope;
|
|
4196
4953
|
break;
|
|
4954
|
+
case GGML_OP_IM2COL:
|
|
4955
|
+
if (!any_on_device) {
|
|
4956
|
+
return false;
|
|
4957
|
+
}
|
|
4958
|
+
func = ggml_cl_im2col;
|
|
4959
|
+
break;
|
|
4197
4960
|
default:
|
|
4198
4961
|
return false;
|
|
4199
4962
|
}
|