@fugood/llama.node 0.3.13 → 0.3.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +1 -1
- package/package.json +1 -1
- package/src/LlamaContext.cpp +98 -76
- package/src/LlamaContext.h +1 -1
- package/src/common.hpp +1 -2
- package/src/llama.cpp/.github/workflows/build.yml +89 -10
- package/src/llama.cpp/.github/workflows/server.yml +2 -0
- package/src/llama.cpp/CMakeLists.txt +9 -1
- package/src/llama.cpp/cmake/common.cmake +2 -0
- package/src/llama.cpp/common/CMakeLists.txt +3 -3
- package/src/llama.cpp/common/arg.cpp +132 -13
- package/src/llama.cpp/common/chat.cpp +960 -266
- package/src/llama.cpp/common/chat.h +135 -0
- package/src/llama.cpp/common/common.cpp +33 -174
- package/src/llama.cpp/common/common.h +27 -67
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
- package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
- package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +37 -5
- package/src/llama.cpp/common/ngram-cache.cpp +1 -0
- package/src/llama.cpp/common/sampling.cpp +45 -7
- package/src/llama.cpp/common/speculative.cpp +10 -9
- package/src/llama.cpp/common/speculative.h +1 -1
- package/src/llama.cpp/docs/build.md +45 -7
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +2 -2
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +4 -2
- package/src/llama.cpp/examples/embedding/embedding.cpp +2 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +2 -2
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +3 -4
- package/src/llama.cpp/examples/infill/infill.cpp +2 -2
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +2 -2
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +5 -5
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +373 -107
- package/src/llama.cpp/examples/llava/clip.h +19 -3
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
- package/src/llama.cpp/examples/llava/llava.cpp +4 -2
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +7 -6
- package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
- package/src/llama.cpp/examples/main/main.cpp +79 -34
- package/src/llama.cpp/examples/parallel/parallel.cpp +6 -5
- package/src/llama.cpp/examples/passkey/passkey.cpp +15 -14
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +6 -6
- package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -2
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +1 -1
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
- package/src/llama.cpp/examples/run/run.cpp +196 -108
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +2 -2
- package/src/llama.cpp/examples/server/server.cpp +113 -101
- package/src/llama.cpp/examples/server/utils.hpp +94 -105
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
- package/src/llama.cpp/examples/speculative/speculative.cpp +14 -14
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
- package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
- package/src/llama.cpp/examples/tts/tts.cpp +263 -151
- package/src/llama.cpp/ggml/CMakeLists.txt +14 -1
- package/src/llama.cpp/ggml/cmake/common.cmake +26 -0
- package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-cpu.h +3 -0
- package/src/llama.cpp/ggml/include/ggml.h +29 -1
- package/src/llama.cpp/ggml/src/CMakeLists.txt +15 -34
- package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +6 -2
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -7
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +139 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +151 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1546 -387
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1645 -113
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +22 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +15 -2
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +2 -1
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -1
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +242 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -6
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +315 -138
- package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +5 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +117 -36
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +147 -16
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +40 -40
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +307 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +262 -746
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +0 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -78
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +114 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +4 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +305 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +498 -188
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -4
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +16 -3
- package/src/llama.cpp/ggml/src/ggml.c +93 -5
- package/src/llama.cpp/include/llama.h +105 -27
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +1 -0
- package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
- package/src/llama.cpp/requirements.txt +1 -0
- package/src/llama.cpp/src/CMakeLists.txt +5 -2
- package/src/llama.cpp/src/llama-adapter.cpp +19 -20
- package/src/llama.cpp/src/llama-adapter.h +11 -9
- package/src/llama.cpp/src/llama-arch.cpp +123 -16
- package/src/llama.cpp/src/llama-arch.h +19 -0
- package/src/llama.cpp/src/llama-batch.h +2 -2
- package/src/llama.cpp/src/llama-chat.cpp +1 -0
- package/src/llama.cpp/src/llama-context.cpp +2253 -1222
- package/src/llama.cpp/src/llama-context.h +214 -77
- package/src/llama.cpp/src/llama-cparams.h +1 -0
- package/src/llama.cpp/src/llama-grammar.cpp +182 -182
- package/src/llama.cpp/src/llama-grammar.h +12 -3
- package/src/llama.cpp/src/llama-graph.cpp +1662 -0
- package/src/llama.cpp/src/llama-graph.h +574 -0
- package/src/llama.cpp/src/llama-hparams.cpp +8 -0
- package/src/llama.cpp/src/llama-hparams.h +9 -0
- package/src/llama.cpp/src/llama-io.cpp +15 -0
- package/src/llama.cpp/src/llama-io.h +35 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +1006 -291
- package/src/llama.cpp/src/llama-kv-cache.h +178 -109
- package/src/llama.cpp/src/llama-memory.cpp +1 -0
- package/src/llama.cpp/src/llama-memory.h +21 -0
- package/src/llama.cpp/src/llama-mmap.cpp +11 -1
- package/src/llama.cpp/src/llama-model.cpp +8230 -122
- package/src/llama.cpp/src/llama-model.h +34 -1
- package/src/llama.cpp/src/llama-quant.cpp +10 -1
- package/src/llama.cpp/src/llama-sampling.cpp +43 -10
- package/src/llama.cpp/src/llama-vocab.cpp +12 -0
- package/src/llama.cpp/src/llama.cpp +51 -9837
- package/src/llama.cpp/tests/test-backend-ops.cpp +247 -112
- package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
- package/src/llama.cpp/tests/test-chat.cpp +593 -395
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
- package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
- package/src/llama.cpp/Sources/llama/llama.h +0 -4
- package/src/llama.cpp/common/chat.hpp +0 -55
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +0 -143
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +0 -9
- /package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
#define CL_TARGET_OPENCL_VERSION
|
|
1
|
+
#define CL_TARGET_OPENCL_VERSION GGML_OPENCL_TARGET_VERSION
|
|
2
2
|
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
|
|
3
3
|
|
|
4
4
|
// suppress warnings in CL headers for GCC and Clang
|
|
@@ -25,6 +25,8 @@
|
|
|
25
25
|
#include <vector>
|
|
26
26
|
#include <string>
|
|
27
27
|
#include <cmath>
|
|
28
|
+
#include <memory>
|
|
29
|
+
#include <charconv>
|
|
28
30
|
|
|
29
31
|
#undef MIN
|
|
30
32
|
#undef MAX
|
|
@@ -62,6 +64,97 @@ enum ADRENO_GPU_GEN {
|
|
|
62
64
|
X1E,
|
|
63
65
|
};
|
|
64
66
|
|
|
67
|
+
struct ggml_cl_version {
|
|
68
|
+
cl_uint major = 0;
|
|
69
|
+
cl_uint minor = 0;
|
|
70
|
+
};
|
|
71
|
+
|
|
72
|
+
// Parses a version string of form "XX.YY ". On an error returns ggml_cl_version with all zeroes.
|
|
73
|
+
static ggml_cl_version parse_cl_version(std::string_view str) {
|
|
74
|
+
size_t major_str_begin = 0;
|
|
75
|
+
size_t major_str_end = str.find(".", major_str_begin);
|
|
76
|
+
if (major_str_end == std::string::npos) {
|
|
77
|
+
return {};
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
size_t minor_str_begin = major_str_end + 1;
|
|
81
|
+
size_t minor_str_end = str.find(" ", minor_str_begin);
|
|
82
|
+
if (minor_str_end == std::string::npos) {
|
|
83
|
+
return {};
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
cl_uint version_major;
|
|
87
|
+
if (std::from_chars(str.data() + major_str_begin, str.data() + major_str_end, version_major).ec != std::errc{}) {
|
|
88
|
+
return {};
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
cl_uint version_minor;
|
|
92
|
+
if (std::from_chars(str.data() + minor_str_begin, str.data() + minor_str_end, version_minor).ec != std::errc{}) {
|
|
93
|
+
return {};
|
|
94
|
+
}
|
|
95
|
+
return { version_major, version_minor };
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
// Returns OpenCL platform's version. On an error returns ggml_cl_version with all zeroes.
|
|
99
|
+
static ggml_cl_version get_opencl_platform_version(cl_platform_id platform) {
|
|
100
|
+
size_t param_size;
|
|
101
|
+
CL_CHECK(clGetPlatformInfo(platform, CL_PLATFORM_VERSION, 0, nullptr, ¶m_size));
|
|
102
|
+
std::unique_ptr<char[]> param_storage(new char[param_size]);
|
|
103
|
+
CL_CHECK(clGetPlatformInfo(platform, CL_PLATFORM_VERSION, param_size, param_storage.get(), nullptr));
|
|
104
|
+
|
|
105
|
+
auto param_value = std::string_view(param_storage.get(), param_size);
|
|
106
|
+
const std::string version_prefix = "OpenCL "; // Suffix: "XX.YY <platform-specific-info>"
|
|
107
|
+
if (param_value.find(version_prefix) != 0) {
|
|
108
|
+
return {};
|
|
109
|
+
}
|
|
110
|
+
param_value.remove_prefix(version_prefix.length());
|
|
111
|
+
return parse_cl_version(param_value);
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
// Return a version to use in OpenCL C compilation. On an error returns ggml_cl_version with all zeroes.
|
|
115
|
+
static ggml_cl_version get_opencl_c_version(ggml_cl_version platform_version, cl_device_id device) {
|
|
116
|
+
size_t param_size;
|
|
117
|
+
|
|
118
|
+
#if CL_TARGET_OPENCL_VERSION >= 300
|
|
119
|
+
if (platform_version.major >= 3) {
|
|
120
|
+
CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_ALL_VERSIONS, 0, nullptr, ¶m_size));
|
|
121
|
+
if (!param_size) {
|
|
122
|
+
return {};
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
std::unique_ptr<cl_name_version[]> versions(new cl_name_version[param_size]);
|
|
126
|
+
CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_ALL_VERSIONS, param_size, versions.get(), nullptr));
|
|
127
|
+
unsigned versions_count = param_size / sizeof(cl_name_version);
|
|
128
|
+
|
|
129
|
+
cl_version version_max = 0;
|
|
130
|
+
for (unsigned i = 0; i < versions_count; i++) {
|
|
131
|
+
version_max = std::max<cl_version>(versions[i].version, version_max);
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
return { CL_VERSION_MAJOR(version_max), CL_VERSION_MINOR(version_max) };
|
|
135
|
+
}
|
|
136
|
+
#else
|
|
137
|
+
GGML_UNUSED(platform_version);
|
|
138
|
+
#endif // CL_TARGET_OPENCL_VERSION >= 300
|
|
139
|
+
|
|
140
|
+
CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, 0, nullptr, ¶m_size));
|
|
141
|
+
if (!param_size) {
|
|
142
|
+
return {};
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
std::unique_ptr<char[]> param_storage(new char[param_size]);
|
|
146
|
+
CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, param_size, param_storage.get(), nullptr));
|
|
147
|
+
auto param_value = std::string_view(param_storage.get(), param_size);
|
|
148
|
+
|
|
149
|
+
const std::string version_prefix = "OpenCL C "; // Suffix: "XX.YY <platform-specific-info>"
|
|
150
|
+
if (param_value.find(version_prefix) != 0) {
|
|
151
|
+
return {};
|
|
152
|
+
}
|
|
153
|
+
param_value.remove_prefix(version_prefix.length());
|
|
154
|
+
|
|
155
|
+
return parse_cl_version(param_value);
|
|
156
|
+
}
|
|
157
|
+
|
|
65
158
|
static ADRENO_GPU_GEN get_adreno_gpu_gen(const char *device_name) {
|
|
66
159
|
if (strstr(device_name, "730") ||
|
|
67
160
|
strstr(device_name, "740") ||
|
|
@@ -204,8 +297,27 @@ static int ggml_backend_opencl_n_devices = 0;
|
|
|
204
297
|
struct ProfilingInfo {
|
|
205
298
|
std::string op_name;
|
|
206
299
|
std::string kernel_name;
|
|
207
|
-
|
|
208
|
-
|
|
300
|
+
|
|
301
|
+
cl_kernel kernel;
|
|
302
|
+
cl_event evt;
|
|
303
|
+
|
|
304
|
+
cl_ulong cmd_queued;
|
|
305
|
+
cl_ulong cmd_submit;
|
|
306
|
+
cl_ulong cmd_start;
|
|
307
|
+
cl_ulong cmd_end;
|
|
308
|
+
cl_ulong overhead_start;
|
|
309
|
+
cl_ulong overhead_end;
|
|
310
|
+
// For the times below, see spec for clGetEventProfilingInfo
|
|
311
|
+
// The time kernel spent in cmd queue - SUBMIT - QUEUED
|
|
312
|
+
cl_ulong cmd_queued_duration_ns;
|
|
313
|
+
// The time kernel spent for submission - START - SUBMIT
|
|
314
|
+
cl_ulong cmd_submit_duration_ns;
|
|
315
|
+
// Kernel execution time in nanoseconds - END - START
|
|
316
|
+
cl_ulong cmd_duration_ns;
|
|
317
|
+
// The time for the kernel to complete - COMPLETE - END
|
|
318
|
+
cl_ulong cmd_complete_duration_ns;
|
|
319
|
+
// Total time to finish the kernel - COMPELTE - QUEUED
|
|
320
|
+
cl_ulong cmd_total_duration_ns;
|
|
209
321
|
// Global and local work sizes.
|
|
210
322
|
size_t global_size[3];
|
|
211
323
|
size_t local_size[3];
|
|
@@ -278,7 +390,7 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
|
278
390
|
|
|
279
391
|
cl_int err;
|
|
280
392
|
|
|
281
|
-
#ifdef
|
|
393
|
+
#ifdef GGML_OPENCL_PROFILING
|
|
282
394
|
GGML_LOG_INFO("ggml_opencl: OpenCL profiling enabled\n");
|
|
283
395
|
#endif
|
|
284
396
|
|
|
@@ -444,19 +556,8 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
|
444
556
|
backend_ctx->gpu_family = GPU_FAMILY::ADRENO;
|
|
445
557
|
backend_ctx->adreno_gen = get_adreno_gpu_gen(default_device->name);
|
|
446
558
|
|
|
447
|
-
//
|
|
448
|
-
|
|
449
|
-
backend_ctx->adreno_wave_size = 64;
|
|
450
|
-
} else if (backend_ctx->adreno_gen == ADRENO_GPU_GEN::A7X ||
|
|
451
|
-
backend_ctx->adreno_gen == ADRENO_GPU_GEN::X1E) {
|
|
452
|
-
backend_ctx->adreno_wave_size = 128;
|
|
453
|
-
} else {
|
|
454
|
-
backend_ctx->adreno_wave_size = 128;
|
|
455
|
-
GGML_LOG_WARN("ggml_opencl: Unsupported Adreno GPU: %s, "
|
|
456
|
-
"using wave size %d, "
|
|
457
|
-
"may not work as expected\n",
|
|
458
|
-
backend_ctx->device_name.c_str(), backend_ctx->adreno_wave_size);
|
|
459
|
-
}
|
|
559
|
+
// Use wave size of 64 for all Adreno GPUs.
|
|
560
|
+
backend_ctx->adreno_wave_size = 64;
|
|
460
561
|
} else if (strstr(default_device->name, "Intel")) {
|
|
461
562
|
backend_ctx->gpu_family = GPU_FAMILY::INTEL;
|
|
462
563
|
} else {
|
|
@@ -481,16 +582,11 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
|
481
582
|
// A local ref of cl_device_id for convenience
|
|
482
583
|
cl_device_id device = backend_ctx->device;
|
|
483
584
|
|
|
585
|
+
ggml_cl_version platform_version = get_opencl_platform_version(default_device->platform->id);
|
|
586
|
+
|
|
484
587
|
// Check device OpenCL version, OpenCL 2.0 or above is required
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
char *device_ver_buffer = (char *)alloca(device_ver_str_size + 1);
|
|
488
|
-
clGetDeviceInfo(device, CL_DEVICE_VERSION, device_ver_str_size, device_ver_buffer, NULL);
|
|
489
|
-
device_ver_buffer[device_ver_str_size] = '\0';
|
|
490
|
-
GGML_LOG_INFO("ggml_opencl: device OpenCL version: %s\n", device_ver_buffer);
|
|
491
|
-
|
|
492
|
-
if (strstr(device_ver_buffer, "OpenCL 2") == NULL &&
|
|
493
|
-
strstr(device_ver_buffer, "OpenCL 3") == NULL) {
|
|
588
|
+
ggml_cl_version opencl_c_version = get_opencl_c_version(platform_version, device);
|
|
589
|
+
if (opencl_c_version.major < 2) {
|
|
494
590
|
GGML_LOG_ERROR("ggml_opencl: OpenCL 2.0 or above is required\n");
|
|
495
591
|
return backend_ctx;
|
|
496
592
|
}
|
|
@@ -527,15 +623,17 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
|
527
623
|
|
|
528
624
|
// If OpenCL 3.0 is supported, then check for cl_khr_subgroups, which becomes
|
|
529
625
|
// optional in OpenCL 3.0 (cl_khr_subgroup is mandatory in OpenCL 2.x)
|
|
530
|
-
if (strstr(
|
|
531
|
-
strstr(ext_buffer, "cl_khr_subgroups") == NULL &&
|
|
626
|
+
if (opencl_c_version.major == 3 && strstr(ext_buffer, "cl_khr_subgroups") == NULL &&
|
|
532
627
|
strstr(ext_buffer, "cl_intel_subgroups") == NULL) {
|
|
533
628
|
GGML_LOG_ERROR("ggml_opencl: device does not support subgroups (cl_khr_subgroups or cl_intel_subgroups) "
|
|
534
629
|
"(note that subgroups is an optional feature in OpenCL 3.0)\n");
|
|
535
630
|
return backend_ctx;
|
|
536
631
|
}
|
|
537
632
|
|
|
538
|
-
|
|
633
|
+
cl_uint base_align_in_bits;
|
|
634
|
+
CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &base_align_in_bits, NULL));
|
|
635
|
+
GGML_ASSERT(base_align_in_bits % 8u == 0);
|
|
636
|
+
backend_ctx->alignment = base_align_in_bits / 8u;
|
|
539
637
|
GGML_LOG_INFO("ggml_opencl: mem base addr align: %u\n", backend_ctx->alignment);
|
|
540
638
|
|
|
541
639
|
clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &backend_ctx->max_alloc_size, NULL);
|
|
@@ -589,9 +687,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
|
589
687
|
const std::string kernel_src = read_file("ggml-opencl.cl");
|
|
590
688
|
#endif
|
|
591
689
|
|
|
592
|
-
|
|
593
|
-
"
|
|
594
|
-
|
|
690
|
+
auto opencl_c_std =
|
|
691
|
+
std::string("CL") + std::to_string(opencl_c_version.major) + "." + std::to_string(opencl_c_version.minor);
|
|
692
|
+
|
|
693
|
+
std::string compile_opts = std::string("-cl-std=") + opencl_c_std +
|
|
694
|
+
" -cl-mad-enable -cl-unsafe-math-optimizations"
|
|
695
|
+
" -cl-finite-math-only -cl-fast-relaxed-math";
|
|
595
696
|
backend_ctx->program = build_program_from_source(context, device, kernel_src.c_str(), compile_opts);
|
|
596
697
|
|
|
597
698
|
// Non matmul kernels.
|
|
@@ -701,10 +802,10 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
|
701
802
|
CL_CHECK((backend_ctx->kernel_transpose_16 = clCreateKernel(backend_ctx->program_transpose_16, "kernel_transpose_16", &err), err));
|
|
702
803
|
|
|
703
804
|
// Gemv general
|
|
704
|
-
std::string CL_gemv_compile_opts =
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
805
|
+
std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
|
806
|
+
" -cl-mad-enable "
|
|
807
|
+
" -DSIMDGROUP_WIDTH=" +
|
|
808
|
+
std::to_string(backend_ctx->adreno_wave_size);
|
|
708
809
|
if (has_vector_subgroup_broadcast) {
|
|
709
810
|
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
|
710
811
|
}
|
|
@@ -721,12 +822,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
|
721
822
|
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_general = clCreateKernel(backend_ctx->program_CL_gemv_general, "kernel_gemv_noshuffle", &err), err));
|
|
722
823
|
|
|
723
824
|
// Gemv 2048, 16384
|
|
724
|
-
CL_gemv_compile_opts =
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
825
|
+
CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
|
826
|
+
" -cl-mad-enable "
|
|
827
|
+
" -DLINE_STRIDE_A=2048 "
|
|
828
|
+
" -DBLOCK_STRIDE_A=16384 "
|
|
829
|
+
" -DSIMDGROUP_WIDTH=" +
|
|
830
|
+
std::to_string(backend_ctx->adreno_wave_size);
|
|
730
831
|
if (has_vector_subgroup_broadcast) {
|
|
731
832
|
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
|
732
833
|
}
|
|
@@ -743,12 +844,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
|
743
844
|
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_4096, "kernel_gemv_noshuffle", &err), err));
|
|
744
845
|
|
|
745
846
|
// Gemv 2048, 16384
|
|
746
|
-
CL_gemv_compile_opts =
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
847
|
+
CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
|
848
|
+
" -cl-mad-enable "
|
|
849
|
+
" -DLINE_STRIDE_A=2048 "
|
|
850
|
+
" -DBLOCK_STRIDE_A=16384 "
|
|
851
|
+
" -DSIMDGROUP_WIDTH=" +
|
|
852
|
+
std::to_string(backend_ctx->adreno_wave_size);
|
|
752
853
|
if (has_vector_subgroup_broadcast) {
|
|
753
854
|
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
|
754
855
|
}
|
|
@@ -758,12 +859,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
|
758
859
|
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_11008 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_11008, "kernel_gemv_noshuffle", &err), err));
|
|
759
860
|
|
|
760
861
|
// Gemv 5504, 44032
|
|
761
|
-
CL_gemv_compile_opts =
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
862
|
+
CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
|
863
|
+
" -cl-mad-enable "
|
|
864
|
+
" -DLINE_STRIDE_A=5504 "
|
|
865
|
+
" -DBLOCK_STRIDE_A=44032 "
|
|
866
|
+
" -DSIMDGROUP_WIDTH=" +
|
|
867
|
+
std::to_string(backend_ctx->adreno_wave_size);
|
|
767
868
|
if (has_vector_subgroup_broadcast) {
|
|
768
869
|
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
|
769
870
|
}
|
|
@@ -773,12 +874,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
|
773
874
|
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_11008_1_4096, "kernel_gemv_noshuffle", &err), err));
|
|
774
875
|
|
|
775
876
|
// Gemv 16000, 128000
|
|
776
|
-
CL_gemv_compile_opts =
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
877
|
+
CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
|
878
|
+
" -cl-mad-enable "
|
|
879
|
+
" -DLINE_STRIDE_A=16000 "
|
|
880
|
+
" -DBLOCK_STRIDE_A=128000 "
|
|
881
|
+
" -DSIMDGROUP_WIDTH=" +
|
|
882
|
+
std::to_string(backend_ctx->adreno_wave_size);
|
|
782
883
|
if (has_vector_subgroup_broadcast) {
|
|
783
884
|
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
|
784
885
|
}
|
|
@@ -821,12 +922,56 @@ static void ggml_cl2_free(void) {
|
|
|
821
922
|
return;
|
|
822
923
|
}
|
|
823
924
|
|
|
925
|
+
// Populate profiling info
|
|
926
|
+
for (ProfilingInfo & info : g_profiling_info) {
|
|
927
|
+
cl_ulong cmd_queued;
|
|
928
|
+
cl_ulong cmd_submit;
|
|
929
|
+
cl_ulong cmd_start;
|
|
930
|
+
cl_ulong cmd_end;
|
|
931
|
+
cl_ulong cmd_complete;
|
|
932
|
+
|
|
933
|
+
CL_CHECK(clWaitForEvents(1, &info.evt));
|
|
934
|
+
CL_CHECK(clGetEventProfilingInfo(
|
|
935
|
+
info.evt, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &cmd_queued, NULL));
|
|
936
|
+
CL_CHECK(clGetEventProfilingInfo(
|
|
937
|
+
info.evt, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &cmd_submit, NULL));
|
|
938
|
+
CL_CHECK(clGetEventProfilingInfo(
|
|
939
|
+
info.evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &cmd_start, NULL));
|
|
940
|
+
CL_CHECK(clGetEventProfilingInfo(
|
|
941
|
+
info.evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &cmd_end, NULL));
|
|
942
|
+
CL_CHECK(clGetEventProfilingInfo(
|
|
943
|
+
info.evt, CL_PROFILING_COMMAND_COMPLETE, sizeof(cl_ulong), &cmd_complete, NULL));
|
|
944
|
+
CL_CHECK(clReleaseEvent(info.evt));
|
|
945
|
+
|
|
946
|
+
char kernel_name[512];
|
|
947
|
+
CL_CHECK(clGetKernelInfo(info.kernel, CL_KERNEL_FUNCTION_NAME,
|
|
948
|
+
sizeof(kernel_name), kernel_name, NULL));
|
|
949
|
+
info.kernel_name = kernel_name;
|
|
950
|
+
|
|
951
|
+
info.cmd_queued = cmd_queued;
|
|
952
|
+
info.cmd_submit = cmd_submit;
|
|
953
|
+
info.cmd_start = cmd_start;
|
|
954
|
+
info.cmd_end = cmd_end;
|
|
955
|
+
|
|
956
|
+
info.cmd_queued_duration_ns = cmd_submit - cmd_queued;
|
|
957
|
+
info.cmd_submit_duration_ns = cmd_start - cmd_submit;
|
|
958
|
+
info.cmd_duration_ns = cmd_end - cmd_start;
|
|
959
|
+
info.cmd_complete_duration_ns = cmd_complete - cmd_end;
|
|
960
|
+
info.cmd_total_duration_ns = cmd_complete - cmd_queued;
|
|
961
|
+
}
|
|
962
|
+
|
|
963
|
+
// Dump a csv
|
|
824
964
|
float total_kernel_time = 0;
|
|
825
|
-
fprintf(fperf, "op name, kernel name, duration (ms), global size, local size, output size\n");
|
|
965
|
+
fprintf(fperf, "op name, kernel name, queued duration (ms), submit duration(ms), exec duration (ms), complete duration (ms), total duration (ms), global size, local size, output size\n");
|
|
826
966
|
for (const ProfilingInfo & info : g_profiling_info) {
|
|
827
|
-
total_kernel_time += info.
|
|
828
|
-
fprintf(fperf, "%s,%s,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n",
|
|
829
|
-
info.op_name.c_str(), info.kernel_name.c_str(),
|
|
967
|
+
total_kernel_time += info.cmd_duration_ns/1.e6f;
|
|
968
|
+
fprintf(fperf, "%s,%s,%f,%f,%f,%f,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n",
|
|
969
|
+
info.op_name.c_str(), info.kernel_name.c_str(),
|
|
970
|
+
info.cmd_queued_duration_ns/1.e6f,
|
|
971
|
+
info.cmd_submit_duration_ns/1.e6f,
|
|
972
|
+
info.cmd_duration_ns/1.e6f,
|
|
973
|
+
info.cmd_complete_duration_ns/1.e6f,
|
|
974
|
+
info.cmd_total_duration_ns/1.e6f,
|
|
830
975
|
info.global_size[0], info.global_size[1], info.global_size[2],
|
|
831
976
|
info.local_size[0], info.local_size[2], info.local_size[2],
|
|
832
977
|
info.output_size[0], info.output_size[1], info.output_size[2], info.output_size[3]);
|
|
@@ -834,6 +979,27 @@ static void ggml_cl2_free(void) {
|
|
|
834
979
|
fclose(fperf);
|
|
835
980
|
|
|
836
981
|
GGML_LOG_INFO("ggml_opencl: total kernel time: %f\n", total_kernel_time);
|
|
982
|
+
|
|
983
|
+
// Dump a simple chrome trace
|
|
984
|
+
FILE* ftrace = fopen("cl_trace.json", "w");
|
|
985
|
+
if (!ftrace) {
|
|
986
|
+
GGML_LOG_ERROR("Failed to open cl_trace.json\n");
|
|
987
|
+
return;
|
|
988
|
+
}
|
|
989
|
+
|
|
990
|
+
fprintf(ftrace, "[\n");
|
|
991
|
+
for (const ProfilingInfo & info : g_profiling_info) {
|
|
992
|
+
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n",
|
|
993
|
+
info.kernel_name.c_str(), info.cmd_queued/1000);
|
|
994
|
+
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n",
|
|
995
|
+
info.kernel_name.c_str(), info.cmd_submit/1000);
|
|
996
|
+
|
|
997
|
+
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n",
|
|
998
|
+
info.kernel_name.c_str(), info.cmd_start/1000);
|
|
999
|
+
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n",
|
|
1000
|
+
info.kernel_name.c_str(), info.cmd_end/1000);
|
|
1001
|
+
}
|
|
1002
|
+
fclose(ftrace);
|
|
837
1003
|
#endif
|
|
838
1004
|
}
|
|
839
1005
|
|
|
@@ -1015,17 +1181,18 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
|
|
1015
1181
|
case GGML_OP_ADD:
|
|
1016
1182
|
case GGML_OP_SCALE:
|
|
1017
1183
|
case GGML_OP_MUL:
|
|
1018
|
-
return
|
|
1184
|
+
return op->src[0]->type == GGML_TYPE_F32;
|
|
1019
1185
|
case GGML_OP_UNARY:
|
|
1020
1186
|
switch (ggml_get_unary_op(op)) {
|
|
1021
1187
|
case GGML_UNARY_OP_GELU:
|
|
1022
1188
|
case GGML_UNARY_OP_SILU:
|
|
1023
1189
|
case GGML_UNARY_OP_RELU:
|
|
1024
|
-
return ggml_is_contiguous(op->src[0]);
|
|
1190
|
+
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
|
|
1025
1191
|
default:
|
|
1026
1192
|
return false;
|
|
1027
1193
|
}
|
|
1028
1194
|
case GGML_OP_CLAMP:
|
|
1195
|
+
return op->src[0]->type == GGML_TYPE_F32;
|
|
1029
1196
|
case GGML_OP_SOFT_MAX:
|
|
1030
1197
|
case GGML_OP_NORM:
|
|
1031
1198
|
case GGML_OP_RMS_NORM:
|
|
@@ -1209,20 +1376,17 @@ struct ggml_backend_opencl_buffer_context {
|
|
|
1209
1376
|
std::string name;
|
|
1210
1377
|
};
|
|
1211
1378
|
|
|
1212
|
-
static void * const cl_ptr_base = (void *)(uintptr_t) 0x1000;
|
|
1213
|
-
|
|
1214
1379
|
static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
1215
1380
|
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
|
|
1216
1381
|
delete ctx;
|
|
1217
1382
|
}
|
|
1218
1383
|
|
|
1219
1384
|
static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|
1220
|
-
|
|
1221
|
-
|
|
1222
|
-
GGML_UNUSED(buffer);
|
|
1385
|
+
ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(buffer->buft->device);
|
|
1386
|
+
return (void *) (uintptr_t) backend_ctx->alignment;
|
|
1223
1387
|
}
|
|
1224
1388
|
|
|
1225
|
-
static
|
|
1389
|
+
static enum ggml_status ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
|
1226
1390
|
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
|
|
1227
1391
|
|
|
1228
1392
|
ggml_cl2_init(buffer->buft->device);
|
|
@@ -1252,7 +1416,7 @@ static void ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
|
|
1252
1416
|
tensor->extra = view_extra;
|
|
1253
1417
|
} else {
|
|
1254
1418
|
{
|
|
1255
|
-
size_t offset = (char *)tensor->data - (char *)
|
|
1419
|
+
size_t offset = (char *) tensor->data - (char *) ggml_backend_opencl_buffer_get_base(buffer);
|
|
1256
1420
|
|
|
1257
1421
|
ggml_tensor_extra_cl * extra = ctx->ggml_opencl_alloc_temp_tensor_extra();
|
|
1258
1422
|
extra->offset = offset;
|
|
@@ -1262,6 +1426,7 @@ static void ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
|
|
1262
1426
|
tensor->extra = extra;
|
|
1263
1427
|
}
|
|
1264
1428
|
}
|
|
1429
|
+
return GGML_STATUS_SUCCESS;
|
|
1265
1430
|
}
|
|
1266
1431
|
|
|
1267
1432
|
// The optimized gemm and gemv kernels are used for large matrices without batch.
|
|
@@ -1376,6 +1541,11 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|
|
1376
1541
|
int M = tensor->ne[1]; // ne01
|
|
1377
1542
|
int K = tensor->ne[0]; // ne00
|
|
1378
1543
|
|
|
1544
|
+
//For matrix-vector multiplication kernel, we assume K is a multiple of 32
|
|
1545
|
+
GGML_ASSERT(K % 32 == 0);
|
|
1546
|
+
//For transpose kernels, we assume K is a multiple of 4 (satisfied by prior assert), and M is a multiple of 4
|
|
1547
|
+
GGML_ASSERT(M % 4 == 0);
|
|
1548
|
+
|
|
1379
1549
|
// transpose is out of place, so we need to allocate transposed buffers
|
|
1380
1550
|
// <----------------------------------------------------------------------------------> //
|
|
1381
1551
|
// use sub_buffer of max buffer size instead
|
|
@@ -1416,36 +1586,36 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|
|
1416
1586
|
cl_mem qT_d_image1D;
|
|
1417
1587
|
cl_mem dT_d_image1D;
|
|
1418
1588
|
|
|
1419
|
-
cl_image_format img_fmt_1d = { CL_RGBA,
|
|
1589
|
+
cl_image_format img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
|
|
1420
1590
|
cl_image_desc img_desc_1d;
|
|
1421
1591
|
|
|
1422
1592
|
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
|
1423
1593
|
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
|
1424
|
-
img_desc_1d.image_width = M * K /
|
|
1594
|
+
img_desc_1d.image_width = M * K / 4 / 4;
|
|
1425
1595
|
img_desc_1d.buffer = extra->q;
|
|
1426
1596
|
q_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
|
|
1427
1597
|
CL_CHECK(err);
|
|
1428
1598
|
|
|
1429
|
-
img_fmt_1d = { CL_RGBA,
|
|
1599
|
+
img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
|
|
1430
1600
|
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
|
1431
1601
|
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
|
1432
|
-
img_desc_1d.image_width = M * K /
|
|
1602
|
+
img_desc_1d.image_width = M * K / 4 / 4;
|
|
1433
1603
|
img_desc_1d.buffer = qT_d;
|
|
1434
1604
|
qT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
|
|
1435
1605
|
CL_CHECK(err);
|
|
1436
1606
|
|
|
1437
|
-
img_fmt_1d = { CL_RGBA,
|
|
1607
|
+
img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
|
|
1438
1608
|
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
|
1439
1609
|
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
|
1440
|
-
img_desc_1d.image_width = M * K / 32 / 4
|
|
1610
|
+
img_desc_1d.image_width = M * K / 32 / 4;
|
|
1441
1611
|
img_desc_1d.buffer = extra->d;
|
|
1442
1612
|
d_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
|
|
1443
1613
|
CL_CHECK(err);
|
|
1444
1614
|
|
|
1445
|
-
img_fmt_1d = { CL_RGBA,
|
|
1615
|
+
img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
|
|
1446
1616
|
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
|
1447
1617
|
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
|
1448
|
-
img_desc_1d.image_width = M * K / 32 / 4
|
|
1618
|
+
img_desc_1d.image_width = M * K / 32 / 4;
|
|
1449
1619
|
img_desc_1d.buffer = dT_d;
|
|
1450
1620
|
dT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
|
|
1451
1621
|
CL_CHECK(err);
|
|
@@ -1454,8 +1624,8 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|
|
1454
1624
|
// set up and call the transpose kernels
|
|
1455
1625
|
// <----------------------------------------------------------------------------------> //
|
|
1456
1626
|
// weights
|
|
1457
|
-
int height_q = M /
|
|
1458
|
-
int width_q = K /
|
|
1627
|
+
int height_q = M / 4;
|
|
1628
|
+
int width_q = K / 4 / 4;
|
|
1459
1629
|
kernel = backend_ctx->kernel_transpose_16;
|
|
1460
1630
|
|
|
1461
1631
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &q_d_image1D));
|
|
@@ -1469,8 +1639,8 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|
|
1469
1639
|
CL_CHECK(clWaitForEvents(1, &evt));
|
|
1470
1640
|
|
|
1471
1641
|
// scales
|
|
1472
|
-
int height_s = M /
|
|
1473
|
-
int width_s = K / 32 /
|
|
1642
|
+
int height_s = M / 4;
|
|
1643
|
+
int width_s = K / 32 / 4;
|
|
1474
1644
|
|
|
1475
1645
|
kernel = backend_ctx->kernel_transpose_16;
|
|
1476
1646
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_d_image1D));
|
|
@@ -1864,7 +2034,6 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso
|
|
|
1864
2034
|
void * buf_d;
|
|
1865
2035
|
#endif
|
|
1866
2036
|
|
|
1867
|
-
#ifdef GGML_USE_OPENCL
|
|
1868
2037
|
// Make sure everything is done.
|
|
1869
2038
|
CL_CHECK(clFinish(queue));
|
|
1870
2039
|
|
|
@@ -1900,7 +2069,6 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso
|
|
|
1900
2069
|
extra->offset, ggml_nbytes(tensor), buf, 0, NULL, NULL));
|
|
1901
2070
|
CL_CHECK(clFinish(queue));
|
|
1902
2071
|
#endif // GGML_OPENCL_SOA_Q
|
|
1903
|
-
#endif // GGML_USE_OPENCL
|
|
1904
2072
|
|
|
1905
2073
|
// Open file and dump.
|
|
1906
2074
|
char fname[512];
|
|
@@ -1978,25 +2146,14 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso
|
|
|
1978
2146
|
// Profiling utility
|
|
1979
2147
|
//------------------------------------------------------------------------------
|
|
1980
2148
|
#ifdef GGML_OPENCL_PROFILING
|
|
1981
|
-
void populateProfilingInfo(
|
|
2149
|
+
static void populateProfilingInfo(
|
|
1982
2150
|
ProfilingInfo& info, cl_event evt, cl_kernel kernel,
|
|
1983
2151
|
size_t global_size[3], size_t local_size[3],
|
|
1984
2152
|
const ggml_tensor * tensor) {
|
|
1985
|
-
|
|
1986
|
-
|
|
1987
|
-
|
|
1988
|
-
|
|
1989
|
-
evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL));
|
|
1990
|
-
CL_CHECK(clGetEventProfilingInfo(
|
|
1991
|
-
evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL));
|
|
1992
|
-
|
|
1993
|
-
char kernel_name[512];
|
|
1994
|
-
CL_CHECK(clGetKernelInfo(kernel, CL_KERNEL_FUNCTION_NAME,
|
|
1995
|
-
sizeof(kernel_name), kernel_name, NULL));
|
|
1996
|
-
|
|
1997
|
-
info.duration_ns = end - start;
|
|
1998
|
-
info.op_name = tensor->name;
|
|
1999
|
-
info.kernel_name = kernel_name;
|
|
2153
|
+
info.op_name = tensor->name;
|
|
2154
|
+
info.kernel = kernel;
|
|
2155
|
+
info.evt = evt;
|
|
2156
|
+
|
|
2000
2157
|
info.local_size[0] = local_size[0];
|
|
2001
2158
|
info.local_size[1] = local_size[1];
|
|
2002
2159
|
info.local_size[2] = local_size[2];
|
|
@@ -2580,26 +2737,33 @@ static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
2580
2737
|
memcpy(&eps, dst->op_params, sizeof(float));
|
|
2581
2738
|
|
|
2582
2739
|
const int ne00 = src0 ? src0->ne[0] : 0;
|
|
2583
|
-
const
|
|
2740
|
+
const int ne01 = src0 ? src0->ne[1] : 0;
|
|
2741
|
+
const int ne02 = src0 ? src0->ne[2] : 0;
|
|
2742
|
+
const int ne03 = src0 ? src0->ne[3] : 0;
|
|
2584
2743
|
|
|
2585
|
-
|
|
2744
|
+
const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
|
|
2745
|
+
const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
|
|
2746
|
+
const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
|
|
2586
2747
|
|
|
2587
2748
|
const int nth = MIN(64, ne00);
|
|
2588
2749
|
|
|
2589
2750
|
cl_kernel kernel = backend_ctx->kernel_norm;
|
|
2590
2751
|
|
|
2591
|
-
CL_CHECK(clSetKernelArg(kernel,
|
|
2592
|
-
CL_CHECK(clSetKernelArg(kernel,
|
|
2593
|
-
CL_CHECK(clSetKernelArg(kernel,
|
|
2594
|
-
CL_CHECK(clSetKernelArg(kernel,
|
|
2595
|
-
CL_CHECK(clSetKernelArg(kernel,
|
|
2596
|
-
CL_CHECK(clSetKernelArg(kernel,
|
|
2597
|
-
CL_CHECK(clSetKernelArg(kernel,
|
|
2598
|
-
CL_CHECK(clSetKernelArg(kernel,
|
|
2599
|
-
|
|
2600
|
-
|
|
2752
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
2753
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
|
2754
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
|
2755
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
|
2756
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
|
|
2757
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
|
|
2758
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
|
|
2759
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
|
|
2760
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb01));
|
|
2761
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb02));
|
|
2762
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb03));
|
|
2763
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(float), &eps));
|
|
2764
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float)*nth, NULL));
|
|
2601
2765
|
|
|
2602
|
-
size_t global_work_size[] = {(size_t)
|
|
2766
|
+
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
|
2603
2767
|
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
|
2604
2768
|
|
|
2605
2769
|
#ifdef GGML_OPENCL_PROFILING
|
|
@@ -2637,16 +2801,19 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
|
2637
2801
|
memcpy(&eps, dst->op_params, sizeof(float));
|
|
2638
2802
|
|
|
2639
2803
|
const int ne00 = src0 ? src0->ne[0] : 0;
|
|
2804
|
+
const int ne01 = src0 ? src0->ne[1] : 0;
|
|
2805
|
+
const int ne02 = src0 ? src0->ne[2] : 0;
|
|
2806
|
+
const int ne03 = src0 ? src0->ne[3] : 0;
|
|
2807
|
+
|
|
2640
2808
|
const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
|
|
2809
|
+
const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
|
|
2810
|
+
const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
|
|
2641
2811
|
|
|
2642
2812
|
GGML_ASSERT(ne00 % 4 == 0);
|
|
2643
|
-
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
|
2644
2813
|
|
|
2645
2814
|
const int nth = MIN(64, ne00);
|
|
2646
2815
|
|
|
2647
|
-
|
|
2648
|
-
|
|
2649
|
-
size_t global_work_size[] = {(size_t)nrows*nth, 1, 1};
|
|
2816
|
+
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
|
2650
2817
|
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
|
2651
2818
|
|
|
2652
2819
|
cl_kernel kernel = backend_ctx->kernel_rms_norm;
|
|
@@ -2661,15 +2828,20 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
|
2661
2828
|
sizeof(local_work_size), local_work_size,
|
|
2662
2829
|
sizeof(size_t), &sgs, NULL));
|
|
2663
2830
|
|
|
2664
|
-
CL_CHECK(clSetKernelArg(kernel,
|
|
2665
|
-
CL_CHECK(clSetKernelArg(kernel,
|
|
2666
|
-
CL_CHECK(clSetKernelArg(kernel,
|
|
2667
|
-
CL_CHECK(clSetKernelArg(kernel,
|
|
2668
|
-
CL_CHECK(clSetKernelArg(kernel,
|
|
2669
|
-
CL_CHECK(clSetKernelArg(kernel,
|
|
2670
|
-
CL_CHECK(clSetKernelArg(kernel,
|
|
2831
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
2832
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
|
2833
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
|
2834
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
|
2835
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
|
|
2836
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
|
|
2837
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
|
|
2838
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
|
|
2839
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb01));
|
|
2840
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb02));
|
|
2841
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb03));
|
|
2842
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(float), &eps));
|
|
2671
2843
|
// This is local memory - the size depends on subgroup size.
|
|
2672
|
-
CL_CHECK(clSetKernelArg(kernel,
|
|
2844
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float)*nth/sgs, NULL));
|
|
2673
2845
|
|
|
2674
2846
|
#ifdef GGML_OPENCL_PROFILING
|
|
2675
2847
|
cl_event evt;
|
|
@@ -2865,6 +3037,9 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
2865
3037
|
CL_CHECK(status);
|
|
2866
3038
|
|
|
2867
3039
|
int height_B = N/4;
|
|
3040
|
+
if (height_B == 0) {
|
|
3041
|
+
height_B = 1;
|
|
3042
|
+
}
|
|
2868
3043
|
int width_B = K/4;
|
|
2869
3044
|
int padded_height_B = (N + padding)/4;
|
|
2870
3045
|
|
|
@@ -3013,11 +3188,12 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
3013
3188
|
}
|
|
3014
3189
|
|
|
3015
3190
|
if (N == 1) {
|
|
3016
|
-
|
|
3191
|
+
size_t wavesize = backend_ctx->adreno_wave_size;
|
|
3192
|
+
local_work_size[0] = wavesize; // localsize
|
|
3017
3193
|
local_work_size[1] = 4; // reduce factor
|
|
3018
3194
|
local_work_size[2] = 1;
|
|
3019
3195
|
|
|
3020
|
-
global_work_size[0] = M / 2;
|
|
3196
|
+
global_work_size[0] = (((M / 2) + wavesize - 1) / wavesize) * wavesize;
|
|
3021
3197
|
global_work_size[1] = 4; // reduce factor
|
|
3022
3198
|
global_work_size[2] = 1;
|
|
3023
3199
|
}
|
|
@@ -3026,6 +3202,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
3026
3202
|
// enqueue kernel with profiling
|
|
3027
3203
|
// <--------------------------------------------> //
|
|
3028
3204
|
#ifdef GGML_OPENCL_PROFILING
|
|
3205
|
+
cl_event evt;
|
|
3029
3206
|
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
3030
3207
|
|
|
3031
3208
|
g_profiling_info.emplace_back();
|
|
@@ -3767,10 +3944,10 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
3767
3944
|
const int ne02 = src0 ? src0->ne[2] : 0;
|
|
3768
3945
|
const int ne03 = src0 ? src0->ne[3] : 0;
|
|
3769
3946
|
|
|
3770
|
-
const
|
|
3771
|
-
const
|
|
3772
|
-
const
|
|
3773
|
-
const
|
|
3947
|
+
const cl_ulong nb00 = src0 ? src0->nb[0] : 0;
|
|
3948
|
+
const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
|
|
3949
|
+
const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
|
|
3950
|
+
const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
|
|
3774
3951
|
|
|
3775
3952
|
const int ne10 = src1 ? src1->ne[0] : 0;
|
|
3776
3953
|
const int ne11 = src1 ? src1->ne[1] : 0; UNUSED(ne11);
|
|
@@ -3782,10 +3959,10 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
3782
3959
|
const int ne2 = dst ? dst->ne[2] : 0;
|
|
3783
3960
|
const int ne3 = dst ? dst->ne[3] : 0;
|
|
3784
3961
|
|
|
3785
|
-
const
|
|
3786
|
-
const
|
|
3787
|
-
const
|
|
3788
|
-
const
|
|
3962
|
+
const cl_ulong nb0 = dst ? dst->nb[0] : 0;
|
|
3963
|
+
const cl_ulong nb1 = dst ? dst->nb[1] : 0;
|
|
3964
|
+
const cl_ulong nb2 = dst ? dst->nb[2] : 0;
|
|
3965
|
+
const cl_ulong nb3 = dst ? dst->nb[3] : 0;
|
|
3789
3966
|
|
|
3790
3967
|
GGML_ASSERT(ne10 % ne02 == 0);
|
|
3791
3968
|
GGML_ASSERT(ne10 >= ne02);
|