@fugood/llama.node 0.3.12 → 0.3.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +2 -1
- package/package.json +1 -1
- package/src/LlamaCompletionWorker.cpp +14 -0
- package/src/LlamaContext.cpp +110 -79
- package/src/LlamaContext.h +1 -1
- package/src/common.hpp +1 -2
- package/src/llama.cpp/.github/workflows/build.yml +95 -13
- package/src/llama.cpp/.github/workflows/docker.yml +2 -0
- package/src/llama.cpp/.github/workflows/labeler.yml +1 -1
- package/src/llama.cpp/.github/workflows/server.yml +2 -0
- package/src/llama.cpp/common/CMakeLists.txt +23 -6
- package/src/llama.cpp/common/arg.cpp +292 -14
- package/src/llama.cpp/common/chat.cpp +1128 -315
- package/src/llama.cpp/common/chat.h +135 -0
- package/src/llama.cpp/common/common.cpp +27 -171
- package/src/llama.cpp/common/common.h +41 -73
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
- package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
- package/src/llama.cpp/common/llguidance.cpp +3 -3
- package/src/llama.cpp/common/log.cpp +1 -0
- package/src/llama.cpp/common/log.h +2 -1
- package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +21 -7
- package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +61 -14
- package/src/llama.cpp/common/ngram-cache.cpp +1 -0
- package/src/llama.cpp/common/sampling.cpp +93 -49
- package/src/llama.cpp/common/speculative.cpp +6 -5
- package/src/llama.cpp/common/speculative.h +1 -1
- package/src/llama.cpp/docs/build.md +47 -9
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +3 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +1 -0
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +4 -4
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +6 -5
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +1 -1
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +373 -107
- package/src/llama.cpp/examples/llava/clip.h +19 -3
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
- package/src/llama.cpp/examples/llava/llava.cpp +4 -2
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -0
- package/src/llama.cpp/examples/main/main.cpp +73 -28
- package/src/llama.cpp/examples/parallel/parallel.cpp +1 -0
- package/src/llama.cpp/examples/passkey/passkey.cpp +1 -0
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +1 -0
- package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
- package/src/llama.cpp/examples/run/run.cpp +115 -79
- package/src/llama.cpp/examples/server/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/server/httplib.h +381 -292
- package/src/llama.cpp/examples/server/server.cpp +134 -128
- package/src/llama.cpp/examples/server/utils.hpp +95 -106
- package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
- package/src/llama.cpp/examples/tts/tts.cpp +251 -142
- package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
- package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-cpu.h +4 -1
- package/src/llama.cpp/ggml/include/ggml-metal.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +0 -2
- package/src/llama.cpp/ggml/include/ggml.h +6 -2
- package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
- package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
- package/src/llama.cpp/ggml/src/ggml-common.h +0 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +132 -17
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +156 -11
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +2235 -641
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1572 -198
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +24 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +9 -8
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +16 -3
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +235 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -2
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +246 -120
- package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +51 -10
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +136 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +308 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +174 -728
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -77
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +949 -602
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +37 -3
- package/src/llama.cpp/ggml/src/ggml.c +9 -4
- package/src/llama.cpp/include/llama.h +32 -14
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +1 -0
- package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
- package/src/llama.cpp/requirements.txt +1 -0
- package/src/llama.cpp/src/llama-arch.cpp +21 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-chat.cpp +1 -0
- package/src/llama.cpp/src/llama-grammar.cpp +183 -183
- package/src/llama.cpp/src/llama-grammar.h +13 -4
- package/src/llama.cpp/src/llama-impl.h +6 -6
- package/src/llama.cpp/src/llama-kv-cache.h +2 -1
- package/src/llama.cpp/src/llama-mmap.cpp +11 -1
- package/src/llama.cpp/src/llama-mmap.h +1 -0
- package/src/llama.cpp/src/llama-model.cpp +70 -6
- package/src/llama.cpp/src/llama-sampling.cpp +174 -67
- package/src/llama.cpp/src/llama-vocab.cpp +12 -0
- package/src/llama.cpp/src/llama.cpp +154 -5
- package/src/llama.cpp/src/unicode.cpp +9 -2
- package/src/llama.cpp/tests/test-backend-ops.cpp +171 -115
- package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
- package/src/llama.cpp/tests/test-chat.cpp +691 -325
- package/src/llama.cpp/tests/test-gguf.cpp +4 -4
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
- package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
- package/src/llama.cpp/tests/test-sampling.cpp +15 -0
- package/src/llama.cpp/Sources/llama/llama.h +0 -4
- package/src/llama.cpp/common/chat.hpp +0 -52
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
#define CL_TARGET_OPENCL_VERSION
|
|
1
|
+
#define CL_TARGET_OPENCL_VERSION GGML_OPENCL_TARGET_VERSION
|
|
2
2
|
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
|
|
3
3
|
|
|
4
4
|
// suppress warnings in CL headers for GCC and Clang
|
|
@@ -25,6 +25,8 @@
|
|
|
25
25
|
#include <vector>
|
|
26
26
|
#include <string>
|
|
27
27
|
#include <cmath>
|
|
28
|
+
#include <memory>
|
|
29
|
+
#include <charconv>
|
|
28
30
|
|
|
29
31
|
#undef MIN
|
|
30
32
|
#undef MAX
|
|
@@ -62,6 +64,97 @@ enum ADRENO_GPU_GEN {
|
|
|
62
64
|
X1E,
|
|
63
65
|
};
|
|
64
66
|
|
|
67
|
+
struct ggml_cl_version {
|
|
68
|
+
cl_uint major = 0;
|
|
69
|
+
cl_uint minor = 0;
|
|
70
|
+
};
|
|
71
|
+
|
|
72
|
+
// Parses a version string of form "XX.YY ". On an error returns ggml_cl_version with all zeroes.
|
|
73
|
+
static ggml_cl_version parse_cl_version(std::string_view str) {
|
|
74
|
+
size_t major_str_begin = 0;
|
|
75
|
+
size_t major_str_end = str.find(".", major_str_begin);
|
|
76
|
+
if (major_str_end == std::string::npos) {
|
|
77
|
+
return {};
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
size_t minor_str_begin = major_str_end + 1;
|
|
81
|
+
size_t minor_str_end = str.find(" ", minor_str_begin);
|
|
82
|
+
if (minor_str_end == std::string::npos) {
|
|
83
|
+
return {};
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
cl_uint version_major;
|
|
87
|
+
if (std::from_chars(str.data() + major_str_begin, str.data() + major_str_end, version_major).ec != std::errc{}) {
|
|
88
|
+
return {};
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
cl_uint version_minor;
|
|
92
|
+
if (std::from_chars(str.data() + minor_str_begin, str.data() + minor_str_end, version_minor).ec != std::errc{}) {
|
|
93
|
+
return {};
|
|
94
|
+
}
|
|
95
|
+
return { version_major, version_minor };
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
// Returns OpenCL platform's version. On an error returns ggml_cl_version with all zeroes.
|
|
99
|
+
static ggml_cl_version get_opencl_platform_version(cl_platform_id platform) {
|
|
100
|
+
size_t param_size;
|
|
101
|
+
CL_CHECK(clGetPlatformInfo(platform, CL_PLATFORM_VERSION, 0, nullptr, ¶m_size));
|
|
102
|
+
std::unique_ptr<char[]> param_storage(new char[param_size]);
|
|
103
|
+
CL_CHECK(clGetPlatformInfo(platform, CL_PLATFORM_VERSION, param_size, param_storage.get(), nullptr));
|
|
104
|
+
|
|
105
|
+
auto param_value = std::string_view(param_storage.get(), param_size);
|
|
106
|
+
const std::string version_prefix = "OpenCL "; // Suffix: "XX.YY <platform-specific-info>"
|
|
107
|
+
if (param_value.find(version_prefix) != 0) {
|
|
108
|
+
return {};
|
|
109
|
+
}
|
|
110
|
+
param_value.remove_prefix(version_prefix.length());
|
|
111
|
+
return parse_cl_version(param_value);
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
// Return a version to use in OpenCL C compilation. On an error returns ggml_cl_version with all zeroes.
|
|
115
|
+
static ggml_cl_version get_opencl_c_version(ggml_cl_version platform_version, cl_device_id device) {
|
|
116
|
+
size_t param_size;
|
|
117
|
+
|
|
118
|
+
#if CL_TARGET_OPENCL_VERSION >= 300
|
|
119
|
+
if (platform_version.major >= 3) {
|
|
120
|
+
CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_ALL_VERSIONS, 0, nullptr, ¶m_size));
|
|
121
|
+
if (!param_size) {
|
|
122
|
+
return {};
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
std::unique_ptr<cl_name_version[]> versions(new cl_name_version[param_size]);
|
|
126
|
+
CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_ALL_VERSIONS, param_size, versions.get(), nullptr));
|
|
127
|
+
unsigned versions_count = param_size / sizeof(cl_name_version);
|
|
128
|
+
|
|
129
|
+
cl_version version_max = 0;
|
|
130
|
+
for (unsigned i = 0; i < versions_count; i++) {
|
|
131
|
+
version_max = std::max<cl_version>(versions[i].version, version_max);
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
return { CL_VERSION_MAJOR(version_max), CL_VERSION_MINOR(version_max) };
|
|
135
|
+
}
|
|
136
|
+
#else
|
|
137
|
+
GGML_UNUSED(platform_version);
|
|
138
|
+
#endif // CL_TARGET_OPENCL_VERSION >= 300
|
|
139
|
+
|
|
140
|
+
CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, 0, nullptr, ¶m_size));
|
|
141
|
+
if (!param_size) {
|
|
142
|
+
return {};
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
std::unique_ptr<char[]> param_storage(new char[param_size]);
|
|
146
|
+
CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, param_size, param_storage.get(), nullptr));
|
|
147
|
+
auto param_value = std::string_view(param_storage.get(), param_size);
|
|
148
|
+
|
|
149
|
+
const std::string version_prefix = "OpenCL C "; // Suffix: "XX.YY <platform-specific-info>"
|
|
150
|
+
if (param_value.find(version_prefix) != 0) {
|
|
151
|
+
return {};
|
|
152
|
+
}
|
|
153
|
+
param_value.remove_prefix(version_prefix.length());
|
|
154
|
+
|
|
155
|
+
return parse_cl_version(param_value);
|
|
156
|
+
}
|
|
157
|
+
|
|
65
158
|
static ADRENO_GPU_GEN get_adreno_gpu_gen(const char *device_name) {
|
|
66
159
|
if (strstr(device_name, "730") ||
|
|
67
160
|
strstr(device_name, "740") ||
|
|
@@ -143,6 +236,7 @@ struct ggml_backend_opencl_context {
|
|
|
143
236
|
cl_kernel kernel_rms_norm;
|
|
144
237
|
cl_kernel kernel_diag_mask_inf, kernel_diag_mask_inf_8;
|
|
145
238
|
cl_kernel kernel_soft_max, kernel_soft_max_4;
|
|
239
|
+
cl_kernel kernel_soft_max_f16, kernel_soft_max_4_f16;
|
|
146
240
|
cl_kernel kernel_get_rows_f32, kernel_get_rows_f16, kernel_get_rows_q4_0;
|
|
147
241
|
cl_kernel kernel_rope_norm_f32, kernel_rope_norm_f16, kernel_rope_neox_f32, kernel_rope_neox_f16;
|
|
148
242
|
cl_kernel kernel_cpy_f16_f16, kernel_cpy_f16_f32, kernel_cpy_f32_f16, kernel_cpy_f32_f32;
|
|
@@ -277,7 +371,7 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
|
277
371
|
|
|
278
372
|
cl_int err;
|
|
279
373
|
|
|
280
|
-
#ifdef
|
|
374
|
+
#ifdef GGML_OPENCL_PROFILING
|
|
281
375
|
GGML_LOG_INFO("ggml_opencl: OpenCL profiling enabled\n");
|
|
282
376
|
#endif
|
|
283
377
|
|
|
@@ -443,19 +537,8 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
|
443
537
|
backend_ctx->gpu_family = GPU_FAMILY::ADRENO;
|
|
444
538
|
backend_ctx->adreno_gen = get_adreno_gpu_gen(default_device->name);
|
|
445
539
|
|
|
446
|
-
//
|
|
447
|
-
|
|
448
|
-
backend_ctx->adreno_wave_size = 64;
|
|
449
|
-
} else if (backend_ctx->adreno_gen == ADRENO_GPU_GEN::A7X ||
|
|
450
|
-
backend_ctx->adreno_gen == ADRENO_GPU_GEN::X1E) {
|
|
451
|
-
backend_ctx->adreno_wave_size = 128;
|
|
452
|
-
} else {
|
|
453
|
-
backend_ctx->adreno_wave_size = 128;
|
|
454
|
-
GGML_LOG_WARN("ggml_opencl: Unsupported Adreno GPU: %s, "
|
|
455
|
-
"using wave size %d, "
|
|
456
|
-
"may not work as expected\n",
|
|
457
|
-
backend_ctx->device_name.c_str(), backend_ctx->adreno_wave_size);
|
|
458
|
-
}
|
|
540
|
+
// Use wave size of 64 for all Adreno GPUs.
|
|
541
|
+
backend_ctx->adreno_wave_size = 64;
|
|
459
542
|
} else if (strstr(default_device->name, "Intel")) {
|
|
460
543
|
backend_ctx->gpu_family = GPU_FAMILY::INTEL;
|
|
461
544
|
} else {
|
|
@@ -480,16 +563,11 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
|
480
563
|
// A local ref of cl_device_id for convenience
|
|
481
564
|
cl_device_id device = backend_ctx->device;
|
|
482
565
|
|
|
566
|
+
ggml_cl_version platform_version = get_opencl_platform_version(default_device->platform->id);
|
|
567
|
+
|
|
483
568
|
// Check device OpenCL version, OpenCL 2.0 or above is required
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
char *device_ver_buffer = (char *)alloca(device_ver_str_size + 1);
|
|
487
|
-
clGetDeviceInfo(device, CL_DEVICE_VERSION, device_ver_str_size, device_ver_buffer, NULL);
|
|
488
|
-
device_ver_buffer[device_ver_str_size] = '\0';
|
|
489
|
-
GGML_LOG_INFO("ggml_opencl: device OpenCL version: %s\n", device_ver_buffer);
|
|
490
|
-
|
|
491
|
-
if (strstr(device_ver_buffer, "OpenCL 2") == NULL &&
|
|
492
|
-
strstr(device_ver_buffer, "OpenCL 3") == NULL) {
|
|
569
|
+
ggml_cl_version opencl_c_version = get_opencl_c_version(platform_version, device);
|
|
570
|
+
if (opencl_c_version.major < 2) {
|
|
493
571
|
GGML_LOG_ERROR("ggml_opencl: OpenCL 2.0 or above is required\n");
|
|
494
572
|
return backend_ctx;
|
|
495
573
|
}
|
|
@@ -526,15 +604,17 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
|
526
604
|
|
|
527
605
|
// If OpenCL 3.0 is supported, then check for cl_khr_subgroups, which becomes
|
|
528
606
|
// optional in OpenCL 3.0 (cl_khr_subgroup is mandatory in OpenCL 2.x)
|
|
529
|
-
if (strstr(
|
|
530
|
-
strstr(ext_buffer, "cl_khr_subgroups") == NULL &&
|
|
607
|
+
if (opencl_c_version.major == 3 && strstr(ext_buffer, "cl_khr_subgroups") == NULL &&
|
|
531
608
|
strstr(ext_buffer, "cl_intel_subgroups") == NULL) {
|
|
532
609
|
GGML_LOG_ERROR("ggml_opencl: device does not support subgroups (cl_khr_subgroups or cl_intel_subgroups) "
|
|
533
610
|
"(note that subgroups is an optional feature in OpenCL 3.0)\n");
|
|
534
611
|
return backend_ctx;
|
|
535
612
|
}
|
|
536
613
|
|
|
537
|
-
|
|
614
|
+
cl_uint base_align_in_bits;
|
|
615
|
+
CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &base_align_in_bits, NULL));
|
|
616
|
+
GGML_ASSERT(base_align_in_bits % 8u == 0);
|
|
617
|
+
backend_ctx->alignment = base_align_in_bits / 8u;
|
|
538
618
|
GGML_LOG_INFO("ggml_opencl: mem base addr align: %u\n", backend_ctx->alignment);
|
|
539
619
|
|
|
540
620
|
clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &backend_ctx->max_alloc_size, NULL);
|
|
@@ -588,9 +668,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
|
588
668
|
const std::string kernel_src = read_file("ggml-opencl.cl");
|
|
589
669
|
#endif
|
|
590
670
|
|
|
591
|
-
|
|
592
|
-
"
|
|
593
|
-
|
|
671
|
+
auto opencl_c_std =
|
|
672
|
+
std::string("CL") + std::to_string(opencl_c_version.major) + "." + std::to_string(opencl_c_version.minor);
|
|
673
|
+
|
|
674
|
+
std::string compile_opts = std::string("-cl-std=") + opencl_c_std +
|
|
675
|
+
" -cl-mad-enable -cl-unsafe-math-optimizations"
|
|
676
|
+
" -cl-finite-math-only -cl-fast-relaxed-math";
|
|
594
677
|
backend_ctx->program = build_program_from_source(context, device, kernel_src.c_str(), compile_opts);
|
|
595
678
|
|
|
596
679
|
// Non matmul kernels.
|
|
@@ -614,6 +697,8 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
|
614
697
|
CL_CHECK((backend_ctx->kernel_diag_mask_inf_8 = clCreateKernel(backend_ctx->program, "kernel_diag_mask_inf_8", &err), err));
|
|
615
698
|
CL_CHECK((backend_ctx->kernel_soft_max = clCreateKernel(backend_ctx->program, "kernel_soft_max", &err), err));
|
|
616
699
|
CL_CHECK((backend_ctx->kernel_soft_max_4 = clCreateKernel(backend_ctx->program, "kernel_soft_max_4", &err), err));
|
|
700
|
+
CL_CHECK((backend_ctx->kernel_soft_max_f16 = clCreateKernel(backend_ctx->program, "kernel_soft_max_f16", &err), err));
|
|
701
|
+
CL_CHECK((backend_ctx->kernel_soft_max_4_f16 = clCreateKernel(backend_ctx->program, "kernel_soft_max_4_f16", &err), err));
|
|
617
702
|
CL_CHECK((backend_ctx->kernel_rope_norm_f32 = clCreateKernel(backend_ctx->program, "kernel_rope_norm_f32", &err), err));
|
|
618
703
|
CL_CHECK((backend_ctx->kernel_rope_norm_f16 = clCreateKernel(backend_ctx->program, "kernel_rope_norm_f16", &err), err));
|
|
619
704
|
CL_CHECK((backend_ctx->kernel_rope_neox_f32 = clCreateKernel(backend_ctx->program, "kernel_rope_neox_f32", &err), err));
|
|
@@ -698,10 +783,10 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
|
698
783
|
CL_CHECK((backend_ctx->kernel_transpose_16 = clCreateKernel(backend_ctx->program_transpose_16, "kernel_transpose_16", &err), err));
|
|
699
784
|
|
|
700
785
|
// Gemv general
|
|
701
|
-
std::string CL_gemv_compile_opts =
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
786
|
+
std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
|
787
|
+
" -cl-mad-enable "
|
|
788
|
+
" -DSIMDGROUP_WIDTH=" +
|
|
789
|
+
std::to_string(backend_ctx->adreno_wave_size);
|
|
705
790
|
if (has_vector_subgroup_broadcast) {
|
|
706
791
|
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
|
707
792
|
}
|
|
@@ -718,12 +803,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
|
718
803
|
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_general = clCreateKernel(backend_ctx->program_CL_gemv_general, "kernel_gemv_noshuffle", &err), err));
|
|
719
804
|
|
|
720
805
|
// Gemv 2048, 16384
|
|
721
|
-
CL_gemv_compile_opts =
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
806
|
+
CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
|
807
|
+
" -cl-mad-enable "
|
|
808
|
+
" -DLINE_STRIDE_A=2048 "
|
|
809
|
+
" -DBLOCK_STRIDE_A=16384 "
|
|
810
|
+
" -DSIMDGROUP_WIDTH=" +
|
|
811
|
+
std::to_string(backend_ctx->adreno_wave_size);
|
|
727
812
|
if (has_vector_subgroup_broadcast) {
|
|
728
813
|
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
|
729
814
|
}
|
|
@@ -740,12 +825,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
|
740
825
|
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_4096, "kernel_gemv_noshuffle", &err), err));
|
|
741
826
|
|
|
742
827
|
// Gemv 2048, 16384
|
|
743
|
-
CL_gemv_compile_opts =
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
828
|
+
CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
|
829
|
+
" -cl-mad-enable "
|
|
830
|
+
" -DLINE_STRIDE_A=2048 "
|
|
831
|
+
" -DBLOCK_STRIDE_A=16384 "
|
|
832
|
+
" -DSIMDGROUP_WIDTH=" +
|
|
833
|
+
std::to_string(backend_ctx->adreno_wave_size);
|
|
749
834
|
if (has_vector_subgroup_broadcast) {
|
|
750
835
|
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
|
751
836
|
}
|
|
@@ -755,12 +840,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
|
755
840
|
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_11008 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_11008, "kernel_gemv_noshuffle", &err), err));
|
|
756
841
|
|
|
757
842
|
// Gemv 5504, 44032
|
|
758
|
-
CL_gemv_compile_opts =
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
843
|
+
CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
|
844
|
+
" -cl-mad-enable "
|
|
845
|
+
" -DLINE_STRIDE_A=5504 "
|
|
846
|
+
" -DBLOCK_STRIDE_A=44032 "
|
|
847
|
+
" -DSIMDGROUP_WIDTH=" +
|
|
848
|
+
std::to_string(backend_ctx->adreno_wave_size);
|
|
764
849
|
if (has_vector_subgroup_broadcast) {
|
|
765
850
|
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
|
766
851
|
}
|
|
@@ -770,12 +855,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
|
770
855
|
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_11008_1_4096, "kernel_gemv_noshuffle", &err), err));
|
|
771
856
|
|
|
772
857
|
// Gemv 16000, 128000
|
|
773
|
-
CL_gemv_compile_opts =
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
858
|
+
CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
|
859
|
+
" -cl-mad-enable "
|
|
860
|
+
" -DLINE_STRIDE_A=16000 "
|
|
861
|
+
" -DBLOCK_STRIDE_A=128000 "
|
|
862
|
+
" -DSIMDGROUP_WIDTH=" +
|
|
863
|
+
std::to_string(backend_ctx->adreno_wave_size);
|
|
779
864
|
if (has_vector_subgroup_broadcast) {
|
|
780
865
|
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
|
781
866
|
}
|
|
@@ -1012,17 +1097,18 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
|
|
1012
1097
|
case GGML_OP_ADD:
|
|
1013
1098
|
case GGML_OP_SCALE:
|
|
1014
1099
|
case GGML_OP_MUL:
|
|
1015
|
-
return
|
|
1100
|
+
return op->src[0]->type == GGML_TYPE_F32;
|
|
1016
1101
|
case GGML_OP_UNARY:
|
|
1017
1102
|
switch (ggml_get_unary_op(op)) {
|
|
1018
1103
|
case GGML_UNARY_OP_GELU:
|
|
1019
1104
|
case GGML_UNARY_OP_SILU:
|
|
1020
1105
|
case GGML_UNARY_OP_RELU:
|
|
1021
|
-
return ggml_is_contiguous(op->src[0]);
|
|
1106
|
+
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
|
|
1022
1107
|
default:
|
|
1023
1108
|
return false;
|
|
1024
1109
|
}
|
|
1025
1110
|
case GGML_OP_CLAMP:
|
|
1111
|
+
return op->src[0]->type == GGML_TYPE_F32;
|
|
1026
1112
|
case GGML_OP_SOFT_MAX:
|
|
1027
1113
|
case GGML_OP_NORM:
|
|
1028
1114
|
case GGML_OP_RMS_NORM:
|
|
@@ -1044,8 +1130,16 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
|
|
1044
1130
|
return true;
|
|
1045
1131
|
case GGML_OP_DIAG_MASK_INF:
|
|
1046
1132
|
return op->ne[3] == 1;
|
|
1047
|
-
case GGML_OP_ROPE:
|
|
1133
|
+
case GGML_OP_ROPE: {
|
|
1134
|
+
const int mode = ((const int32_t *) op->op_params)[2];
|
|
1135
|
+
if (mode & GGML_ROPE_TYPE_MROPE) {
|
|
1136
|
+
return false;
|
|
1137
|
+
}
|
|
1138
|
+
if (mode & GGML_ROPE_TYPE_VISION) {
|
|
1139
|
+
return false;
|
|
1140
|
+
}
|
|
1048
1141
|
return true;
|
|
1142
|
+
}
|
|
1049
1143
|
default:
|
|
1050
1144
|
return false;
|
|
1051
1145
|
}
|
|
@@ -1198,20 +1292,17 @@ struct ggml_backend_opencl_buffer_context {
|
|
|
1198
1292
|
std::string name;
|
|
1199
1293
|
};
|
|
1200
1294
|
|
|
1201
|
-
static void * const cl_ptr_base = (void *)(uintptr_t) 0x1000;
|
|
1202
|
-
|
|
1203
1295
|
static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
1204
1296
|
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
|
|
1205
1297
|
delete ctx;
|
|
1206
1298
|
}
|
|
1207
1299
|
|
|
1208
1300
|
static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|
1209
|
-
|
|
1210
|
-
|
|
1211
|
-
GGML_UNUSED(buffer);
|
|
1301
|
+
ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(buffer->buft->device);
|
|
1302
|
+
return (void *) (uintptr_t) backend_ctx->alignment;
|
|
1212
1303
|
}
|
|
1213
1304
|
|
|
1214
|
-
static
|
|
1305
|
+
static enum ggml_status ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
|
1215
1306
|
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
|
|
1216
1307
|
|
|
1217
1308
|
ggml_cl2_init(buffer->buft->device);
|
|
@@ -1241,7 +1332,7 @@ static void ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
|
|
1241
1332
|
tensor->extra = view_extra;
|
|
1242
1333
|
} else {
|
|
1243
1334
|
{
|
|
1244
|
-
size_t offset = (char *)tensor->data - (char *)
|
|
1335
|
+
size_t offset = (char *) tensor->data - (char *) ggml_backend_opencl_buffer_get_base(buffer);
|
|
1245
1336
|
|
|
1246
1337
|
ggml_tensor_extra_cl * extra = ctx->ggml_opencl_alloc_temp_tensor_extra();
|
|
1247
1338
|
extra->offset = offset;
|
|
@@ -1251,6 +1342,7 @@ static void ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
|
|
1251
1342
|
tensor->extra = extra;
|
|
1252
1343
|
}
|
|
1253
1344
|
}
|
|
1345
|
+
return GGML_STATUS_SUCCESS;
|
|
1254
1346
|
}
|
|
1255
1347
|
|
|
1256
1348
|
// The optimized gemm and gemv kernels are used for large matrices without batch.
|
|
@@ -1365,6 +1457,11 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|
|
1365
1457
|
int M = tensor->ne[1]; // ne01
|
|
1366
1458
|
int K = tensor->ne[0]; // ne00
|
|
1367
1459
|
|
|
1460
|
+
//For matrix-vector multiplication kernel, we assume K is a multiple of 32
|
|
1461
|
+
GGML_ASSERT(K % 32 == 0);
|
|
1462
|
+
//For transpose kernels, we assume K is a multiple of 4 (satisfied by prior assert), and M is a multiple of 4
|
|
1463
|
+
GGML_ASSERT(M % 4 == 0);
|
|
1464
|
+
|
|
1368
1465
|
// transpose is out of place, so we need to allocate transposed buffers
|
|
1369
1466
|
// <----------------------------------------------------------------------------------> //
|
|
1370
1467
|
// use sub_buffer of max buffer size instead
|
|
@@ -1405,36 +1502,36 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|
|
1405
1502
|
cl_mem qT_d_image1D;
|
|
1406
1503
|
cl_mem dT_d_image1D;
|
|
1407
1504
|
|
|
1408
|
-
cl_image_format img_fmt_1d = { CL_RGBA,
|
|
1505
|
+
cl_image_format img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
|
|
1409
1506
|
cl_image_desc img_desc_1d;
|
|
1410
1507
|
|
|
1411
1508
|
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
|
1412
1509
|
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
|
1413
|
-
img_desc_1d.image_width = M * K /
|
|
1510
|
+
img_desc_1d.image_width = M * K / 4 / 4;
|
|
1414
1511
|
img_desc_1d.buffer = extra->q;
|
|
1415
1512
|
q_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
|
|
1416
1513
|
CL_CHECK(err);
|
|
1417
1514
|
|
|
1418
|
-
img_fmt_1d = { CL_RGBA,
|
|
1515
|
+
img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
|
|
1419
1516
|
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
|
1420
1517
|
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
|
1421
|
-
img_desc_1d.image_width = M * K /
|
|
1518
|
+
img_desc_1d.image_width = M * K / 4 / 4;
|
|
1422
1519
|
img_desc_1d.buffer = qT_d;
|
|
1423
1520
|
qT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
|
|
1424
1521
|
CL_CHECK(err);
|
|
1425
1522
|
|
|
1426
|
-
img_fmt_1d = { CL_RGBA,
|
|
1523
|
+
img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
|
|
1427
1524
|
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
|
1428
1525
|
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
|
1429
|
-
img_desc_1d.image_width = M * K / 32 / 4
|
|
1526
|
+
img_desc_1d.image_width = M * K / 32 / 4;
|
|
1430
1527
|
img_desc_1d.buffer = extra->d;
|
|
1431
1528
|
d_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
|
|
1432
1529
|
CL_CHECK(err);
|
|
1433
1530
|
|
|
1434
|
-
img_fmt_1d = { CL_RGBA,
|
|
1531
|
+
img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
|
|
1435
1532
|
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
|
1436
1533
|
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
|
1437
|
-
img_desc_1d.image_width = M * K / 32 / 4
|
|
1534
|
+
img_desc_1d.image_width = M * K / 32 / 4;
|
|
1438
1535
|
img_desc_1d.buffer = dT_d;
|
|
1439
1536
|
dT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
|
|
1440
1537
|
CL_CHECK(err);
|
|
@@ -1443,8 +1540,8 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|
|
1443
1540
|
// set up and call the transpose kernels
|
|
1444
1541
|
// <----------------------------------------------------------------------------------> //
|
|
1445
1542
|
// weights
|
|
1446
|
-
int height_q = M /
|
|
1447
|
-
int width_q = K /
|
|
1543
|
+
int height_q = M / 4;
|
|
1544
|
+
int width_q = K / 4 / 4;
|
|
1448
1545
|
kernel = backend_ctx->kernel_transpose_16;
|
|
1449
1546
|
|
|
1450
1547
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &q_d_image1D));
|
|
@@ -1458,8 +1555,8 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|
|
1458
1555
|
CL_CHECK(clWaitForEvents(1, &evt));
|
|
1459
1556
|
|
|
1460
1557
|
// scales
|
|
1461
|
-
int height_s = M /
|
|
1462
|
-
int width_s = K / 32 /
|
|
1558
|
+
int height_s = M / 4;
|
|
1559
|
+
int width_s = K / 32 / 4;
|
|
1463
1560
|
|
|
1464
1561
|
kernel = backend_ctx->kernel_transpose_16;
|
|
1465
1562
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_d_image1D));
|
|
@@ -1853,7 +1950,6 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso
|
|
|
1853
1950
|
void * buf_d;
|
|
1854
1951
|
#endif
|
|
1855
1952
|
|
|
1856
|
-
#ifdef GGML_USE_OPENCL
|
|
1857
1953
|
// Make sure everything is done.
|
|
1858
1954
|
CL_CHECK(clFinish(queue));
|
|
1859
1955
|
|
|
@@ -1889,7 +1985,6 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso
|
|
|
1889
1985
|
extra->offset, ggml_nbytes(tensor), buf, 0, NULL, NULL));
|
|
1890
1986
|
CL_CHECK(clFinish(queue));
|
|
1891
1987
|
#endif // GGML_OPENCL_SOA_Q
|
|
1892
|
-
#endif // GGML_USE_OPENCL
|
|
1893
1988
|
|
|
1894
1989
|
// Open file and dump.
|
|
1895
1990
|
char fname[512];
|
|
@@ -2569,26 +2664,33 @@ static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
2569
2664
|
memcpy(&eps, dst->op_params, sizeof(float));
|
|
2570
2665
|
|
|
2571
2666
|
const int ne00 = src0 ? src0->ne[0] : 0;
|
|
2572
|
-
const
|
|
2667
|
+
const int ne01 = src0 ? src0->ne[1] : 0;
|
|
2668
|
+
const int ne02 = src0 ? src0->ne[2] : 0;
|
|
2669
|
+
const int ne03 = src0 ? src0->ne[3] : 0;
|
|
2573
2670
|
|
|
2574
|
-
|
|
2671
|
+
const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
|
|
2672
|
+
const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
|
|
2673
|
+
const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
|
|
2575
2674
|
|
|
2576
2675
|
const int nth = MIN(64, ne00);
|
|
2577
2676
|
|
|
2578
2677
|
cl_kernel kernel = backend_ctx->kernel_norm;
|
|
2579
2678
|
|
|
2580
|
-
CL_CHECK(clSetKernelArg(kernel,
|
|
2581
|
-
CL_CHECK(clSetKernelArg(kernel,
|
|
2582
|
-
CL_CHECK(clSetKernelArg(kernel,
|
|
2583
|
-
CL_CHECK(clSetKernelArg(kernel,
|
|
2584
|
-
CL_CHECK(clSetKernelArg(kernel,
|
|
2585
|
-
CL_CHECK(clSetKernelArg(kernel,
|
|
2586
|
-
CL_CHECK(clSetKernelArg(kernel,
|
|
2587
|
-
CL_CHECK(clSetKernelArg(kernel,
|
|
2679
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
2680
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
|
2681
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
|
2682
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
|
2683
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
|
|
2684
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
|
|
2685
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
|
|
2686
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
|
|
2687
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb01));
|
|
2688
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb02));
|
|
2689
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb03));
|
|
2690
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(float), &eps));
|
|
2691
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float)*nth, NULL));
|
|
2588
2692
|
|
|
2589
|
-
|
|
2590
|
-
|
|
2591
|
-
size_t global_work_size[] = {(size_t)nrows*nth, 1, 1};
|
|
2693
|
+
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
|
2592
2694
|
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
|
2593
2695
|
|
|
2594
2696
|
#ifdef GGML_OPENCL_PROFILING
|
|
@@ -2626,16 +2728,19 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
|
2626
2728
|
memcpy(&eps, dst->op_params, sizeof(float));
|
|
2627
2729
|
|
|
2628
2730
|
const int ne00 = src0 ? src0->ne[0] : 0;
|
|
2731
|
+
const int ne01 = src0 ? src0->ne[1] : 0;
|
|
2732
|
+
const int ne02 = src0 ? src0->ne[2] : 0;
|
|
2733
|
+
const int ne03 = src0 ? src0->ne[3] : 0;
|
|
2734
|
+
|
|
2629
2735
|
const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
|
|
2736
|
+
const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
|
|
2737
|
+
const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
|
|
2630
2738
|
|
|
2631
2739
|
GGML_ASSERT(ne00 % 4 == 0);
|
|
2632
|
-
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
|
2633
2740
|
|
|
2634
2741
|
const int nth = MIN(64, ne00);
|
|
2635
2742
|
|
|
2636
|
-
|
|
2637
|
-
|
|
2638
|
-
size_t global_work_size[] = {(size_t)nrows*nth, 1, 1};
|
|
2743
|
+
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
|
2639
2744
|
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
|
2640
2745
|
|
|
2641
2746
|
cl_kernel kernel = backend_ctx->kernel_rms_norm;
|
|
@@ -2650,15 +2755,20 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
|
2650
2755
|
sizeof(local_work_size), local_work_size,
|
|
2651
2756
|
sizeof(size_t), &sgs, NULL));
|
|
2652
2757
|
|
|
2653
|
-
CL_CHECK(clSetKernelArg(kernel,
|
|
2654
|
-
CL_CHECK(clSetKernelArg(kernel,
|
|
2655
|
-
CL_CHECK(clSetKernelArg(kernel,
|
|
2656
|
-
CL_CHECK(clSetKernelArg(kernel,
|
|
2657
|
-
CL_CHECK(clSetKernelArg(kernel,
|
|
2658
|
-
CL_CHECK(clSetKernelArg(kernel,
|
|
2659
|
-
CL_CHECK(clSetKernelArg(kernel,
|
|
2758
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
2759
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
|
2760
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
|
2761
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
|
2762
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
|
|
2763
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
|
|
2764
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
|
|
2765
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
|
|
2766
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb01));
|
|
2767
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb02));
|
|
2768
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb03));
|
|
2769
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(float), &eps));
|
|
2660
2770
|
// This is local memory - the size depends on subgroup size.
|
|
2661
|
-
CL_CHECK(clSetKernelArg(kernel,
|
|
2771
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float)*nth/sgs, NULL));
|
|
2662
2772
|
|
|
2663
2773
|
#ifdef GGML_OPENCL_PROFILING
|
|
2664
2774
|
cl_event evt;
|
|
@@ -2854,6 +2964,9 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
2854
2964
|
CL_CHECK(status);
|
|
2855
2965
|
|
|
2856
2966
|
int height_B = N/4;
|
|
2967
|
+
if (height_B == 0) {
|
|
2968
|
+
height_B = 1;
|
|
2969
|
+
}
|
|
2857
2970
|
int width_B = K/4;
|
|
2858
2971
|
int padded_height_B = (N + padding)/4;
|
|
2859
2972
|
|
|
@@ -3002,11 +3115,12 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
3002
3115
|
}
|
|
3003
3116
|
|
|
3004
3117
|
if (N == 1) {
|
|
3005
|
-
|
|
3118
|
+
size_t wavesize = backend_ctx->adreno_wave_size;
|
|
3119
|
+
local_work_size[0] = wavesize; // localsize
|
|
3006
3120
|
local_work_size[1] = 4; // reduce factor
|
|
3007
3121
|
local_work_size[2] = 1;
|
|
3008
3122
|
|
|
3009
|
-
global_work_size[0] = M / 2;
|
|
3123
|
+
global_work_size[0] = (((M / 2) + wavesize - 1) / wavesize) * wavesize;
|
|
3010
3124
|
global_work_size[1] = 4; // reduce factor
|
|
3011
3125
|
global_work_size[2] = 1;
|
|
3012
3126
|
}
|
|
@@ -3015,6 +3129,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
3015
3129
|
// enqueue kernel with profiling
|
|
3016
3130
|
// <--------------------------------------------> //
|
|
3017
3131
|
#ifdef GGML_OPENCL_PROFILING
|
|
3132
|
+
cl_event evt;
|
|
3018
3133
|
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
3019
3134
|
|
|
3020
3135
|
g_profiling_info.emplace_back();
|
|
@@ -3666,6 +3781,8 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
|
3666
3781
|
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
|
3667
3782
|
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
|
3668
3783
|
|
|
3784
|
+
const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16);
|
|
3785
|
+
|
|
3669
3786
|
// Local size must be wave size. Each workgroup is a wave, working on a row,
|
|
3670
3787
|
// where a row corresponds to leading dimension.
|
|
3671
3788
|
int nth = MIN(32, ne00);
|
|
@@ -3683,9 +3800,17 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
|
3683
3800
|
cl_kernel kernel;
|
|
3684
3801
|
|
|
3685
3802
|
if (ne00%4 == 0) {
|
|
3686
|
-
|
|
3803
|
+
if (use_f16) {
|
|
3804
|
+
kernel = backend_ctx->kernel_soft_max_4_f16;
|
|
3805
|
+
} else {
|
|
3806
|
+
kernel = backend_ctx->kernel_soft_max_4;
|
|
3807
|
+
}
|
|
3687
3808
|
} else {
|
|
3688
|
-
|
|
3809
|
+
if (use_f16) {
|
|
3810
|
+
kernel = backend_ctx->kernel_soft_max_f16;
|
|
3811
|
+
} else {
|
|
3812
|
+
kernel = backend_ctx->kernel_soft_max;
|
|
3813
|
+
}
|
|
3689
3814
|
}
|
|
3690
3815
|
|
|
3691
3816
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
@@ -3746,10 +3871,10 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
3746
3871
|
const int ne02 = src0 ? src0->ne[2] : 0;
|
|
3747
3872
|
const int ne03 = src0 ? src0->ne[3] : 0;
|
|
3748
3873
|
|
|
3749
|
-
const
|
|
3750
|
-
const
|
|
3751
|
-
const
|
|
3752
|
-
const
|
|
3874
|
+
const cl_ulong nb00 = src0 ? src0->nb[0] : 0;
|
|
3875
|
+
const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
|
|
3876
|
+
const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
|
|
3877
|
+
const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
|
|
3753
3878
|
|
|
3754
3879
|
const int ne10 = src1 ? src1->ne[0] : 0;
|
|
3755
3880
|
const int ne11 = src1 ? src1->ne[1] : 0; UNUSED(ne11);
|
|
@@ -3761,12 +3886,13 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
3761
3886
|
const int ne2 = dst ? dst->ne[2] : 0;
|
|
3762
3887
|
const int ne3 = dst ? dst->ne[3] : 0;
|
|
3763
3888
|
|
|
3764
|
-
const
|
|
3765
|
-
const
|
|
3766
|
-
const
|
|
3767
|
-
const
|
|
3889
|
+
const cl_ulong nb0 = dst ? dst->nb[0] : 0;
|
|
3890
|
+
const cl_ulong nb1 = dst ? dst->nb[1] : 0;
|
|
3891
|
+
const cl_ulong nb2 = dst ? dst->nb[2] : 0;
|
|
3892
|
+
const cl_ulong nb3 = dst ? dst->nb[3] : 0;
|
|
3768
3893
|
|
|
3769
|
-
GGML_ASSERT(ne10 ==
|
|
3894
|
+
GGML_ASSERT(ne10 % ne02 == 0);
|
|
3895
|
+
GGML_ASSERT(ne10 >= ne02);
|
|
3770
3896
|
|
|
3771
3897
|
int nth = MIN(64, ne00);
|
|
3772
3898
|
|