@fugood/llama.node 0.3.13 → 0.3.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +1 -1
- package/package.json +1 -1
- package/src/LlamaContext.cpp +98 -76
- package/src/LlamaContext.h +1 -1
- package/src/common.hpp +1 -2
- package/src/llama.cpp/.github/workflows/build.yml +60 -10
- package/src/llama.cpp/.github/workflows/server.yml +2 -0
- package/src/llama.cpp/common/CMakeLists.txt +3 -3
- package/src/llama.cpp/common/arg.cpp +112 -11
- package/src/llama.cpp/common/chat.cpp +960 -266
- package/src/llama.cpp/common/chat.h +135 -0
- package/src/llama.cpp/common/common.cpp +27 -171
- package/src/llama.cpp/common/common.h +27 -67
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
- package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
- package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +37 -5
- package/src/llama.cpp/common/ngram-cache.cpp +1 -0
- package/src/llama.cpp/common/sampling.cpp +45 -7
- package/src/llama.cpp/common/speculative.cpp +6 -5
- package/src/llama.cpp/common/speculative.h +1 -1
- package/src/llama.cpp/docs/build.md +45 -7
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +3 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +1 -0
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -3
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +1 -1
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +373 -107
- package/src/llama.cpp/examples/llava/clip.h +19 -3
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
- package/src/llama.cpp/examples/llava/llava.cpp +4 -2
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -0
- package/src/llama.cpp/examples/main/main.cpp +73 -28
- package/src/llama.cpp/examples/parallel/parallel.cpp +1 -0
- package/src/llama.cpp/examples/passkey/passkey.cpp +1 -0
- package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
- package/src/llama.cpp/examples/run/run.cpp +110 -67
- package/src/llama.cpp/examples/server/server.cpp +82 -87
- package/src/llama.cpp/examples/server/utils.hpp +94 -107
- package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
- package/src/llama.cpp/examples/tts/tts.cpp +251 -142
- package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
- package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-cpu.h +3 -0
- package/src/llama.cpp/ggml/include/ggml.h +5 -1
- package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
- package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +132 -17
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +151 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1396 -386
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1432 -151
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +22 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +15 -2
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +235 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -2
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +220 -116
- package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +51 -10
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +136 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +308 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +168 -721
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -77
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +146 -42
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +13 -3
- package/src/llama.cpp/ggml/src/ggml.c +8 -3
- package/src/llama.cpp/include/llama.h +19 -5
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +1 -0
- package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
- package/src/llama.cpp/requirements.txt +1 -0
- package/src/llama.cpp/src/llama-arch.cpp +21 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-chat.cpp +1 -0
- package/src/llama.cpp/src/llama-grammar.cpp +182 -182
- package/src/llama.cpp/src/llama-grammar.h +12 -3
- package/src/llama.cpp/src/llama-kv-cache.h +1 -0
- package/src/llama.cpp/src/llama-mmap.cpp +11 -1
- package/src/llama.cpp/src/llama-model.cpp +69 -5
- package/src/llama.cpp/src/llama-sampling.cpp +43 -10
- package/src/llama.cpp/src/llama-vocab.cpp +12 -0
- package/src/llama.cpp/src/llama.cpp +147 -0
- package/src/llama.cpp/tests/test-backend-ops.cpp +166 -110
- package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
- package/src/llama.cpp/tests/test-chat.cpp +593 -395
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
- package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
- package/src/llama.cpp/Sources/llama/llama.h +0 -4
- package/src/llama.cpp/common/chat.hpp +0 -55
- /package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
#define CL_TARGET_OPENCL_VERSION
|
|
1
|
+
#define CL_TARGET_OPENCL_VERSION GGML_OPENCL_TARGET_VERSION
|
|
2
2
|
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
|
|
3
3
|
|
|
4
4
|
// suppress warnings in CL headers for GCC and Clang
|
|
@@ -25,6 +25,8 @@
|
|
|
25
25
|
#include <vector>
|
|
26
26
|
#include <string>
|
|
27
27
|
#include <cmath>
|
|
28
|
+
#include <memory>
|
|
29
|
+
#include <charconv>
|
|
28
30
|
|
|
29
31
|
#undef MIN
|
|
30
32
|
#undef MAX
|
|
@@ -62,6 +64,97 @@ enum ADRENO_GPU_GEN {
|
|
|
62
64
|
X1E,
|
|
63
65
|
};
|
|
64
66
|
|
|
67
|
+
struct ggml_cl_version {
|
|
68
|
+
cl_uint major = 0;
|
|
69
|
+
cl_uint minor = 0;
|
|
70
|
+
};
|
|
71
|
+
|
|
72
|
+
// Parses a version string of form "XX.YY ". On an error returns ggml_cl_version with all zeroes.
|
|
73
|
+
static ggml_cl_version parse_cl_version(std::string_view str) {
|
|
74
|
+
size_t major_str_begin = 0;
|
|
75
|
+
size_t major_str_end = str.find(".", major_str_begin);
|
|
76
|
+
if (major_str_end == std::string::npos) {
|
|
77
|
+
return {};
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
size_t minor_str_begin = major_str_end + 1;
|
|
81
|
+
size_t minor_str_end = str.find(" ", minor_str_begin);
|
|
82
|
+
if (minor_str_end == std::string::npos) {
|
|
83
|
+
return {};
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
cl_uint version_major;
|
|
87
|
+
if (std::from_chars(str.data() + major_str_begin, str.data() + major_str_end, version_major).ec != std::errc{}) {
|
|
88
|
+
return {};
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
cl_uint version_minor;
|
|
92
|
+
if (std::from_chars(str.data() + minor_str_begin, str.data() + minor_str_end, version_minor).ec != std::errc{}) {
|
|
93
|
+
return {};
|
|
94
|
+
}
|
|
95
|
+
return { version_major, version_minor };
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
// Returns OpenCL platform's version. On an error returns ggml_cl_version with all zeroes.
|
|
99
|
+
static ggml_cl_version get_opencl_platform_version(cl_platform_id platform) {
|
|
100
|
+
size_t param_size;
|
|
101
|
+
CL_CHECK(clGetPlatformInfo(platform, CL_PLATFORM_VERSION, 0, nullptr, ¶m_size));
|
|
102
|
+
std::unique_ptr<char[]> param_storage(new char[param_size]);
|
|
103
|
+
CL_CHECK(clGetPlatformInfo(platform, CL_PLATFORM_VERSION, param_size, param_storage.get(), nullptr));
|
|
104
|
+
|
|
105
|
+
auto param_value = std::string_view(param_storage.get(), param_size);
|
|
106
|
+
const std::string version_prefix = "OpenCL "; // Suffix: "XX.YY <platform-specific-info>"
|
|
107
|
+
if (param_value.find(version_prefix) != 0) {
|
|
108
|
+
return {};
|
|
109
|
+
}
|
|
110
|
+
param_value.remove_prefix(version_prefix.length());
|
|
111
|
+
return parse_cl_version(param_value);
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
// Return a version to use in OpenCL C compilation. On an error returns ggml_cl_version with all zeroes.
|
|
115
|
+
static ggml_cl_version get_opencl_c_version(ggml_cl_version platform_version, cl_device_id device) {
|
|
116
|
+
size_t param_size;
|
|
117
|
+
|
|
118
|
+
#if CL_TARGET_OPENCL_VERSION >= 300
|
|
119
|
+
if (platform_version.major >= 3) {
|
|
120
|
+
CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_ALL_VERSIONS, 0, nullptr, ¶m_size));
|
|
121
|
+
if (!param_size) {
|
|
122
|
+
return {};
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
std::unique_ptr<cl_name_version[]> versions(new cl_name_version[param_size]);
|
|
126
|
+
CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_ALL_VERSIONS, param_size, versions.get(), nullptr));
|
|
127
|
+
unsigned versions_count = param_size / sizeof(cl_name_version);
|
|
128
|
+
|
|
129
|
+
cl_version version_max = 0;
|
|
130
|
+
for (unsigned i = 0; i < versions_count; i++) {
|
|
131
|
+
version_max = std::max<cl_version>(versions[i].version, version_max);
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
return { CL_VERSION_MAJOR(version_max), CL_VERSION_MINOR(version_max) };
|
|
135
|
+
}
|
|
136
|
+
#else
|
|
137
|
+
GGML_UNUSED(platform_version);
|
|
138
|
+
#endif // CL_TARGET_OPENCL_VERSION >= 300
|
|
139
|
+
|
|
140
|
+
CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, 0, nullptr, ¶m_size));
|
|
141
|
+
if (!param_size) {
|
|
142
|
+
return {};
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
std::unique_ptr<char[]> param_storage(new char[param_size]);
|
|
146
|
+
CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, param_size, param_storage.get(), nullptr));
|
|
147
|
+
auto param_value = std::string_view(param_storage.get(), param_size);
|
|
148
|
+
|
|
149
|
+
const std::string version_prefix = "OpenCL C "; // Suffix: "XX.YY <platform-specific-info>"
|
|
150
|
+
if (param_value.find(version_prefix) != 0) {
|
|
151
|
+
return {};
|
|
152
|
+
}
|
|
153
|
+
param_value.remove_prefix(version_prefix.length());
|
|
154
|
+
|
|
155
|
+
return parse_cl_version(param_value);
|
|
156
|
+
}
|
|
157
|
+
|
|
65
158
|
static ADRENO_GPU_GEN get_adreno_gpu_gen(const char *device_name) {
|
|
66
159
|
if (strstr(device_name, "730") ||
|
|
67
160
|
strstr(device_name, "740") ||
|
|
@@ -278,7 +371,7 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
|
278
371
|
|
|
279
372
|
cl_int err;
|
|
280
373
|
|
|
281
|
-
#ifdef
|
|
374
|
+
#ifdef GGML_OPENCL_PROFILING
|
|
282
375
|
GGML_LOG_INFO("ggml_opencl: OpenCL profiling enabled\n");
|
|
283
376
|
#endif
|
|
284
377
|
|
|
@@ -444,19 +537,8 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
|
444
537
|
backend_ctx->gpu_family = GPU_FAMILY::ADRENO;
|
|
445
538
|
backend_ctx->adreno_gen = get_adreno_gpu_gen(default_device->name);
|
|
446
539
|
|
|
447
|
-
//
|
|
448
|
-
|
|
449
|
-
backend_ctx->adreno_wave_size = 64;
|
|
450
|
-
} else if (backend_ctx->adreno_gen == ADRENO_GPU_GEN::A7X ||
|
|
451
|
-
backend_ctx->adreno_gen == ADRENO_GPU_GEN::X1E) {
|
|
452
|
-
backend_ctx->adreno_wave_size = 128;
|
|
453
|
-
} else {
|
|
454
|
-
backend_ctx->adreno_wave_size = 128;
|
|
455
|
-
GGML_LOG_WARN("ggml_opencl: Unsupported Adreno GPU: %s, "
|
|
456
|
-
"using wave size %d, "
|
|
457
|
-
"may not work as expected\n",
|
|
458
|
-
backend_ctx->device_name.c_str(), backend_ctx->adreno_wave_size);
|
|
459
|
-
}
|
|
540
|
+
// Use wave size of 64 for all Adreno GPUs.
|
|
541
|
+
backend_ctx->adreno_wave_size = 64;
|
|
460
542
|
} else if (strstr(default_device->name, "Intel")) {
|
|
461
543
|
backend_ctx->gpu_family = GPU_FAMILY::INTEL;
|
|
462
544
|
} else {
|
|
@@ -481,16 +563,11 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
|
481
563
|
// A local ref of cl_device_id for convenience
|
|
482
564
|
cl_device_id device = backend_ctx->device;
|
|
483
565
|
|
|
566
|
+
ggml_cl_version platform_version = get_opencl_platform_version(default_device->platform->id);
|
|
567
|
+
|
|
484
568
|
// Check device OpenCL version, OpenCL 2.0 or above is required
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
char *device_ver_buffer = (char *)alloca(device_ver_str_size + 1);
|
|
488
|
-
clGetDeviceInfo(device, CL_DEVICE_VERSION, device_ver_str_size, device_ver_buffer, NULL);
|
|
489
|
-
device_ver_buffer[device_ver_str_size] = '\0';
|
|
490
|
-
GGML_LOG_INFO("ggml_opencl: device OpenCL version: %s\n", device_ver_buffer);
|
|
491
|
-
|
|
492
|
-
if (strstr(device_ver_buffer, "OpenCL 2") == NULL &&
|
|
493
|
-
strstr(device_ver_buffer, "OpenCL 3") == NULL) {
|
|
569
|
+
ggml_cl_version opencl_c_version = get_opencl_c_version(platform_version, device);
|
|
570
|
+
if (opencl_c_version.major < 2) {
|
|
494
571
|
GGML_LOG_ERROR("ggml_opencl: OpenCL 2.0 or above is required\n");
|
|
495
572
|
return backend_ctx;
|
|
496
573
|
}
|
|
@@ -527,15 +604,17 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
|
527
604
|
|
|
528
605
|
// If OpenCL 3.0 is supported, then check for cl_khr_subgroups, which becomes
|
|
529
606
|
// optional in OpenCL 3.0 (cl_khr_subgroup is mandatory in OpenCL 2.x)
|
|
530
|
-
if (strstr(
|
|
531
|
-
strstr(ext_buffer, "cl_khr_subgroups") == NULL &&
|
|
607
|
+
if (opencl_c_version.major == 3 && strstr(ext_buffer, "cl_khr_subgroups") == NULL &&
|
|
532
608
|
strstr(ext_buffer, "cl_intel_subgroups") == NULL) {
|
|
533
609
|
GGML_LOG_ERROR("ggml_opencl: device does not support subgroups (cl_khr_subgroups or cl_intel_subgroups) "
|
|
534
610
|
"(note that subgroups is an optional feature in OpenCL 3.0)\n");
|
|
535
611
|
return backend_ctx;
|
|
536
612
|
}
|
|
537
613
|
|
|
538
|
-
|
|
614
|
+
cl_uint base_align_in_bits;
|
|
615
|
+
CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &base_align_in_bits, NULL));
|
|
616
|
+
GGML_ASSERT(base_align_in_bits % 8u == 0);
|
|
617
|
+
backend_ctx->alignment = base_align_in_bits / 8u;
|
|
539
618
|
GGML_LOG_INFO("ggml_opencl: mem base addr align: %u\n", backend_ctx->alignment);
|
|
540
619
|
|
|
541
620
|
clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &backend_ctx->max_alloc_size, NULL);
|
|
@@ -589,9 +668,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
|
589
668
|
const std::string kernel_src = read_file("ggml-opencl.cl");
|
|
590
669
|
#endif
|
|
591
670
|
|
|
592
|
-
|
|
593
|
-
"
|
|
594
|
-
|
|
671
|
+
auto opencl_c_std =
|
|
672
|
+
std::string("CL") + std::to_string(opencl_c_version.major) + "." + std::to_string(opencl_c_version.minor);
|
|
673
|
+
|
|
674
|
+
std::string compile_opts = std::string("-cl-std=") + opencl_c_std +
|
|
675
|
+
" -cl-mad-enable -cl-unsafe-math-optimizations"
|
|
676
|
+
" -cl-finite-math-only -cl-fast-relaxed-math";
|
|
595
677
|
backend_ctx->program = build_program_from_source(context, device, kernel_src.c_str(), compile_opts);
|
|
596
678
|
|
|
597
679
|
// Non matmul kernels.
|
|
@@ -701,10 +783,10 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
|
701
783
|
CL_CHECK((backend_ctx->kernel_transpose_16 = clCreateKernel(backend_ctx->program_transpose_16, "kernel_transpose_16", &err), err));
|
|
702
784
|
|
|
703
785
|
// Gemv general
|
|
704
|
-
std::string CL_gemv_compile_opts =
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
786
|
+
std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
|
787
|
+
" -cl-mad-enable "
|
|
788
|
+
" -DSIMDGROUP_WIDTH=" +
|
|
789
|
+
std::to_string(backend_ctx->adreno_wave_size);
|
|
708
790
|
if (has_vector_subgroup_broadcast) {
|
|
709
791
|
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
|
710
792
|
}
|
|
@@ -721,12 +803,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
|
721
803
|
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_general = clCreateKernel(backend_ctx->program_CL_gemv_general, "kernel_gemv_noshuffle", &err), err));
|
|
722
804
|
|
|
723
805
|
// Gemv 2048, 16384
|
|
724
|
-
CL_gemv_compile_opts =
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
806
|
+
CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
|
807
|
+
" -cl-mad-enable "
|
|
808
|
+
" -DLINE_STRIDE_A=2048 "
|
|
809
|
+
" -DBLOCK_STRIDE_A=16384 "
|
|
810
|
+
" -DSIMDGROUP_WIDTH=" +
|
|
811
|
+
std::to_string(backend_ctx->adreno_wave_size);
|
|
730
812
|
if (has_vector_subgroup_broadcast) {
|
|
731
813
|
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
|
732
814
|
}
|
|
@@ -743,12 +825,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
|
743
825
|
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_4096, "kernel_gemv_noshuffle", &err), err));
|
|
744
826
|
|
|
745
827
|
// Gemv 2048, 16384
|
|
746
|
-
CL_gemv_compile_opts =
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
828
|
+
CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
|
829
|
+
" -cl-mad-enable "
|
|
830
|
+
" -DLINE_STRIDE_A=2048 "
|
|
831
|
+
" -DBLOCK_STRIDE_A=16384 "
|
|
832
|
+
" -DSIMDGROUP_WIDTH=" +
|
|
833
|
+
std::to_string(backend_ctx->adreno_wave_size);
|
|
752
834
|
if (has_vector_subgroup_broadcast) {
|
|
753
835
|
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
|
754
836
|
}
|
|
@@ -758,12 +840,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
|
758
840
|
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_11008 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_11008, "kernel_gemv_noshuffle", &err), err));
|
|
759
841
|
|
|
760
842
|
// Gemv 5504, 44032
|
|
761
|
-
CL_gemv_compile_opts =
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
843
|
+
CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
|
844
|
+
" -cl-mad-enable "
|
|
845
|
+
" -DLINE_STRIDE_A=5504 "
|
|
846
|
+
" -DBLOCK_STRIDE_A=44032 "
|
|
847
|
+
" -DSIMDGROUP_WIDTH=" +
|
|
848
|
+
std::to_string(backend_ctx->adreno_wave_size);
|
|
767
849
|
if (has_vector_subgroup_broadcast) {
|
|
768
850
|
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
|
769
851
|
}
|
|
@@ -773,12 +855,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
|
773
855
|
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_11008_1_4096, "kernel_gemv_noshuffle", &err), err));
|
|
774
856
|
|
|
775
857
|
// Gemv 16000, 128000
|
|
776
|
-
CL_gemv_compile_opts =
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
858
|
+
CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
|
859
|
+
" -cl-mad-enable "
|
|
860
|
+
" -DLINE_STRIDE_A=16000 "
|
|
861
|
+
" -DBLOCK_STRIDE_A=128000 "
|
|
862
|
+
" -DSIMDGROUP_WIDTH=" +
|
|
863
|
+
std::to_string(backend_ctx->adreno_wave_size);
|
|
782
864
|
if (has_vector_subgroup_broadcast) {
|
|
783
865
|
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
|
|
784
866
|
}
|
|
@@ -1015,17 +1097,18 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
|
|
1015
1097
|
case GGML_OP_ADD:
|
|
1016
1098
|
case GGML_OP_SCALE:
|
|
1017
1099
|
case GGML_OP_MUL:
|
|
1018
|
-
return
|
|
1100
|
+
return op->src[0]->type == GGML_TYPE_F32;
|
|
1019
1101
|
case GGML_OP_UNARY:
|
|
1020
1102
|
switch (ggml_get_unary_op(op)) {
|
|
1021
1103
|
case GGML_UNARY_OP_GELU:
|
|
1022
1104
|
case GGML_UNARY_OP_SILU:
|
|
1023
1105
|
case GGML_UNARY_OP_RELU:
|
|
1024
|
-
return ggml_is_contiguous(op->src[0]);
|
|
1106
|
+
return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
|
|
1025
1107
|
default:
|
|
1026
1108
|
return false;
|
|
1027
1109
|
}
|
|
1028
1110
|
case GGML_OP_CLAMP:
|
|
1111
|
+
return op->src[0]->type == GGML_TYPE_F32;
|
|
1029
1112
|
case GGML_OP_SOFT_MAX:
|
|
1030
1113
|
case GGML_OP_NORM:
|
|
1031
1114
|
case GGML_OP_RMS_NORM:
|
|
@@ -1209,20 +1292,17 @@ struct ggml_backend_opencl_buffer_context {
|
|
|
1209
1292
|
std::string name;
|
|
1210
1293
|
};
|
|
1211
1294
|
|
|
1212
|
-
static void * const cl_ptr_base = (void *)(uintptr_t) 0x1000;
|
|
1213
|
-
|
|
1214
1295
|
static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
1215
1296
|
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
|
|
1216
1297
|
delete ctx;
|
|
1217
1298
|
}
|
|
1218
1299
|
|
|
1219
1300
|
static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|
1220
|
-
|
|
1221
|
-
|
|
1222
|
-
GGML_UNUSED(buffer);
|
|
1301
|
+
ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(buffer->buft->device);
|
|
1302
|
+
return (void *) (uintptr_t) backend_ctx->alignment;
|
|
1223
1303
|
}
|
|
1224
1304
|
|
|
1225
|
-
static
|
|
1305
|
+
static enum ggml_status ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
|
1226
1306
|
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
|
|
1227
1307
|
|
|
1228
1308
|
ggml_cl2_init(buffer->buft->device);
|
|
@@ -1252,7 +1332,7 @@ static void ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
|
|
1252
1332
|
tensor->extra = view_extra;
|
|
1253
1333
|
} else {
|
|
1254
1334
|
{
|
|
1255
|
-
size_t offset = (char *)tensor->data - (char *)
|
|
1335
|
+
size_t offset = (char *) tensor->data - (char *) ggml_backend_opencl_buffer_get_base(buffer);
|
|
1256
1336
|
|
|
1257
1337
|
ggml_tensor_extra_cl * extra = ctx->ggml_opencl_alloc_temp_tensor_extra();
|
|
1258
1338
|
extra->offset = offset;
|
|
@@ -1262,6 +1342,7 @@ static void ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
|
|
1262
1342
|
tensor->extra = extra;
|
|
1263
1343
|
}
|
|
1264
1344
|
}
|
|
1345
|
+
return GGML_STATUS_SUCCESS;
|
|
1265
1346
|
}
|
|
1266
1347
|
|
|
1267
1348
|
// The optimized gemm and gemv kernels are used for large matrices without batch.
|
|
@@ -1376,6 +1457,11 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|
|
1376
1457
|
int M = tensor->ne[1]; // ne01
|
|
1377
1458
|
int K = tensor->ne[0]; // ne00
|
|
1378
1459
|
|
|
1460
|
+
//For matrix-vector multiplication kernel, we assume K is a multiple of 32
|
|
1461
|
+
GGML_ASSERT(K % 32 == 0);
|
|
1462
|
+
//For transpose kernels, we assume K is a multiple of 4 (satisfied by prior assert), and M is a multiple of 4
|
|
1463
|
+
GGML_ASSERT(M % 4 == 0);
|
|
1464
|
+
|
|
1379
1465
|
// transpose is out of place, so we need to allocate transposed buffers
|
|
1380
1466
|
// <----------------------------------------------------------------------------------> //
|
|
1381
1467
|
// use sub_buffer of max buffer size instead
|
|
@@ -1416,36 +1502,36 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|
|
1416
1502
|
cl_mem qT_d_image1D;
|
|
1417
1503
|
cl_mem dT_d_image1D;
|
|
1418
1504
|
|
|
1419
|
-
cl_image_format img_fmt_1d = { CL_RGBA,
|
|
1505
|
+
cl_image_format img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
|
|
1420
1506
|
cl_image_desc img_desc_1d;
|
|
1421
1507
|
|
|
1422
1508
|
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
|
1423
1509
|
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
|
1424
|
-
img_desc_1d.image_width = M * K /
|
|
1510
|
+
img_desc_1d.image_width = M * K / 4 / 4;
|
|
1425
1511
|
img_desc_1d.buffer = extra->q;
|
|
1426
1512
|
q_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
|
|
1427
1513
|
CL_CHECK(err);
|
|
1428
1514
|
|
|
1429
|
-
img_fmt_1d = { CL_RGBA,
|
|
1515
|
+
img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
|
|
1430
1516
|
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
|
1431
1517
|
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
|
1432
|
-
img_desc_1d.image_width = M * K /
|
|
1518
|
+
img_desc_1d.image_width = M * K / 4 / 4;
|
|
1433
1519
|
img_desc_1d.buffer = qT_d;
|
|
1434
1520
|
qT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
|
|
1435
1521
|
CL_CHECK(err);
|
|
1436
1522
|
|
|
1437
|
-
img_fmt_1d = { CL_RGBA,
|
|
1523
|
+
img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
|
|
1438
1524
|
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
|
1439
1525
|
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
|
1440
|
-
img_desc_1d.image_width = M * K / 32 / 4
|
|
1526
|
+
img_desc_1d.image_width = M * K / 32 / 4;
|
|
1441
1527
|
img_desc_1d.buffer = extra->d;
|
|
1442
1528
|
d_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
|
|
1443
1529
|
CL_CHECK(err);
|
|
1444
1530
|
|
|
1445
|
-
img_fmt_1d = { CL_RGBA,
|
|
1531
|
+
img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
|
|
1446
1532
|
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
|
1447
1533
|
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
|
1448
|
-
img_desc_1d.image_width = M * K / 32 / 4
|
|
1534
|
+
img_desc_1d.image_width = M * K / 32 / 4;
|
|
1449
1535
|
img_desc_1d.buffer = dT_d;
|
|
1450
1536
|
dT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
|
|
1451
1537
|
CL_CHECK(err);
|
|
@@ -1454,8 +1540,8 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|
|
1454
1540
|
// set up and call the transpose kernels
|
|
1455
1541
|
// <----------------------------------------------------------------------------------> //
|
|
1456
1542
|
// weights
|
|
1457
|
-
int height_q = M /
|
|
1458
|
-
int width_q = K /
|
|
1543
|
+
int height_q = M / 4;
|
|
1544
|
+
int width_q = K / 4 / 4;
|
|
1459
1545
|
kernel = backend_ctx->kernel_transpose_16;
|
|
1460
1546
|
|
|
1461
1547
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &q_d_image1D));
|
|
@@ -1469,8 +1555,8 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|
|
1469
1555
|
CL_CHECK(clWaitForEvents(1, &evt));
|
|
1470
1556
|
|
|
1471
1557
|
// scales
|
|
1472
|
-
int height_s = M /
|
|
1473
|
-
int width_s = K / 32 /
|
|
1558
|
+
int height_s = M / 4;
|
|
1559
|
+
int width_s = K / 32 / 4;
|
|
1474
1560
|
|
|
1475
1561
|
kernel = backend_ctx->kernel_transpose_16;
|
|
1476
1562
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_d_image1D));
|
|
@@ -1864,7 +1950,6 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso
|
|
|
1864
1950
|
void * buf_d;
|
|
1865
1951
|
#endif
|
|
1866
1952
|
|
|
1867
|
-
#ifdef GGML_USE_OPENCL
|
|
1868
1953
|
// Make sure everything is done.
|
|
1869
1954
|
CL_CHECK(clFinish(queue));
|
|
1870
1955
|
|
|
@@ -1900,7 +1985,6 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso
|
|
|
1900
1985
|
extra->offset, ggml_nbytes(tensor), buf, 0, NULL, NULL));
|
|
1901
1986
|
CL_CHECK(clFinish(queue));
|
|
1902
1987
|
#endif // GGML_OPENCL_SOA_Q
|
|
1903
|
-
#endif // GGML_USE_OPENCL
|
|
1904
1988
|
|
|
1905
1989
|
// Open file and dump.
|
|
1906
1990
|
char fname[512];
|
|
@@ -2580,26 +2664,33 @@ static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
2580
2664
|
memcpy(&eps, dst->op_params, sizeof(float));
|
|
2581
2665
|
|
|
2582
2666
|
const int ne00 = src0 ? src0->ne[0] : 0;
|
|
2583
|
-
const
|
|
2667
|
+
const int ne01 = src0 ? src0->ne[1] : 0;
|
|
2668
|
+
const int ne02 = src0 ? src0->ne[2] : 0;
|
|
2669
|
+
const int ne03 = src0 ? src0->ne[3] : 0;
|
|
2584
2670
|
|
|
2585
|
-
|
|
2671
|
+
const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
|
|
2672
|
+
const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
|
|
2673
|
+
const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
|
|
2586
2674
|
|
|
2587
2675
|
const int nth = MIN(64, ne00);
|
|
2588
2676
|
|
|
2589
2677
|
cl_kernel kernel = backend_ctx->kernel_norm;
|
|
2590
2678
|
|
|
2591
|
-
CL_CHECK(clSetKernelArg(kernel,
|
|
2592
|
-
CL_CHECK(clSetKernelArg(kernel,
|
|
2593
|
-
CL_CHECK(clSetKernelArg(kernel,
|
|
2594
|
-
CL_CHECK(clSetKernelArg(kernel,
|
|
2595
|
-
CL_CHECK(clSetKernelArg(kernel,
|
|
2596
|
-
CL_CHECK(clSetKernelArg(kernel,
|
|
2597
|
-
CL_CHECK(clSetKernelArg(kernel,
|
|
2598
|
-
CL_CHECK(clSetKernelArg(kernel,
|
|
2599
|
-
|
|
2600
|
-
|
|
2679
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
2680
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
|
2681
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
|
2682
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
|
2683
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
|
|
2684
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
|
|
2685
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
|
|
2686
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
|
|
2687
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb01));
|
|
2688
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb02));
|
|
2689
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb03));
|
|
2690
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(float), &eps));
|
|
2691
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float)*nth, NULL));
|
|
2601
2692
|
|
|
2602
|
-
size_t global_work_size[] = {(size_t)
|
|
2693
|
+
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
|
2603
2694
|
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
|
2604
2695
|
|
|
2605
2696
|
#ifdef GGML_OPENCL_PROFILING
|
|
@@ -2637,16 +2728,19 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
|
2637
2728
|
memcpy(&eps, dst->op_params, sizeof(float));
|
|
2638
2729
|
|
|
2639
2730
|
const int ne00 = src0 ? src0->ne[0] : 0;
|
|
2731
|
+
const int ne01 = src0 ? src0->ne[1] : 0;
|
|
2732
|
+
const int ne02 = src0 ? src0->ne[2] : 0;
|
|
2733
|
+
const int ne03 = src0 ? src0->ne[3] : 0;
|
|
2734
|
+
|
|
2640
2735
|
const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
|
|
2736
|
+
const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
|
|
2737
|
+
const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
|
|
2641
2738
|
|
|
2642
2739
|
GGML_ASSERT(ne00 % 4 == 0);
|
|
2643
|
-
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
|
2644
2740
|
|
|
2645
2741
|
const int nth = MIN(64, ne00);
|
|
2646
2742
|
|
|
2647
|
-
|
|
2648
|
-
|
|
2649
|
-
size_t global_work_size[] = {(size_t)nrows*nth, 1, 1};
|
|
2743
|
+
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
|
2650
2744
|
size_t local_work_size[] = {(size_t)nth, 1, 1};
|
|
2651
2745
|
|
|
2652
2746
|
cl_kernel kernel = backend_ctx->kernel_rms_norm;
|
|
@@ -2661,15 +2755,20 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
|
2661
2755
|
sizeof(local_work_size), local_work_size,
|
|
2662
2756
|
sizeof(size_t), &sgs, NULL));
|
|
2663
2757
|
|
|
2664
|
-
CL_CHECK(clSetKernelArg(kernel,
|
|
2665
|
-
CL_CHECK(clSetKernelArg(kernel,
|
|
2666
|
-
CL_CHECK(clSetKernelArg(kernel,
|
|
2667
|
-
CL_CHECK(clSetKernelArg(kernel,
|
|
2668
|
-
CL_CHECK(clSetKernelArg(kernel,
|
|
2669
|
-
CL_CHECK(clSetKernelArg(kernel,
|
|
2670
|
-
CL_CHECK(clSetKernelArg(kernel,
|
|
2758
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
2759
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
|
2760
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
|
|
2761
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
|
|
2762
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
|
|
2763
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
|
|
2764
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
|
|
2765
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
|
|
2766
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb01));
|
|
2767
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb02));
|
|
2768
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb03));
|
|
2769
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(float), &eps));
|
|
2671
2770
|
// This is local memory - the size depends on subgroup size.
|
|
2672
|
-
CL_CHECK(clSetKernelArg(kernel,
|
|
2771
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float)*nth/sgs, NULL));
|
|
2673
2772
|
|
|
2674
2773
|
#ifdef GGML_OPENCL_PROFILING
|
|
2675
2774
|
cl_event evt;
|
|
@@ -2865,6 +2964,9 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
2865
2964
|
CL_CHECK(status);
|
|
2866
2965
|
|
|
2867
2966
|
int height_B = N/4;
|
|
2967
|
+
if (height_B == 0) {
|
|
2968
|
+
height_B = 1;
|
|
2969
|
+
}
|
|
2868
2970
|
int width_B = K/4;
|
|
2869
2971
|
int padded_height_B = (N + padding)/4;
|
|
2870
2972
|
|
|
@@ -3013,11 +3115,12 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
3013
3115
|
}
|
|
3014
3116
|
|
|
3015
3117
|
if (N == 1) {
|
|
3016
|
-
|
|
3118
|
+
size_t wavesize = backend_ctx->adreno_wave_size;
|
|
3119
|
+
local_work_size[0] = wavesize; // localsize
|
|
3017
3120
|
local_work_size[1] = 4; // reduce factor
|
|
3018
3121
|
local_work_size[2] = 1;
|
|
3019
3122
|
|
|
3020
|
-
global_work_size[0] = M / 2;
|
|
3123
|
+
global_work_size[0] = (((M / 2) + wavesize - 1) / wavesize) * wavesize;
|
|
3021
3124
|
global_work_size[1] = 4; // reduce factor
|
|
3022
3125
|
global_work_size[2] = 1;
|
|
3023
3126
|
}
|
|
@@ -3026,6 +3129,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
3026
3129
|
// enqueue kernel with profiling
|
|
3027
3130
|
// <--------------------------------------------> //
|
|
3028
3131
|
#ifdef GGML_OPENCL_PROFILING
|
|
3132
|
+
cl_event evt;
|
|
3029
3133
|
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
3030
3134
|
|
|
3031
3135
|
g_profiling_info.emplace_back();
|
|
@@ -3767,10 +3871,10 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
3767
3871
|
const int ne02 = src0 ? src0->ne[2] : 0;
|
|
3768
3872
|
const int ne03 = src0 ? src0->ne[3] : 0;
|
|
3769
3873
|
|
|
3770
|
-
const
|
|
3771
|
-
const
|
|
3772
|
-
const
|
|
3773
|
-
const
|
|
3874
|
+
const cl_ulong nb00 = src0 ? src0->nb[0] : 0;
|
|
3875
|
+
const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
|
|
3876
|
+
const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
|
|
3877
|
+
const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
|
|
3774
3878
|
|
|
3775
3879
|
const int ne10 = src1 ? src1->ne[0] : 0;
|
|
3776
3880
|
const int ne11 = src1 ? src1->ne[1] : 0; UNUSED(ne11);
|
|
@@ -3782,10 +3886,10 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
3782
3886
|
const int ne2 = dst ? dst->ne[2] : 0;
|
|
3783
3887
|
const int ne3 = dst ? dst->ne[3] : 0;
|
|
3784
3888
|
|
|
3785
|
-
const
|
|
3786
|
-
const
|
|
3787
|
-
const
|
|
3788
|
-
const
|
|
3889
|
+
const cl_ulong nb0 = dst ? dst->nb[0] : 0;
|
|
3890
|
+
const cl_ulong nb1 = dst ? dst->nb[1] : 0;
|
|
3891
|
+
const cl_ulong nb2 = dst ? dst->nb[2] : 0;
|
|
3892
|
+
const cl_ulong nb3 = dst ? dst->nb[3] : 0;
|
|
3789
3893
|
|
|
3790
3894
|
GGML_ASSERT(ne10 % ne02 == 0);
|
|
3791
3895
|
GGML_ASSERT(ne10 >= ne02);
|