@fugood/llama.node 0.3.13 → 0.3.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (184) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/lib/binding.ts +1 -1
  18. package/package.json +1 -1
  19. package/src/LlamaContext.cpp +98 -76
  20. package/src/LlamaContext.h +1 -1
  21. package/src/common.hpp +1 -2
  22. package/src/llama.cpp/.github/workflows/build.yml +89 -10
  23. package/src/llama.cpp/.github/workflows/server.yml +2 -0
  24. package/src/llama.cpp/CMakeLists.txt +9 -1
  25. package/src/llama.cpp/cmake/common.cmake +2 -0
  26. package/src/llama.cpp/common/CMakeLists.txt +3 -3
  27. package/src/llama.cpp/common/arg.cpp +132 -13
  28. package/src/llama.cpp/common/chat.cpp +960 -266
  29. package/src/llama.cpp/common/chat.h +135 -0
  30. package/src/llama.cpp/common/common.cpp +33 -174
  31. package/src/llama.cpp/common/common.h +27 -67
  32. package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
  33. package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
  34. package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +37 -5
  35. package/src/llama.cpp/common/ngram-cache.cpp +1 -0
  36. package/src/llama.cpp/common/sampling.cpp +45 -7
  37. package/src/llama.cpp/common/speculative.cpp +10 -9
  38. package/src/llama.cpp/common/speculative.h +1 -1
  39. package/src/llama.cpp/docs/build.md +45 -7
  40. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +2 -2
  41. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +4 -2
  42. package/src/llama.cpp/examples/embedding/embedding.cpp +2 -1
  43. package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
  44. package/src/llama.cpp/examples/gritlm/gritlm.cpp +2 -2
  45. package/src/llama.cpp/examples/imatrix/imatrix.cpp +3 -4
  46. package/src/llama.cpp/examples/infill/infill.cpp +2 -2
  47. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +2 -2
  48. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +5 -5
  49. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  50. package/src/llama.cpp/examples/llava/clip.cpp +373 -107
  51. package/src/llama.cpp/examples/llava/clip.h +19 -3
  52. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
  53. package/src/llama.cpp/examples/llava/llava.cpp +4 -2
  54. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
  55. package/src/llama.cpp/examples/lookahead/lookahead.cpp +7 -6
  56. package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
  57. package/src/llama.cpp/examples/main/main.cpp +79 -34
  58. package/src/llama.cpp/examples/parallel/parallel.cpp +6 -5
  59. package/src/llama.cpp/examples/passkey/passkey.cpp +15 -14
  60. package/src/llama.cpp/examples/perplexity/perplexity.cpp +6 -6
  61. package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
  62. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -2
  63. package/src/llama.cpp/examples/retrieval/retrieval.cpp +1 -1
  64. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
  65. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
  66. package/src/llama.cpp/examples/run/run.cpp +196 -108
  67. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +2 -2
  68. package/src/llama.cpp/examples/server/server.cpp +113 -101
  69. package/src/llama.cpp/examples/server/utils.hpp +94 -105
  70. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
  71. package/src/llama.cpp/examples/speculative/speculative.cpp +14 -14
  72. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  73. package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
  74. package/src/llama.cpp/examples/tts/tts.cpp +263 -151
  75. package/src/llama.cpp/ggml/CMakeLists.txt +14 -1
  76. package/src/llama.cpp/ggml/cmake/common.cmake +26 -0
  77. package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
  78. package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
  79. package/src/llama.cpp/ggml/include/ggml-cpu.h +3 -0
  80. package/src/llama.cpp/ggml/include/ggml.h +29 -1
  81. package/src/llama.cpp/ggml/src/CMakeLists.txt +15 -34
  82. package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
  83. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
  84. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
  85. package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
  86. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +6 -2
  87. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -7
  88. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
  89. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +139 -16
  90. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
  91. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
  92. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
  93. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +151 -0
  94. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1546 -387
  95. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1645 -113
  96. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +22 -0
  97. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
  98. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
  99. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  101. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +15 -2
  102. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +2 -1
  103. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -1
  104. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
  105. package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
  106. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
  107. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +242 -0
  108. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -6
  109. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
  110. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +315 -138
  111. package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
  112. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
  113. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +5 -0
  114. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +2 -1
  115. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
  116. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +117 -36
  117. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
  118. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
  119. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
  120. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
  121. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
  122. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +147 -16
  123. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +40 -40
  124. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +307 -0
  125. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
  126. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +262 -746
  127. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +0 -1
  128. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -78
  129. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +114 -6
  130. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +6 -0
  131. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +4 -1
  132. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
  133. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
  134. package/src/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +305 -0
  135. package/src/llama.cpp/ggml/src/ggml-sycl/wkv.hpp +10 -0
  136. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +498 -188
  137. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -4
  138. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +16 -3
  139. package/src/llama.cpp/ggml/src/ggml.c +93 -5
  140. package/src/llama.cpp/include/llama.h +105 -27
  141. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
  142. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
  143. package/src/llama.cpp/requirements/requirements-all.txt +1 -0
  144. package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
  145. package/src/llama.cpp/requirements.txt +1 -0
  146. package/src/llama.cpp/src/CMakeLists.txt +5 -2
  147. package/src/llama.cpp/src/llama-adapter.cpp +19 -20
  148. package/src/llama.cpp/src/llama-adapter.h +11 -9
  149. package/src/llama.cpp/src/llama-arch.cpp +123 -16
  150. package/src/llama.cpp/src/llama-arch.h +19 -0
  151. package/src/llama.cpp/src/llama-batch.h +2 -2
  152. package/src/llama.cpp/src/llama-chat.cpp +1 -0
  153. package/src/llama.cpp/src/llama-context.cpp +2253 -1222
  154. package/src/llama.cpp/src/llama-context.h +214 -77
  155. package/src/llama.cpp/src/llama-cparams.h +1 -0
  156. package/src/llama.cpp/src/llama-grammar.cpp +182 -182
  157. package/src/llama.cpp/src/llama-grammar.h +12 -3
  158. package/src/llama.cpp/src/llama-graph.cpp +1662 -0
  159. package/src/llama.cpp/src/llama-graph.h +574 -0
  160. package/src/llama.cpp/src/llama-hparams.cpp +8 -0
  161. package/src/llama.cpp/src/llama-hparams.h +9 -0
  162. package/src/llama.cpp/src/llama-io.cpp +15 -0
  163. package/src/llama.cpp/src/llama-io.h +35 -0
  164. package/src/llama.cpp/src/llama-kv-cache.cpp +1006 -291
  165. package/src/llama.cpp/src/llama-kv-cache.h +178 -109
  166. package/src/llama.cpp/src/llama-memory.cpp +1 -0
  167. package/src/llama.cpp/src/llama-memory.h +21 -0
  168. package/src/llama.cpp/src/llama-mmap.cpp +11 -1
  169. package/src/llama.cpp/src/llama-model.cpp +8230 -122
  170. package/src/llama.cpp/src/llama-model.h +34 -1
  171. package/src/llama.cpp/src/llama-quant.cpp +10 -1
  172. package/src/llama.cpp/src/llama-sampling.cpp +43 -10
  173. package/src/llama.cpp/src/llama-vocab.cpp +12 -0
  174. package/src/llama.cpp/src/llama.cpp +51 -9837
  175. package/src/llama.cpp/tests/test-backend-ops.cpp +247 -112
  176. package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
  177. package/src/llama.cpp/tests/test-chat.cpp +593 -395
  178. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
  179. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
  180. package/src/llama.cpp/Sources/llama/llama.h +0 -4
  181. package/src/llama.cpp/common/chat.hpp +0 -55
  182. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +0 -143
  183. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +0 -9
  184. /package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +0 -0
@@ -1,4 +1,4 @@
1
- #define CL_TARGET_OPENCL_VERSION 220
1
+ #define CL_TARGET_OPENCL_VERSION GGML_OPENCL_TARGET_VERSION
2
2
  #define CL_USE_DEPRECATED_OPENCL_1_2_APIS
3
3
 
4
4
  // suppress warnings in CL headers for GCC and Clang
@@ -25,6 +25,8 @@
25
25
  #include <vector>
26
26
  #include <string>
27
27
  #include <cmath>
28
+ #include <memory>
29
+ #include <charconv>
28
30
 
29
31
  #undef MIN
30
32
  #undef MAX
@@ -62,6 +64,97 @@ enum ADRENO_GPU_GEN {
62
64
  X1E,
63
65
  };
64
66
 
67
+ struct ggml_cl_version {
68
+ cl_uint major = 0;
69
+ cl_uint minor = 0;
70
+ };
71
+
72
+ // Parses a version string of form "XX.YY ". On an error returns ggml_cl_version with all zeroes.
73
+ static ggml_cl_version parse_cl_version(std::string_view str) {
74
+ size_t major_str_begin = 0;
75
+ size_t major_str_end = str.find(".", major_str_begin);
76
+ if (major_str_end == std::string::npos) {
77
+ return {};
78
+ }
79
+
80
+ size_t minor_str_begin = major_str_end + 1;
81
+ size_t minor_str_end = str.find(" ", minor_str_begin);
82
+ if (minor_str_end == std::string::npos) {
83
+ return {};
84
+ }
85
+
86
+ cl_uint version_major;
87
+ if (std::from_chars(str.data() + major_str_begin, str.data() + major_str_end, version_major).ec != std::errc{}) {
88
+ return {};
89
+ }
90
+
91
+ cl_uint version_minor;
92
+ if (std::from_chars(str.data() + minor_str_begin, str.data() + minor_str_end, version_minor).ec != std::errc{}) {
93
+ return {};
94
+ }
95
+ return { version_major, version_minor };
96
+ }
97
+
98
+ // Returns OpenCL platform's version. On an error returns ggml_cl_version with all zeroes.
99
+ static ggml_cl_version get_opencl_platform_version(cl_platform_id platform) {
100
+ size_t param_size;
101
+ CL_CHECK(clGetPlatformInfo(platform, CL_PLATFORM_VERSION, 0, nullptr, &param_size));
102
+ std::unique_ptr<char[]> param_storage(new char[param_size]);
103
+ CL_CHECK(clGetPlatformInfo(platform, CL_PLATFORM_VERSION, param_size, param_storage.get(), nullptr));
104
+
105
+ auto param_value = std::string_view(param_storage.get(), param_size);
106
+ const std::string version_prefix = "OpenCL "; // Suffix: "XX.YY <platform-specific-info>"
107
+ if (param_value.find(version_prefix) != 0) {
108
+ return {};
109
+ }
110
+ param_value.remove_prefix(version_prefix.length());
111
+ return parse_cl_version(param_value);
112
+ }
113
+
114
+ // Return a version to use in OpenCL C compilation. On an error returns ggml_cl_version with all zeroes.
115
+ static ggml_cl_version get_opencl_c_version(ggml_cl_version platform_version, cl_device_id device) {
116
+ size_t param_size;
117
+
118
+ #if CL_TARGET_OPENCL_VERSION >= 300
119
+ if (platform_version.major >= 3) {
120
+ CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_ALL_VERSIONS, 0, nullptr, &param_size));
121
+ if (!param_size) {
122
+ return {};
123
+ }
124
+
125
+ std::unique_ptr<cl_name_version[]> versions(new cl_name_version[param_size]);
126
+ CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_ALL_VERSIONS, param_size, versions.get(), nullptr));
127
+ unsigned versions_count = param_size / sizeof(cl_name_version);
128
+
129
+ cl_version version_max = 0;
130
+ for (unsigned i = 0; i < versions_count; i++) {
131
+ version_max = std::max<cl_version>(versions[i].version, version_max);
132
+ }
133
+
134
+ return { CL_VERSION_MAJOR(version_max), CL_VERSION_MINOR(version_max) };
135
+ }
136
+ #else
137
+ GGML_UNUSED(platform_version);
138
+ #endif // CL_TARGET_OPENCL_VERSION >= 300
139
+
140
+ CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, 0, nullptr, &param_size));
141
+ if (!param_size) {
142
+ return {};
143
+ }
144
+
145
+ std::unique_ptr<char[]> param_storage(new char[param_size]);
146
+ CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, param_size, param_storage.get(), nullptr));
147
+ auto param_value = std::string_view(param_storage.get(), param_size);
148
+
149
+ const std::string version_prefix = "OpenCL C "; // Suffix: "XX.YY <platform-specific-info>"
150
+ if (param_value.find(version_prefix) != 0) {
151
+ return {};
152
+ }
153
+ param_value.remove_prefix(version_prefix.length());
154
+
155
+ return parse_cl_version(param_value);
156
+ }
157
+
65
158
  static ADRENO_GPU_GEN get_adreno_gpu_gen(const char *device_name) {
66
159
  if (strstr(device_name, "730") ||
67
160
  strstr(device_name, "740") ||
@@ -204,8 +297,27 @@ static int ggml_backend_opencl_n_devices = 0;
204
297
  struct ProfilingInfo {
205
298
  std::string op_name;
206
299
  std::string kernel_name;
207
- // Kernel execution time in nanoseconds.
208
- cl_ulong duration_ns;
300
+
301
+ cl_kernel kernel;
302
+ cl_event evt;
303
+
304
+ cl_ulong cmd_queued;
305
+ cl_ulong cmd_submit;
306
+ cl_ulong cmd_start;
307
+ cl_ulong cmd_end;
308
+ cl_ulong overhead_start;
309
+ cl_ulong overhead_end;
310
+ // For the times below, see spec for clGetEventProfilingInfo
311
+ // The time kernel spent in cmd queue - SUBMIT - QUEUED
312
+ cl_ulong cmd_queued_duration_ns;
313
+ // The time kernel spent for submission - START - SUBMIT
314
+ cl_ulong cmd_submit_duration_ns;
315
+ // Kernel execution time in nanoseconds - END - START
316
+ cl_ulong cmd_duration_ns;
317
+ // The time for the kernel to complete - COMPLETE - END
318
+ cl_ulong cmd_complete_duration_ns;
319
+ // Total time to finish the kernel - COMPELTE - QUEUED
320
+ cl_ulong cmd_total_duration_ns;
209
321
  // Global and local work sizes.
210
322
  size_t global_size[3];
211
323
  size_t local_size[3];
@@ -278,7 +390,7 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
278
390
 
279
391
  cl_int err;
280
392
 
281
- #ifdef GGML_PROFILE_OPENCL
393
+ #ifdef GGML_OPENCL_PROFILING
282
394
  GGML_LOG_INFO("ggml_opencl: OpenCL profiling enabled\n");
283
395
  #endif
284
396
 
@@ -444,19 +556,8 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
444
556
  backend_ctx->gpu_family = GPU_FAMILY::ADRENO;
445
557
  backend_ctx->adreno_gen = get_adreno_gpu_gen(default_device->name);
446
558
 
447
- // Default wave size is 128, A8x uses 64.
448
- if (backend_ctx->adreno_gen == ADRENO_GPU_GEN::A8X) {
449
- backend_ctx->adreno_wave_size = 64;
450
- } else if (backend_ctx->adreno_gen == ADRENO_GPU_GEN::A7X ||
451
- backend_ctx->adreno_gen == ADRENO_GPU_GEN::X1E) {
452
- backend_ctx->adreno_wave_size = 128;
453
- } else {
454
- backend_ctx->adreno_wave_size = 128;
455
- GGML_LOG_WARN("ggml_opencl: Unsupported Adreno GPU: %s, "
456
- "using wave size %d, "
457
- "may not work as expected\n",
458
- backend_ctx->device_name.c_str(), backend_ctx->adreno_wave_size);
459
- }
559
+ // Use wave size of 64 for all Adreno GPUs.
560
+ backend_ctx->adreno_wave_size = 64;
460
561
  } else if (strstr(default_device->name, "Intel")) {
461
562
  backend_ctx->gpu_family = GPU_FAMILY::INTEL;
462
563
  } else {
@@ -481,16 +582,11 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
481
582
  // A local ref of cl_device_id for convenience
482
583
  cl_device_id device = backend_ctx->device;
483
584
 
585
+ ggml_cl_version platform_version = get_opencl_platform_version(default_device->platform->id);
586
+
484
587
  // Check device OpenCL version, OpenCL 2.0 or above is required
485
- size_t device_ver_str_size;
486
- clGetDeviceInfo(device, CL_DEVICE_VERSION, 0, NULL, &device_ver_str_size);
487
- char *device_ver_buffer = (char *)alloca(device_ver_str_size + 1);
488
- clGetDeviceInfo(device, CL_DEVICE_VERSION, device_ver_str_size, device_ver_buffer, NULL);
489
- device_ver_buffer[device_ver_str_size] = '\0';
490
- GGML_LOG_INFO("ggml_opencl: device OpenCL version: %s\n", device_ver_buffer);
491
-
492
- if (strstr(device_ver_buffer, "OpenCL 2") == NULL &&
493
- strstr(device_ver_buffer, "OpenCL 3") == NULL) {
588
+ ggml_cl_version opencl_c_version = get_opencl_c_version(platform_version, device);
589
+ if (opencl_c_version.major < 2) {
494
590
  GGML_LOG_ERROR("ggml_opencl: OpenCL 2.0 or above is required\n");
495
591
  return backend_ctx;
496
592
  }
@@ -527,15 +623,17 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
527
623
 
528
624
  // If OpenCL 3.0 is supported, then check for cl_khr_subgroups, which becomes
529
625
  // optional in OpenCL 3.0 (cl_khr_subgroup is mandatory in OpenCL 2.x)
530
- if (strstr(device_ver_buffer, "OpenCL 3") &&
531
- strstr(ext_buffer, "cl_khr_subgroups") == NULL &&
626
+ if (opencl_c_version.major == 3 && strstr(ext_buffer, "cl_khr_subgroups") == NULL &&
532
627
  strstr(ext_buffer, "cl_intel_subgroups") == NULL) {
533
628
  GGML_LOG_ERROR("ggml_opencl: device does not support subgroups (cl_khr_subgroups or cl_intel_subgroups) "
534
629
  "(note that subgroups is an optional feature in OpenCL 3.0)\n");
535
630
  return backend_ctx;
536
631
  }
537
632
 
538
- CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &backend_ctx->alignment, NULL));
633
+ cl_uint base_align_in_bits;
634
+ CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &base_align_in_bits, NULL));
635
+ GGML_ASSERT(base_align_in_bits % 8u == 0);
636
+ backend_ctx->alignment = base_align_in_bits / 8u;
539
637
  GGML_LOG_INFO("ggml_opencl: mem base addr align: %u\n", backend_ctx->alignment);
540
638
 
541
639
  clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &backend_ctx->max_alloc_size, NULL);
@@ -589,9 +687,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
589
687
  const std::string kernel_src = read_file("ggml-opencl.cl");
590
688
  #endif
591
689
 
592
- std::string compile_opts =
593
- "-cl-std=CL2.0 -cl-mad-enable -cl-unsafe-math-optimizations "
594
- "-cl-finite-math-only -cl-fast-relaxed-math ";
690
+ auto opencl_c_std =
691
+ std::string("CL") + std::to_string(opencl_c_version.major) + "." + std::to_string(opencl_c_version.minor);
692
+
693
+ std::string compile_opts = std::string("-cl-std=") + opencl_c_std +
694
+ " -cl-mad-enable -cl-unsafe-math-optimizations"
695
+ " -cl-finite-math-only -cl-fast-relaxed-math";
595
696
  backend_ctx->program = build_program_from_source(context, device, kernel_src.c_str(), compile_opts);
596
697
 
597
698
  // Non matmul kernels.
@@ -701,10 +802,10 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
701
802
  CL_CHECK((backend_ctx->kernel_transpose_16 = clCreateKernel(backend_ctx->program_transpose_16, "kernel_transpose_16", &err), err));
702
803
 
703
804
  // Gemv general
704
- std::string CL_gemv_compile_opts =
705
- " -cl-std=CL2.0 "
706
- " -cl-mad-enable "
707
- " -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size);
805
+ std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
806
+ " -cl-mad-enable "
807
+ " -DSIMDGROUP_WIDTH=" +
808
+ std::to_string(backend_ctx->adreno_wave_size);
708
809
  if (has_vector_subgroup_broadcast) {
709
810
  CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
710
811
  }
@@ -721,12 +822,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
721
822
  CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_general = clCreateKernel(backend_ctx->program_CL_gemv_general, "kernel_gemv_noshuffle", &err), err));
722
823
 
723
824
  // Gemv 2048, 16384
724
- CL_gemv_compile_opts =
725
- " -cl-std=CL2.0 "
726
- " -cl-mad-enable "
727
- " -DLINE_STRIDE_A=2048 "
728
- " -DBLOCK_STRIDE_A=16384 "
729
- " -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size);
825
+ CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
826
+ " -cl-mad-enable "
827
+ " -DLINE_STRIDE_A=2048 "
828
+ " -DBLOCK_STRIDE_A=16384 "
829
+ " -DSIMDGROUP_WIDTH=" +
830
+ std::to_string(backend_ctx->adreno_wave_size);
730
831
  if (has_vector_subgroup_broadcast) {
731
832
  CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
732
833
  }
@@ -743,12 +844,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
743
844
  CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_4096, "kernel_gemv_noshuffle", &err), err));
744
845
 
745
846
  // Gemv 2048, 16384
746
- CL_gemv_compile_opts =
747
- " -cl-std=CL2.0 "
748
- " -cl-mad-enable "
749
- " -DLINE_STRIDE_A=2048 "
750
- " -DBLOCK_STRIDE_A=16384 "
751
- " -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size);
847
+ CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
848
+ " -cl-mad-enable "
849
+ " -DLINE_STRIDE_A=2048 "
850
+ " -DBLOCK_STRIDE_A=16384 "
851
+ " -DSIMDGROUP_WIDTH=" +
852
+ std::to_string(backend_ctx->adreno_wave_size);
752
853
  if (has_vector_subgroup_broadcast) {
753
854
  CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
754
855
  }
@@ -758,12 +859,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
758
859
  CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_11008 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_11008, "kernel_gemv_noshuffle", &err), err));
759
860
 
760
861
  // Gemv 5504, 44032
761
- CL_gemv_compile_opts =
762
- " -cl-std=CL2.0 "
763
- " -cl-mad-enable "
764
- " -DLINE_STRIDE_A=5504 "
765
- " -DBLOCK_STRIDE_A=44032 "
766
- " -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size);
862
+ CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
863
+ " -cl-mad-enable "
864
+ " -DLINE_STRIDE_A=5504 "
865
+ " -DBLOCK_STRIDE_A=44032 "
866
+ " -DSIMDGROUP_WIDTH=" +
867
+ std::to_string(backend_ctx->adreno_wave_size);
767
868
  if (has_vector_subgroup_broadcast) {
768
869
  CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
769
870
  }
@@ -773,12 +874,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
773
874
  CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_11008_1_4096, "kernel_gemv_noshuffle", &err), err));
774
875
 
775
876
  // Gemv 16000, 128000
776
- CL_gemv_compile_opts =
777
- " -cl-std=CL2.0 "
778
- " -cl-mad-enable "
779
- " -DLINE_STRIDE_A=16000 "
780
- " -DBLOCK_STRIDE_A=128000 "
781
- " -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size);
877
+ CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
878
+ " -cl-mad-enable "
879
+ " -DLINE_STRIDE_A=16000 "
880
+ " -DBLOCK_STRIDE_A=128000 "
881
+ " -DSIMDGROUP_WIDTH=" +
882
+ std::to_string(backend_ctx->adreno_wave_size);
782
883
  if (has_vector_subgroup_broadcast) {
783
884
  CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
784
885
  }
@@ -821,12 +922,56 @@ static void ggml_cl2_free(void) {
821
922
  return;
822
923
  }
823
924
 
925
+ // Populate profiling info
926
+ for (ProfilingInfo & info : g_profiling_info) {
927
+ cl_ulong cmd_queued;
928
+ cl_ulong cmd_submit;
929
+ cl_ulong cmd_start;
930
+ cl_ulong cmd_end;
931
+ cl_ulong cmd_complete;
932
+
933
+ CL_CHECK(clWaitForEvents(1, &info.evt));
934
+ CL_CHECK(clGetEventProfilingInfo(
935
+ info.evt, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &cmd_queued, NULL));
936
+ CL_CHECK(clGetEventProfilingInfo(
937
+ info.evt, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &cmd_submit, NULL));
938
+ CL_CHECK(clGetEventProfilingInfo(
939
+ info.evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &cmd_start, NULL));
940
+ CL_CHECK(clGetEventProfilingInfo(
941
+ info.evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &cmd_end, NULL));
942
+ CL_CHECK(clGetEventProfilingInfo(
943
+ info.evt, CL_PROFILING_COMMAND_COMPLETE, sizeof(cl_ulong), &cmd_complete, NULL));
944
+ CL_CHECK(clReleaseEvent(info.evt));
945
+
946
+ char kernel_name[512];
947
+ CL_CHECK(clGetKernelInfo(info.kernel, CL_KERNEL_FUNCTION_NAME,
948
+ sizeof(kernel_name), kernel_name, NULL));
949
+ info.kernel_name = kernel_name;
950
+
951
+ info.cmd_queued = cmd_queued;
952
+ info.cmd_submit = cmd_submit;
953
+ info.cmd_start = cmd_start;
954
+ info.cmd_end = cmd_end;
955
+
956
+ info.cmd_queued_duration_ns = cmd_submit - cmd_queued;
957
+ info.cmd_submit_duration_ns = cmd_start - cmd_submit;
958
+ info.cmd_duration_ns = cmd_end - cmd_start;
959
+ info.cmd_complete_duration_ns = cmd_complete - cmd_end;
960
+ info.cmd_total_duration_ns = cmd_complete - cmd_queued;
961
+ }
962
+
963
+ // Dump a csv
824
964
  float total_kernel_time = 0;
825
- fprintf(fperf, "op name, kernel name, duration (ms), global size, local size, output size\n");
965
+ fprintf(fperf, "op name, kernel name, queued duration (ms), submit duration(ms), exec duration (ms), complete duration (ms), total duration (ms), global size, local size, output size\n");
826
966
  for (const ProfilingInfo & info : g_profiling_info) {
827
- total_kernel_time += info.duration_ns/1.e6f;
828
- fprintf(fperf, "%s,%s,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n",
829
- info.op_name.c_str(), info.kernel_name.c_str(), info.duration_ns/1.e6f,
967
+ total_kernel_time += info.cmd_duration_ns/1.e6f;
968
+ fprintf(fperf, "%s,%s,%f,%f,%f,%f,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n",
969
+ info.op_name.c_str(), info.kernel_name.c_str(),
970
+ info.cmd_queued_duration_ns/1.e6f,
971
+ info.cmd_submit_duration_ns/1.e6f,
972
+ info.cmd_duration_ns/1.e6f,
973
+ info.cmd_complete_duration_ns/1.e6f,
974
+ info.cmd_total_duration_ns/1.e6f,
830
975
  info.global_size[0], info.global_size[1], info.global_size[2],
831
976
  info.local_size[0], info.local_size[2], info.local_size[2],
832
977
  info.output_size[0], info.output_size[1], info.output_size[2], info.output_size[3]);
@@ -834,6 +979,27 @@ static void ggml_cl2_free(void) {
834
979
  fclose(fperf);
835
980
 
836
981
  GGML_LOG_INFO("ggml_opencl: total kernel time: %f\n", total_kernel_time);
982
+
983
+ // Dump a simple chrome trace
984
+ FILE* ftrace = fopen("cl_trace.json", "w");
985
+ if (!ftrace) {
986
+ GGML_LOG_ERROR("Failed to open cl_trace.json\n");
987
+ return;
988
+ }
989
+
990
+ fprintf(ftrace, "[\n");
991
+ for (const ProfilingInfo & info : g_profiling_info) {
992
+ fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n",
993
+ info.kernel_name.c_str(), info.cmd_queued/1000);
994
+ fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Host\"},\n",
995
+ info.kernel_name.c_str(), info.cmd_submit/1000);
996
+
997
+ fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n",
998
+ info.kernel_name.c_str(), info.cmd_start/1000);
999
+ fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %lu, \"pid\": \"\", \"tid\": \"Device\"},\n",
1000
+ info.kernel_name.c_str(), info.cmd_end/1000);
1001
+ }
1002
+ fclose(ftrace);
837
1003
  #endif
838
1004
  }
839
1005
 
@@ -1015,17 +1181,18 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
1015
1181
  case GGML_OP_ADD:
1016
1182
  case GGML_OP_SCALE:
1017
1183
  case GGML_OP_MUL:
1018
- return true;
1184
+ return op->src[0]->type == GGML_TYPE_F32;
1019
1185
  case GGML_OP_UNARY:
1020
1186
  switch (ggml_get_unary_op(op)) {
1021
1187
  case GGML_UNARY_OP_GELU:
1022
1188
  case GGML_UNARY_OP_SILU:
1023
1189
  case GGML_UNARY_OP_RELU:
1024
- return ggml_is_contiguous(op->src[0]);
1190
+ return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
1025
1191
  default:
1026
1192
  return false;
1027
1193
  }
1028
1194
  case GGML_OP_CLAMP:
1195
+ return op->src[0]->type == GGML_TYPE_F32;
1029
1196
  case GGML_OP_SOFT_MAX:
1030
1197
  case GGML_OP_NORM:
1031
1198
  case GGML_OP_RMS_NORM:
@@ -1209,20 +1376,17 @@ struct ggml_backend_opencl_buffer_context {
1209
1376
  std::string name;
1210
1377
  };
1211
1378
 
1212
- static void * const cl_ptr_base = (void *)(uintptr_t) 0x1000;
1213
-
1214
1379
  static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
1215
1380
  ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
1216
1381
  delete ctx;
1217
1382
  }
1218
1383
 
1219
1384
  static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
1220
- return cl_ptr_base;
1221
-
1222
- GGML_UNUSED(buffer);
1385
+ ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(buffer->buft->device);
1386
+ return (void *) (uintptr_t) backend_ctx->alignment;
1223
1387
  }
1224
1388
 
1225
- static void ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
1389
+ static enum ggml_status ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
1226
1390
  ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
1227
1391
 
1228
1392
  ggml_cl2_init(buffer->buft->device);
@@ -1252,7 +1416,7 @@ static void ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer,
1252
1416
  tensor->extra = view_extra;
1253
1417
  } else {
1254
1418
  {
1255
- size_t offset = (char *)tensor->data - (char *)cl_ptr_base;
1419
+ size_t offset = (char *) tensor->data - (char *) ggml_backend_opencl_buffer_get_base(buffer);
1256
1420
 
1257
1421
  ggml_tensor_extra_cl * extra = ctx->ggml_opencl_alloc_temp_tensor_extra();
1258
1422
  extra->offset = offset;
@@ -1262,6 +1426,7 @@ static void ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer,
1262
1426
  tensor->extra = extra;
1263
1427
  }
1264
1428
  }
1429
+ return GGML_STATUS_SUCCESS;
1265
1430
  }
1266
1431
 
1267
1432
  // The optimized gemm and gemv kernels are used for large matrices without batch.
@@ -1376,6 +1541,11 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
1376
1541
  int M = tensor->ne[1]; // ne01
1377
1542
  int K = tensor->ne[0]; // ne00
1378
1543
 
1544
+ //For matrix-vector multiplication kernel, we assume K is a multiple of 32
1545
+ GGML_ASSERT(K % 32 == 0);
1546
+ //For transpose kernels, we assume K is a multiple of 4 (satisfied by prior assert), and M is a multiple of 4
1547
+ GGML_ASSERT(M % 4 == 0);
1548
+
1379
1549
  // transpose is out of place, so we need to allocate transposed buffers
1380
1550
  // <----------------------------------------------------------------------------------> //
1381
1551
  // use sub_buffer of max buffer size instead
@@ -1416,36 +1586,36 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
1416
1586
  cl_mem qT_d_image1D;
1417
1587
  cl_mem dT_d_image1D;
1418
1588
 
1419
- cl_image_format img_fmt_1d = { CL_RGBA, CL_FLOAT };
1589
+ cl_image_format img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
1420
1590
  cl_image_desc img_desc_1d;
1421
1591
 
1422
1592
  memset(&img_desc_1d, 0, sizeof(img_desc_1d));
1423
1593
  img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
1424
- img_desc_1d.image_width = M * K / 8 / 4;
1594
+ img_desc_1d.image_width = M * K / 4 / 4;
1425
1595
  img_desc_1d.buffer = extra->q;
1426
1596
  q_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
1427
1597
  CL_CHECK(err);
1428
1598
 
1429
- img_fmt_1d = { CL_RGBA, CL_FLOAT };
1599
+ img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
1430
1600
  memset(&img_desc_1d, 0, sizeof(img_desc_1d));
1431
1601
  img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
1432
- img_desc_1d.image_width = M * K / 8 / 4;
1602
+ img_desc_1d.image_width = M * K / 4 / 4;
1433
1603
  img_desc_1d.buffer = qT_d;
1434
1604
  qT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
1435
1605
  CL_CHECK(err);
1436
1606
 
1437
- img_fmt_1d = { CL_RGBA, CL_FLOAT };
1607
+ img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
1438
1608
  memset(&img_desc_1d, 0, sizeof(img_desc_1d));
1439
1609
  img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
1440
- img_desc_1d.image_width = M * K / 32 / 4 / 2;
1610
+ img_desc_1d.image_width = M * K / 32 / 4;
1441
1611
  img_desc_1d.buffer = extra->d;
1442
1612
  d_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
1443
1613
  CL_CHECK(err);
1444
1614
 
1445
- img_fmt_1d = { CL_RGBA, CL_FLOAT };
1615
+ img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
1446
1616
  memset(&img_desc_1d, 0, sizeof(img_desc_1d));
1447
1617
  img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
1448
- img_desc_1d.image_width = M * K / 32 / 4 / 2;
1618
+ img_desc_1d.image_width = M * K / 32 / 4;
1449
1619
  img_desc_1d.buffer = dT_d;
1450
1620
  dT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
1451
1621
  CL_CHECK(err);
@@ -1454,8 +1624,8 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
1454
1624
  // set up and call the transpose kernels
1455
1625
  // <----------------------------------------------------------------------------------> //
1456
1626
  // weights
1457
- int height_q = M / 8;
1458
- int width_q = K / 8 / 4;
1627
+ int height_q = M / 4;
1628
+ int width_q = K / 4 / 4;
1459
1629
  kernel = backend_ctx->kernel_transpose_16;
1460
1630
 
1461
1631
  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &q_d_image1D));
@@ -1469,8 +1639,8 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
1469
1639
  CL_CHECK(clWaitForEvents(1, &evt));
1470
1640
 
1471
1641
  // scales
1472
- int height_s = M / 8;
1473
- int width_s = K / 32 / 8;
1642
+ int height_s = M / 4;
1643
+ int width_s = K / 32 / 4;
1474
1644
 
1475
1645
  kernel = backend_ctx->kernel_transpose_16;
1476
1646
  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_d_image1D));
@@ -1864,7 +2034,6 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso
1864
2034
  void * buf_d;
1865
2035
  #endif
1866
2036
 
1867
- #ifdef GGML_USE_OPENCL
1868
2037
  // Make sure everything is done.
1869
2038
  CL_CHECK(clFinish(queue));
1870
2039
 
@@ -1900,7 +2069,6 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso
1900
2069
  extra->offset, ggml_nbytes(tensor), buf, 0, NULL, NULL));
1901
2070
  CL_CHECK(clFinish(queue));
1902
2071
  #endif // GGML_OPENCL_SOA_Q
1903
- #endif // GGML_USE_OPENCL
1904
2072
 
1905
2073
  // Open file and dump.
1906
2074
  char fname[512];
@@ -1978,25 +2146,14 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso
1978
2146
  // Profiling utility
1979
2147
  //------------------------------------------------------------------------------
1980
2148
  #ifdef GGML_OPENCL_PROFILING
1981
- void populateProfilingInfo(
2149
+ static void populateProfilingInfo(
1982
2150
  ProfilingInfo& info, cl_event evt, cl_kernel kernel,
1983
2151
  size_t global_size[3], size_t local_size[3],
1984
2152
  const ggml_tensor * tensor) {
1985
- cl_ulong start;
1986
- cl_ulong end;
1987
- CL_CHECK(clWaitForEvents(1, &evt));
1988
- CL_CHECK(clGetEventProfilingInfo(
1989
- evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL));
1990
- CL_CHECK(clGetEventProfilingInfo(
1991
- evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL));
1992
-
1993
- char kernel_name[512];
1994
- CL_CHECK(clGetKernelInfo(kernel, CL_KERNEL_FUNCTION_NAME,
1995
- sizeof(kernel_name), kernel_name, NULL));
1996
-
1997
- info.duration_ns = end - start;
1998
- info.op_name = tensor->name;
1999
- info.kernel_name = kernel_name;
2153
+ info.op_name = tensor->name;
2154
+ info.kernel = kernel;
2155
+ info.evt = evt;
2156
+
2000
2157
  info.local_size[0] = local_size[0];
2001
2158
  info.local_size[1] = local_size[1];
2002
2159
  info.local_size[2] = local_size[2];
@@ -2580,26 +2737,33 @@ static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const
2580
2737
  memcpy(&eps, dst->op_params, sizeof(float));
2581
2738
 
2582
2739
  const int ne00 = src0 ? src0->ne[0] : 0;
2583
- const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
2740
+ const int ne01 = src0 ? src0->ne[1] : 0;
2741
+ const int ne02 = src0 ? src0->ne[2] : 0;
2742
+ const int ne03 = src0 ? src0->ne[3] : 0;
2584
2743
 
2585
- GGML_ASSERT(ggml_is_contiguous_1(src0));
2744
+ const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
2745
+ const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
2746
+ const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
2586
2747
 
2587
2748
  const int nth = MIN(64, ne00);
2588
2749
 
2589
2750
  cl_kernel kernel = backend_ctx->kernel_norm;
2590
2751
 
2591
- CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
2592
- CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
2593
- CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
2594
- CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
2595
- CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
2596
- CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &nb01));
2597
- CL_CHECK(clSetKernelArg(kernel, 6, sizeof(float), &eps));
2598
- CL_CHECK(clSetKernelArg(kernel, 7, sizeof(float)*nth, NULL));
2599
-
2600
- const int64_t nrows = ggml_nrows(src0);
2752
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
2753
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
2754
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
2755
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
2756
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
2757
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
2758
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
2759
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
2760
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb01));
2761
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb02));
2762
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb03));
2763
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(float), &eps));
2764
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float)*nth, NULL));
2601
2765
 
2602
- size_t global_work_size[] = {(size_t)nrows*nth, 1, 1};
2766
+ size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
2603
2767
  size_t local_work_size[] = {(size_t)nth, 1, 1};
2604
2768
 
2605
2769
  #ifdef GGML_OPENCL_PROFILING
@@ -2637,16 +2801,19 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
2637
2801
  memcpy(&eps, dst->op_params, sizeof(float));
2638
2802
 
2639
2803
  const int ne00 = src0 ? src0->ne[0] : 0;
2804
+ const int ne01 = src0 ? src0->ne[1] : 0;
2805
+ const int ne02 = src0 ? src0->ne[2] : 0;
2806
+ const int ne03 = src0 ? src0->ne[3] : 0;
2807
+
2640
2808
  const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
2809
+ const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
2810
+ const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
2641
2811
 
2642
2812
  GGML_ASSERT(ne00 % 4 == 0);
2643
- GGML_ASSERT(ggml_is_contiguous_1(src0));
2644
2813
 
2645
2814
  const int nth = MIN(64, ne00);
2646
2815
 
2647
- const int64_t nrows = ggml_nrows(src0);
2648
-
2649
- size_t global_work_size[] = {(size_t)nrows*nth, 1, 1};
2816
+ size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
2650
2817
  size_t local_work_size[] = {(size_t)nth, 1, 1};
2651
2818
 
2652
2819
  cl_kernel kernel = backend_ctx->kernel_rms_norm;
@@ -2661,15 +2828,20 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
2661
2828
  sizeof(local_work_size), local_work_size,
2662
2829
  sizeof(size_t), &sgs, NULL));
2663
2830
 
2664
- CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
2665
- CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
2666
- CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
2667
- CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
2668
- CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
2669
- CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &nb01));
2670
- CL_CHECK(clSetKernelArg(kernel, 6, sizeof(float), &eps));
2831
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
2832
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
2833
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
2834
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
2835
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
2836
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
2837
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
2838
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
2839
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb01));
2840
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb02));
2841
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb03));
2842
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(float), &eps));
2671
2843
  // This is local memory - the size depends on subgroup size.
2672
- CL_CHECK(clSetKernelArg(kernel, 7, sizeof(float)*nth/sgs, NULL));
2844
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float)*nth/sgs, NULL));
2673
2845
 
2674
2846
  #ifdef GGML_OPENCL_PROFILING
2675
2847
  cl_event evt;
@@ -2865,6 +3037,9 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
2865
3037
  CL_CHECK(status);
2866
3038
 
2867
3039
  int height_B = N/4;
3040
+ if (height_B == 0) {
3041
+ height_B = 1;
3042
+ }
2868
3043
  int width_B = K/4;
2869
3044
  int padded_height_B = (N + padding)/4;
2870
3045
 
@@ -3013,11 +3188,12 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
3013
3188
  }
3014
3189
 
3015
3190
  if (N == 1) {
3016
- local_work_size[0] = backend_ctx->adreno_wave_size; // localsize
3191
+ size_t wavesize = backend_ctx->adreno_wave_size;
3192
+ local_work_size[0] = wavesize; // localsize
3017
3193
  local_work_size[1] = 4; // reduce factor
3018
3194
  local_work_size[2] = 1;
3019
3195
 
3020
- global_work_size[0] = M / 2;
3196
+ global_work_size[0] = (((M / 2) + wavesize - 1) / wavesize) * wavesize;
3021
3197
  global_work_size[1] = 4; // reduce factor
3022
3198
  global_work_size[2] = 1;
3023
3199
  }
@@ -3026,6 +3202,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
3026
3202
  // enqueue kernel with profiling
3027
3203
  // <--------------------------------------------> //
3028
3204
  #ifdef GGML_OPENCL_PROFILING
3205
+ cl_event evt;
3029
3206
  CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3030
3207
 
3031
3208
  g_profiling_info.emplace_back();
@@ -3767,10 +3944,10 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
3767
3944
  const int ne02 = src0 ? src0->ne[2] : 0;
3768
3945
  const int ne03 = src0 ? src0->ne[3] : 0;
3769
3946
 
3770
- const int nb00 = src0 ? src0->nb[0] : 0;
3771
- const int nb01 = src0 ? src0->nb[1] : 0;
3772
- const int nb02 = src0 ? src0->nb[2] : 0;
3773
- const int nb03 = src0 ? src0->nb[3] : 0;
3947
+ const cl_ulong nb00 = src0 ? src0->nb[0] : 0;
3948
+ const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
3949
+ const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
3950
+ const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
3774
3951
 
3775
3952
  const int ne10 = src1 ? src1->ne[0] : 0;
3776
3953
  const int ne11 = src1 ? src1->ne[1] : 0; UNUSED(ne11);
@@ -3782,10 +3959,10 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
3782
3959
  const int ne2 = dst ? dst->ne[2] : 0;
3783
3960
  const int ne3 = dst ? dst->ne[3] : 0;
3784
3961
 
3785
- const int nb0 = dst ? dst->nb[0] : 0;
3786
- const int nb1 = dst ? dst->nb[1] : 0;
3787
- const int nb2 = dst ? dst->nb[2] : 0;
3788
- const int nb3 = dst ? dst->nb[3] : 0;
3962
+ const cl_ulong nb0 = dst ? dst->nb[0] : 0;
3963
+ const cl_ulong nb1 = dst ? dst->nb[1] : 0;
3964
+ const cl_ulong nb2 = dst ? dst->nb[2] : 0;
3965
+ const cl_ulong nb3 = dst ? dst->nb[3] : 0;
3789
3966
 
3790
3967
  GGML_ASSERT(ne10 % ne02 == 0);
3791
3968
  GGML_ASSERT(ne10 >= ne02);