@fugood/llama.node 0.3.12 → 0.3.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (159) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/lib/binding.ts +2 -1
  18. package/package.json +1 -1
  19. package/src/LlamaCompletionWorker.cpp +14 -0
  20. package/src/LlamaContext.cpp +110 -79
  21. package/src/LlamaContext.h +1 -1
  22. package/src/common.hpp +1 -2
  23. package/src/llama.cpp/.github/workflows/build.yml +95 -13
  24. package/src/llama.cpp/.github/workflows/docker.yml +2 -0
  25. package/src/llama.cpp/.github/workflows/labeler.yml +1 -1
  26. package/src/llama.cpp/.github/workflows/server.yml +2 -0
  27. package/src/llama.cpp/common/CMakeLists.txt +23 -6
  28. package/src/llama.cpp/common/arg.cpp +292 -14
  29. package/src/llama.cpp/common/chat.cpp +1128 -315
  30. package/src/llama.cpp/common/chat.h +135 -0
  31. package/src/llama.cpp/common/common.cpp +27 -171
  32. package/src/llama.cpp/common/common.h +41 -73
  33. package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
  34. package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
  35. package/src/llama.cpp/common/llguidance.cpp +3 -3
  36. package/src/llama.cpp/common/log.cpp +1 -0
  37. package/src/llama.cpp/common/log.h +2 -1
  38. package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +21 -7
  39. package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +61 -14
  40. package/src/llama.cpp/common/ngram-cache.cpp +1 -0
  41. package/src/llama.cpp/common/sampling.cpp +93 -49
  42. package/src/llama.cpp/common/speculative.cpp +6 -5
  43. package/src/llama.cpp/common/speculative.h +1 -1
  44. package/src/llama.cpp/docs/build.md +47 -9
  45. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +3 -1
  46. package/src/llama.cpp/examples/embedding/embedding.cpp +1 -0
  47. package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
  48. package/src/llama.cpp/examples/imatrix/imatrix.cpp +4 -4
  49. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +6 -5
  50. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +1 -1
  51. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +1 -1
  52. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  53. package/src/llama.cpp/examples/llava/clip.cpp +373 -107
  54. package/src/llama.cpp/examples/llava/clip.h +19 -3
  55. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
  56. package/src/llama.cpp/examples/llava/llava.cpp +4 -2
  57. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
  58. package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -0
  59. package/src/llama.cpp/examples/main/main.cpp +73 -28
  60. package/src/llama.cpp/examples/parallel/parallel.cpp +1 -0
  61. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -0
  62. package/src/llama.cpp/examples/perplexity/perplexity.cpp +1 -0
  63. package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
  64. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
  65. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
  66. package/src/llama.cpp/examples/run/run.cpp +115 -79
  67. package/src/llama.cpp/examples/server/CMakeLists.txt +1 -1
  68. package/src/llama.cpp/examples/server/httplib.h +381 -292
  69. package/src/llama.cpp/examples/server/server.cpp +134 -128
  70. package/src/llama.cpp/examples/server/utils.hpp +95 -106
  71. package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
  72. package/src/llama.cpp/examples/tts/tts.cpp +251 -142
  73. package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
  74. package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
  75. package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
  76. package/src/llama.cpp/ggml/include/ggml-cpu.h +4 -1
  77. package/src/llama.cpp/ggml/include/ggml-metal.h +1 -1
  78. package/src/llama.cpp/ggml/include/ggml-vulkan.h +0 -2
  79. package/src/llama.cpp/ggml/include/ggml.h +6 -2
  80. package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
  81. package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
  82. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
  83. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
  84. package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
  85. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -2
  86. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
  87. package/src/llama.cpp/ggml/src/ggml-common.h +0 -2
  88. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +132 -17
  89. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
  90. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
  91. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
  92. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +156 -11
  93. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +2235 -641
  94. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1572 -198
  95. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +24 -5
  96. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
  97. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
  98. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
  99. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +9 -8
  101. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +16 -3
  102. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
  103. package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
  104. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
  105. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +235 -0
  106. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -2
  107. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
  108. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +246 -120
  109. package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
  110. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
  111. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +2 -0
  112. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  113. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
  114. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +51 -10
  115. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
  116. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
  117. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
  118. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
  119. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
  120. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +136 -4
  121. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +308 -0
  122. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
  123. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +174 -728
  124. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -77
  125. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -0
  126. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
  127. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
  128. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +949 -602
  129. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +37 -3
  130. package/src/llama.cpp/ggml/src/ggml.c +9 -4
  131. package/src/llama.cpp/include/llama.h +32 -14
  132. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
  133. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
  134. package/src/llama.cpp/requirements/requirements-all.txt +1 -0
  135. package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
  136. package/src/llama.cpp/requirements.txt +1 -0
  137. package/src/llama.cpp/src/llama-arch.cpp +21 -0
  138. package/src/llama.cpp/src/llama-arch.h +1 -0
  139. package/src/llama.cpp/src/llama-chat.cpp +1 -0
  140. package/src/llama.cpp/src/llama-grammar.cpp +183 -183
  141. package/src/llama.cpp/src/llama-grammar.h +13 -4
  142. package/src/llama.cpp/src/llama-impl.h +6 -6
  143. package/src/llama.cpp/src/llama-kv-cache.h +2 -1
  144. package/src/llama.cpp/src/llama-mmap.cpp +11 -1
  145. package/src/llama.cpp/src/llama-mmap.h +1 -0
  146. package/src/llama.cpp/src/llama-model.cpp +70 -6
  147. package/src/llama.cpp/src/llama-sampling.cpp +174 -67
  148. package/src/llama.cpp/src/llama-vocab.cpp +12 -0
  149. package/src/llama.cpp/src/llama.cpp +154 -5
  150. package/src/llama.cpp/src/unicode.cpp +9 -2
  151. package/src/llama.cpp/tests/test-backend-ops.cpp +171 -115
  152. package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
  153. package/src/llama.cpp/tests/test-chat.cpp +691 -325
  154. package/src/llama.cpp/tests/test-gguf.cpp +4 -4
  155. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
  156. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
  157. package/src/llama.cpp/tests/test-sampling.cpp +15 -0
  158. package/src/llama.cpp/Sources/llama/llama.h +0 -4
  159. package/src/llama.cpp/common/chat.hpp +0 -52
@@ -1,4 +1,4 @@
1
- #define CL_TARGET_OPENCL_VERSION 220
1
+ #define CL_TARGET_OPENCL_VERSION GGML_OPENCL_TARGET_VERSION
2
2
  #define CL_USE_DEPRECATED_OPENCL_1_2_APIS
3
3
 
4
4
  // suppress warnings in CL headers for GCC and Clang
@@ -25,6 +25,8 @@
25
25
  #include <vector>
26
26
  #include <string>
27
27
  #include <cmath>
28
+ #include <memory>
29
+ #include <charconv>
28
30
 
29
31
  #undef MIN
30
32
  #undef MAX
@@ -62,6 +64,97 @@ enum ADRENO_GPU_GEN {
62
64
  X1E,
63
65
  };
64
66
 
67
+ struct ggml_cl_version {
68
+ cl_uint major = 0;
69
+ cl_uint minor = 0;
70
+ };
71
+
72
+ // Parses a version string of form "XX.YY ". On an error returns ggml_cl_version with all zeroes.
73
+ static ggml_cl_version parse_cl_version(std::string_view str) {
74
+ size_t major_str_begin = 0;
75
+ size_t major_str_end = str.find(".", major_str_begin);
76
+ if (major_str_end == std::string::npos) {
77
+ return {};
78
+ }
79
+
80
+ size_t minor_str_begin = major_str_end + 1;
81
+ size_t minor_str_end = str.find(" ", minor_str_begin);
82
+ if (minor_str_end == std::string::npos) {
83
+ return {};
84
+ }
85
+
86
+ cl_uint version_major;
87
+ if (std::from_chars(str.data() + major_str_begin, str.data() + major_str_end, version_major).ec != std::errc{}) {
88
+ return {};
89
+ }
90
+
91
+ cl_uint version_minor;
92
+ if (std::from_chars(str.data() + minor_str_begin, str.data() + minor_str_end, version_minor).ec != std::errc{}) {
93
+ return {};
94
+ }
95
+ return { version_major, version_minor };
96
+ }
97
+
98
+ // Returns OpenCL platform's version. On an error returns ggml_cl_version with all zeroes.
99
+ static ggml_cl_version get_opencl_platform_version(cl_platform_id platform) {
100
+ size_t param_size;
101
+ CL_CHECK(clGetPlatformInfo(platform, CL_PLATFORM_VERSION, 0, nullptr, &param_size));
102
+ std::unique_ptr<char[]> param_storage(new char[param_size]);
103
+ CL_CHECK(clGetPlatformInfo(platform, CL_PLATFORM_VERSION, param_size, param_storage.get(), nullptr));
104
+
105
+ auto param_value = std::string_view(param_storage.get(), param_size);
106
+ const std::string version_prefix = "OpenCL "; // Suffix: "XX.YY <platform-specific-info>"
107
+ if (param_value.find(version_prefix) != 0) {
108
+ return {};
109
+ }
110
+ param_value.remove_prefix(version_prefix.length());
111
+ return parse_cl_version(param_value);
112
+ }
113
+
114
+ // Return a version to use in OpenCL C compilation. On an error returns ggml_cl_version with all zeroes.
115
+ static ggml_cl_version get_opencl_c_version(ggml_cl_version platform_version, cl_device_id device) {
116
+ size_t param_size;
117
+
118
+ #if CL_TARGET_OPENCL_VERSION >= 300
119
+ if (platform_version.major >= 3) {
120
+ CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_ALL_VERSIONS, 0, nullptr, &param_size));
121
+ if (!param_size) {
122
+ return {};
123
+ }
124
+
125
+ std::unique_ptr<cl_name_version[]> versions(new cl_name_version[param_size]);
126
+ CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_ALL_VERSIONS, param_size, versions.get(), nullptr));
127
+ unsigned versions_count = param_size / sizeof(cl_name_version);
128
+
129
+ cl_version version_max = 0;
130
+ for (unsigned i = 0; i < versions_count; i++) {
131
+ version_max = std::max<cl_version>(versions[i].version, version_max);
132
+ }
133
+
134
+ return { CL_VERSION_MAJOR(version_max), CL_VERSION_MINOR(version_max) };
135
+ }
136
+ #else
137
+ GGML_UNUSED(platform_version);
138
+ #endif // CL_TARGET_OPENCL_VERSION >= 300
139
+
140
+ CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, 0, nullptr, &param_size));
141
+ if (!param_size) {
142
+ return {};
143
+ }
144
+
145
+ std::unique_ptr<char[]> param_storage(new char[param_size]);
146
+ CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, param_size, param_storage.get(), nullptr));
147
+ auto param_value = std::string_view(param_storage.get(), param_size);
148
+
149
+ const std::string version_prefix = "OpenCL C "; // Suffix: "XX.YY <platform-specific-info>"
150
+ if (param_value.find(version_prefix) != 0) {
151
+ return {};
152
+ }
153
+ param_value.remove_prefix(version_prefix.length());
154
+
155
+ return parse_cl_version(param_value);
156
+ }
157
+
65
158
  static ADRENO_GPU_GEN get_adreno_gpu_gen(const char *device_name) {
66
159
  if (strstr(device_name, "730") ||
67
160
  strstr(device_name, "740") ||
@@ -143,6 +236,7 @@ struct ggml_backend_opencl_context {
143
236
  cl_kernel kernel_rms_norm;
144
237
  cl_kernel kernel_diag_mask_inf, kernel_diag_mask_inf_8;
145
238
  cl_kernel kernel_soft_max, kernel_soft_max_4;
239
+ cl_kernel kernel_soft_max_f16, kernel_soft_max_4_f16;
146
240
  cl_kernel kernel_get_rows_f32, kernel_get_rows_f16, kernel_get_rows_q4_0;
147
241
  cl_kernel kernel_rope_norm_f32, kernel_rope_norm_f16, kernel_rope_neox_f32, kernel_rope_neox_f16;
148
242
  cl_kernel kernel_cpy_f16_f16, kernel_cpy_f16_f32, kernel_cpy_f32_f16, kernel_cpy_f32_f32;
@@ -277,7 +371,7 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
277
371
 
278
372
  cl_int err;
279
373
 
280
- #ifdef GGML_PROFILE_OPENCL
374
+ #ifdef GGML_OPENCL_PROFILING
281
375
  GGML_LOG_INFO("ggml_opencl: OpenCL profiling enabled\n");
282
376
  #endif
283
377
 
@@ -443,19 +537,8 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
443
537
  backend_ctx->gpu_family = GPU_FAMILY::ADRENO;
444
538
  backend_ctx->adreno_gen = get_adreno_gpu_gen(default_device->name);
445
539
 
446
- // Default wave size is 128, A8x uses 64.
447
- if (backend_ctx->adreno_gen == ADRENO_GPU_GEN::A8X) {
448
- backend_ctx->adreno_wave_size = 64;
449
- } else if (backend_ctx->adreno_gen == ADRENO_GPU_GEN::A7X ||
450
- backend_ctx->adreno_gen == ADRENO_GPU_GEN::X1E) {
451
- backend_ctx->adreno_wave_size = 128;
452
- } else {
453
- backend_ctx->adreno_wave_size = 128;
454
- GGML_LOG_WARN("ggml_opencl: Unsupported Adreno GPU: %s, "
455
- "using wave size %d, "
456
- "may not work as expected\n",
457
- backend_ctx->device_name.c_str(), backend_ctx->adreno_wave_size);
458
- }
540
+ // Use wave size of 64 for all Adreno GPUs.
541
+ backend_ctx->adreno_wave_size = 64;
459
542
  } else if (strstr(default_device->name, "Intel")) {
460
543
  backend_ctx->gpu_family = GPU_FAMILY::INTEL;
461
544
  } else {
@@ -480,16 +563,11 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
480
563
  // A local ref of cl_device_id for convenience
481
564
  cl_device_id device = backend_ctx->device;
482
565
 
566
+ ggml_cl_version platform_version = get_opencl_platform_version(default_device->platform->id);
567
+
483
568
  // Check device OpenCL version, OpenCL 2.0 or above is required
484
- size_t device_ver_str_size;
485
- clGetDeviceInfo(device, CL_DEVICE_VERSION, 0, NULL, &device_ver_str_size);
486
- char *device_ver_buffer = (char *)alloca(device_ver_str_size + 1);
487
- clGetDeviceInfo(device, CL_DEVICE_VERSION, device_ver_str_size, device_ver_buffer, NULL);
488
- device_ver_buffer[device_ver_str_size] = '\0';
489
- GGML_LOG_INFO("ggml_opencl: device OpenCL version: %s\n", device_ver_buffer);
490
-
491
- if (strstr(device_ver_buffer, "OpenCL 2") == NULL &&
492
- strstr(device_ver_buffer, "OpenCL 3") == NULL) {
569
+ ggml_cl_version opencl_c_version = get_opencl_c_version(platform_version, device);
570
+ if (opencl_c_version.major < 2) {
493
571
  GGML_LOG_ERROR("ggml_opencl: OpenCL 2.0 or above is required\n");
494
572
  return backend_ctx;
495
573
  }
@@ -526,15 +604,17 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
526
604
 
527
605
  // If OpenCL 3.0 is supported, then check for cl_khr_subgroups, which becomes
528
606
  // optional in OpenCL 3.0 (cl_khr_subgroup is mandatory in OpenCL 2.x)
529
- if (strstr(device_ver_buffer, "OpenCL 3") &&
530
- strstr(ext_buffer, "cl_khr_subgroups") == NULL &&
607
+ if (opencl_c_version.major == 3 && strstr(ext_buffer, "cl_khr_subgroups") == NULL &&
531
608
  strstr(ext_buffer, "cl_intel_subgroups") == NULL) {
532
609
  GGML_LOG_ERROR("ggml_opencl: device does not support subgroups (cl_khr_subgroups or cl_intel_subgroups) "
533
610
  "(note that subgroups is an optional feature in OpenCL 3.0)\n");
534
611
  return backend_ctx;
535
612
  }
536
613
 
537
- CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &backend_ctx->alignment, NULL));
614
+ cl_uint base_align_in_bits;
615
+ CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &base_align_in_bits, NULL));
616
+ GGML_ASSERT(base_align_in_bits % 8u == 0);
617
+ backend_ctx->alignment = base_align_in_bits / 8u;
538
618
  GGML_LOG_INFO("ggml_opencl: mem base addr align: %u\n", backend_ctx->alignment);
539
619
 
540
620
  clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &backend_ctx->max_alloc_size, NULL);
@@ -588,9 +668,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
588
668
  const std::string kernel_src = read_file("ggml-opencl.cl");
589
669
  #endif
590
670
 
591
- std::string compile_opts =
592
- "-cl-std=CL2.0 -cl-mad-enable -cl-unsafe-math-optimizations "
593
- "-cl-finite-math-only -cl-fast-relaxed-math ";
671
+ auto opencl_c_std =
672
+ std::string("CL") + std::to_string(opencl_c_version.major) + "." + std::to_string(opencl_c_version.minor);
673
+
674
+ std::string compile_opts = std::string("-cl-std=") + opencl_c_std +
675
+ " -cl-mad-enable -cl-unsafe-math-optimizations"
676
+ " -cl-finite-math-only -cl-fast-relaxed-math";
594
677
  backend_ctx->program = build_program_from_source(context, device, kernel_src.c_str(), compile_opts);
595
678
 
596
679
  // Non matmul kernels.
@@ -614,6 +697,8 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
614
697
  CL_CHECK((backend_ctx->kernel_diag_mask_inf_8 = clCreateKernel(backend_ctx->program, "kernel_diag_mask_inf_8", &err), err));
615
698
  CL_CHECK((backend_ctx->kernel_soft_max = clCreateKernel(backend_ctx->program, "kernel_soft_max", &err), err));
616
699
  CL_CHECK((backend_ctx->kernel_soft_max_4 = clCreateKernel(backend_ctx->program, "kernel_soft_max_4", &err), err));
700
+ CL_CHECK((backend_ctx->kernel_soft_max_f16 = clCreateKernel(backend_ctx->program, "kernel_soft_max_f16", &err), err));
701
+ CL_CHECK((backend_ctx->kernel_soft_max_4_f16 = clCreateKernel(backend_ctx->program, "kernel_soft_max_4_f16", &err), err));
617
702
  CL_CHECK((backend_ctx->kernel_rope_norm_f32 = clCreateKernel(backend_ctx->program, "kernel_rope_norm_f32", &err), err));
618
703
  CL_CHECK((backend_ctx->kernel_rope_norm_f16 = clCreateKernel(backend_ctx->program, "kernel_rope_norm_f16", &err), err));
619
704
  CL_CHECK((backend_ctx->kernel_rope_neox_f32 = clCreateKernel(backend_ctx->program, "kernel_rope_neox_f32", &err), err));
@@ -698,10 +783,10 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
698
783
  CL_CHECK((backend_ctx->kernel_transpose_16 = clCreateKernel(backend_ctx->program_transpose_16, "kernel_transpose_16", &err), err));
699
784
 
700
785
  // Gemv general
701
- std::string CL_gemv_compile_opts =
702
- " -cl-std=CL2.0 "
703
- " -cl-mad-enable "
704
- " -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size);
786
+ std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
787
+ " -cl-mad-enable "
788
+ " -DSIMDGROUP_WIDTH=" +
789
+ std::to_string(backend_ctx->adreno_wave_size);
705
790
  if (has_vector_subgroup_broadcast) {
706
791
  CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
707
792
  }
@@ -718,12 +803,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
718
803
  CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_general = clCreateKernel(backend_ctx->program_CL_gemv_general, "kernel_gemv_noshuffle", &err), err));
719
804
 
720
805
  // Gemv 2048, 16384
721
- CL_gemv_compile_opts =
722
- " -cl-std=CL2.0 "
723
- " -cl-mad-enable "
724
- " -DLINE_STRIDE_A=2048 "
725
- " -DBLOCK_STRIDE_A=16384 "
726
- " -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size);
806
+ CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
807
+ " -cl-mad-enable "
808
+ " -DLINE_STRIDE_A=2048 "
809
+ " -DBLOCK_STRIDE_A=16384 "
810
+ " -DSIMDGROUP_WIDTH=" +
811
+ std::to_string(backend_ctx->adreno_wave_size);
727
812
  if (has_vector_subgroup_broadcast) {
728
813
  CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
729
814
  }
@@ -740,12 +825,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
740
825
  CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_4096, "kernel_gemv_noshuffle", &err), err));
741
826
 
742
827
  // Gemv 2048, 16384
743
- CL_gemv_compile_opts =
744
- " -cl-std=CL2.0 "
745
- " -cl-mad-enable "
746
- " -DLINE_STRIDE_A=2048 "
747
- " -DBLOCK_STRIDE_A=16384 "
748
- " -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size);
828
+ CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
829
+ " -cl-mad-enable "
830
+ " -DLINE_STRIDE_A=2048 "
831
+ " -DBLOCK_STRIDE_A=16384 "
832
+ " -DSIMDGROUP_WIDTH=" +
833
+ std::to_string(backend_ctx->adreno_wave_size);
749
834
  if (has_vector_subgroup_broadcast) {
750
835
  CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
751
836
  }
@@ -755,12 +840,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
755
840
  CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_11008 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_11008, "kernel_gemv_noshuffle", &err), err));
756
841
 
757
842
  // Gemv 5504, 44032
758
- CL_gemv_compile_opts =
759
- " -cl-std=CL2.0 "
760
- " -cl-mad-enable "
761
- " -DLINE_STRIDE_A=5504 "
762
- " -DBLOCK_STRIDE_A=44032 "
763
- " -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size);
843
+ CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
844
+ " -cl-mad-enable "
845
+ " -DLINE_STRIDE_A=5504 "
846
+ " -DBLOCK_STRIDE_A=44032 "
847
+ " -DSIMDGROUP_WIDTH=" +
848
+ std::to_string(backend_ctx->adreno_wave_size);
764
849
  if (has_vector_subgroup_broadcast) {
765
850
  CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
766
851
  }
@@ -770,12 +855,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
770
855
  CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_11008_1_4096, "kernel_gemv_noshuffle", &err), err));
771
856
 
772
857
  // Gemv 16000, 128000
773
- CL_gemv_compile_opts =
774
- " -cl-std=CL2.0 "
775
- " -cl-mad-enable "
776
- " -DLINE_STRIDE_A=16000 "
777
- " -DBLOCK_STRIDE_A=128000 "
778
- " -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size);
858
+ CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
859
+ " -cl-mad-enable "
860
+ " -DLINE_STRIDE_A=16000 "
861
+ " -DBLOCK_STRIDE_A=128000 "
862
+ " -DSIMDGROUP_WIDTH=" +
863
+ std::to_string(backend_ctx->adreno_wave_size);
779
864
  if (has_vector_subgroup_broadcast) {
780
865
  CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
781
866
  }
@@ -1012,17 +1097,18 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
1012
1097
  case GGML_OP_ADD:
1013
1098
  case GGML_OP_SCALE:
1014
1099
  case GGML_OP_MUL:
1015
- return true;
1100
+ return op->src[0]->type == GGML_TYPE_F32;
1016
1101
  case GGML_OP_UNARY:
1017
1102
  switch (ggml_get_unary_op(op)) {
1018
1103
  case GGML_UNARY_OP_GELU:
1019
1104
  case GGML_UNARY_OP_SILU:
1020
1105
  case GGML_UNARY_OP_RELU:
1021
- return ggml_is_contiguous(op->src[0]);
1106
+ return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
1022
1107
  default:
1023
1108
  return false;
1024
1109
  }
1025
1110
  case GGML_OP_CLAMP:
1111
+ return op->src[0]->type == GGML_TYPE_F32;
1026
1112
  case GGML_OP_SOFT_MAX:
1027
1113
  case GGML_OP_NORM:
1028
1114
  case GGML_OP_RMS_NORM:
@@ -1044,8 +1130,16 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
1044
1130
  return true;
1045
1131
  case GGML_OP_DIAG_MASK_INF:
1046
1132
  return op->ne[3] == 1;
1047
- case GGML_OP_ROPE:
1133
+ case GGML_OP_ROPE: {
1134
+ const int mode = ((const int32_t *) op->op_params)[2];
1135
+ if (mode & GGML_ROPE_TYPE_MROPE) {
1136
+ return false;
1137
+ }
1138
+ if (mode & GGML_ROPE_TYPE_VISION) {
1139
+ return false;
1140
+ }
1048
1141
  return true;
1142
+ }
1049
1143
  default:
1050
1144
  return false;
1051
1145
  }
@@ -1198,20 +1292,17 @@ struct ggml_backend_opencl_buffer_context {
1198
1292
  std::string name;
1199
1293
  };
1200
1294
 
1201
- static void * const cl_ptr_base = (void *)(uintptr_t) 0x1000;
1202
-
1203
1295
  static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
1204
1296
  ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
1205
1297
  delete ctx;
1206
1298
  }
1207
1299
 
1208
1300
  static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
1209
- return cl_ptr_base;
1210
-
1211
- GGML_UNUSED(buffer);
1301
+ ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(buffer->buft->device);
1302
+ return (void *) (uintptr_t) backend_ctx->alignment;
1212
1303
  }
1213
1304
 
1214
- static void ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
1305
+ static enum ggml_status ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
1215
1306
  ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
1216
1307
 
1217
1308
  ggml_cl2_init(buffer->buft->device);
@@ -1241,7 +1332,7 @@ static void ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer,
1241
1332
  tensor->extra = view_extra;
1242
1333
  } else {
1243
1334
  {
1244
- size_t offset = (char *)tensor->data - (char *)cl_ptr_base;
1335
+ size_t offset = (char *) tensor->data - (char *) ggml_backend_opencl_buffer_get_base(buffer);
1245
1336
 
1246
1337
  ggml_tensor_extra_cl * extra = ctx->ggml_opencl_alloc_temp_tensor_extra();
1247
1338
  extra->offset = offset;
@@ -1251,6 +1342,7 @@ static void ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer,
1251
1342
  tensor->extra = extra;
1252
1343
  }
1253
1344
  }
1345
+ return GGML_STATUS_SUCCESS;
1254
1346
  }
1255
1347
 
1256
1348
  // The optimized gemm and gemv kernels are used for large matrices without batch.
@@ -1365,6 +1457,11 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
1365
1457
  int M = tensor->ne[1]; // ne01
1366
1458
  int K = tensor->ne[0]; // ne00
1367
1459
 
1460
+ //For matrix-vector multiplication kernel, we assume K is a multiple of 32
1461
+ GGML_ASSERT(K % 32 == 0);
1462
+ //For transpose kernels, we assume K is a multiple of 4 (satisfied by prior assert), and M is a multiple of 4
1463
+ GGML_ASSERT(M % 4 == 0);
1464
+
1368
1465
  // transpose is out of place, so we need to allocate transposed buffers
1369
1466
  // <----------------------------------------------------------------------------------> //
1370
1467
  // use sub_buffer of max buffer size instead
@@ -1405,36 +1502,36 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
1405
1502
  cl_mem qT_d_image1D;
1406
1503
  cl_mem dT_d_image1D;
1407
1504
 
1408
- cl_image_format img_fmt_1d = { CL_RGBA, CL_FLOAT };
1505
+ cl_image_format img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
1409
1506
  cl_image_desc img_desc_1d;
1410
1507
 
1411
1508
  memset(&img_desc_1d, 0, sizeof(img_desc_1d));
1412
1509
  img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
1413
- img_desc_1d.image_width = M * K / 8 / 4;
1510
+ img_desc_1d.image_width = M * K / 4 / 4;
1414
1511
  img_desc_1d.buffer = extra->q;
1415
1512
  q_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
1416
1513
  CL_CHECK(err);
1417
1514
 
1418
- img_fmt_1d = { CL_RGBA, CL_FLOAT };
1515
+ img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
1419
1516
  memset(&img_desc_1d, 0, sizeof(img_desc_1d));
1420
1517
  img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
1421
- img_desc_1d.image_width = M * K / 8 / 4;
1518
+ img_desc_1d.image_width = M * K / 4 / 4;
1422
1519
  img_desc_1d.buffer = qT_d;
1423
1520
  qT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
1424
1521
  CL_CHECK(err);
1425
1522
 
1426
- img_fmt_1d = { CL_RGBA, CL_FLOAT };
1523
+ img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
1427
1524
  memset(&img_desc_1d, 0, sizeof(img_desc_1d));
1428
1525
  img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
1429
- img_desc_1d.image_width = M * K / 32 / 4 / 2;
1526
+ img_desc_1d.image_width = M * K / 32 / 4;
1430
1527
  img_desc_1d.buffer = extra->d;
1431
1528
  d_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
1432
1529
  CL_CHECK(err);
1433
1530
 
1434
- img_fmt_1d = { CL_RGBA, CL_FLOAT };
1531
+ img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
1435
1532
  memset(&img_desc_1d, 0, sizeof(img_desc_1d));
1436
1533
  img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
1437
- img_desc_1d.image_width = M * K / 32 / 4 / 2;
1534
+ img_desc_1d.image_width = M * K / 32 / 4;
1438
1535
  img_desc_1d.buffer = dT_d;
1439
1536
  dT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
1440
1537
  CL_CHECK(err);
@@ -1443,8 +1540,8 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
1443
1540
  // set up and call the transpose kernels
1444
1541
  // <----------------------------------------------------------------------------------> //
1445
1542
  // weights
1446
- int height_q = M / 8;
1447
- int width_q = K / 8 / 4;
1543
+ int height_q = M / 4;
1544
+ int width_q = K / 4 / 4;
1448
1545
  kernel = backend_ctx->kernel_transpose_16;
1449
1546
 
1450
1547
  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &q_d_image1D));
@@ -1458,8 +1555,8 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
1458
1555
  CL_CHECK(clWaitForEvents(1, &evt));
1459
1556
 
1460
1557
  // scales
1461
- int height_s = M / 8;
1462
- int width_s = K / 32 / 8;
1558
+ int height_s = M / 4;
1559
+ int width_s = K / 32 / 4;
1463
1560
 
1464
1561
  kernel = backend_ctx->kernel_transpose_16;
1465
1562
  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_d_image1D));
@@ -1853,7 +1950,6 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso
1853
1950
  void * buf_d;
1854
1951
  #endif
1855
1952
 
1856
- #ifdef GGML_USE_OPENCL
1857
1953
  // Make sure everything is done.
1858
1954
  CL_CHECK(clFinish(queue));
1859
1955
 
@@ -1889,7 +1985,6 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso
1889
1985
  extra->offset, ggml_nbytes(tensor), buf, 0, NULL, NULL));
1890
1986
  CL_CHECK(clFinish(queue));
1891
1987
  #endif // GGML_OPENCL_SOA_Q
1892
- #endif // GGML_USE_OPENCL
1893
1988
 
1894
1989
  // Open file and dump.
1895
1990
  char fname[512];
@@ -2569,26 +2664,33 @@ static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const
2569
2664
  memcpy(&eps, dst->op_params, sizeof(float));
2570
2665
 
2571
2666
  const int ne00 = src0 ? src0->ne[0] : 0;
2572
- const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
2667
+ const int ne01 = src0 ? src0->ne[1] : 0;
2668
+ const int ne02 = src0 ? src0->ne[2] : 0;
2669
+ const int ne03 = src0 ? src0->ne[3] : 0;
2573
2670
 
2574
- GGML_ASSERT(ggml_is_contiguous_1(src0));
2671
+ const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
2672
+ const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
2673
+ const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
2575
2674
 
2576
2675
  const int nth = MIN(64, ne00);
2577
2676
 
2578
2677
  cl_kernel kernel = backend_ctx->kernel_norm;
2579
2678
 
2580
- CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
2581
- CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
2582
- CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
2583
- CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
2584
- CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
2585
- CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &nb01));
2586
- CL_CHECK(clSetKernelArg(kernel, 6, sizeof(float), &eps));
2587
- CL_CHECK(clSetKernelArg(kernel, 7, sizeof(float)*nth, NULL));
2679
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
2680
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
2681
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
2682
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
2683
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
2684
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
2685
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
2686
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
2687
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb01));
2688
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb02));
2689
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb03));
2690
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(float), &eps));
2691
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float)*nth, NULL));
2588
2692
 
2589
- const int64_t nrows = ggml_nrows(src0);
2590
-
2591
- size_t global_work_size[] = {(size_t)nrows*nth, 1, 1};
2693
+ size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
2592
2694
  size_t local_work_size[] = {(size_t)nth, 1, 1};
2593
2695
 
2594
2696
  #ifdef GGML_OPENCL_PROFILING
@@ -2626,16 +2728,19 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
2626
2728
  memcpy(&eps, dst->op_params, sizeof(float));
2627
2729
 
2628
2730
  const int ne00 = src0 ? src0->ne[0] : 0;
2731
+ const int ne01 = src0 ? src0->ne[1] : 0;
2732
+ const int ne02 = src0 ? src0->ne[2] : 0;
2733
+ const int ne03 = src0 ? src0->ne[3] : 0;
2734
+
2629
2735
  const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
2736
+ const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
2737
+ const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
2630
2738
 
2631
2739
  GGML_ASSERT(ne00 % 4 == 0);
2632
- GGML_ASSERT(ggml_is_contiguous_1(src0));
2633
2740
 
2634
2741
  const int nth = MIN(64, ne00);
2635
2742
 
2636
- const int64_t nrows = ggml_nrows(src0);
2637
-
2638
- size_t global_work_size[] = {(size_t)nrows*nth, 1, 1};
2743
+ size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
2639
2744
  size_t local_work_size[] = {(size_t)nth, 1, 1};
2640
2745
 
2641
2746
  cl_kernel kernel = backend_ctx->kernel_rms_norm;
@@ -2650,15 +2755,20 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
2650
2755
  sizeof(local_work_size), local_work_size,
2651
2756
  sizeof(size_t), &sgs, NULL));
2652
2757
 
2653
- CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
2654
- CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
2655
- CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
2656
- CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
2657
- CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
2658
- CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &nb01));
2659
- CL_CHECK(clSetKernelArg(kernel, 6, sizeof(float), &eps));
2758
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
2759
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
2760
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
2761
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
2762
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
2763
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
2764
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
2765
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
2766
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb01));
2767
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb02));
2768
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb03));
2769
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(float), &eps));
2660
2770
  // This is local memory - the size depends on subgroup size.
2661
- CL_CHECK(clSetKernelArg(kernel, 7, sizeof(float)*nth/sgs, NULL));
2771
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float)*nth/sgs, NULL));
2662
2772
 
2663
2773
  #ifdef GGML_OPENCL_PROFILING
2664
2774
  cl_event evt;
@@ -2854,6 +2964,9 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
2854
2964
  CL_CHECK(status);
2855
2965
 
2856
2966
  int height_B = N/4;
2967
+ if (height_B == 0) {
2968
+ height_B = 1;
2969
+ }
2857
2970
  int width_B = K/4;
2858
2971
  int padded_height_B = (N + padding)/4;
2859
2972
 
@@ -3002,11 +3115,12 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
3002
3115
  }
3003
3116
 
3004
3117
  if (N == 1) {
3005
- local_work_size[0] = backend_ctx->adreno_wave_size; // localsize
3118
+ size_t wavesize = backend_ctx->adreno_wave_size;
3119
+ local_work_size[0] = wavesize; // localsize
3006
3120
  local_work_size[1] = 4; // reduce factor
3007
3121
  local_work_size[2] = 1;
3008
3122
 
3009
- global_work_size[0] = M / 2;
3123
+ global_work_size[0] = (((M / 2) + wavesize - 1) / wavesize) * wavesize;
3010
3124
  global_work_size[1] = 4; // reduce factor
3011
3125
  global_work_size[2] = 1;
3012
3126
  }
@@ -3015,6 +3129,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
3015
3129
  // enqueue kernel with profiling
3016
3130
  // <--------------------------------------------> //
3017
3131
  #ifdef GGML_OPENCL_PROFILING
3132
+ cl_event evt;
3018
3133
  CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3019
3134
 
3020
3135
  g_profiling_info.emplace_back();
@@ -3666,6 +3781,8 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c
3666
3781
  const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
3667
3782
  const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
3668
3783
 
3784
+ const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16);
3785
+
3669
3786
  // Local size must be wave size. Each workgroup is a wave, working on a row,
3670
3787
  // where a row corresponds to leading dimension.
3671
3788
  int nth = MIN(32, ne00);
@@ -3683,9 +3800,17 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c
3683
3800
  cl_kernel kernel;
3684
3801
 
3685
3802
  if (ne00%4 == 0) {
3686
- kernel = backend_ctx->kernel_soft_max_4;
3803
+ if (use_f16) {
3804
+ kernel = backend_ctx->kernel_soft_max_4_f16;
3805
+ } else {
3806
+ kernel = backend_ctx->kernel_soft_max_4;
3807
+ }
3687
3808
  } else {
3688
- kernel = backend_ctx->kernel_soft_max;
3809
+ if (use_f16) {
3810
+ kernel = backend_ctx->kernel_soft_max_f16;
3811
+ } else {
3812
+ kernel = backend_ctx->kernel_soft_max;
3813
+ }
3689
3814
  }
3690
3815
 
3691
3816
  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
@@ -3746,10 +3871,10 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
3746
3871
  const int ne02 = src0 ? src0->ne[2] : 0;
3747
3872
  const int ne03 = src0 ? src0->ne[3] : 0;
3748
3873
 
3749
- const int nb00 = src0 ? src0->nb[0] : 0;
3750
- const int nb01 = src0 ? src0->nb[1] : 0;
3751
- const int nb02 = src0 ? src0->nb[2] : 0;
3752
- const int nb03 = src0 ? src0->nb[3] : 0;
3874
+ const cl_ulong nb00 = src0 ? src0->nb[0] : 0;
3875
+ const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
3876
+ const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
3877
+ const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
3753
3878
 
3754
3879
  const int ne10 = src1 ? src1->ne[0] : 0;
3755
3880
  const int ne11 = src1 ? src1->ne[1] : 0; UNUSED(ne11);
@@ -3761,12 +3886,13 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
3761
3886
  const int ne2 = dst ? dst->ne[2] : 0;
3762
3887
  const int ne3 = dst ? dst->ne[3] : 0;
3763
3888
 
3764
- const int nb0 = dst ? dst->nb[0] : 0;
3765
- const int nb1 = dst ? dst->nb[1] : 0;
3766
- const int nb2 = dst ? dst->nb[2] : 0;
3767
- const int nb3 = dst ? dst->nb[3] : 0;
3889
+ const cl_ulong nb0 = dst ? dst->nb[0] : 0;
3890
+ const cl_ulong nb1 = dst ? dst->nb[1] : 0;
3891
+ const cl_ulong nb2 = dst ? dst->nb[2] : 0;
3892
+ const cl_ulong nb3 = dst ? dst->nb[3] : 0;
3768
3893
 
3769
- GGML_ASSERT(ne10 == ne02);
3894
+ GGML_ASSERT(ne10 % ne02 == 0);
3895
+ GGML_ASSERT(ne10 >= ne02);
3770
3896
 
3771
3897
  int nth = MIN(64, ne00);
3772
3898