@fugood/llama.node 0.4.7 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. package/CMakeLists.txt +4 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/lib/binding.ts +66 -6
  11. package/lib/index.js +59 -17
  12. package/lib/index.ts +74 -23
  13. package/package.json +1 -1
  14. package/src/DecodeAudioTokenWorker.cpp +40 -0
  15. package/src/DecodeAudioTokenWorker.h +22 -0
  16. package/src/EmbeddingWorker.cpp +7 -5
  17. package/src/LlamaCompletionWorker.cpp +68 -54
  18. package/src/LlamaCompletionWorker.h +7 -8
  19. package/src/LlamaContext.cpp +551 -235
  20. package/src/LlamaContext.h +26 -4
  21. package/src/LoadSessionWorker.cpp +4 -2
  22. package/src/SaveSessionWorker.cpp +10 -6
  23. package/src/TokenizeWorker.cpp +23 -14
  24. package/src/TokenizeWorker.h +2 -2
  25. package/src/addons.cc +8 -11
  26. package/src/common.hpp +129 -126
  27. package/src/llama.cpp/.github/workflows/build.yml +2 -2
  28. package/src/llama.cpp/.github/workflows/release.yml +152 -129
  29. package/src/llama.cpp/.github/workflows/winget.yml +42 -0
  30. package/src/llama.cpp/common/arg.cpp +14 -13
  31. package/src/llama.cpp/common/common.cpp +4 -75
  32. package/src/llama.cpp/common/common.h +7 -12
  33. package/src/llama.cpp/examples/lookahead/lookahead.cpp +0 -13
  34. package/src/llama.cpp/examples/lookup/lookup.cpp +0 -11
  35. package/src/llama.cpp/examples/parallel/parallel.cpp +0 -9
  36. package/src/llama.cpp/examples/retrieval/retrieval.cpp +6 -6
  37. package/src/llama.cpp/examples/simple/simple.cpp +1 -1
  38. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
  39. package/src/llama.cpp/examples/sycl/run-llama2.sh +4 -4
  40. package/src/llama.cpp/examples/sycl/run-llama3.sh +28 -0
  41. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  42. package/src/llama.cpp/examples/sycl/win-run-llama3.bat +9 -0
  43. package/src/llama.cpp/ggml/include/ggml-opt.h +2 -0
  44. package/src/llama.cpp/ggml/include/ggml.h +11 -0
  45. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +274 -0
  46. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +27 -0
  47. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +18 -2
  48. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1 -0
  49. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +107 -0
  50. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +16 -0
  51. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +8 -2
  52. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +315 -155
  53. package/src/llama.cpp/ggml/src/ggml-opt.cpp +5 -0
  54. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +43 -12
  55. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +171 -112
  56. package/src/llama.cpp/ggml/src/ggml.c +64 -18
  57. package/src/llama.cpp/include/llama.h +24 -124
  58. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +5 -1
  59. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +5 -1
  60. package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
  61. package/src/llama.cpp/src/llama-batch.cpp +3 -1
  62. package/src/llama.cpp/src/llama-context.cpp +60 -110
  63. package/src/llama.cpp/src/llama-graph.cpp +137 -233
  64. package/src/llama.cpp/src/llama-graph.h +49 -7
  65. package/src/llama.cpp/src/llama-hparams.cpp +17 -1
  66. package/src/llama.cpp/src/llama-hparams.h +34 -5
  67. package/src/llama.cpp/src/llama-kv-cache.cpp +654 -321
  68. package/src/llama.cpp/src/llama-kv-cache.h +201 -85
  69. package/src/llama.cpp/src/llama-memory.h +3 -2
  70. package/src/llama.cpp/src/llama-model.cpp +273 -94
  71. package/src/llama.cpp/src/llama-model.h +4 -1
  72. package/src/llama.cpp/tests/test-arg-parser.cpp +1 -1
  73. package/src/llama.cpp/tools/llama-bench/llama-bench.cpp +1 -0
  74. package/src/llama.cpp/tools/mtmd/CMakeLists.txt +13 -2
  75. package/src/llama.cpp/tools/mtmd/clip-impl.h +108 -11
  76. package/src/llama.cpp/tools/mtmd/clip.cpp +466 -88
  77. package/src/llama.cpp/tools/mtmd/clip.h +6 -4
  78. package/src/llama.cpp/tools/mtmd/miniaudio.h +93468 -0
  79. package/src/llama.cpp/tools/mtmd/mtmd-audio.cpp +855 -0
  80. package/src/llama.cpp/tools/mtmd/mtmd-audio.h +62 -0
  81. package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +21 -14
  82. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +36 -49
  83. package/src/llama.cpp/tools/mtmd/mtmd.cpp +362 -98
  84. package/src/llama.cpp/tools/mtmd/mtmd.h +52 -21
  85. package/src/llama.cpp/tools/run/run.cpp +2 -2
  86. package/src/llama.cpp/tools/server/server.cpp +158 -47
  87. package/src/llama.cpp/tools/server/utils.hpp +71 -43
  88. package/src/llama.cpp/tools/tts/tts.cpp +4 -2
  89. package/src/tts_utils.cpp +342 -0
  90. package/src/tts_utils.h +62 -0
  91. package/bin/win32/arm64/llama-node.node +0 -0
  92. package/bin/win32/arm64/node.lib +0 -0
  93. package/bin/win32/x64/llama-node.node +0 -0
  94. package/bin/win32/x64/node.lib +0 -0
  95. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  96. package/bin/win32-vulkan/arm64/node.lib +0 -0
  97. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  98. package/bin/win32-vulkan/x64/node.lib +0 -0
@@ -27,6 +27,7 @@
27
27
  #include <cmath>
28
28
  #include <memory>
29
29
  #include <charconv>
30
+ #include <mutex>
30
31
 
31
32
  #undef MIN
32
33
  #undef MAX
@@ -74,6 +75,7 @@ struct ggml_cl_version {
74
75
  cl_uint minor = 0;
75
76
  };
76
77
 
78
+
77
79
  struct ggml_cl_compiler_version {
78
80
  ADRENO_CL_COMPILER_TYPE type;
79
81
  int major = -1;
@@ -91,6 +93,14 @@ struct ggml_cl_compiler_version {
91
93
  }
92
94
  };
93
95
 
96
+ static size_t align_to(size_t value, size_t to_alignment) {
97
+ GGML_ASSERT(to_alignment && "Invalid alignment (must be non-zero)");
98
+ GGML_ASSERT((to_alignment & (to_alignment - 1)) == 0 && "to_alignment must be power-of-two");
99
+
100
+ return ((value + to_alignment - 1) / to_alignment) * to_alignment;
101
+ }
102
+
103
+
94
104
  // Parses a version string of form "XX.YY ". On an error returns ggml_cl_version with all zeroes.
95
105
  static ggml_cl_version parse_cl_version(std::string_view str) {
96
106
  size_t major_str_begin = 0;
@@ -221,13 +231,25 @@ static ggml_cl_compiler_version get_adreno_cl_compiler_version(const char *drive
221
231
  return { type, major, minor, patch };
222
232
  }
223
233
 
234
+ struct ggml_backend_opencl_context;
235
+
224
236
  // backend device context
225
237
  struct ggml_backend_opencl_device_context {
226
238
  cl_platform_id platform;
227
239
  std::string platform_name;
228
240
 
229
- cl_device_id device;
230
- std::string device_name;
241
+ cl_device_id device;
242
+ std::string device_name;
243
+ cl_device_type device_type;
244
+ std::string device_version;
245
+
246
+ // Initialized by ggml_cl2_init().
247
+ ggml_backend_opencl_context * backend_ctx = nullptr;
248
+
249
+ // Initialized by ggml_backend_opencl_device_get_buffer_type()
250
+ ggml_backend_buffer_type buffer_type;
251
+
252
+ cl_context context = nullptr;
231
253
  };
232
254
 
233
255
  // backend context
@@ -248,6 +270,8 @@ struct ggml_backend_opencl_context {
248
270
 
249
271
  int adreno_wave_size;
250
272
 
273
+ cl_bool non_uniform_workgroups;
274
+
251
275
  cl_context context;
252
276
  cl_command_queue queue;
253
277
 
@@ -344,15 +368,8 @@ struct ggml_backend_opencl_context {
344
368
  #endif // GGML_OPENCL_USE_ADRENO_KERNELS
345
369
  };
346
370
 
347
- static ggml_backend_device g_ggml_backend_opencl_device;
348
- static ggml_backend_opencl_device_context g_ggml_ctx_dev_main {
349
- /*.platform =*/ nullptr,
350
- /*.platform_nane =*/ "",
351
- /*.device =*/ nullptr,
352
- /*.device_name =*/ "",
353
- };
354
-
355
- static int ggml_backend_opencl_n_devices = 0;
371
+ // All registered devices with a default device in the front.
372
+ static std::vector<ggml_backend_device> g_ggml_backend_opencl_devices;
356
373
 
357
374
  // Profiling
358
375
  #ifdef GGML_OPENCL_PROFILING
@@ -1107,25 +1124,19 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
1107
1124
  GGML_LOG_CONT("\n");
1108
1125
  }
1109
1126
 
1110
- static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
1111
- static bool initialized = false;
1112
- static ggml_backend_opencl_context *backend_ctx = nullptr;
1127
+ // XXX static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
1128
+ // XXX static bool initialized = false;
1129
+ // XXX static ggml_backend_opencl_context *backend_ctx = nullptr;
1113
1130
 
1114
- if (initialized) {
1115
- return backend_ctx;
1116
- }
1131
+ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev);
1117
1132
 
1118
- ggml_backend_opencl_device_context *dev_ctx = (ggml_backend_opencl_device_context *)dev->context;
1119
- GGML_ASSERT(dev_ctx);
1120
- GGML_ASSERT(dev_ctx->platform == nullptr);
1121
- GGML_ASSERT(dev_ctx->device == nullptr);
1122
- GGML_ASSERT(backend_ctx == nullptr);
1123
-
1124
- initialized = true;
1125
- backend_ctx = new ggml_backend_opencl_context();
1126
- backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
1133
+ namespace /* anonymous */ {
1134
+ extern struct ggml_backend_device_i ggml_backend_opencl_device_i;
1135
+ }
1127
1136
 
1128
- cl_int err;
1137
+ // Look for available and suitable devices.
1138
+ static std::vector<ggml_backend_device> ggml_opencl_probe_devices(ggml_backend_reg * reg) {
1139
+ std::vector<ggml_backend_device> found_devices;
1129
1140
 
1130
1141
  #ifdef GGML_OPENCL_PROFILING
1131
1142
  GGML_LOG_INFO("ggml_opencl: OpenCL profiling enabled\n");
@@ -1158,11 +1169,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
1158
1169
  struct cl_device devices[NDEV];
1159
1170
  unsigned n_devices = 0;
1160
1171
  struct cl_device * default_device = NULL;
1172
+ unsigned default_platform_number = 0;
1161
1173
 
1162
1174
  cl_platform_id platform_ids[NPLAT];
1163
1175
  if (clGetPlatformIDs(NPLAT, platform_ids, &n_platforms) != CL_SUCCESS) {
1164
1176
  GGML_LOG_ERROR("ggml_opencl: plaform IDs not available.\n");
1165
- return backend_ctx;
1177
+ return found_devices;
1166
1178
  }
1167
1179
 
1168
1180
  for (unsigned i = 0; i < n_platforms; i++) {
@@ -1197,19 +1209,22 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
1197
1209
  }
1198
1210
 
1199
1211
  if (default_device == NULL && p->default_device != NULL) {
1200
- default_device = p->default_device;
1212
+ default_device = p->default_device;
1213
+ default_platform_number = i;
1201
1214
  }
1202
1215
  }
1203
1216
 
1204
1217
  if (n_devices == 0) {
1205
1218
  GGML_LOG_ERROR("ggml_opencl: could find any OpenCL devices.\n");
1206
- return backend_ctx;
1219
+ return found_devices;
1207
1220
  }
1208
1221
 
1209
- char * user_platform_string = getenv("GGML_OPENCL_PLATFORM");
1210
- char * user_device_string = getenv("GGML_OPENCL_DEVICE");
1211
- int user_platform_number = -1;
1212
- int user_device_number = -1;
1222
+ char * user_platform_string = getenv("GGML_OPENCL_PLATFORM");
1223
+ char * user_device_string = getenv("GGML_OPENCL_DEVICE");
1224
+ int user_platform_number = -1;
1225
+ int user_device_number = -1;
1226
+ cl_device * candidate_devices = nullptr;
1227
+ unsigned n_candidate_devices = 0;
1213
1228
 
1214
1229
  unsigned n;
1215
1230
  if (user_platform_string != NULL && sscanf(user_platform_string, " %u", &n) == 1 && n < n_platforms) {
@@ -1224,12 +1239,11 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
1224
1239
  GGML_LOG_ERROR("ggml_opencl: invalid device number %d\n", user_device_number);
1225
1240
  exit(1);
1226
1241
  }
1227
- default_device = &platform->devices[user_device_number];
1242
+ default_device = &platform->devices[user_device_number];
1243
+ candidate_devices = platform->devices;
1244
+ n_candidate_devices = platform->n_devices;
1228
1245
  } else {
1229
-
1230
- struct cl_device * selected_devices = devices;
1231
- unsigned n_selected_devices = n_devices;
1232
-
1246
+ // Choose a platform by matching a substring.
1233
1247
  if (user_platform_number == -1 && user_platform_string != NULL && user_platform_string[0] != 0) {
1234
1248
  for (unsigned i = 0; i < n_platforms; i++) {
1235
1249
  struct cl_platform * p = &platforms[i];
@@ -1244,20 +1258,20 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
1244
1258
  exit(1);
1245
1259
  }
1246
1260
  }
1247
- if (user_platform_number != -1) {
1248
- struct cl_platform * p = &platforms[user_platform_number];
1249
- selected_devices = p->devices;
1250
- n_selected_devices = p->n_devices;
1251
- default_device = p->default_device;
1252
- if (n_selected_devices == 0) {
1253
- GGML_LOG_ERROR("ggml_opencl: selected platform '%s' does not have any devices.\n", p->name);
1254
- exit(1);
1255
- }
1261
+
1262
+ int platform_idx = user_platform_number != -1 ? user_platform_number : default_platform_number;
1263
+ struct cl_platform * p = &platforms[platform_idx];
1264
+ candidate_devices = p->devices;
1265
+ n_candidate_devices = p->n_devices;
1266
+ default_device = p->default_device;
1267
+ if (n_candidate_devices == 0) {
1268
+ GGML_LOG_ERROR("ggml_opencl: selected platform '%s' does not have any devices.\n", p->name);
1269
+ exit(1);
1256
1270
  }
1257
1271
 
1258
1272
  if (user_device_number == -1 && user_device_string != NULL && user_device_string[0] != 0) {
1259
- for (unsigned i = 0; i < n_selected_devices; i++) {
1260
- struct cl_device * d = &selected_devices[i];
1273
+ for (unsigned i = 0; i < n_candidate_devices; i++) {
1274
+ struct cl_device * d = &candidate_devices[i];
1261
1275
  if (strstr(d->name, user_device_string) != NULL) {
1262
1276
  user_device_number = d->number;
1263
1277
  break;
@@ -1269,71 +1283,145 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
1269
1283
  }
1270
1284
  }
1271
1285
  if (user_device_number != -1) {
1272
- selected_devices = &devices[user_device_number];
1273
- n_selected_devices = 1;
1274
- default_device = &selected_devices[0];
1286
+ candidate_devices = &devices[user_device_number];
1287
+ n_candidate_devices = 1;
1288
+ default_device = &candidate_devices[0];
1275
1289
  }
1276
1290
 
1277
- GGML_ASSERT(n_selected_devices > 0);
1291
+ GGML_ASSERT(n_candidate_devices > 0);
1278
1292
 
1279
1293
  if (default_device == NULL) {
1280
- default_device = &selected_devices[0];
1294
+ default_device = &candidate_devices[0];
1281
1295
  }
1282
1296
  }
1283
1297
 
1284
- GGML_LOG_INFO("ggml_opencl: selecting platform: '%s'\n", default_device->platform->name);
1285
- GGML_LOG_INFO("ggml_opencl: selecting device: '%s (%s)'\n", default_device->name, default_device->version);
1286
- if (default_device->type != CL_DEVICE_TYPE_GPU) {
1287
- GGML_LOG_WARN("ggml_opencl: warning, not a GPU: '%s'.\n", default_device->name);
1298
+ GGML_ASSERT(n_candidate_devices != 0 && candidate_devices);
1299
+
1300
+ // Put the default device in front.
1301
+ for (unsigned i = 1; i < n_candidate_devices; i++) {
1302
+ if (&candidate_devices[i] == default_device) {
1303
+ std::swap(candidate_devices[0], candidate_devices[i]);
1304
+ default_device = &candidate_devices[0];
1305
+ break;
1306
+ }
1288
1307
  }
1289
1308
 
1290
- dev_ctx->platform = default_device->platform->id;
1291
- dev_ctx->device = default_device->id;
1292
- backend_ctx->device = default_device->id;
1309
+ GGML_LOG_INFO("ggml_opencl: selected platform: '%s'\n", default_device->platform->name);
1310
+
1311
+ std::vector<cl_device_id> device_ids;
1312
+ for (auto dev = candidate_devices, dev_end = candidate_devices + n_candidate_devices; dev != dev_end; dev++) {
1313
+ device_ids.push_back(dev->id);
1314
+ }
1293
1315
 
1294
- if (strstr(default_device->name, "Adreno") ||
1295
- strstr(default_device->name, "Qualcomm") ||
1296
- strstr(default_device->version, "Adreno")) {
1316
+ cl_int err;
1317
+ cl_context shared_context;
1318
+ cl_context_properties properties[] = { (intptr_t) CL_CONTEXT_PLATFORM, (intptr_t) default_device->platform->id, 0 };
1319
+
1320
+ CL_CHECK(
1321
+ (shared_context = clCreateContext(properties, device_ids.size(), device_ids.data(), NULL, NULL, &err), err));
1322
+
1323
+ for (auto dev = candidate_devices, dev_end = candidate_devices + n_candidate_devices; dev != dev_end; dev++) {
1324
+ GGML_LOG_INFO("\nggml_opencl: device: '%s (%s)'\n", dev->name, dev->version);
1325
+
1326
+ auto dev_ctx = std::unique_ptr<ggml_backend_opencl_device_context>(new ggml_backend_opencl_device_context{
1327
+ /*.platform =*/dev->platform->id,
1328
+ /*.platform_nane =*/dev->platform->name,
1329
+ /*.device =*/dev->id,
1330
+ /*.device_name =*/dev->name,
1331
+ /*.device_type =*/dev->type,
1332
+ /*.device_version =*/dev->version,
1333
+ /*.backend_ctx =*/nullptr,
1334
+ /*.buffer_type =*/{},
1335
+ /*.context =*/shared_context,
1336
+ });
1337
+
1338
+ found_devices.push_back(ggml_backend_device{
1339
+ /* .iface = */ ggml_backend_opencl_device_i,
1340
+ /* .reg = */ reg,
1341
+ /* .context = */ dev_ctx.get(),
1342
+ });
1343
+
1344
+ if (!ggml_cl2_init(&found_devices.back())) {
1345
+ found_devices.pop_back();
1346
+ GGML_LOG_INFO("ggml_opencl: drop unsupported device.\n");
1347
+ continue;
1348
+ }
1349
+
1350
+ dev_ctx.release();
1351
+ }
1352
+
1353
+ if (found_devices.size()) {
1354
+ auto * dev_ctx = static_cast<ggml_backend_opencl_device_context *>(found_devices.front().context);
1355
+ GGML_LOG_INFO("ggml_opencl: default device: '%s (%s)'\n", dev_ctx->device_name.c_str(),
1356
+ dev_ctx->device_version.c_str());
1357
+
1358
+ if (dev_ctx->device_type != CL_DEVICE_TYPE_GPU) {
1359
+ GGML_LOG_WARN("ggml_opencl: warning, the default device is not a GPU: '%s'.\n",
1360
+ dev_ctx->device_name.c_str());
1361
+ }
1362
+ }
1363
+
1364
+ return found_devices;
1365
+ }
1366
+
1367
+ // Initialize device if it is supported (returns nullptr if it is not).
1368
+ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
1369
+ GGML_ASSERT(dev);
1370
+ GGML_ASSERT(dev->context);
1371
+
1372
+ ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *) dev->context;
1373
+ GGML_ASSERT(dev_ctx->platform);
1374
+ GGML_ASSERT(dev_ctx->device);
1375
+
1376
+ if (dev_ctx->backend_ctx) {
1377
+ return dev_ctx->backend_ctx;
1378
+ }
1379
+
1380
+ auto backend_ctx = std::make_unique<ggml_backend_opencl_context>();
1381
+ backend_ctx->device = dev_ctx->device;
1382
+ backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
1383
+
1384
+ if (strstr(dev_ctx->device_name.c_str(), "Adreno") ||
1385
+ strstr(dev_ctx->device_name.c_str(), "Qualcomm") ||
1386
+ strstr(dev_ctx->device_version.c_str(), "Adreno")) {
1297
1387
  backend_ctx->gpu_family = GPU_FAMILY::ADRENO;
1298
1388
  // Usually device version contains the detailed device name
1299
- backend_ctx->adreno_gen = get_adreno_gpu_gen(default_device->version);
1389
+ backend_ctx->adreno_gen = get_adreno_gpu_gen(dev_ctx->device_version.c_str());
1300
1390
  if (backend_ctx->adreno_gen == ADRENO_GPU_GEN::ADRENO_UNKNOWN) {
1301
- backend_ctx->adreno_gen = get_adreno_gpu_gen(default_device->name);
1391
+ backend_ctx->adreno_gen = get_adreno_gpu_gen(dev_ctx->device_name.c_str());
1302
1392
  }
1303
1393
 
1304
1394
  // Use wave size of 64 for all Adreno GPUs.
1305
1395
  backend_ctx->adreno_wave_size = 64;
1306
- } else if (strstr(default_device->name, "Intel")) {
1396
+ } else if (strstr(dev_ctx->device_name.c_str(), "Intel")) {
1307
1397
  backend_ctx->gpu_family = GPU_FAMILY::INTEL;
1308
1398
  } else {
1309
- GGML_LOG_ERROR("Unsupported GPU: %s\n", default_device->name);
1399
+ GGML_LOG_ERROR("Unsupported GPU: %s\n", dev_ctx->device_name.c_str());
1310
1400
  backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
1311
- return backend_ctx;
1401
+ return nullptr;
1312
1402
  }
1313
1403
 
1314
1404
  #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
1315
1405
  if (backend_ctx->gpu_family != GPU_FAMILY::ADRENO) {
1316
1406
  GGML_LOG_ERROR("ggml_opencl: Adreno-specific kernels should not be enabled for non-Adreno GPUs; "
1317
1407
  "run on an Adreno GPU or recompile with CMake option `-DGGML_OPENCL_USE_ADRENO_KERNELS=OFF`\n");
1318
- return backend_ctx;
1408
+ return nullptr;
1319
1409
  }
1320
1410
  #endif
1321
1411
 
1322
1412
  // Populate backend device name
1323
- dev_ctx->platform_name = default_device->platform->name;
1324
- dev_ctx->device_name = default_device->name;
1325
- backend_ctx->device_name = default_device->name;
1413
+ backend_ctx->device_name = dev_ctx->device_name;
1326
1414
 
1327
1415
  // A local ref of cl_device_id for convenience
1328
1416
  cl_device_id device = backend_ctx->device;
1329
1417
 
1330
- ggml_cl_version platform_version = get_opencl_platform_version(default_device->platform->id);
1418
+ ggml_cl_version platform_version = get_opencl_platform_version(dev_ctx->platform);
1331
1419
 
1332
1420
  // Check device OpenCL version, OpenCL 2.0 or above is required
1333
1421
  ggml_cl_version opencl_c_version = get_opencl_c_version(platform_version, device);
1334
1422
  if (opencl_c_version.major < 2) {
1335
1423
  GGML_LOG_ERROR("ggml_opencl: OpenCL 2.0 or above is required\n");
1336
- return backend_ctx;
1424
+ return nullptr;
1337
1425
  }
1338
1426
 
1339
1427
  // Check driver version
@@ -1364,7 +1452,7 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
1364
1452
  // fp16 is required
1365
1453
  if (!backend_ctx->fp16_support) {
1366
1454
  GGML_LOG_ERROR("ggml_opencl: device does not support FP16\n");
1367
- return backend_ctx;
1455
+ return nullptr;
1368
1456
  }
1369
1457
 
1370
1458
  // If OpenCL 3.0 is supported, then check for cl_khr_subgroups, which becomes
@@ -1373,7 +1461,7 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
1373
1461
  strstr(ext_buffer, "cl_intel_subgroups") == NULL) {
1374
1462
  GGML_LOG_ERROR("ggml_opencl: device does not support subgroups (cl_khr_subgroups or cl_intel_subgroups) "
1375
1463
  "(note that subgroups is an optional feature in OpenCL 3.0)\n");
1376
- return backend_ctx;
1464
+ return nullptr;
1377
1465
  }
1378
1466
 
1379
1467
  cl_uint base_align_in_bits;
@@ -1397,6 +1485,15 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
1397
1485
  GGML_LOG_INFO("ggml_opencl: SVM atomics support: %s\n",
1398
1486
  svm_caps & CL_DEVICE_SVM_ATOMICS ? "true" : "false");
1399
1487
 
1488
+ if (opencl_c_version.major >= 3) {
1489
+ CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT, sizeof(cl_bool),
1490
+ &backend_ctx->non_uniform_workgroups, 0));
1491
+ } else {
1492
+ GGML_ASSERT(opencl_c_version.major == 2);
1493
+ // Non-uniform workgroup sizes is mandatory feature in v2.x.
1494
+ backend_ctx->non_uniform_workgroups = true;
1495
+ }
1496
+
1400
1497
  // Print out configurations
1401
1498
  #ifdef GGML_OPENCL_SOA_Q
1402
1499
  GGML_LOG_INFO("ggml_opencl: flattening quantized weights representation as struct of arrays (GGML_OPENCL_SOA_Q)\n");
@@ -1406,14 +1503,10 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
1406
1503
  GGML_LOG_INFO("ggml_opencl: using kernels optimized for Adreno (GGML_OPENCL_USE_ADRENO_KERNELS)\n");
1407
1504
  #endif // GGML_OPENCL_USE_ADRENO_KERNELS
1408
1505
 
1409
- cl_context_properties properties[] = {
1410
- (intptr_t)CL_CONTEXT_PLATFORM, (intptr_t)dev_ctx->platform, 0
1411
- };
1412
-
1413
- CL_CHECK((backend_ctx->context = clCreateContext(properties, 1, &device, NULL, NULL, &err), err));
1506
+ cl_int err;
1414
1507
 
1415
1508
  // A local ref of cl_context for convenience
1416
- cl_context context = backend_ctx->context;
1509
+ cl_context context = backend_ctx->context = dev_ctx->context;
1417
1510
 
1418
1511
  //CL_CHECK((queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err),
1419
1512
  // (err != CL_INVALID_QUEUE_PROPERTIES && err != CL_INVALID_VALUE ? err :
@@ -1426,7 +1519,7 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
1426
1519
  CL_CHECK((backend_ctx->queue = clCreateCommandQueue(context, device, command_queue_props, &err), err));
1427
1520
 
1428
1521
  // Load kernels
1429
- load_cl_kernels(backend_ctx, opencl_c_version);
1522
+ load_cl_kernels(backend_ctx.get(), opencl_c_version);
1430
1523
 
1431
1524
  #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
1432
1525
  // Allocate intermediate buffers and images
@@ -1456,10 +1549,8 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
1456
1549
  CL_CHECK((backend_ctx->B_d_max = clCreateBuffer(context, 0, max_B_d_bytes, NULL, &err), err));
1457
1550
  #endif // GGML_OPENCL_USE_ADRENO_KERNELS
1458
1551
 
1459
- // For now we support a single devices
1460
- ggml_backend_opencl_n_devices = 1;
1461
-
1462
- return backend_ctx;
1552
+ dev_ctx->backend_ctx = backend_ctx.release();
1553
+ return dev_ctx->backend_ctx;
1463
1554
  }
1464
1555
 
1465
1556
  static void ggml_cl2_free(void) {
@@ -1664,10 +1755,46 @@ static void ggml_backend_opencl_synchronize(ggml_backend_t backend) {
1664
1755
  GGML_UNUSED(backend);
1665
1756
  }
1666
1757
 
1758
+ // Syncronizes the 'backend_ctx's device with others so that commands
1759
+ // enqueued to it won't start until commands in the other devices have
1760
+ // completed.
1761
+ static void sync_with_other_backends(ggml_backend_opencl_context * backend_ctx) {
1762
+ if (g_ggml_backend_opencl_devices.size() < 2)
1763
+ return; // No other devices to synchronize with.
1764
+
1765
+ std::vector<cl_event> events;
1766
+ events.reserve(g_ggml_backend_opencl_devices.size());
1767
+
1768
+ for (ggml_backend_device & backend_dev : g_ggml_backend_opencl_devices) {
1769
+ auto * other_backend_ctx = ggml_cl2_init(&backend_dev);
1770
+ if (backend_ctx != other_backend_ctx) {
1771
+ cl_event ev;
1772
+ CL_CHECK(clEnqueueMarkerWithWaitList(other_backend_ctx->queue, 0, nullptr, &ev));
1773
+ CL_CHECK(clFlush(other_backend_ctx->queue));
1774
+ events.push_back(ev);
1775
+ }
1776
+ }
1777
+
1778
+ CL_CHECK(clEnqueueBarrierWithWaitList(backend_ctx->queue, events.size(), events.data(), nullptr));
1779
+ for (auto ev : events) {
1780
+ CL_CHECK(clReleaseEvent(ev));
1781
+ }
1782
+ }
1783
+
1784
+ static void sync_with_other_backends(ggml_backend_t backend) {
1785
+ auto * backend_ctx = static_cast<ggml_backend_opencl_context *>(backend->context);
1786
+ sync_with_other_backends(backend_ctx);
1787
+ }
1788
+
1667
1789
  static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
1668
1790
  for (int i = 0; i < cgraph->n_nodes; i++) {
1669
1791
  ggml_tensor * node = cgraph->nodes[i];
1670
1792
 
1793
+ // NOTE: this may oversynchronize by synchronizing with
1794
+ // backends/devices which don't compute 'cgraph's
1795
+ // dependencies.
1796
+ sync_with_other_backends(backend);
1797
+
1671
1798
  if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
1672
1799
  continue;
1673
1800
  }
@@ -2058,15 +2185,16 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
2058
2185
  // The original tensor memory is divided into scales and quants, i.e.,
2059
2186
  // we first store scales, then quants.
2060
2187
  // Create subbuffer for scales.
2061
- region.origin = extra_orig->offset + tensor->view_offs + offset;
2188
+ region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment);
2062
2189
  region.size = size_d;
2063
2190
  extra->d = clCreateSubBuffer(
2064
2191
  extra_orig->data_device, CL_MEM_READ_WRITE,
2065
2192
  CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
2066
2193
  CL_CHECK(err);
2194
+ auto previous_origin = region.origin;
2067
2195
 
2068
2196
  // Create subbuffer for quants.
2069
- region.origin = extra_orig->offset + tensor->view_offs + offset + size_d;
2197
+ region.origin = align_to(previous_origin + size_d, backend_ctx->alignment);
2070
2198
  region.size = size_q;
2071
2199
  extra->q = clCreateSubBuffer(
2072
2200
  extra_orig->data_device, CL_MEM_READ_WRITE,
@@ -2271,8 +2399,8 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
2271
2399
  cl_context context = backend_ctx->context;
2272
2400
  cl_command_queue queue = backend_ctx->queue;
2273
2401
 
2274
- // Make sure all previously submitted commands are finished.
2275
- CL_CHECK(clFinish(queue));
2402
+ // Make sure all previously submitted commands in other devices are finished.
2403
+ sync_with_other_backends(backend_ctx);
2276
2404
 
2277
2405
  #ifdef GGML_OPENCL_SOA_Q
2278
2406
  // In end-to-end runs, get_tensor is usually used to get back the logits,
@@ -2376,13 +2504,8 @@ static ggml_backend_buffer_t ggml_backend_opencl_buffer_type_alloc_buffer(ggml_b
2376
2504
  }
2377
2505
 
2378
2506
  static size_t ggml_backend_opencl_buffer_type_get_alignment(ggml_backend_buffer_type_t buffer_type) {
2379
- // FIXME: not thread safe, device may not be initialized yet
2380
- static cl_uint alignment = -1;
2381
- if (alignment == (cl_uint)-1) {
2382
- ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(buffer_type->device);
2383
- alignment = backend_ctx->alignment;
2384
- }
2385
- return alignment;
2507
+ ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(buffer_type->device);
2508
+ return backend_ctx->alignment;
2386
2509
  }
2387
2510
 
2388
2511
  static size_t ggml_backend_opencl_buffer_type_get_max_size(ggml_backend_buffer_type_t buffer_type) {
@@ -2409,16 +2532,6 @@ static ggml_backend_buffer_type_i ggml_backend_opencl_buffer_type_interface = {
2409
2532
  /* .is_host = */ NULL,
2410
2533
  };
2411
2534
 
2412
- ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type() {
2413
- static ggml_backend_buffer_type buffer_type = {
2414
- /* .iface = */ ggml_backend_opencl_buffer_type_interface,
2415
- /* .device = */ &g_ggml_backend_opencl_device,
2416
- /* .context = */ nullptr,
2417
- };
2418
-
2419
- return &buffer_type;
2420
- }
2421
-
2422
2535
  //
2423
2536
  // backend device
2424
2537
  //
@@ -2476,9 +2589,15 @@ static ggml_backend_t ggml_backend_opencl_device_init(ggml_backend_dev_t dev, co
2476
2589
  }
2477
2590
 
2478
2591
  static ggml_backend_buffer_type_t ggml_backend_opencl_device_get_buffer_type(ggml_backend_dev_t dev) {
2479
- return ggml_backend_opencl_buffer_type();
2592
+ auto * dev_ctx = static_cast<ggml_backend_opencl_device_context *>(dev->context);
2480
2593
 
2481
- GGML_UNUSED(dev);
2594
+ dev_ctx->buffer_type = ggml_backend_buffer_type{
2595
+ /* .iface = */ ggml_backend_opencl_buffer_type_interface,
2596
+ /* .device = */ dev,
2597
+ /* .context = */ nullptr,
2598
+ };
2599
+
2600
+ return &dev_ctx->buffer_type;
2482
2601
  }
2483
2602
 
2484
2603
  static ggml_backend_buffer_t ggml_backend_opencl_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
@@ -2494,12 +2613,21 @@ static bool ggml_backend_opencl_device_supports_op(ggml_backend_dev_t dev, const
2494
2613
  }
2495
2614
 
2496
2615
  static bool ggml_backend_opencl_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
2497
- return buft->iface.get_name == ggml_backend_opencl_buffer_type_get_name;
2616
+ // Check 'dev' and 'buffer_type' are not objects belonging to this backend.
2617
+ if (dev->iface.get_name != ggml_backend_opencl_device_get_name ||
2618
+ buft->iface.get_name != ggml_backend_opencl_buffer_type_get_name) {
2619
+ return false;
2620
+ }
2498
2621
 
2499
- GGML_UNUSED(dev);
2622
+ // Check cl_context is the same. clEnqueue* commands may not use
2623
+ // buffers from another cl_context.
2624
+ ggml_backend_opencl_context * backend_ctx0 = ggml_cl2_init(dev);
2625
+ ggml_backend_opencl_context * backend_ctx1 = ggml_cl2_init(buft->device);
2626
+ return backend_ctx0->context == backend_ctx1->context;
2500
2627
  }
2501
2628
 
2502
- static struct ggml_backend_device_i ggml_backend_opencl_device_i = {
2629
+ namespace /* anonymous */ {
2630
+ struct ggml_backend_device_i ggml_backend_opencl_device_i = {
2503
2631
  /* .get_name = */ ggml_backend_opencl_device_get_name,
2504
2632
  /* .get_description = */ ggml_backend_opencl_device_get_description,
2505
2633
  /* .get_memory = */ ggml_backend_opencl_device_get_memory,
@@ -2516,6 +2644,7 @@ static struct ggml_backend_device_i ggml_backend_opencl_device_i = {
2516
2644
  /* .event_free = */ NULL,
2517
2645
  /* .event_synchronize = */ NULL,
2518
2646
  };
2647
+ }
2519
2648
 
2520
2649
  // Backend registry
2521
2650
 
@@ -2526,15 +2655,15 @@ static const char * ggml_backend_opencl_reg_get_name(ggml_backend_reg_t reg) {
2526
2655
  }
2527
2656
 
2528
2657
  static size_t ggml_backend_opencl_reg_device_count(ggml_backend_reg_t reg) {
2529
- return ggml_backend_opencl_n_devices;
2658
+ return g_ggml_backend_opencl_devices.size();
2530
2659
 
2531
2660
  GGML_UNUSED(reg);
2532
2661
  }
2533
2662
 
2534
2663
  static ggml_backend_dev_t ggml_backend_opencl_reg_device_get(ggml_backend_reg_t reg, size_t index) {
2535
- GGML_ASSERT(index == 0);
2664
+ GGML_ASSERT(index < ggml_backend_opencl_reg_device_count(reg));
2536
2665
 
2537
- return &g_ggml_backend_opencl_device;
2666
+ return &g_ggml_backend_opencl_devices[index];
2538
2667
 
2539
2668
  GGML_UNUSED(reg);
2540
2669
  GGML_UNUSED(index);
@@ -2548,27 +2677,23 @@ static struct ggml_backend_reg_i ggml_backend_opencl_reg_i = {
2548
2677
  };
2549
2678
 
2550
2679
  ggml_backend_reg_t ggml_backend_opencl_reg(void) {
2551
- // TODO: make this thread-safe somehow?
2680
+ static std::mutex mutex;
2552
2681
  static ggml_backend_reg reg;
2553
2682
  static bool initialized = false;
2683
+ std::lock_guard<std::mutex> lock(mutex);
2554
2684
 
2555
- if (!initialized) {
2556
- reg = ggml_backend_reg {
2557
- /* .api_version = */ GGML_BACKEND_API_VERSION,
2558
- /* .iface = */ ggml_backend_opencl_reg_i,
2559
- /* .context = */ NULL,
2560
- };
2561
-
2562
- g_ggml_backend_opencl_device = ggml_backend_device {
2563
- /* .iface = */ ggml_backend_opencl_device_i,
2564
- /* .reg = */ &reg,
2565
- /* .context = */ &g_ggml_ctx_dev_main,
2566
- };
2685
+ if (initialized) {
2686
+ return &reg;
2687
+ }
2688
+ initialized = true;
2567
2689
 
2568
- ggml_cl2_init(&g_ggml_backend_opencl_device);
2690
+ g_ggml_backend_opencl_devices = ggml_opencl_probe_devices(&reg);
2569
2691
 
2570
- initialized = true;
2571
- }
2692
+ reg = ggml_backend_reg{
2693
+ /* .api_version = */ GGML_BACKEND_API_VERSION,
2694
+ /* .iface = */ ggml_backend_opencl_reg_i,
2695
+ /* .context = */ NULL,
2696
+ };
2572
2697
 
2573
2698
  return &reg;
2574
2699
  }
@@ -2942,14 +3067,19 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
2942
3067
  size_t global_work_size[] = {(size_t)n, 1, 1};
2943
3068
  size_t local_work_size[] = {64, 1, 1};
2944
3069
 
3070
+ size_t * local_work_size_ptr = local_work_size;
3071
+ if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
3072
+ local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
3073
+ }
3074
+
2945
3075
  #ifdef GGML_OPENCL_PROFILING
2946
3076
  cl_event evt;
2947
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3077
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
2948
3078
 
2949
3079
  g_profiling_info.emplace_back();
2950
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3080
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
2951
3081
  #else
2952
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3082
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
2953
3083
  #endif
2954
3084
  } else {
2955
3085
  unsigned int nth = MIN(64, ne0);
@@ -3077,14 +3207,19 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
3077
3207
  size_t global_work_size[] = {(size_t)n, 1, 1};
3078
3208
  size_t local_work_size[] = {64, 1, 1};
3079
3209
 
3210
+ size_t * local_work_size_ptr = local_work_size;
3211
+ if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
3212
+ local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
3213
+ }
3214
+
3080
3215
  #ifdef GGML_OPENCL_PROFILING
3081
3216
  cl_event evt;
3082
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3217
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
3083
3218
 
3084
3219
  g_profiling_info.emplace_back();
3085
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3220
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
3086
3221
  #else
3087
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3222
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
3088
3223
  #endif
3089
3224
  } else {
3090
3225
  unsigned int nth = MIN(64, ne0);
@@ -3233,14 +3368,19 @@ static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const
3233
3368
  size_t global_work_size[] = {(size_t)n, 1, 1};
3234
3369
  size_t local_work_size[] = {64, 1, 1};
3235
3370
 
3371
+ size_t * local_work_size_ptr = local_work_size;
3372
+ if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
3373
+ local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
3374
+ }
3375
+
3236
3376
  #ifdef GGML_OPENCL_PROFILING
3237
3377
  cl_event evt;
3238
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3378
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
3239
3379
 
3240
3380
  g_profiling_info.emplace_back();
3241
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3381
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
3242
3382
  #else
3243
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3383
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
3244
3384
  #endif
3245
3385
  }
3246
3386
 
@@ -3273,14 +3413,19 @@ static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const
3273
3413
  size_t global_work_size[] = {(size_t)n, 1, 1};
3274
3414
  size_t local_work_size[] = {64, 1, 1};
3275
3415
 
3416
+ size_t * local_work_size_ptr = local_work_size;
3417
+ if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
3418
+ local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
3419
+ }
3420
+
3276
3421
  #ifdef GGML_OPENCL_PROFILING
3277
3422
  cl_event evt;
3278
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3423
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
3279
3424
 
3280
3425
  g_profiling_info.emplace_back();
3281
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3426
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
3282
3427
  #else
3283
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3428
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
3284
3429
  #endif
3285
3430
  }
3286
3431
 
@@ -3320,14 +3465,19 @@ static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, cons
3320
3465
  size_t global_work_size[] = {(size_t)n, 1, 1};
3321
3466
  size_t local_work_size[] = {64, 1, 1};
3322
3467
 
3468
+ size_t * local_work_size_ptr = local_work_size;
3469
+ if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
3470
+ local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
3471
+ }
3472
+
3323
3473
  #ifdef GGML_OPENCL_PROFILING
3324
3474
  cl_event evt;
3325
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3475
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
3326
3476
 
3327
3477
  g_profiling_info.emplace_back();
3328
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
3478
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
3329
3479
  #else
3330
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
3480
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
3331
3481
  #endif
3332
3482
  }
3333
3483
 
@@ -4230,14 +4380,19 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons
4230
4380
  size_t global_work_size[] = {(size_t)n, 1, 1};
4231
4381
  size_t local_work_size[] = {64, 1, 1};
4232
4382
 
4383
+ size_t * local_work_size_ptr = local_work_size;
4384
+ if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
4385
+ local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
4386
+ }
4387
+
4233
4388
  #ifdef GGML_OPENCL_PROFILING
4234
4389
  cl_event evt;
4235
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
4390
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
4236
4391
 
4237
4392
  g_profiling_info.emplace_back();
4238
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
4393
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
4239
4394
  #else
4240
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
4395
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
4241
4396
  #endif
4242
4397
  }
4243
4398
 
@@ -4418,14 +4573,19 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr
4418
4573
  size_t global_work_size[] = {(size_t)ne00, (size_t)ne01, (size_t)ne02};
4419
4574
  size_t local_work_size[] = {64, 1, 1};
4420
4575
 
4576
+ size_t * local_work_size_ptr = local_work_size;
4577
+ if (ne00 % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
4578
+ local_work_size_ptr = nullptr; // Let driver choose the work-group sizes.
4579
+ }
4580
+
4421
4581
  #ifdef GGML_OPENCL_PROFILING
4422
4582
  cl_event evt;
4423
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
4583
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
4424
4584
 
4425
4585
  g_profiling_info.emplace_back();
4426
- populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
4586
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
4427
4587
  #else
4428
- CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
4588
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
4429
4589
  #endif
4430
4590
  }
4431
4591
  }