@fugood/llama.node 0.3.13 → 0.3.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/lib/binding.ts +1 -1
  18. package/package.json +1 -1
  19. package/src/LlamaContext.cpp +98 -76
  20. package/src/LlamaContext.h +1 -1
  21. package/src/common.hpp +1 -2
  22. package/src/llama.cpp/.github/workflows/build.yml +60 -10
  23. package/src/llama.cpp/.github/workflows/server.yml +2 -0
  24. package/src/llama.cpp/common/CMakeLists.txt +3 -3
  25. package/src/llama.cpp/common/arg.cpp +112 -11
  26. package/src/llama.cpp/common/chat.cpp +960 -266
  27. package/src/llama.cpp/common/chat.h +135 -0
  28. package/src/llama.cpp/common/common.cpp +27 -171
  29. package/src/llama.cpp/common/common.h +27 -67
  30. package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
  31. package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
  32. package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +37 -5
  33. package/src/llama.cpp/common/ngram-cache.cpp +1 -0
  34. package/src/llama.cpp/common/sampling.cpp +45 -7
  35. package/src/llama.cpp/common/speculative.cpp +6 -5
  36. package/src/llama.cpp/common/speculative.h +1 -1
  37. package/src/llama.cpp/docs/build.md +45 -7
  38. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +3 -1
  39. package/src/llama.cpp/examples/embedding/embedding.cpp +1 -0
  40. package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
  41. package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -3
  42. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +1 -1
  43. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  44. package/src/llama.cpp/examples/llava/clip.cpp +373 -107
  45. package/src/llama.cpp/examples/llava/clip.h +19 -3
  46. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
  47. package/src/llama.cpp/examples/llava/llava.cpp +4 -2
  48. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
  49. package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -0
  50. package/src/llama.cpp/examples/main/main.cpp +73 -28
  51. package/src/llama.cpp/examples/parallel/parallel.cpp +1 -0
  52. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -0
  53. package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
  54. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
  55. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
  56. package/src/llama.cpp/examples/run/run.cpp +110 -67
  57. package/src/llama.cpp/examples/server/server.cpp +82 -87
  58. package/src/llama.cpp/examples/server/utils.hpp +94 -107
  59. package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
  60. package/src/llama.cpp/examples/tts/tts.cpp +251 -142
  61. package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
  62. package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
  63. package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
  64. package/src/llama.cpp/ggml/include/ggml-cpu.h +3 -0
  65. package/src/llama.cpp/ggml/include/ggml.h +5 -1
  66. package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
  67. package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
  68. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
  69. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
  70. package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
  71. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -2
  72. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
  73. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +132 -17
  74. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
  75. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
  76. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
  77. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +151 -0
  78. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1396 -386
  79. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1432 -151
  80. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +22 -0
  81. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
  82. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
  83. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
  84. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  85. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +15 -2
  86. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
  87. package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
  88. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
  89. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +235 -0
  90. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -2
  91. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
  92. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +220 -116
  93. package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
  94. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
  95. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +2 -0
  96. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  97. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
  98. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +51 -10
  99. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
  100. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
  101. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
  102. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
  103. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
  104. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +136 -4
  105. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +308 -0
  106. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
  107. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +168 -721
  108. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -77
  109. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -0
  110. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
  111. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
  112. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +146 -42
  113. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +13 -3
  114. package/src/llama.cpp/ggml/src/ggml.c +8 -3
  115. package/src/llama.cpp/include/llama.h +19 -5
  116. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
  117. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
  118. package/src/llama.cpp/requirements/requirements-all.txt +1 -0
  119. package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
  120. package/src/llama.cpp/requirements.txt +1 -0
  121. package/src/llama.cpp/src/llama-arch.cpp +21 -0
  122. package/src/llama.cpp/src/llama-arch.h +1 -0
  123. package/src/llama.cpp/src/llama-chat.cpp +1 -0
  124. package/src/llama.cpp/src/llama-grammar.cpp +182 -182
  125. package/src/llama.cpp/src/llama-grammar.h +12 -3
  126. package/src/llama.cpp/src/llama-kv-cache.h +1 -0
  127. package/src/llama.cpp/src/llama-mmap.cpp +11 -1
  128. package/src/llama.cpp/src/llama-model.cpp +69 -5
  129. package/src/llama.cpp/src/llama-sampling.cpp +43 -10
  130. package/src/llama.cpp/src/llama-vocab.cpp +12 -0
  131. package/src/llama.cpp/src/llama.cpp +147 -0
  132. package/src/llama.cpp/tests/test-backend-ops.cpp +166 -110
  133. package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
  134. package/src/llama.cpp/tests/test-chat.cpp +593 -395
  135. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
  136. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
  137. package/src/llama.cpp/Sources/llama/llama.h +0 -4
  138. package/src/llama.cpp/common/chat.hpp +0 -55
  139. /package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +0 -0
@@ -1,4 +1,4 @@
1
- #define CL_TARGET_OPENCL_VERSION 220
1
+ #define CL_TARGET_OPENCL_VERSION GGML_OPENCL_TARGET_VERSION
2
2
  #define CL_USE_DEPRECATED_OPENCL_1_2_APIS
3
3
 
4
4
  // suppress warnings in CL headers for GCC and Clang
@@ -25,6 +25,8 @@
25
25
  #include <vector>
26
26
  #include <string>
27
27
  #include <cmath>
28
+ #include <memory>
29
+ #include <charconv>
28
30
 
29
31
  #undef MIN
30
32
  #undef MAX
@@ -62,6 +64,97 @@ enum ADRENO_GPU_GEN {
62
64
  X1E,
63
65
  };
64
66
 
67
+ struct ggml_cl_version {
68
+ cl_uint major = 0;
69
+ cl_uint minor = 0;
70
+ };
71
+
72
+ // Parses a version string of form "XX.YY ". On an error returns ggml_cl_version with all zeroes.
73
+ static ggml_cl_version parse_cl_version(std::string_view str) {
74
+ size_t major_str_begin = 0;
75
+ size_t major_str_end = str.find(".", major_str_begin);
76
+ if (major_str_end == std::string::npos) {
77
+ return {};
78
+ }
79
+
80
+ size_t minor_str_begin = major_str_end + 1;
81
+ size_t minor_str_end = str.find(" ", minor_str_begin);
82
+ if (minor_str_end == std::string::npos) {
83
+ return {};
84
+ }
85
+
86
+ cl_uint version_major;
87
+ if (std::from_chars(str.data() + major_str_begin, str.data() + major_str_end, version_major).ec != std::errc{}) {
88
+ return {};
89
+ }
90
+
91
+ cl_uint version_minor;
92
+ if (std::from_chars(str.data() + minor_str_begin, str.data() + minor_str_end, version_minor).ec != std::errc{}) {
93
+ return {};
94
+ }
95
+ return { version_major, version_minor };
96
+ }
97
+
98
+ // Returns OpenCL platform's version. On an error returns ggml_cl_version with all zeroes.
99
+ static ggml_cl_version get_opencl_platform_version(cl_platform_id platform) {
100
+ size_t param_size;
101
+ CL_CHECK(clGetPlatformInfo(platform, CL_PLATFORM_VERSION, 0, nullptr, &param_size));
102
+ std::unique_ptr<char[]> param_storage(new char[param_size]);
103
+ CL_CHECK(clGetPlatformInfo(platform, CL_PLATFORM_VERSION, param_size, param_storage.get(), nullptr));
104
+
105
+ auto param_value = std::string_view(param_storage.get(), param_size);
106
+ const std::string version_prefix = "OpenCL "; // Suffix: "XX.YY <platform-specific-info>"
107
+ if (param_value.find(version_prefix) != 0) {
108
+ return {};
109
+ }
110
+ param_value.remove_prefix(version_prefix.length());
111
+ return parse_cl_version(param_value);
112
+ }
113
+
114
+ // Return a version to use in OpenCL C compilation. On an error returns ggml_cl_version with all zeroes.
115
+ static ggml_cl_version get_opencl_c_version(ggml_cl_version platform_version, cl_device_id device) {
116
+ size_t param_size;
117
+
118
+ #if CL_TARGET_OPENCL_VERSION >= 300
119
+ if (platform_version.major >= 3) {
120
+ CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_ALL_VERSIONS, 0, nullptr, &param_size));
121
+ if (!param_size) {
122
+ return {};
123
+ }
124
+
125
+ std::unique_ptr<cl_name_version[]> versions(new cl_name_version[param_size]);
126
+ CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_ALL_VERSIONS, param_size, versions.get(), nullptr));
127
+ unsigned versions_count = param_size / sizeof(cl_name_version);
128
+
129
+ cl_version version_max = 0;
130
+ for (unsigned i = 0; i < versions_count; i++) {
131
+ version_max = std::max<cl_version>(versions[i].version, version_max);
132
+ }
133
+
134
+ return { CL_VERSION_MAJOR(version_max), CL_VERSION_MINOR(version_max) };
135
+ }
136
+ #else
137
+ GGML_UNUSED(platform_version);
138
+ #endif // CL_TARGET_OPENCL_VERSION >= 300
139
+
140
+ CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, 0, nullptr, &param_size));
141
+ if (!param_size) {
142
+ return {};
143
+ }
144
+
145
+ std::unique_ptr<char[]> param_storage(new char[param_size]);
146
+ CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, param_size, param_storage.get(), nullptr));
147
+ auto param_value = std::string_view(param_storage.get(), param_size);
148
+
149
+ const std::string version_prefix = "OpenCL C "; // Suffix: "XX.YY <platform-specific-info>"
150
+ if (param_value.find(version_prefix) != 0) {
151
+ return {};
152
+ }
153
+ param_value.remove_prefix(version_prefix.length());
154
+
155
+ return parse_cl_version(param_value);
156
+ }
157
+
65
158
  static ADRENO_GPU_GEN get_adreno_gpu_gen(const char *device_name) {
66
159
  if (strstr(device_name, "730") ||
67
160
  strstr(device_name, "740") ||
@@ -278,7 +371,7 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
278
371
 
279
372
  cl_int err;
280
373
 
281
- #ifdef GGML_PROFILE_OPENCL
374
+ #ifdef GGML_OPENCL_PROFILING
282
375
  GGML_LOG_INFO("ggml_opencl: OpenCL profiling enabled\n");
283
376
  #endif
284
377
 
@@ -444,19 +537,8 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
444
537
  backend_ctx->gpu_family = GPU_FAMILY::ADRENO;
445
538
  backend_ctx->adreno_gen = get_adreno_gpu_gen(default_device->name);
446
539
 
447
- // Default wave size is 128, A8x uses 64.
448
- if (backend_ctx->adreno_gen == ADRENO_GPU_GEN::A8X) {
449
- backend_ctx->adreno_wave_size = 64;
450
- } else if (backend_ctx->adreno_gen == ADRENO_GPU_GEN::A7X ||
451
- backend_ctx->adreno_gen == ADRENO_GPU_GEN::X1E) {
452
- backend_ctx->adreno_wave_size = 128;
453
- } else {
454
- backend_ctx->adreno_wave_size = 128;
455
- GGML_LOG_WARN("ggml_opencl: Unsupported Adreno GPU: %s, "
456
- "using wave size %d, "
457
- "may not work as expected\n",
458
- backend_ctx->device_name.c_str(), backend_ctx->adreno_wave_size);
459
- }
540
+ // Use wave size of 64 for all Adreno GPUs.
541
+ backend_ctx->adreno_wave_size = 64;
460
542
  } else if (strstr(default_device->name, "Intel")) {
461
543
  backend_ctx->gpu_family = GPU_FAMILY::INTEL;
462
544
  } else {
@@ -481,16 +563,11 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
481
563
  // A local ref of cl_device_id for convenience
482
564
  cl_device_id device = backend_ctx->device;
483
565
 
566
+ ggml_cl_version platform_version = get_opencl_platform_version(default_device->platform->id);
567
+
484
568
  // Check device OpenCL version, OpenCL 2.0 or above is required
485
- size_t device_ver_str_size;
486
- clGetDeviceInfo(device, CL_DEVICE_VERSION, 0, NULL, &device_ver_str_size);
487
- char *device_ver_buffer = (char *)alloca(device_ver_str_size + 1);
488
- clGetDeviceInfo(device, CL_DEVICE_VERSION, device_ver_str_size, device_ver_buffer, NULL);
489
- device_ver_buffer[device_ver_str_size] = '\0';
490
- GGML_LOG_INFO("ggml_opencl: device OpenCL version: %s\n", device_ver_buffer);
491
-
492
- if (strstr(device_ver_buffer, "OpenCL 2") == NULL &&
493
- strstr(device_ver_buffer, "OpenCL 3") == NULL) {
569
+ ggml_cl_version opencl_c_version = get_opencl_c_version(platform_version, device);
570
+ if (opencl_c_version.major < 2) {
494
571
  GGML_LOG_ERROR("ggml_opencl: OpenCL 2.0 or above is required\n");
495
572
  return backend_ctx;
496
573
  }
@@ -527,15 +604,17 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
527
604
 
528
605
  // If OpenCL 3.0 is supported, then check for cl_khr_subgroups, which becomes
529
606
  // optional in OpenCL 3.0 (cl_khr_subgroup is mandatory in OpenCL 2.x)
530
- if (strstr(device_ver_buffer, "OpenCL 3") &&
531
- strstr(ext_buffer, "cl_khr_subgroups") == NULL &&
607
+ if (opencl_c_version.major == 3 && strstr(ext_buffer, "cl_khr_subgroups") == NULL &&
532
608
  strstr(ext_buffer, "cl_intel_subgroups") == NULL) {
533
609
  GGML_LOG_ERROR("ggml_opencl: device does not support subgroups (cl_khr_subgroups or cl_intel_subgroups) "
534
610
  "(note that subgroups is an optional feature in OpenCL 3.0)\n");
535
611
  return backend_ctx;
536
612
  }
537
613
 
538
- CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &backend_ctx->alignment, NULL));
614
+ cl_uint base_align_in_bits;
615
+ CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &base_align_in_bits, NULL));
616
+ GGML_ASSERT(base_align_in_bits % 8u == 0);
617
+ backend_ctx->alignment = base_align_in_bits / 8u;
539
618
  GGML_LOG_INFO("ggml_opencl: mem base addr align: %u\n", backend_ctx->alignment);
540
619
 
541
620
  clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &backend_ctx->max_alloc_size, NULL);
@@ -589,9 +668,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
589
668
  const std::string kernel_src = read_file("ggml-opencl.cl");
590
669
  #endif
591
670
 
592
- std::string compile_opts =
593
- "-cl-std=CL2.0 -cl-mad-enable -cl-unsafe-math-optimizations "
594
- "-cl-finite-math-only -cl-fast-relaxed-math ";
671
+ auto opencl_c_std =
672
+ std::string("CL") + std::to_string(opencl_c_version.major) + "." + std::to_string(opencl_c_version.minor);
673
+
674
+ std::string compile_opts = std::string("-cl-std=") + opencl_c_std +
675
+ " -cl-mad-enable -cl-unsafe-math-optimizations"
676
+ " -cl-finite-math-only -cl-fast-relaxed-math";
595
677
  backend_ctx->program = build_program_from_source(context, device, kernel_src.c_str(), compile_opts);
596
678
 
597
679
  // Non matmul kernels.
@@ -701,10 +783,10 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
701
783
  CL_CHECK((backend_ctx->kernel_transpose_16 = clCreateKernel(backend_ctx->program_transpose_16, "kernel_transpose_16", &err), err));
702
784
 
703
785
  // Gemv general
704
- std::string CL_gemv_compile_opts =
705
- " -cl-std=CL2.0 "
706
- " -cl-mad-enable "
707
- " -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size);
786
+ std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
787
+ " -cl-mad-enable "
788
+ " -DSIMDGROUP_WIDTH=" +
789
+ std::to_string(backend_ctx->adreno_wave_size);
708
790
  if (has_vector_subgroup_broadcast) {
709
791
  CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
710
792
  }
@@ -721,12 +803,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
721
803
  CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_general = clCreateKernel(backend_ctx->program_CL_gemv_general, "kernel_gemv_noshuffle", &err), err));
722
804
 
723
805
  // Gemv 2048, 16384
724
- CL_gemv_compile_opts =
725
- " -cl-std=CL2.0 "
726
- " -cl-mad-enable "
727
- " -DLINE_STRIDE_A=2048 "
728
- " -DBLOCK_STRIDE_A=16384 "
729
- " -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size);
806
+ CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
807
+ " -cl-mad-enable "
808
+ " -DLINE_STRIDE_A=2048 "
809
+ " -DBLOCK_STRIDE_A=16384 "
810
+ " -DSIMDGROUP_WIDTH=" +
811
+ std::to_string(backend_ctx->adreno_wave_size);
730
812
  if (has_vector_subgroup_broadcast) {
731
813
  CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
732
814
  }
@@ -743,12 +825,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
743
825
  CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_4096, "kernel_gemv_noshuffle", &err), err));
744
826
 
745
827
  // Gemv 2048, 16384
746
- CL_gemv_compile_opts =
747
- " -cl-std=CL2.0 "
748
- " -cl-mad-enable "
749
- " -DLINE_STRIDE_A=2048 "
750
- " -DBLOCK_STRIDE_A=16384 "
751
- " -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size);
828
+ CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
829
+ " -cl-mad-enable "
830
+ " -DLINE_STRIDE_A=2048 "
831
+ " -DBLOCK_STRIDE_A=16384 "
832
+ " -DSIMDGROUP_WIDTH=" +
833
+ std::to_string(backend_ctx->adreno_wave_size);
752
834
  if (has_vector_subgroup_broadcast) {
753
835
  CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
754
836
  }
@@ -758,12 +840,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
758
840
  CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_11008 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_11008, "kernel_gemv_noshuffle", &err), err));
759
841
 
760
842
  // Gemv 5504, 44032
761
- CL_gemv_compile_opts =
762
- " -cl-std=CL2.0 "
763
- " -cl-mad-enable "
764
- " -DLINE_STRIDE_A=5504 "
765
- " -DBLOCK_STRIDE_A=44032 "
766
- " -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size);
843
+ CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
844
+ " -cl-mad-enable "
845
+ " -DLINE_STRIDE_A=5504 "
846
+ " -DBLOCK_STRIDE_A=44032 "
847
+ " -DSIMDGROUP_WIDTH=" +
848
+ std::to_string(backend_ctx->adreno_wave_size);
767
849
  if (has_vector_subgroup_broadcast) {
768
850
  CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
769
851
  }
@@ -773,12 +855,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
773
855
  CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_11008_1_4096, "kernel_gemv_noshuffle", &err), err));
774
856
 
775
857
  // Gemv 16000, 128000
776
- CL_gemv_compile_opts =
777
- " -cl-std=CL2.0 "
778
- " -cl-mad-enable "
779
- " -DLINE_STRIDE_A=16000 "
780
- " -DBLOCK_STRIDE_A=128000 "
781
- " -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size);
858
+ CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std +
859
+ " -cl-mad-enable "
860
+ " -DLINE_STRIDE_A=16000 "
861
+ " -DBLOCK_STRIDE_A=128000 "
862
+ " -DSIMDGROUP_WIDTH=" +
863
+ std::to_string(backend_ctx->adreno_wave_size);
782
864
  if (has_vector_subgroup_broadcast) {
783
865
  CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT ";
784
866
  }
@@ -1015,17 +1097,18 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
1015
1097
  case GGML_OP_ADD:
1016
1098
  case GGML_OP_SCALE:
1017
1099
  case GGML_OP_MUL:
1018
- return true;
1100
+ return op->src[0]->type == GGML_TYPE_F32;
1019
1101
  case GGML_OP_UNARY:
1020
1102
  switch (ggml_get_unary_op(op)) {
1021
1103
  case GGML_UNARY_OP_GELU:
1022
1104
  case GGML_UNARY_OP_SILU:
1023
1105
  case GGML_UNARY_OP_RELU:
1024
- return ggml_is_contiguous(op->src[0]);
1106
+ return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
1025
1107
  default:
1026
1108
  return false;
1027
1109
  }
1028
1110
  case GGML_OP_CLAMP:
1111
+ return op->src[0]->type == GGML_TYPE_F32;
1029
1112
  case GGML_OP_SOFT_MAX:
1030
1113
  case GGML_OP_NORM:
1031
1114
  case GGML_OP_RMS_NORM:
@@ -1209,20 +1292,17 @@ struct ggml_backend_opencl_buffer_context {
1209
1292
  std::string name;
1210
1293
  };
1211
1294
 
1212
- static void * const cl_ptr_base = (void *)(uintptr_t) 0x1000;
1213
-
1214
1295
  static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
1215
1296
  ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
1216
1297
  delete ctx;
1217
1298
  }
1218
1299
 
1219
1300
  static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
1220
- return cl_ptr_base;
1221
-
1222
- GGML_UNUSED(buffer);
1301
+ ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(buffer->buft->device);
1302
+ return (void *) (uintptr_t) backend_ctx->alignment;
1223
1303
  }
1224
1304
 
1225
- static void ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
1305
+ static enum ggml_status ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
1226
1306
  ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
1227
1307
 
1228
1308
  ggml_cl2_init(buffer->buft->device);
@@ -1252,7 +1332,7 @@ static void ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer,
1252
1332
  tensor->extra = view_extra;
1253
1333
  } else {
1254
1334
  {
1255
- size_t offset = (char *)tensor->data - (char *)cl_ptr_base;
1335
+ size_t offset = (char *) tensor->data - (char *) ggml_backend_opencl_buffer_get_base(buffer);
1256
1336
 
1257
1337
  ggml_tensor_extra_cl * extra = ctx->ggml_opencl_alloc_temp_tensor_extra();
1258
1338
  extra->offset = offset;
@@ -1262,6 +1342,7 @@ static void ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer,
1262
1342
  tensor->extra = extra;
1263
1343
  }
1264
1344
  }
1345
+ return GGML_STATUS_SUCCESS;
1265
1346
  }
1266
1347
 
1267
1348
  // The optimized gemm and gemv kernels are used for large matrices without batch.
@@ -1376,6 +1457,11 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
1376
1457
  int M = tensor->ne[1]; // ne01
1377
1458
  int K = tensor->ne[0]; // ne00
1378
1459
 
1460
+ //For matrix-vector multiplication kernel, we assume K is a multiple of 32
1461
+ GGML_ASSERT(K % 32 == 0);
1462
+ //For transpose kernels, we assume K is a multiple of 4 (satisfied by prior assert), and M is a multiple of 4
1463
+ GGML_ASSERT(M % 4 == 0);
1464
+
1379
1465
  // transpose is out of place, so we need to allocate transposed buffers
1380
1466
  // <----------------------------------------------------------------------------------> //
1381
1467
  // use sub_buffer of max buffer size instead
@@ -1416,36 +1502,36 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
1416
1502
  cl_mem qT_d_image1D;
1417
1503
  cl_mem dT_d_image1D;
1418
1504
 
1419
- cl_image_format img_fmt_1d = { CL_RGBA, CL_FLOAT };
1505
+ cl_image_format img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
1420
1506
  cl_image_desc img_desc_1d;
1421
1507
 
1422
1508
  memset(&img_desc_1d, 0, sizeof(img_desc_1d));
1423
1509
  img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
1424
- img_desc_1d.image_width = M * K / 8 / 4;
1510
+ img_desc_1d.image_width = M * K / 4 / 4;
1425
1511
  img_desc_1d.buffer = extra->q;
1426
1512
  q_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
1427
1513
  CL_CHECK(err);
1428
1514
 
1429
- img_fmt_1d = { CL_RGBA, CL_FLOAT };
1515
+ img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
1430
1516
  memset(&img_desc_1d, 0, sizeof(img_desc_1d));
1431
1517
  img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
1432
- img_desc_1d.image_width = M * K / 8 / 4;
1518
+ img_desc_1d.image_width = M * K / 4 / 4;
1433
1519
  img_desc_1d.buffer = qT_d;
1434
1520
  qT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
1435
1521
  CL_CHECK(err);
1436
1522
 
1437
- img_fmt_1d = { CL_RGBA, CL_FLOAT };
1523
+ img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
1438
1524
  memset(&img_desc_1d, 0, sizeof(img_desc_1d));
1439
1525
  img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
1440
- img_desc_1d.image_width = M * K / 32 / 4 / 2;
1526
+ img_desc_1d.image_width = M * K / 32 / 4;
1441
1527
  img_desc_1d.buffer = extra->d;
1442
1528
  d_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
1443
1529
  CL_CHECK(err);
1444
1530
 
1445
- img_fmt_1d = { CL_RGBA, CL_FLOAT };
1531
+ img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
1446
1532
  memset(&img_desc_1d, 0, sizeof(img_desc_1d));
1447
1533
  img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
1448
- img_desc_1d.image_width = M * K / 32 / 4 / 2;
1534
+ img_desc_1d.image_width = M * K / 32 / 4;
1449
1535
  img_desc_1d.buffer = dT_d;
1450
1536
  dT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err);
1451
1537
  CL_CHECK(err);
@@ -1454,8 +1540,8 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
1454
1540
  // set up and call the transpose kernels
1455
1541
  // <----------------------------------------------------------------------------------> //
1456
1542
  // weights
1457
- int height_q = M / 8;
1458
- int width_q = K / 8 / 4;
1543
+ int height_q = M / 4;
1544
+ int width_q = K / 4 / 4;
1459
1545
  kernel = backend_ctx->kernel_transpose_16;
1460
1546
 
1461
1547
  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &q_d_image1D));
@@ -1469,8 +1555,8 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
1469
1555
  CL_CHECK(clWaitForEvents(1, &evt));
1470
1556
 
1471
1557
  // scales
1472
- int height_s = M / 8;
1473
- int width_s = K / 32 / 8;
1558
+ int height_s = M / 4;
1559
+ int width_s = K / 32 / 4;
1474
1560
 
1475
1561
  kernel = backend_ctx->kernel_transpose_16;
1476
1562
  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_d_image1D));
@@ -1864,7 +1950,6 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso
1864
1950
  void * buf_d;
1865
1951
  #endif
1866
1952
 
1867
- #ifdef GGML_USE_OPENCL
1868
1953
  // Make sure everything is done.
1869
1954
  CL_CHECK(clFinish(queue));
1870
1955
 
@@ -1900,7 +1985,6 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso
1900
1985
  extra->offset, ggml_nbytes(tensor), buf, 0, NULL, NULL));
1901
1986
  CL_CHECK(clFinish(queue));
1902
1987
  #endif // GGML_OPENCL_SOA_Q
1903
- #endif // GGML_USE_OPENCL
1904
1988
 
1905
1989
  // Open file and dump.
1906
1990
  char fname[512];
@@ -2580,26 +2664,33 @@ static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const
2580
2664
  memcpy(&eps, dst->op_params, sizeof(float));
2581
2665
 
2582
2666
  const int ne00 = src0 ? src0->ne[0] : 0;
2583
- const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
2667
+ const int ne01 = src0 ? src0->ne[1] : 0;
2668
+ const int ne02 = src0 ? src0->ne[2] : 0;
2669
+ const int ne03 = src0 ? src0->ne[3] : 0;
2584
2670
 
2585
- GGML_ASSERT(ggml_is_contiguous_1(src0));
2671
+ const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
2672
+ const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
2673
+ const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
2586
2674
 
2587
2675
  const int nth = MIN(64, ne00);
2588
2676
 
2589
2677
  cl_kernel kernel = backend_ctx->kernel_norm;
2590
2678
 
2591
- CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
2592
- CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
2593
- CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
2594
- CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
2595
- CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
2596
- CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &nb01));
2597
- CL_CHECK(clSetKernelArg(kernel, 6, sizeof(float), &eps));
2598
- CL_CHECK(clSetKernelArg(kernel, 7, sizeof(float)*nth, NULL));
2599
-
2600
- const int64_t nrows = ggml_nrows(src0);
2679
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
2680
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
2681
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
2682
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
2683
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
2684
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
2685
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
2686
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
2687
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb01));
2688
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb02));
2689
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb03));
2690
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(float), &eps));
2691
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float)*nth, NULL));
2601
2692
 
2602
- size_t global_work_size[] = {(size_t)nrows*nth, 1, 1};
2693
+ size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
2603
2694
  size_t local_work_size[] = {(size_t)nth, 1, 1};
2604
2695
 
2605
2696
  #ifdef GGML_OPENCL_PROFILING
@@ -2637,16 +2728,19 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
2637
2728
  memcpy(&eps, dst->op_params, sizeof(float));
2638
2729
 
2639
2730
  const int ne00 = src0 ? src0->ne[0] : 0;
2731
+ const int ne01 = src0 ? src0->ne[1] : 0;
2732
+ const int ne02 = src0 ? src0->ne[2] : 0;
2733
+ const int ne03 = src0 ? src0->ne[3] : 0;
2734
+
2640
2735
  const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
2736
+ const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
2737
+ const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
2641
2738
 
2642
2739
  GGML_ASSERT(ne00 % 4 == 0);
2643
- GGML_ASSERT(ggml_is_contiguous_1(src0));
2644
2740
 
2645
2741
  const int nth = MIN(64, ne00);
2646
2742
 
2647
- const int64_t nrows = ggml_nrows(src0);
2648
-
2649
- size_t global_work_size[] = {(size_t)nrows*nth, 1, 1};
2743
+ size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
2650
2744
  size_t local_work_size[] = {(size_t)nth, 1, 1};
2651
2745
 
2652
2746
  cl_kernel kernel = backend_ctx->kernel_rms_norm;
@@ -2661,15 +2755,20 @@ static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, c
2661
2755
  sizeof(local_work_size), local_work_size,
2662
2756
  sizeof(size_t), &sgs, NULL));
2663
2757
 
2664
- CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
2665
- CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
2666
- CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
2667
- CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
2668
- CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
2669
- CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &nb01));
2670
- CL_CHECK(clSetKernelArg(kernel, 6, sizeof(float), &eps));
2758
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
2759
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
2760
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
2761
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd));
2762
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
2763
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
2764
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
2765
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
2766
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb01));
2767
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb02));
2768
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb03));
2769
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(float), &eps));
2671
2770
  // This is local memory - the size depends on subgroup size.
2672
- CL_CHECK(clSetKernelArg(kernel, 7, sizeof(float)*nth/sgs, NULL));
2771
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float)*nth/sgs, NULL));
2673
2772
 
2674
2773
  #ifdef GGML_OPENCL_PROFILING
2675
2774
  cl_event evt;
@@ -2865,6 +2964,9 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
2865
2964
  CL_CHECK(status);
2866
2965
 
2867
2966
  int height_B = N/4;
2967
+ if (height_B == 0) {
2968
+ height_B = 1;
2969
+ }
2868
2970
  int width_B = K/4;
2869
2971
  int padded_height_B = (N + padding)/4;
2870
2972
 
@@ -3013,11 +3115,12 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
3013
3115
  }
3014
3116
 
3015
3117
  if (N == 1) {
3016
- local_work_size[0] = backend_ctx->adreno_wave_size; // localsize
3118
+ size_t wavesize = backend_ctx->adreno_wave_size;
3119
+ local_work_size[0] = wavesize; // localsize
3017
3120
  local_work_size[1] = 4; // reduce factor
3018
3121
  local_work_size[2] = 1;
3019
3122
 
3020
- global_work_size[0] = M / 2;
3123
+ global_work_size[0] = (((M / 2) + wavesize - 1) / wavesize) * wavesize;
3021
3124
  global_work_size[1] = 4; // reduce factor
3022
3125
  global_work_size[2] = 1;
3023
3126
  }
@@ -3026,6 +3129,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
3026
3129
  // enqueue kernel with profiling
3027
3130
  // <--------------------------------------------> //
3028
3131
  #ifdef GGML_OPENCL_PROFILING
3132
+ cl_event evt;
3029
3133
  CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
3030
3134
 
3031
3135
  g_profiling_info.emplace_back();
@@ -3767,10 +3871,10 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
3767
3871
  const int ne02 = src0 ? src0->ne[2] : 0;
3768
3872
  const int ne03 = src0 ? src0->ne[3] : 0;
3769
3873
 
3770
- const int nb00 = src0 ? src0->nb[0] : 0;
3771
- const int nb01 = src0 ? src0->nb[1] : 0;
3772
- const int nb02 = src0 ? src0->nb[2] : 0;
3773
- const int nb03 = src0 ? src0->nb[3] : 0;
3874
+ const cl_ulong nb00 = src0 ? src0->nb[0] : 0;
3875
+ const cl_ulong nb01 = src0 ? src0->nb[1] : 0;
3876
+ const cl_ulong nb02 = src0 ? src0->nb[2] : 0;
3877
+ const cl_ulong nb03 = src0 ? src0->nb[3] : 0;
3774
3878
 
3775
3879
  const int ne10 = src1 ? src1->ne[0] : 0;
3776
3880
  const int ne11 = src1 ? src1->ne[1] : 0; UNUSED(ne11);
@@ -3782,10 +3886,10 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
3782
3886
  const int ne2 = dst ? dst->ne[2] : 0;
3783
3887
  const int ne3 = dst ? dst->ne[3] : 0;
3784
3888
 
3785
- const int nb0 = dst ? dst->nb[0] : 0;
3786
- const int nb1 = dst ? dst->nb[1] : 0;
3787
- const int nb2 = dst ? dst->nb[2] : 0;
3788
- const int nb3 = dst ? dst->nb[3] : 0;
3889
+ const cl_ulong nb0 = dst ? dst->nb[0] : 0;
3890
+ const cl_ulong nb1 = dst ? dst->nb[1] : 0;
3891
+ const cl_ulong nb2 = dst ? dst->nb[2] : 0;
3892
+ const cl_ulong nb3 = dst ? dst->nb[3] : 0;
3789
3893
 
3790
3894
  GGML_ASSERT(ne10 % ne02 == 0);
3791
3895
  GGML_ASSERT(ne10 >= ne02);