@novastera-oss/llamarn 0.2.6 → 0.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (192) hide show
  1. package/android/src/main/cpp/include/llama.h +134 -36
  2. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  3. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  4. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  6. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  7. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  8. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  10. package/cpp/LlamaCppModel.cpp +2 -2
  11. package/cpp/LlamaCppModel.h +3 -3
  12. package/cpp/PureCppImpl.cpp +1 -1
  13. package/cpp/PureCppImpl.h +2 -2
  14. package/cpp/build-info.cpp +2 -2
  15. package/cpp/llama.cpp/CMakeLists.txt +15 -4
  16. package/cpp/llama.cpp/Makefile +2 -2
  17. package/cpp/llama.cpp/README.md +32 -13
  18. package/cpp/llama.cpp/common/CMakeLists.txt +10 -20
  19. package/cpp/llama.cpp/common/arg.cpp +30 -6
  20. package/cpp/llama.cpp/common/build-info.cpp.in +2 -2
  21. package/cpp/llama.cpp/common/chat-parser.cpp +5 -0
  22. package/cpp/llama.cpp/common/chat-parser.h +2 -0
  23. package/cpp/llama.cpp/common/chat.cpp +12 -9
  24. package/cpp/llama.cpp/common/chat.h +1 -1
  25. package/cpp/llama.cpp/common/common.cpp +50 -40
  26. package/cpp/llama.cpp/common/common.h +5 -2
  27. package/cpp/llama.cpp/common/speculative.cpp +6 -4
  28. package/cpp/llama.cpp/convert_hf_to_gguf.py +97 -56
  29. package/cpp/llama.cpp/ggml/CMakeLists.txt +47 -2
  30. package/cpp/llama.cpp/ggml/cmake/common.cmake +1 -2
  31. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +47 -13
  32. package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +5 -0
  33. package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +6 -1
  34. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
  35. package/cpp/llama.cpp/ggml/src/ggml-common.h +4 -0
  36. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +93 -24
  37. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
  38. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +1 -1
  39. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  40. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4113 -0
  41. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2174 -0
  42. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2638 -0
  43. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2731 -0
  44. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2068 -0
  45. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +396 -0
  46. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1299 -0
  47. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1480 -0
  48. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +4310 -0
  49. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +59 -3206
  50. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +184 -0
  51. package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +1 -1
  52. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +7 -4
  53. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +10 -2
  54. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +8 -8
  55. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
  56. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
  57. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +56 -7
  58. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
  59. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +2 -2
  60. package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +1157 -0
  61. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
  62. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +1555 -0
  63. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.h +98 -0
  64. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +2 -4
  65. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
  66. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +5 -8
  67. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +4 -1
  68. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +6 -8
  69. package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
  70. package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -0
  71. package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +11 -10
  72. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +33 -8
  73. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +135 -100
  74. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +7 -0
  75. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +908 -3
  76. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
  77. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  78. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
  79. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
  80. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
  81. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  82. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
  83. package/cpp/llama.cpp/ggml/src/ggml-quants.c +0 -2
  84. package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
  85. package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +1 -1
  86. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +19 -24
  87. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +21 -2
  88. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +121 -4
  89. package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +32 -0
  90. package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +3 -0
  91. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +2 -96
  92. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +164 -38
  93. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +32 -8
  94. package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +38 -10
  95. package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
  96. package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +26 -29
  97. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +431 -247
  98. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -12
  99. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  100. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +2 -0
  101. package/cpp/llama.cpp/ggml/src/ggml.c +0 -6
  102. package/cpp/llama.cpp/gguf-py/gguf/constants.py +57 -0
  103. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +4 -1
  104. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +14 -3
  105. package/cpp/llama.cpp/include/llama.h +134 -36
  106. package/cpp/llama.cpp/requirements/requirements-compare-llama-bench.txt +1 -0
  107. package/cpp/llama.cpp/src/CMakeLists.txt +2 -2
  108. package/cpp/llama.cpp/src/llama-arch.cpp +95 -3
  109. package/cpp/llama.cpp/src/llama-arch.h +7 -1
  110. package/cpp/llama.cpp/src/llama-batch.cpp +270 -19
  111. package/cpp/llama.cpp/src/llama-batch.h +36 -11
  112. package/cpp/llama.cpp/src/llama-chat.cpp +19 -2
  113. package/cpp/llama.cpp/src/llama-chat.h +1 -0
  114. package/cpp/llama.cpp/src/llama-context.cpp +313 -213
  115. package/cpp/llama.cpp/src/llama-context.h +16 -12
  116. package/cpp/llama.cpp/src/llama-cparams.cpp +1 -1
  117. package/cpp/llama.cpp/src/llama-cparams.h +1 -1
  118. package/cpp/llama.cpp/src/llama-graph.cpp +249 -129
  119. package/cpp/llama.cpp/src/llama-graph.h +90 -34
  120. package/cpp/llama.cpp/src/llama-hparams.cpp +6 -2
  121. package/cpp/llama.cpp/src/llama-hparams.h +8 -2
  122. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +82 -50
  123. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +23 -26
  124. package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +292 -174
  125. package/cpp/llama.cpp/src/llama-kv-cache-unified.h +68 -38
  126. package/cpp/llama.cpp/src/llama-kv-cells.h +18 -13
  127. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +247 -0
  128. package/cpp/llama.cpp/src/llama-memory-hybrid.h +143 -0
  129. package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.cpp → llama-memory-recurrent.cpp} +266 -282
  130. package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.h → llama-memory-recurrent.h} +54 -57
  131. package/cpp/llama.cpp/src/llama-memory.cpp +41 -0
  132. package/cpp/llama.cpp/src/llama-memory.h +64 -23
  133. package/cpp/llama.cpp/src/llama-mmap.cpp +1 -1
  134. package/cpp/llama.cpp/src/llama-model-loader.cpp +42 -17
  135. package/cpp/llama.cpp/src/llama-model.cpp +726 -141
  136. package/cpp/llama.cpp/src/llama-model.h +4 -0
  137. package/cpp/llama.cpp/src/llama-quant.cpp +2 -1
  138. package/cpp/llama.cpp/src/llama-vocab.cpp +32 -23
  139. package/cpp/llama.cpp/src/llama.cpp +11 -7
  140. package/cpp/llama.cpp/src/unicode.cpp +5 -0
  141. package/cpp/rn-completion.cpp +2 -2
  142. package/cpp/{rn-llama.hpp → rn-llama.h} +1 -1
  143. package/ios/include/chat.h +1 -1
  144. package/ios/include/common.h +5 -2
  145. package/ios/include/llama.h +134 -36
  146. package/ios/libs/llama.xcframework/Info.plist +18 -18
  147. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  148. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4863 -4689
  149. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +134 -36
  150. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  151. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  152. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4710
  153. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3742 -3622
  154. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +134 -36
  155. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  156. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  157. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4710
  158. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3744 -3624
  159. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +134 -36
  160. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +134 -36
  161. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  162. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +134 -36
  163. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  164. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  165. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  166. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4863 -4689
  167. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +134 -36
  168. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  169. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  170. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4710
  171. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3742 -3622
  172. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +134 -36
  173. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  174. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  175. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4900 -4725
  176. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +134 -36
  177. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  178. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  179. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4871 -4746
  180. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3773 -3652
  181. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +134 -36
  182. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  183. package/package.json +1 -2
  184. package/cpp/llama.cpp/common/cmake/build-info-gen-cpp.cmake +0 -24
  185. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  186. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13891
  187. package/cpp/llama.cpp/src/llama-kv-cache.cpp +0 -1
  188. package/cpp/llama.cpp/src/llama-kv-cache.h +0 -44
  189. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
  190. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
  191. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
  192. /package/cpp/{rn-utils.hpp → rn-utils.h} +0 -0
@@ -315,6 +315,13 @@ struct ggml_backend_opencl_context {
315
315
  cl_program program_softmax_4_f16;
316
316
  cl_program program_argsort_f32_i32;
317
317
  cl_program program_sum_rows_f32;
318
+ cl_program program_repeat;
319
+ cl_program program_pad;
320
+ cl_program program_tanh;
321
+ cl_program program_upscale;
322
+ cl_program program_concat;
323
+ cl_program program_tsembd;
324
+ cl_program program_mul_mv_id_q4_0_f32_8x_flat;
318
325
 
319
326
  cl_kernel kernel_add, kernel_add_row;
320
327
  cl_kernel kernel_mul, kernel_mul_row;
@@ -351,6 +358,16 @@ struct ggml_backend_opencl_context {
351
358
  cl_kernel kernel_im2col_f32, kernel_im2col_f16;
352
359
  cl_kernel kernel_argsort_f32_i32;
353
360
  cl_kernel kernel_sum_rows_f32;
361
+ cl_kernel kernel_repeat;
362
+ cl_kernel kernel_pad;
363
+ cl_kernel kernel_tanh_f32_nd;
364
+ cl_kernel kernel_tanh_f16_nd;
365
+ cl_kernel kernel_upscale;
366
+ cl_kernel kernel_upscale_bilinear;
367
+ cl_kernel kernel_concat_f32_contiguous;
368
+ cl_kernel kernel_concat_f32_non_contiguous;
369
+ cl_kernel kernel_timestep_embedding;
370
+ cl_kernel kernel_mul_mv_id_q4_0_f32_8x_flat;
354
371
 
355
372
  #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
356
373
  // Transpose kernels
@@ -1097,6 +1114,166 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
1097
1114
  GGML_LOG_CONT(".");
1098
1115
  }
1099
1116
 
1117
+ // repeat
1118
+ {
1119
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1120
+ const std::string kernel_src {
1121
+ #include "repeat.cl.h"
1122
+ };
1123
+ #else
1124
+ const std::string kernel_src = read_file("repeat.cl");
1125
+ #endif
1126
+ if (!kernel_src.empty()) {
1127
+ backend_ctx->program_repeat =
1128
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1129
+ CL_CHECK((backend_ctx->kernel_repeat = clCreateKernel(backend_ctx->program_repeat, "kernel_repeat", &err), err));
1130
+ GGML_LOG_CONT(".");
1131
+ } else {
1132
+ GGML_LOG_WARN("ggml_opencl: repeat kernel source not found or empty. Repeat operations will not be available.\n");
1133
+ backend_ctx->program_repeat = nullptr;
1134
+ backend_ctx->kernel_repeat = nullptr;
1135
+ }
1136
+ }
1137
+
1138
+ // pad
1139
+ {
1140
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1141
+ const std::string kernel_src {
1142
+ #include "pad.cl.h"
1143
+ };
1144
+ #else
1145
+ const std::string kernel_src = read_file("pad.cl");
1146
+ #endif
1147
+ if (!kernel_src.empty()) {
1148
+ backend_ctx->program_pad =
1149
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1150
+ CL_CHECK((backend_ctx->kernel_pad = clCreateKernel(backend_ctx->program_pad, "kernel_pad", &err), err));
1151
+ GGML_LOG_CONT(".");
1152
+ } else {
1153
+ GGML_LOG_WARN("ggml_opencl: pad kernel source not found or empty. Pad operations will not be available.\n");
1154
+ backend_ctx->program_pad = nullptr;
1155
+ backend_ctx->kernel_pad = nullptr;
1156
+ }
1157
+ }
1158
+
1159
+ // tanh
1160
+ {
1161
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1162
+ const std::string kernel_src {
1163
+ #include "tanh.cl.h"
1164
+ };
1165
+ #else
1166
+ const std::string kernel_src = read_file("tanh.cl");
1167
+ #endif
1168
+ if (!kernel_src.empty()) {
1169
+ backend_ctx->program_tanh =
1170
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1171
+ CL_CHECK((backend_ctx->kernel_tanh_f32_nd = clCreateKernel(backend_ctx->program_tanh, "kernel_tanh_f32_nd", &err), err));
1172
+ CL_CHECK((backend_ctx->kernel_tanh_f16_nd = clCreateKernel(backend_ctx->program_tanh, "kernel_tanh_f16_nd", &err), err));
1173
+ GGML_LOG_CONT(".");
1174
+ } else {
1175
+ GGML_LOG_WARN("ggml_opencl: tanh kernel source not found or empty. Tanh operation will not be available.\n");
1176
+ backend_ctx->program_tanh = nullptr;
1177
+ backend_ctx->kernel_tanh_f32_nd = nullptr;
1178
+ backend_ctx->kernel_tanh_f16_nd = nullptr;
1179
+ }
1180
+ }
1181
+
1182
+ // upscale
1183
+ {
1184
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1185
+ const std::string kernel_src {
1186
+ #include "upscale.cl.h"
1187
+ };
1188
+ #else
1189
+ const std::string kernel_src = read_file("upscale.cl");
1190
+ #endif
1191
+ if (!kernel_src.empty()) {
1192
+ backend_ctx->program_upscale =
1193
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1194
+ CL_CHECK((backend_ctx->kernel_upscale = clCreateKernel(backend_ctx->program_upscale, "kernel_upscale", &err), err));
1195
+ if (backend_ctx->program_upscale) {
1196
+ cl_int err_bilinear;
1197
+ backend_ctx->kernel_upscale_bilinear = clCreateKernel(backend_ctx->program_upscale, "kernel_upscale_bilinear", &err_bilinear);
1198
+ if (err_bilinear != CL_SUCCESS) {
1199
+ GGML_LOG_WARN("ggml_opencl: kernel_upscale_bilinear not found in upscale.cl. Bilinear upscale will not be available. Error: %d\n", err_bilinear);
1200
+ backend_ctx->kernel_upscale_bilinear = nullptr;
1201
+ }
1202
+ } else {
1203
+ backend_ctx->kernel_upscale_bilinear = nullptr;
1204
+ }
1205
+ GGML_LOG_CONT(".");
1206
+ } else {
1207
+ GGML_LOG_WARN("ggml_opencl: upscale kernel source not found or empty. Upscale operations will not be available.\n");
1208
+ backend_ctx->program_upscale = nullptr;
1209
+ backend_ctx->kernel_upscale = nullptr;
1210
+ backend_ctx->kernel_upscale_bilinear = nullptr;
1211
+ }
1212
+ }
1213
+
1214
+ // concat
1215
+ {
1216
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1217
+ const std::string kernel_src {
1218
+ #include "concat.cl.h"
1219
+ };
1220
+ #else
1221
+
1222
+ const std::string kernel_src = read_file("concat.cl");
1223
+ #endif
1224
+ if (!kernel_src.empty()) {
1225
+ backend_ctx->program_concat =
1226
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1227
+
1228
+ CL_CHECK((backend_ctx->kernel_concat_f32_contiguous = clCreateKernel(backend_ctx->program_concat, "kernel_concat_f32_contiguous", &err), err));
1229
+ CL_CHECK((backend_ctx->kernel_concat_f32_non_contiguous = clCreateKernel(backend_ctx->program_concat, "kernel_concat_f32_non_contiguous", &err), err));
1230
+ GGML_LOG_CONT(".");
1231
+ } else {
1232
+ GGML_LOG_WARN("ggml_opencl: concat kernel source not found or empty. Concat operations will not be available.\n");
1233
+ backend_ctx->program_concat = nullptr;
1234
+ backend_ctx->kernel_concat_f32_contiguous = nullptr;
1235
+ backend_ctx->kernel_concat_f32_non_contiguous = nullptr;
1236
+ }
1237
+ }
1238
+
1239
+ // timestep_embedding
1240
+ {
1241
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1242
+ const std::string kernel_src {
1243
+ #include "tsembd.cl.h"
1244
+ };
1245
+ #else
1246
+
1247
+ const std::string kernel_src = read_file("tsembd.cl");
1248
+ #endif
1249
+ if (!kernel_src.empty()) {
1250
+ backend_ctx->program_tsembd =
1251
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1252
+ CL_CHECK((backend_ctx->kernel_timestep_embedding = clCreateKernel(backend_ctx->program_tsembd, "kernel_timestep_embedding", &err), err));
1253
+ GGML_LOG_CONT(".");
1254
+ } else {
1255
+ GGML_LOG_WARN("ggml_opencl: timestep_embedding kernel source not found or empty. This op will not be available.\n");
1256
+ backend_ctx->program_tsembd = nullptr;
1257
+ backend_ctx->kernel_timestep_embedding = nullptr;
1258
+ }
1259
+ }
1260
+
1261
+ // mul_mv_id_q4_0_f32_8x_flat
1262
+ {
1263
+ #ifdef GGML_OPENCL_EMBED_KERNELS
1264
+ const std::string kernel_src {
1265
+ #include "mul_mv_id_q4_0_f32_8x_flat.cl.h"
1266
+ };
1267
+ #else
1268
+ const std::string kernel_src = read_file("mul_mv_id_q4_0_f32_8x_flat.cl");
1269
+ #endif
1270
+ backend_ctx->program_mul_mv_id_q4_0_f32_8x_flat =
1271
+ build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
1272
+
1273
+ CL_CHECK((backend_ctx->kernel_mul_mv_id_q4_0_f32_8x_flat = clCreateKernel(backend_ctx->program_mul_mv_id_q4_0_f32_8x_flat, "kernel_mul_mv_id_q4_0_f32_8x_flat", &err), err));
1274
+ GGML_LOG_CONT(".");
1275
+ }
1276
+
1100
1277
  // Adreno kernels
1101
1278
  #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
1102
1279
  // transpose
@@ -1863,7 +2040,12 @@ static bool ggml_backend_opencl_cpy_tensor_async(ggml_backend_t backend, const g
1863
2040
  }
1864
2041
 
1865
2042
  static void ggml_backend_opencl_synchronize(ggml_backend_t backend) {
1866
- GGML_UNUSED(backend);
2043
+ auto * backend_ctx = static_cast<ggml_backend_opencl_context *>(backend->context);
2044
+
2045
+ cl_event evt;
2046
+ CL_CHECK(clEnqueueBarrierWithWaitList(backend_ctx->queue, 0, nullptr, &evt));
2047
+ CL_CHECK(clWaitForEvents(1, &evt));
2048
+ CL_CHECK(clReleaseEvent(evt));
1867
2049
  }
1868
2050
 
1869
2051
  // Syncronizes the 'backend_ctx's device with others so that commands
@@ -1976,9 +2158,12 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
1976
2158
  case GGML_UNARY_OP_SILU:
1977
2159
  case GGML_UNARY_OP_RELU:
1978
2160
  case GGML_UNARY_OP_GELU_QUICK:
1979
- return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
2161
+ return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
1980
2162
  case GGML_UNARY_OP_SIGMOID:
1981
2163
  return ggml_is_contiguous(op->src[0]);
2164
+ case GGML_UNARY_OP_TANH:
2165
+ return (op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32) ||
2166
+ (op->src[0]->type == GGML_TYPE_F16 && op->type == GGML_TYPE_F16);
1982
2167
  default:
1983
2168
  return false;
1984
2169
  }
@@ -1988,6 +2173,17 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
1988
2173
  case GGML_OP_NORM:
1989
2174
  case GGML_OP_RMS_NORM:
1990
2175
  return true;
2176
+ case GGML_OP_REPEAT:
2177
+ return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32; // Assuming F32 for now, can be expanded
2178
+ case GGML_OP_PAD:
2179
+ return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32 &&
2180
+ op->src[0]->ne[3] == 1 && op->ne[3] == 1;
2181
+ case GGML_OP_UPSCALE:
2182
+ return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
2183
+ case GGML_OP_CONCAT:
2184
+ return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
2185
+ case GGML_OP_TIMESTEP_EMBEDDING:
2186
+ return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
1991
2187
  case GGML_OP_GROUP_NORM:
1992
2188
  return ggml_is_contiguous(op->src[0]);
1993
2189
  case GGML_OP_MUL_MAT:
@@ -2000,6 +2196,13 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
2000
2196
  return op->src[1]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
2001
2197
  }
2002
2198
  return false;
2199
+ case GGML_OP_MUL_MAT_ID:
2200
+ if (op->src[0]->type == GGML_TYPE_Q4_0) {
2201
+ if (op->src[1]->type == GGML_TYPE_F32) {
2202
+ return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
2203
+ }
2204
+ }
2205
+ return false;
2003
2206
  case GGML_OP_RESHAPE:
2004
2207
  case GGML_OP_VIEW:
2005
2208
  case GGML_OP_PERMUTE:
@@ -2052,7 +2255,7 @@ static ggml_backend_i ggml_backend_opencl_i = {
2052
2255
  /* .set_tensor_async = */ NULL, /* ggml_backend_opencl_set_tensor_async */
2053
2256
  /* .get_tensor_async = */ NULL, /* ggml_backend_opencl_get_tensor_async */
2054
2257
  /* .cpy_tensor_async = */ NULL, /* ggml_backend_opencl_cpy_tensor_async */
2055
- /* .synchronize = */ NULL, /* ggml_backend_opencl_synchronize */
2258
+ /* .synchronize = */ ggml_backend_opencl_synchronize,
2056
2259
  /* .graph_plan_create = */ NULL,
2057
2260
  /* .graph_plan_free = */ NULL,
2058
2261
  /* .graph_plan_update = */ NULL,
@@ -4108,6 +4311,536 @@ static void ggml_cl_group_norm(ggml_backend_t backend, const ggml_tensor * src0,
4108
4311
  #endif
4109
4312
  }
4110
4313
 
4314
+ static void ggml_cl_tanh(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
4315
+ GGML_ASSERT(src0);
4316
+ GGML_ASSERT(src0->extra);
4317
+ GGML_ASSERT(dst);
4318
+ GGML_ASSERT(dst->extra);
4319
+
4320
+ UNUSED(src1);
4321
+
4322
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4323
+ cl_command_queue queue = backend_ctx->queue;
4324
+
4325
+ ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
4326
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
4327
+
4328
+ cl_ulong offset0_abs = extra0->offset + src0->view_offs;
4329
+ cl_ulong offsetd_abs = extrad->offset + dst->view_offs;
4330
+
4331
+ cl_kernel kernel;
4332
+ if (dst->type == GGML_TYPE_F32) {
4333
+ kernel = backend_ctx->kernel_tanh_f32_nd;
4334
+ } else if (dst->type == GGML_TYPE_F16) {
4335
+ kernel = backend_ctx->kernel_tanh_f16_nd;
4336
+ } else {
4337
+ GGML_ASSERT(false && "Unsupported type for ggml_cl_tanh");
4338
+ }
4339
+ GGML_ASSERT(kernel != nullptr);
4340
+
4341
+ const int ne00 = src0->ne[0]; const int ne01 = src0->ne[1]; const int ne02 = src0->ne[2]; const int ne03 = src0->ne[3];
4342
+ const cl_ulong nb00 = src0->nb[0]; const cl_ulong nb01 = src0->nb[1]; const cl_ulong nb02 = src0->nb[2]; const cl_ulong nb03 = src0->nb[3];
4343
+
4344
+ const int ne10 = dst->ne[0]; const int ne11 = dst->ne[1]; const int ne12 = dst->ne[2]; const int ne13 = dst->ne[3];
4345
+ const cl_ulong nb10 = dst->nb[0]; const cl_ulong nb11 = dst->nb[1]; const cl_ulong nb12 = dst->nb[2]; const cl_ulong nb13 = dst->nb[3];
4346
+
4347
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
4348
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0_abs));
4349
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device));
4350
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd_abs));
4351
+
4352
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00));
4353
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01));
4354
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02));
4355
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03));
4356
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb00));
4357
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
4358
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong),&nb02));
4359
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong),&nb03));
4360
+
4361
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne10));
4362
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne11));
4363
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne12));
4364
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne13));
4365
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong),&nb10));
4366
+ CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong),&nb11));
4367
+ CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong),&nb12));
4368
+ CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong),&nb13));
4369
+
4370
+ size_t global_work_size[3];
4371
+ if (ne10 == 0 || ne11 == 0 || ne12 == 0 || ne13 == 0) { // Handle case of 0 elements
4372
+ return;
4373
+ }
4374
+ global_work_size[0] = (size_t)ne10;
4375
+ global_work_size[1] = (size_t)ne11;
4376
+ global_work_size[2] = (size_t)ne12;
4377
+
4378
+ size_t lws0 = 16, lws1 = 4, lws2 = 1;
4379
+ if (ne10 < 16) lws0 = ne10;
4380
+ if (ne11 < 4) lws1 = ne11;
4381
+ if (ne12 < 1) lws2 = ne12 > 0 ? ne12 : 1;
4382
+
4383
+ while (lws0 * lws1 * lws2 > 256 && lws0 > 1) lws0 /= 2;
4384
+ while (lws0 * lws1 * lws2 > 256 && lws1 > 1) lws1 /= 2;
4385
+ while (lws0 * lws1 * lws2 > 256 && lws2 > 1) lws2 /= 2;
4386
+
4387
+
4388
+ size_t local_work_size[] = {lws0, lws1, lws2};
4389
+
4390
+ size_t* local_work_size_ptr = local_work_size;
4391
+ if (!backend_ctx->non_uniform_workgroups) {
4392
+ if (global_work_size[0] % local_work_size[0] != 0 ||
4393
+ global_work_size[1] % local_work_size[1] != 0 ||
4394
+ global_work_size[2] % local_work_size[2] != 0) {
4395
+ local_work_size_ptr = NULL;
4396
+ }
4397
+ }
4398
+ if (global_work_size[0] == 0 || global_work_size[1] == 0 || global_work_size[2] == 0) return;
4399
+
4400
+
4401
+ #ifdef GGML_OPENCL_PROFILING
4402
+ cl_event evt;
4403
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
4404
+
4405
+ g_profiling_info.emplace_back();
4406
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr ? local_work_size : (size_t[3]){0,0,0}, dst);
4407
+ #else
4408
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
4409
+ #endif
4410
+ }
4411
+
4412
+ static void ggml_cl_repeat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1_shape_def, ggml_tensor * dst) {
4413
+ GGML_ASSERT(src0);
4414
+ GGML_ASSERT(src0->extra);
4415
+ GGML_ASSERT(dst);
4416
+ GGML_ASSERT(dst->extra);
4417
+ GGML_ASSERT(dst->type == src0->type);
4418
+
4419
+ UNUSED(src1_shape_def);
4420
+
4421
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4422
+ cl_command_queue queue = backend_ctx->queue;
4423
+
4424
+ if (backend_ctx->kernel_repeat == nullptr) {
4425
+ GGML_LOG_WARN("%s: repeat kernel not available, skipping OpenCL execution.\n", __func__);
4426
+ return;
4427
+ }
4428
+
4429
+ ggml_tensor_extra_cl * extra_src0 = (ggml_tensor_extra_cl *)src0->extra;
4430
+ ggml_tensor_extra_cl * extra_dst = (ggml_tensor_extra_cl *)dst->extra;
4431
+
4432
+ cl_ulong off_src0 = extra_src0->offset + src0->view_offs;
4433
+ cl_ulong off_dst = extra_dst->offset + dst->view_offs;
4434
+
4435
+ const int src0_ne0 = src0->ne[0]; const int src0_ne1 = src0->ne[1]; const int src0_ne2 = src0->ne[2]; const int src0_ne3 = src0->ne[3];
4436
+ const cl_ulong src0_nb0 = src0->nb[0]; const cl_ulong src0_nb1 = src0->nb[1]; const cl_ulong src0_nb2 = src0->nb[2]; const cl_ulong src0_nb3 = src0->nb[3];
4437
+
4438
+ const int dst_ne0 = dst->ne[0]; const int dst_ne1 = dst->ne[1]; const int dst_ne2 = dst->ne[2]; const int dst_ne3 = dst->ne[3];
4439
+ const cl_ulong dst_nb0 = dst->nb[0]; const cl_ulong dst_nb1 = dst->nb[1]; const cl_ulong dst_nb2 = dst->nb[2]; const cl_ulong dst_nb3 = dst->nb[3];
4440
+
4441
+ cl_kernel kernel = backend_ctx->kernel_repeat;
4442
+
4443
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra_src0->data_device));
4444
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra_dst->data_device));
4445
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_ulong), &off_src0));
4446
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &off_dst));
4447
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &src0_ne0));
4448
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &src0_ne1));
4449
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &src0_ne2));
4450
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &src0_ne3));
4451
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &src0_nb0));
4452
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &src0_nb1));
4453
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &src0_nb2));
4454
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &src0_nb3));
4455
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &dst_ne0));
4456
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &dst_ne1));
4457
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &dst_ne2));
4458
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &dst_ne3));
4459
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &dst_nb0));
4460
+ CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &dst_nb1));
4461
+ CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &dst_nb2));
4462
+ CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &dst_nb3));
4463
+
4464
+ size_t gws0 = dst_ne1 > 0 ? (size_t)dst_ne1 : 1;
4465
+ size_t gws1 = dst_ne2 > 0 ? (size_t)dst_ne2 : 1;
4466
+ size_t gws2 = dst_ne3 > 0 ? (size_t)dst_ne3 : 1;
4467
+
4468
+ size_t global_work_size[] = { gws0, gws1, gws2 };
4469
+
4470
+ #ifdef GGML_OPENCL_PROFILING
4471
+ cl_event evt;
4472
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, NULL, 0, NULL, &evt));
4473
+
4474
+ g_profiling_info.emplace_back();
4475
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, (size_t[3]){0,0,0}, dst);
4476
+ #else
4477
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, NULL, 0, NULL, NULL));
4478
+ #endif
4479
+ }
4480
+
4481
+ static void ggml_cl_pad(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
4482
+ GGML_ASSERT(src0);
4483
+ GGML_ASSERT(src0->extra);
4484
+ GGML_ASSERT(dst);
4485
+ GGML_ASSERT(dst->extra);
4486
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
4487
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
4488
+ GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1);
4489
+
4490
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4491
+ cl_command_queue queue = backend_ctx->queue;
4492
+
4493
+ if (backend_ctx->kernel_pad == nullptr) {
4494
+ GGML_LOG_WARN("%s: pad kernel not available, skipping OpenCL execution.\n", __func__);
4495
+ return;
4496
+ }
4497
+
4498
+ ggml_tensor_extra_cl * extra_src0 = (ggml_tensor_extra_cl *)src0->extra;
4499
+ ggml_tensor_extra_cl * extra_dst = (ggml_tensor_extra_cl *)dst->extra;
4500
+
4501
+ cl_ulong off_src0 = extra_src0->offset + src0->view_offs;
4502
+ cl_ulong off_dst = extra_dst->offset + dst->view_offs;
4503
+
4504
+ const int s_ne0 = src0->ne[0];
4505
+ const int s_ne1 = src0->ne[1];
4506
+ const int s_ne2 = src0->ne[2];
4507
+
4508
+ const int d_ne0 = dst->ne[0];
4509
+ const int d_ne1 = dst->ne[1];
4510
+ const int d_ne2 = dst->ne[2];
4511
+
4512
+ cl_kernel kernel = backend_ctx->kernel_pad;
4513
+
4514
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra_src0->data_device));
4515
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &off_src0));
4516
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra_dst->data_device));
4517
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &off_dst));
4518
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &s_ne0));
4519
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &s_ne1));
4520
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &s_ne2));
4521
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &d_ne0));
4522
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &d_ne1));
4523
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &d_ne2));
4524
+
4525
+ size_t lws0 = 64;
4526
+ size_t gws0 = (( (size_t)d_ne0 + lws0 - 1 ) / lws0) * lws0;
4527
+
4528
+ size_t global_work_size[] = { gws0, (size_t)d_ne1, (size_t)d_ne2 };
4529
+ size_t local_work_size[] = { lws0, 1, 1 };
4530
+
4531
+ size_t * local_work_size_ptr = local_work_size;
4532
+ if (d_ne0 % lws0 != 0 && !backend_ctx->non_uniform_workgroups) {
4533
+ local_work_size_ptr = nullptr;
4534
+ }
4535
+
4536
+ #ifdef GGML_OPENCL_PROFILING
4537
+ cl_event evt;
4538
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
4539
+
4540
+ g_profiling_info.emplace_back();
4541
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr ? local_work_size : (size_t[3]){0,0,0}, dst);
4542
+ #else
4543
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
4544
+ #endif
4545
+ }
4546
+
4547
+ static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
4548
+ GGML_ASSERT(src0);
4549
+ GGML_ASSERT(src0->extra);
4550
+ GGML_ASSERT(dst);
4551
+ GGML_ASSERT(dst->extra);
4552
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
4553
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
4554
+
4555
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4556
+ cl_command_queue queue = backend_ctx->queue;
4557
+
4558
+ const ggml_scale_mode mode = (ggml_scale_mode) ggml_get_op_params_i32(dst, 0);
4559
+ cl_kernel kernel = nullptr;
4560
+
4561
+ if (mode == GGML_SCALE_MODE_NEAREST) {
4562
+ kernel = backend_ctx->kernel_upscale;
4563
+ if (kernel == nullptr) {
4564
+ GGML_LOG_WARN("%s: nearest upscale kernel not available, skipping OpenCL execution.\n", __func__);
4565
+ return;
4566
+ }
4567
+ } else if (mode == GGML_SCALE_MODE_BILINEAR) {
4568
+ kernel = backend_ctx->kernel_upscale_bilinear;
4569
+ if (kernel == nullptr) {
4570
+ GGML_LOG_WARN("%s: bilinear upscale kernel not available, skipping OpenCL execution.\n", __func__);
4571
+ return;
4572
+ }
4573
+ } else {
4574
+ GGML_LOG_WARN("%s: unsupported upscale mode %d, skipping OpenCL execution.\n", __func__, mode);
4575
+ return;
4576
+ }
4577
+
4578
+ ggml_tensor_extra_cl * extra_src0 = (ggml_tensor_extra_cl *)src0->extra;
4579
+ ggml_tensor_extra_cl * extra_dst = (ggml_tensor_extra_cl *)dst->extra;
4580
+
4581
+ cl_ulong off_src0 = extra_src0->offset + src0->view_offs;
4582
+ cl_ulong off_dst = extra_dst->offset + dst->view_offs;
4583
+
4584
+ const cl_ulong nb00 = src0->nb[0];
4585
+ const cl_ulong nb01 = src0->nb[1];
4586
+ const cl_ulong nb02 = src0->nb[2];
4587
+ const cl_ulong nb03 = src0->nb[3];
4588
+
4589
+ const int ne00_src = src0->ne[0];
4590
+ const int ne01_src = src0->ne[1];
4591
+
4592
+ const int ne10_dst = dst->ne[0];
4593
+ const int ne11_dst = dst->ne[1];
4594
+ const int ne12_dst = dst->ne[2];
4595
+ const int ne13_dst = dst->ne[3];
4596
+
4597
+ const float sf0 = (float)dst->ne[0] / src0->ne[0];
4598
+ const float sf1 = (float)dst->ne[1] / src0->ne[1];
4599
+ const float sf2 = (float)dst->ne[2] / src0->ne[2];
4600
+ const float sf3 = (float)dst->ne[3] / src0->ne[3];
4601
+
4602
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra_src0->data_device));
4603
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &off_src0));
4604
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra_dst->data_device));
4605
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &off_dst));
4606
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_ulong), &nb00));
4607
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &nb01));
4608
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &nb02));
4609
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb03));
4610
+
4611
+ if (mode == GGML_SCALE_MODE_NEAREST) {
4612
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne10_dst));
4613
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne11_dst));
4614
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12_dst));
4615
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne13_dst));
4616
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float), &sf0));
4617
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(float), &sf1));
4618
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(float), &sf2));
4619
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(float), &sf3));
4620
+ } else if (mode == GGML_SCALE_MODE_BILINEAR) {
4621
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00_src));
4622
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01_src));
4623
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne10_dst));
4624
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne11_dst));
4625
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne12_dst));
4626
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne13_dst));
4627
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(float), &sf0));
4628
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(float), &sf1));
4629
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(float), &sf2));
4630
+ CL_CHECK(clSetKernelArg(kernel, 17, sizeof(float), &sf3));
4631
+ }
4632
+
4633
+
4634
+ size_t dst_total_elements = (size_t)ne10_dst * ne11_dst * ne12_dst * ne13_dst;
4635
+ if (dst_total_elements == 0) {
4636
+ return;
4637
+ }
4638
+ size_t global_work_size[] = { dst_total_elements, 1, 1 };
4639
+ size_t local_work_size_pref = 256;
4640
+ size_t local_work_size[] = { MIN(local_work_size_pref, dst_total_elements), 1, 1};
4641
+
4642
+ size_t * local_work_size_ptr = local_work_size;
4643
+ if (dst_total_elements % local_work_size[0] != 0 && !backend_ctx->non_uniform_workgroups) {
4644
+ local_work_size_ptr = nullptr;
4645
+ }
4646
+
4647
+ #ifdef GGML_OPENCL_PROFILING
4648
+ cl_event evt;
4649
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
4650
+
4651
+ g_profiling_info.emplace_back();
4652
+ size_t profiling_gws[3] = {global_work_size[0], 1, 1};
4653
+ size_t profiling_lws[3] = {local_work_size_ptr ? local_work_size[0] : 0, 1, 1};
4654
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, profiling_gws, profiling_lws, dst);
4655
+ #else
4656
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
4657
+ #endif
4658
+ }
4659
+
4660
+ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
4661
+ GGML_ASSERT(src0);
4662
+ GGML_ASSERT(src0->extra);
4663
+ GGML_ASSERT(src1);
4664
+ GGML_ASSERT(src1->extra);
4665
+ GGML_ASSERT(dst);
4666
+ GGML_ASSERT(dst->extra);
4667
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
4668
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
4669
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
4670
+
4671
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4672
+ cl_command_queue queue = backend_ctx->queue;
4673
+
4674
+ if (backend_ctx->kernel_concat_f32_contiguous == nullptr || backend_ctx->kernel_concat_f32_non_contiguous == nullptr) {
4675
+ GGML_LOG_WARN("%s: concat kernels not available, skipping OpenCL execution.\n", __func__);
4676
+ return;
4677
+ }
4678
+
4679
+ ggml_tensor_extra_cl * extra0_cl = (ggml_tensor_extra_cl *)src0->extra;
4680
+ ggml_tensor_extra_cl * extra1_cl = (ggml_tensor_extra_cl *)src1->extra;
4681
+ ggml_tensor_extra_cl * extrad_cl = (ggml_tensor_extra_cl *)dst->extra;
4682
+
4683
+ cl_ulong off_src0 = extra0_cl->offset + src0->view_offs;
4684
+ cl_ulong off_src1 = extra1_cl->offset + src1->view_offs;
4685
+ cl_ulong off_dst = extrad_cl->offset + dst->view_offs;
4686
+
4687
+ const int32_t dim = ((const int32_t *) dst->op_params)[0];
4688
+ GGML_ASSERT(dim >= 0 && dim <= 3);
4689
+
4690
+ if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) {
4691
+ if (dim == 3) {
4692
+
4693
+ size_t nbytes_src0 = ggml_nbytes(src0);
4694
+ size_t nbytes_src1 = ggml_nbytes(src1);
4695
+
4696
+ CL_CHECK(clEnqueueCopyBuffer(queue, extra0_cl->data_device, extrad_cl->data_device,
4697
+ off_src0, off_dst, nbytes_src0, 0, NULL, NULL));
4698
+ CL_CHECK(clEnqueueCopyBuffer(queue, extra1_cl->data_device, extrad_cl->data_device,
4699
+ off_src1, off_dst + nbytes_src0, nbytes_src1, 0, NULL, NULL));
4700
+ } else {
4701
+
4702
+ cl_kernel kernel = backend_ctx->kernel_concat_f32_contiguous;
4703
+ size_t global_work_size[3];
4704
+
4705
+ for (int i3 = 0; i3 < dst->ne[3]; ++i3) {
4706
+ cl_ulong current_off_src0 = off_src0 + (i3 * src0->nb[3]);
4707
+ cl_ulong current_off_src1 = off_src1 + (i3 * src1->nb[3]);
4708
+ cl_ulong current_off_dst = off_dst + (i3 * dst->nb[3]);
4709
+
4710
+ int d_ne00 = src0->ne[0]; int d_ne01 = src0->ne[1]; int d_ne02 = src0->ne[2];
4711
+ int d_ne10 = src1->ne[0]; int d_ne11 = src1->ne[1]; int d_ne12 = src1->ne[2];
4712
+ int d_ne0 = dst->ne[0]; int d_ne1 = dst->ne[1]; int d_ne2 = dst->ne[2];
4713
+
4714
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_cl->data_device));
4715
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &current_off_src0));
4716
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1_cl->data_device));
4717
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &current_off_src1));
4718
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad_cl->data_device));
4719
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &current_off_dst));
4720
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &d_ne00));
4721
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &d_ne01));
4722
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &d_ne02));
4723
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &d_ne10));
4724
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &d_ne11));
4725
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &d_ne12));
4726
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &d_ne0));
4727
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &d_ne1));
4728
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &d_ne2));
4729
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &dim));
4730
+
4731
+ global_work_size[0] = d_ne0;
4732
+ global_work_size[1] = d_ne1;
4733
+ global_work_size[2] = d_ne2;
4734
+
4735
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, NULL, 0, NULL, NULL));
4736
+ }
4737
+ }
4738
+ } else {
4739
+ cl_kernel kernel = backend_ctx->kernel_concat_f32_non_contiguous;
4740
+
4741
+ long ne00 = src0->ne[0], ne01 = src0->ne[1], ne02 = src0->ne[2], ne03 = src0->ne[3];
4742
+ cl_ulong nb00 = src0->nb[0], nb01 = src0->nb[1], nb02 = src0->nb[2], nb03 = src0->nb[3];
4743
+
4744
+ cl_ulong nb10 = src1->nb[0], nb11 = src1->nb[1], nb12 = src1->nb[2], nb13 = src1->nb[3];
4745
+
4746
+ long d_ne0 = dst->ne[0], d_ne1 = dst->ne[1], d_ne2 = dst->ne[2], d_ne3 = dst->ne[3];
4747
+ cl_ulong d_nb0 = dst->nb[0], d_nb1 = dst->nb[1], d_nb2 = dst->nb[2], d_nb3 = dst->nb[3];
4748
+
4749
+
4750
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_cl->data_device));
4751
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &off_src0));
4752
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1_cl->data_device));
4753
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &off_src1));
4754
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad_cl->data_device));
4755
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &off_dst));
4756
+
4757
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(long), &ne00));
4758
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(long), &ne01));
4759
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(long), &ne02));
4760
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(long), &ne03));
4761
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb00));
4762
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb01));
4763
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
4764
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb03));
4765
+
4766
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb10));
4767
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb11));
4768
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb12));
4769
+ CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb13));
4770
+
4771
+ CL_CHECK(clSetKernelArg(kernel, 18, sizeof(long), &d_ne0));
4772
+ CL_CHECK(clSetKernelArg(kernel, 19, sizeof(long), &d_ne1));
4773
+ CL_CHECK(clSetKernelArg(kernel, 20, sizeof(long), &d_ne2));
4774
+ CL_CHECK(clSetKernelArg(kernel, 21, sizeof(long), &d_ne3));
4775
+ CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &d_nb0));
4776
+ CL_CHECK(clSetKernelArg(kernel, 23, sizeof(cl_ulong), &d_nb1));
4777
+ CL_CHECK(clSetKernelArg(kernel, 24, sizeof(cl_ulong), &d_nb2));
4778
+ CL_CHECK(clSetKernelArg(kernel, 25, sizeof(cl_ulong), &d_nb3));
4779
+ CL_CHECK(clSetKernelArg(kernel, 26, sizeof(int), &dim));
4780
+
4781
+ size_t global_work_size_nc[] = { d_ne1 > 0 ? (size_t)d_ne1 : 1,
4782
+ d_ne2 > 0 ? (size_t)d_ne2 : 1,
4783
+ d_ne3 > 0 ? (size_t)d_ne3 : 1 };
4784
+
4785
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size_nc, NULL, 0, NULL, NULL));
4786
+ }
4787
+ }
4788
+
4789
+ static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor * src0, ggml_tensor * dst) {
4790
+ GGML_ASSERT(src0);
4791
+ GGML_ASSERT(src0->extra);
4792
+ GGML_ASSERT(dst);
4793
+ GGML_ASSERT(dst->extra);
4794
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
4795
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
4796
+
4797
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
4798
+ cl_command_queue queue = backend_ctx->queue;
4799
+
4800
+ if (backend_ctx->kernel_timestep_embedding == nullptr) {
4801
+ GGML_LOG_WARN("%s: timestep_embedding kernel not available, skipping OpenCL execution.\n", __func__);
4802
+ return;
4803
+ }
4804
+
4805
+ ggml_tensor_extra_cl * extra_src0 = (ggml_tensor_extra_cl *)src0->extra;
4806
+ ggml_tensor_extra_cl * extra_dst = (ggml_tensor_extra_cl *)dst->extra;
4807
+
4808
+ cl_ulong off_src0 = extra_src0->offset + src0->view_offs;
4809
+ cl_ulong off_dst = extra_dst->offset + dst->view_offs;
4810
+
4811
+ const int logical_dim = dst->op_params[0];
4812
+ const int max_period = dst->op_params[1];
4813
+ const int dst_nb1_bytes = dst->nb[1];
4814
+
4815
+ cl_kernel kernel = backend_ctx->kernel_timestep_embedding;
4816
+
4817
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra_src0->data_device));
4818
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &off_src0));
4819
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra_dst->data_device));
4820
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &off_dst));
4821
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &dst_nb1_bytes));
4822
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &logical_dim));
4823
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &max_period));
4824
+
4825
+ size_t gws0 = (size_t)(((logical_dim + 1) / 2) + 1);
4826
+
4827
+ size_t gws1 = (size_t)src0->ne[0];
4828
+
4829
+ size_t global_work_size[] = {gws0, gws1, 1};
4830
+
4831
+ #ifdef GGML_OPENCL_PROFILING
4832
+ cl_event evt;
4833
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_work_size, NULL, 0, NULL, &evt)); // Pass 2 for 2D problem
4834
+
4835
+ g_profiling_info.emplace_back();
4836
+ size_t profiling_gws[3] = {global_work_size[0], global_work_size[1], 1};
4837
+ size_t profiling_lws[3] = {0,0,0}; // Reflects NULL LWS
4838
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, profiling_gws, profiling_lws, dst);
4839
+ #else
4840
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL)); // Pass 2 for 2D problem
4841
+ #endif
4842
+ }
4843
+
4111
4844
  static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
4112
4845
  GGML_ASSERT(src0);
4113
4846
  GGML_ASSERT(src0->extra);
@@ -4828,6 +5561,136 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
4828
5561
  }
4829
5562
  }
4830
5563
 
5564
+ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
5565
+ GGML_ASSERT(src0);
5566
+ GGML_ASSERT(src0->extra);
5567
+ GGML_ASSERT(src1);
5568
+ GGML_ASSERT(src1->extra);
5569
+ GGML_ASSERT(dst);
5570
+ GGML_ASSERT(dst->extra);
5571
+
5572
+ const ggml_tensor * src2 = dst->src[2];
5573
+ GGML_ASSERT(src2);
5574
+ GGML_ASSERT(src2->extra);
5575
+
5576
+ ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
5577
+ cl_command_queue queue = backend_ctx->queue;
5578
+
5579
+ ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
5580
+ ggml_tensor_extra_cl * extra2 = (ggml_tensor_extra_cl *)src2->extra;
5581
+ ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
5582
+
5583
+ cl_ulong offset1 = extra1->offset + src1->view_offs;
5584
+ cl_ulong offset2 = extra2->offset + src2->view_offs;
5585
+ cl_ulong offsetd = extrad->offset + dst->view_offs;
5586
+
5587
+ #ifdef GGML_OPENCL_SOA_Q
5588
+ ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra;
5589
+ #endif
5590
+
5591
+ const int ne00 = src0->ne[0];
5592
+ const int ne01 = src0->ne[1];
5593
+ const int ne02 = src0->ne[2];
5594
+ const int ne03 = src0->ne[3];
5595
+
5596
+ const cl_ulong nb00 = src0->nb[0];
5597
+ const cl_ulong nb02 = src0->nb[2];
5598
+
5599
+ const int ne10 = src1->ne[0];
5600
+ const int ne11 = src1->ne[1];
5601
+ const int ne12 = src1->ne[2];
5602
+ const int ne13 = src1->ne[3];
5603
+
5604
+ const cl_ulong nb11 = src1->nb[1];
5605
+ const cl_ulong nb12 = src1->nb[2];
5606
+
5607
+ const int ne20 = src2->ne[0];
5608
+ const int ne21 = src2->ne[1];
5609
+
5610
+ const cl_ulong nb21 = src2->nb[1];
5611
+
5612
+ const int ne0 = dst->ne[0];
5613
+ const int ne1 = dst->ne[1];
5614
+
5615
+ const int r2 = ne12/ne02;
5616
+ const int r3 = ne13/ne03;
5617
+ const int dst_rows = ne20*ne21; // ne20 = n_used_experts, ne21 = n_rows
5618
+
5619
+ GGML_ASSERT(ne00 == ne10);
5620
+
5621
+ int sgs = 32; // subgroup size
5622
+ int nsg = 1; // number of subgroups
5623
+ int nrows = 1; // number of row in src1
5624
+ int ndst = 4; // number of values produced by each subgroup
5625
+
5626
+ cl_kernel kernel;
5627
+
5628
+ // subgroup mat vec
5629
+ switch (src0->type) {
5630
+ case GGML_TYPE_Q4_0: {
5631
+ kernel = backend_ctx->kernel_mul_mv_id_q4_0_f32_8x_flat;
5632
+
5633
+ if (backend_ctx->gpu_family == INTEL) {
5634
+ sgs = 16;
5635
+ nsg = 1;
5636
+ ndst = 8;
5637
+ } else if (backend_ctx->gpu_family == ADRENO) {
5638
+ sgs = 64;
5639
+ nsg = 1;
5640
+ ndst = 8;
5641
+ } else {
5642
+ GGML_ASSERT(false && "TODO: Unknown GPU");
5643
+ }
5644
+
5645
+ CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_0->q));
5646
+ CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_0->d));
5647
+ CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
5648
+ CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
5649
+ CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra2->data_device));
5650
+ CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset2));
5651
+ CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
5652
+ CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd));
5653
+ CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00));
5654
+ CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01));
5655
+ CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne02));
5656
+ CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb00));
5657
+ CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
5658
+ CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne10));
5659
+ CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne11));
5660
+ CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne12));
5661
+ CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb11));
5662
+ CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb12));
5663
+ CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne20));
5664
+ CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &ne21));
5665
+ CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb21));
5666
+ CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int), &ne0));
5667
+ CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &ne1));
5668
+ CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &r2));
5669
+ CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &r3));
5670
+
5671
+ break;
5672
+ }
5673
+ default:
5674
+ GGML_ASSERT(false && "not implemented");;
5675
+ }
5676
+
5677
+ int _ne1 = 1;
5678
+ int ne123 = dst_rows;
5679
+
5680
+ size_t global_work_size[] = {(size_t)(ne01+ndst*nsg-1)/(ndst*nsg)*sgs, (size_t)(_ne1+nrows-1)/nrows*nsg, (size_t)ne123};
5681
+ size_t local_work_size[] = {(size_t)sgs, (size_t)nsg, 1};
5682
+
5683
+ #ifdef GGML_OPENCL_PROFILING
5684
+ cl_event evt;
5685
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
5686
+
5687
+ g_profiling_info.emplace_back();
5688
+ populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
5689
+ #else
5690
+ CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
5691
+ #endif
5692
+ }
5693
+
4831
5694
  static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
4832
5695
  GGML_ASSERT(src0);
4833
5696
  GGML_ASSERT(src0->extra);
@@ -5667,6 +6530,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
5667
6530
  }
5668
6531
  func = ggml_cl_sigmoid;
5669
6532
  break;
6533
+ case GGML_UNARY_OP_TANH:
6534
+ if (!any_on_device) {
6535
+ return false;
6536
+ }
6537
+ func = ggml_cl_tanh;
6538
+ break;
5670
6539
  default:
5671
6540
  return false;
5672
6541
  } break;
@@ -5694,12 +6563,48 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
5694
6563
  }
5695
6564
  func = ggml_cl_group_norm;
5696
6565
  break;
6566
+ case GGML_OP_REPEAT:
6567
+ if (!any_on_device) {
6568
+ return false;
6569
+ }
6570
+ func = ggml_cl_repeat;
6571
+ break;
6572
+ case GGML_OP_PAD:
6573
+ if (!any_on_device) {
6574
+ return false;
6575
+ }
6576
+ ggml_cl_pad(backend, tensor->src[0], tensor);
6577
+ return true;
6578
+ case GGML_OP_UPSCALE:
6579
+ if (!any_on_device) {
6580
+ return false;
6581
+ }
6582
+ ggml_cl_upscale(backend, tensor->src[0], tensor);
6583
+ return true;
6584
+ case GGML_OP_CONCAT:
6585
+ if (!any_on_device) {
6586
+ return false;
6587
+ }
6588
+ func = ggml_cl_concat;
6589
+ break;
6590
+ case GGML_OP_TIMESTEP_EMBEDDING:
6591
+ if (!any_on_device) {
6592
+ return false;
6593
+ }
6594
+ ggml_cl_timestep_embedding(backend, tensor->src[0], tensor);
6595
+ return true;
5697
6596
  case GGML_OP_MUL_MAT:
5698
6597
  if (!any_on_device && !ggml_cl_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
5699
6598
  return false;
5700
6599
  }
5701
6600
  func = ggml_cl_mul_mat;
5702
6601
  break;
6602
+ case GGML_OP_MUL_MAT_ID:
6603
+ if (!any_on_device) {
6604
+ return false;
6605
+ }
6606
+ func = ggml_cl_mul_mat_id;
6607
+ break;
5703
6608
  case GGML_OP_SCALE:
5704
6609
  if (!any_on_device) {
5705
6610
  return false;