@novastera-oss/llamarn 0.2.6 → 0.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (192) hide show
  1. package/android/src/main/cpp/include/llama.h +134 -36
  2. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  3. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  4. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  6. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  7. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  8. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  10. package/cpp/LlamaCppModel.cpp +2 -2
  11. package/cpp/LlamaCppModel.h +3 -3
  12. package/cpp/PureCppImpl.cpp +1 -1
  13. package/cpp/PureCppImpl.h +2 -2
  14. package/cpp/build-info.cpp +2 -2
  15. package/cpp/llama.cpp/CMakeLists.txt +15 -4
  16. package/cpp/llama.cpp/Makefile +2 -2
  17. package/cpp/llama.cpp/README.md +32 -13
  18. package/cpp/llama.cpp/common/CMakeLists.txt +10 -20
  19. package/cpp/llama.cpp/common/arg.cpp +30 -6
  20. package/cpp/llama.cpp/common/build-info.cpp.in +2 -2
  21. package/cpp/llama.cpp/common/chat-parser.cpp +5 -0
  22. package/cpp/llama.cpp/common/chat-parser.h +2 -0
  23. package/cpp/llama.cpp/common/chat.cpp +12 -9
  24. package/cpp/llama.cpp/common/chat.h +1 -1
  25. package/cpp/llama.cpp/common/common.cpp +50 -40
  26. package/cpp/llama.cpp/common/common.h +5 -2
  27. package/cpp/llama.cpp/common/speculative.cpp +6 -4
  28. package/cpp/llama.cpp/convert_hf_to_gguf.py +97 -56
  29. package/cpp/llama.cpp/ggml/CMakeLists.txt +47 -2
  30. package/cpp/llama.cpp/ggml/cmake/common.cmake +1 -2
  31. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +47 -13
  32. package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +5 -0
  33. package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +6 -1
  34. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
  35. package/cpp/llama.cpp/ggml/src/ggml-common.h +4 -0
  36. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +93 -24
  37. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
  38. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +1 -1
  39. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  40. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4113 -0
  41. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2174 -0
  42. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2638 -0
  43. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2731 -0
  44. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2068 -0
  45. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +396 -0
  46. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1299 -0
  47. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1480 -0
  48. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +4310 -0
  49. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +59 -3206
  50. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +184 -0
  51. package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +1 -1
  52. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +7 -4
  53. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +10 -2
  54. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +8 -8
  55. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
  56. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
  57. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +56 -7
  58. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
  59. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +2 -2
  60. package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +1157 -0
  61. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
  62. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +1555 -0
  63. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.h +98 -0
  64. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +2 -4
  65. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
  66. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +5 -8
  67. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +4 -1
  68. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +6 -8
  69. package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
  70. package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -0
  71. package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +11 -10
  72. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +33 -8
  73. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +135 -100
  74. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +7 -0
  75. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +908 -3
  76. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
  77. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  78. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
  79. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
  80. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
  81. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  82. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
  83. package/cpp/llama.cpp/ggml/src/ggml-quants.c +0 -2
  84. package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
  85. package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +1 -1
  86. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +19 -24
  87. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +21 -2
  88. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +121 -4
  89. package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +32 -0
  90. package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +3 -0
  91. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +2 -96
  92. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +164 -38
  93. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +32 -8
  94. package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +38 -10
  95. package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
  96. package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +26 -29
  97. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +431 -247
  98. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -12
  99. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  100. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +2 -0
  101. package/cpp/llama.cpp/ggml/src/ggml.c +0 -6
  102. package/cpp/llama.cpp/gguf-py/gguf/constants.py +57 -0
  103. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +4 -1
  104. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +14 -3
  105. package/cpp/llama.cpp/include/llama.h +134 -36
  106. package/cpp/llama.cpp/requirements/requirements-compare-llama-bench.txt +1 -0
  107. package/cpp/llama.cpp/src/CMakeLists.txt +2 -2
  108. package/cpp/llama.cpp/src/llama-arch.cpp +95 -3
  109. package/cpp/llama.cpp/src/llama-arch.h +7 -1
  110. package/cpp/llama.cpp/src/llama-batch.cpp +270 -19
  111. package/cpp/llama.cpp/src/llama-batch.h +36 -11
  112. package/cpp/llama.cpp/src/llama-chat.cpp +19 -2
  113. package/cpp/llama.cpp/src/llama-chat.h +1 -0
  114. package/cpp/llama.cpp/src/llama-context.cpp +313 -213
  115. package/cpp/llama.cpp/src/llama-context.h +16 -12
  116. package/cpp/llama.cpp/src/llama-cparams.cpp +1 -1
  117. package/cpp/llama.cpp/src/llama-cparams.h +1 -1
  118. package/cpp/llama.cpp/src/llama-graph.cpp +249 -129
  119. package/cpp/llama.cpp/src/llama-graph.h +90 -34
  120. package/cpp/llama.cpp/src/llama-hparams.cpp +6 -2
  121. package/cpp/llama.cpp/src/llama-hparams.h +8 -2
  122. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +82 -50
  123. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +23 -26
  124. package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +292 -174
  125. package/cpp/llama.cpp/src/llama-kv-cache-unified.h +68 -38
  126. package/cpp/llama.cpp/src/llama-kv-cells.h +18 -13
  127. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +247 -0
  128. package/cpp/llama.cpp/src/llama-memory-hybrid.h +143 -0
  129. package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.cpp → llama-memory-recurrent.cpp} +266 -282
  130. package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.h → llama-memory-recurrent.h} +54 -57
  131. package/cpp/llama.cpp/src/llama-memory.cpp +41 -0
  132. package/cpp/llama.cpp/src/llama-memory.h +64 -23
  133. package/cpp/llama.cpp/src/llama-mmap.cpp +1 -1
  134. package/cpp/llama.cpp/src/llama-model-loader.cpp +42 -17
  135. package/cpp/llama.cpp/src/llama-model.cpp +726 -141
  136. package/cpp/llama.cpp/src/llama-model.h +4 -0
  137. package/cpp/llama.cpp/src/llama-quant.cpp +2 -1
  138. package/cpp/llama.cpp/src/llama-vocab.cpp +32 -23
  139. package/cpp/llama.cpp/src/llama.cpp +11 -7
  140. package/cpp/llama.cpp/src/unicode.cpp +5 -0
  141. package/cpp/rn-completion.cpp +2 -2
  142. package/cpp/{rn-llama.hpp → rn-llama.h} +1 -1
  143. package/ios/include/chat.h +1 -1
  144. package/ios/include/common.h +5 -2
  145. package/ios/include/llama.h +134 -36
  146. package/ios/libs/llama.xcframework/Info.plist +18 -18
  147. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  148. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4863 -4689
  149. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +134 -36
  150. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  151. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  152. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4710
  153. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3742 -3622
  154. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +134 -36
  155. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  156. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  157. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4710
  158. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3744 -3624
  159. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +134 -36
  160. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +134 -36
  161. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  162. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +134 -36
  163. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  164. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  165. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  166. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4863 -4689
  167. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +134 -36
  168. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  169. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  170. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4710
  171. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3742 -3622
  172. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +134 -36
  173. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  174. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  175. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4900 -4725
  176. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +134 -36
  177. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  178. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  179. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4871 -4746
  180. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3773 -3652
  181. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +134 -36
  182. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  183. package/package.json +1 -2
  184. package/cpp/llama.cpp/common/cmake/build-info-gen-cpp.cmake +0 -24
  185. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  186. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13891
  187. package/cpp/llama.cpp/src/llama-kv-cache.cpp +0 -1
  188. package/cpp/llama.cpp/src/llama-kv-cache.h +0 -44
  189. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
  190. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
  191. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
  192. /package/cpp/{rn-utils.hpp → rn-utils.h} +0 -0
@@ -347,14 +347,15 @@ static enum ggml_status
347
347
  ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
348
348
  ggml_tensor *tensor) try {
349
349
  GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
350
- debug_print_tensor(": tensor=", tensor, "\n");
350
+ GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor, "\n").c_str());
351
351
  ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context;
352
352
 
353
353
  if (tensor->view_src != NULL) {
354
354
  assert(tensor->view_src->buffer->buft == buffer->buft);
355
355
  return GGML_STATUS_SUCCESS;
356
356
  }
357
- if ((tensor->type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q4_K) && !g_ggml_sycl_disable_optimize) {
357
+ if ((tensor->type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q4_K || tensor->type == GGML_TYPE_Q6_K) &&
358
+ !g_ggml_sycl_disable_optimize) {
358
359
  ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
359
360
  tensor->extra = extra;
360
361
  ctx->tensor_extras.push_back(extra); //used to release it when destroy ctx.
@@ -384,7 +385,7 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer,
384
385
  const void *data, size_t offset,
385
386
  size_t size) try {
386
387
  GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
387
- debug_print_tensor(": tensor=", tensor);
388
+ GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
388
389
  GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
389
390
  ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
390
391
  ggml_sycl_set_device(ctx->device);
@@ -412,7 +413,7 @@ static void ggml_backend_sycl_buffer_get_tensor(ggml_backend_buffer_t buffer,
412
413
  void *data, size_t offset,
413
414
  size_t size) try {
414
415
  GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
415
- debug_print_tensor(": tensor=", tensor);
416
+ GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
416
417
  GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
417
418
  ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
418
419
 
@@ -443,8 +444,8 @@ ggml_backend_sycl_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
443
444
  ggml_tensor *dst) try {
444
445
  bool is_cpy_supported = ggml_backend_buffer_is_sycl(src->buffer);
445
446
  GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
446
- debug_print_tensor(": dst=", dst);
447
- debug_print_tensor(" src=", src);
447
+ GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": dst", dst).c_str());
448
+ GGML_SYCL_DEBUG("%s", debug_get_tensor_str(" src", src).c_str());
448
449
  GGML_SYCL_DEBUG(" is_cpy_supported=%d\n", is_cpy_supported);
449
450
  if (is_cpy_supported) {
450
451
  ggml_backend_sycl_buffer_context * src_ctx = (ggml_backend_sycl_buffer_context *)src->buffer->context;
@@ -524,7 +525,7 @@ catch (sycl::exception const &exc) {
524
525
  static void ggml_backend_sycl_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value,
525
526
  size_t offset, size_t size) {
526
527
  GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
527
- debug_print_tensor(": tensor=", tensor);
528
+ GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
528
529
  GGML_SYCL_DEBUG(" size=%zu offset=%zu value=%u\n", size, offset, value);
529
530
  ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *) buffer->context;
530
531
  SYCL_CHECK(ggml_sycl_set_device(ctx->device));
@@ -804,7 +805,7 @@ static enum ggml_status
804
805
  ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
805
806
  ggml_tensor *tensor) try {
806
807
  GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
807
- debug_print_tensor(": tensor=", tensor, "\n");
808
+ GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor, "\n").c_str());
808
809
  GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
809
810
 
810
811
  ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
@@ -890,7 +891,7 @@ ggml_backend_sycl_split_buffer_set_tensor(ggml_backend_buffer_t buffer,
890
891
  ggml_tensor *tensor, const void *data,
891
892
  size_t offset, size_t size) try {
892
893
  GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
893
- debug_print_tensor(": tensor=", tensor);
894
+ GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
894
895
  GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
895
896
  // split tensors must always be set in their entirety at once
896
897
  GGML_ASSERT(offset == 0);
@@ -946,7 +947,7 @@ ggml_backend_sycl_split_buffer_get_tensor(ggml_backend_buffer_t buffer,
946
947
  const ggml_tensor *tensor, void *data,
947
948
  size_t offset, size_t size) try {
948
949
  GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
949
- debug_print_tensor(": tensor=", tensor);
950
+ GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
950
951
  GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
951
952
  // split tensors must always be set in their entirety at once
952
953
  GGML_ASSERT(offset == 0);
@@ -1434,6 +1435,59 @@ static void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy,
1434
1435
  reinterpret_cast<sycl::half &>(y[ib].ds.y()) = sum;
1435
1436
  }
1436
1437
 
1438
+ template <int ElementsPerWI>
1439
+ static __dpct_inline__ void quantize_and_reorder_q8_1(const float * __restrict__ x, void * reordered_q8_tensor,
1440
+ const int kx, const int kx_padded, const sycl::nd_item<1> & it) {
1441
+ /*
1442
+ Quantizes and reorders the resultant q8 tensor in a per row fashion
1443
+ Each sub-group calculates one quant block. i.e. QK8_1 quant values and the d and sum values
1444
+ */
1445
+
1446
+ auto subgroup_id = it.get_group(0);
1447
+ auto wi_id = it.get_local_id(0);
1448
+
1449
+ const int num_blocks_per_row = kx / QK8_1;
1450
+ auto row = subgroup_id / num_blocks_per_row;
1451
+ auto col = subgroup_id % num_blocks_per_row;
1452
+
1453
+ auto row_offset = row * (kx_padded / QK8_1) * sizeof(block_q8_1);
1454
+ auto col_offset = QK8_1 * col + wi_id * ElementsPerWI;
1455
+
1456
+ auto quant_ptr = (int8_t *) ((char *) reordered_q8_tensor + row_offset + col_offset);
1457
+ auto ds_ptr = (sycl::half2 *) ((char *) reordered_q8_tensor + row_offset + kx + col * sizeof(sycl::half2));
1458
+
1459
+ sycl::vec<float, ElementsPerWI> wi_f32_vals;
1460
+ sycl::vec<int8_t, ElementsPerWI> quantized_values;
1461
+
1462
+ auto float_ptr_offset = subgroup_id * QK8_1 + ElementsPerWI * wi_id;
1463
+ wi_f32_vals = *reinterpret_cast<const sycl::vec<float, ElementsPerWI> *>(x + float_ptr_offset);
1464
+
1465
+ float sum = 0.0f;
1466
+ float amax = 0.0f;
1467
+
1468
+ #pragma unroll(ElementsPerWI)
1469
+ for (int i = 0; i < ElementsPerWI; i++) {
1470
+ sum += wi_f32_vals[i];
1471
+ amax = sycl::fmax(amax, sycl::fabs(wi_f32_vals[i]));
1472
+ quantized_values[i] = 0;
1473
+ }
1474
+ sum = sycl::reduce_over_group(it.get_group(), sum, sycl::plus<float>());
1475
+ amax = sycl::reduce_over_group(it.get_group(), amax, sycl::maximum<float>());
1476
+ float d = amax == 0 ? 1 : amax / 127;
1477
+
1478
+ #pragma unroll(ElementsPerWI)
1479
+ for (int i = 0; i < ElementsPerWI; i++) {
1480
+ quantized_values[i] = sycl::round(wi_f32_vals[i] / d);
1481
+ }
1482
+
1483
+ d = amax == 0 ? 0 : d;
1484
+
1485
+ *reinterpret_cast<sycl::vec<int8_t, ElementsPerWI> *>(quant_ptr) = quantized_values;
1486
+ if (wi_id == 0) {
1487
+ *ds_ptr = sycl::half2(sycl::half(d), sycl::half(sum));
1488
+ }
1489
+ }
1490
+
1437
1491
  static void mul_mat_p021_f16_f32(
1438
1492
  const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst,
1439
1493
  const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y,
@@ -1718,23 +1772,30 @@ static void pool2d_nchw_kernel(
1718
1772
  o_ptr[cur_oh * ow + cur_ow] = res;
1719
1773
  }
1720
1774
 
1721
- static void quantize_row_q8_1_sycl(const float *x, void *vy, const int kx,
1722
- const int ky, const int kx_padded,
1723
- queue_ptr stream) {
1724
- const int block_num_x = (kx_padded + SYCL_QUANTIZE_BLOCK_SIZE - 1) / SYCL_QUANTIZE_BLOCK_SIZE;
1725
- const sycl::range<3> num_blocks(1, ky, block_num_x);
1726
- int constexpr QUANT_BLOCK_TILE = QK8_1 / WARP_SIZE;
1727
- static_assert(QK8_1 % WARP_SIZE == 0);
1728
- const sycl::range<3> block_size(1, 1, SYCL_QUANTIZE_BLOCK_SIZE / QUANT_BLOCK_TILE);
1729
- {
1730
- dpct::has_capability_or_fail(stream->get_device(),
1731
- {sycl::aspect::fp16});
1775
+ static void quantize_row_q8_1_sycl(const float * x, void * vy, const int kx, const int ky, const int kx_padded,
1776
+ bool reorder_q8_tensor, queue_ptr stream) {
1777
+ if (reorder_q8_tensor) {
1778
+ auto local_range = std::size_t(WARP_SIZE);
1779
+ auto num_quant_blocks = ky * (kx / QK8_1);
1780
+ auto global_range = num_quant_blocks * local_range;
1781
+ stream->parallel_for(sycl::nd_range<1>({ global_range }, { local_range }),
1782
+ [=](sycl::nd_item<1> it) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
1783
+ quantize_and_reorder_q8_1<QK8_1 / WARP_SIZE>(x, vy, kx, kx_padded, it);
1784
+ });
1785
+ } else {
1786
+ const int block_num_x = (kx_padded + SYCL_QUANTIZE_BLOCK_SIZE - 1) / SYCL_QUANTIZE_BLOCK_SIZE;
1787
+ const sycl::range<3> num_blocks(1, ky, block_num_x);
1788
+ int constexpr QUANT_BLOCK_TILE = QK8_1 / WARP_SIZE;
1789
+ static_assert(QK8_1 % WARP_SIZE == 0);
1790
+ const sycl::range<3> block_size(1, 1, SYCL_QUANTIZE_BLOCK_SIZE / QUANT_BLOCK_TILE);
1791
+ {
1792
+ dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
1732
1793
 
1733
- stream->parallel_for(
1734
- sycl::nd_range<3>(num_blocks * block_size, block_size),
1735
- [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
1736
- quantize_q8_1<QUANT_BLOCK_TILE>(x, vy, kx, kx_padded, item_ct1);
1737
- });
1794
+ stream->parallel_for(sycl::nd_range<3>(num_blocks * block_size, block_size),
1795
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
1796
+ quantize_q8_1<QUANT_BLOCK_TILE>(x, vy, kx, kx_padded, item_ct1);
1797
+ });
1798
+ }
1738
1799
  }
1739
1800
  }
1740
1801
 
@@ -2066,21 +2127,18 @@ inline void ggml_sycl_op_mul_mat_sycl(
2066
2127
  const sycl::half *src1_ptr = src1->type == GGML_TYPE_F16
2067
2128
  ? (const sycl::half *)src1->data + src1_padded_row_size
2068
2129
  : src1_as_f16.get();
2069
- ggml_sycl_pool_alloc<sycl::half> dst_f16(ctx.pool(), row_diff * src1_ncols);
2070
2130
 
2071
2131
  #if GGML_SYCL_DNNL
2072
2132
  if (!g_ggml_sycl_disable_dnn) {
2073
2133
  DnnlGemmWrapper::row_gemm(ctx, src1_ncols, row_diff, ne10, src1_ptr,
2074
2134
  DnnlGemmWrapper::to_dt<sycl::half>(), src0_ptr, DnnlGemmWrapper::to_dt<sycl::half>(),
2075
- dst_f16.get(), DnnlGemmWrapper::to_dt<sycl::half>(), stream);
2076
- scope_op_debug_print scope_dbg_print(__func__, "/to_fp32_sycl", dst, /*num_src=*/2,
2077
- " : converting dst to fp32");
2078
- const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst);
2079
- to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff* src1_ncols, stream);
2135
+ dst_dd_i, DnnlGemmWrapper::to_dt<float>(), stream);
2080
2136
  }
2081
2137
  else
2082
2138
  #endif
2083
2139
  {
2140
+ ggml_sycl_pool_alloc<sycl::half> dst_f16(ctx.pool(), row_diff * src1_ncols);
2141
+
2084
2142
  const sycl::half alpha_f16 = 1.0f;
2085
2143
  const sycl::half beta_f16 = 0.0f;
2086
2144
  SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm(
@@ -2446,9 +2504,10 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
2446
2504
  dev[i].src1_ddq = dev[i].src1_ddq_alloc.alloc(ctx.pool(i), nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs);
2447
2505
 
2448
2506
  if (src1_on_device && src1_is_contiguous) {
2507
+ bool reorder_q8_tensor = src0->extra && ((ggml_tensor_extra_gpu *)src0->extra)->optimized_feature.reorder;
2449
2508
  scope_op_debug_print scope_dbg_print(__func__, "/quantize_row_q8_1_sycl", dst,
2450
2509
  /*num_src=*/2, " : converting src1 to Q8_1");
2451
- quantize_row_q8_1_sycl(dev[i].src1_ddf, dev[i].src1_ddq, ne10, nrows1, src1_padded_col_size, stream);
2510
+ quantize_row_q8_1_sycl(dev[i].src1_ddf, dev[i].src1_ddq, ne10, nrows1, src1_padded_col_size, reorder_q8_tensor, stream);
2452
2511
  /*
2453
2512
  DPCT1010:90: SYCL uses exceptions to report errors and does not
2454
2513
  use the error codes. The call was replaced with 0. You need to
@@ -2554,7 +2613,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
2554
2613
  if (convert_src1_to_q8_1 && !src1_is_contiguous) {
2555
2614
  scope_op_debug_print scope_dbg_print(__func__, "/quantize_row_q8_1_sycl", dst,
2556
2615
  /*num_src=*/2, " : converting src1 to Q8_1");
2557
- quantize_row_q8_1_sycl(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
2616
+ quantize_row_q8_1_sycl(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, false, stream);
2558
2617
  /*
2559
2618
  DPCT1010:92: SYCL uses exceptions to report errors and does
2560
2619
  not use the error codes. The call was replaced with 0. You
@@ -2928,6 +2987,7 @@ inline bool ggml_sycl_supports_reorder_mul_mat_sycl(enum ggml_type type) {
2928
2987
  case GGML_TYPE_Q4_0:
2929
2988
  return true;
2930
2989
  case GGML_TYPE_Q4_K:
2990
+ case GGML_TYPE_Q6_K:
2931
2991
  return !g_ggml_sycl_prioritize_dmmv;
2932
2992
  default:
2933
2993
  return false;
@@ -2947,6 +3007,7 @@ inline bool ggml_sycl_supports_reorder_mmvq(enum ggml_type type) {
2947
3007
  switch (type) {
2948
3008
  case GGML_TYPE_Q4_0:
2949
3009
  case GGML_TYPE_Q4_K:
3010
+ case GGML_TYPE_Q6_K:
2950
3011
  return true;
2951
3012
  default:
2952
3013
  return false;
@@ -3031,6 +3092,50 @@ static void reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, d
3031
3092
  sycl::free(tmp_buf, *stream);
3032
3093
  }
3033
3094
 
3095
+ static void reorder_qw_q6_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) {
3096
+ GGML_ASSERT(size % sizeof(block_q6_K) == 0);
3097
+ GGML_ASSERT(offset % sizeof(block_q6_K) == 0);
3098
+
3099
+ const int nblocks = size / sizeof(block_q6_K);
3100
+
3101
+ auto * tmp_buf = sycl::malloc_shared<uint8_t>(size, *stream);
3102
+ SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size).wait()));
3103
+
3104
+ auto * ql_ptr = data_device;
3105
+ auto * qh_ptr = ql_ptr + (QK_K / 2) * nblocks;
3106
+ auto * scales_ptr = qh_ptr + (QK_K / 4) * nblocks;
3107
+ sycl::half * dm_ptr = (sycl::half *) (scales_ptr + (QK_K / 16) * nblocks);
3108
+
3109
+ stream
3110
+ ->parallel_for(nblocks,
3111
+ [=](auto i) {
3112
+ const block_q6_K * x = (const block_q6_K *) tmp_buf;
3113
+ const int ib = i;
3114
+
3115
+ const uint8_t * ql = x[ib].ql;
3116
+ const uint8_t * qh = x[ib].qh;
3117
+ uint8_t * base_ql_ptr = ql_ptr + (QK_K / 2) * ib;
3118
+ uint8_t * base_qh_ptr = qh_ptr + (QK_K / 4) * ib;
3119
+ uint8_t * base_scales_ptr = scales_ptr + (QK_K / 16) * ib;
3120
+
3121
+ for (int j = 0; j < QK_K / 2; ++j) {
3122
+ base_ql_ptr[j] = ql[j];
3123
+ }
3124
+ for (int j = 0; j < QK_K / 4; ++j) {
3125
+ base_qh_ptr[j] = qh[j];
3126
+ }
3127
+
3128
+ for (int j = 0; j < QK_K / 16; ++j) {
3129
+ base_scales_ptr[j] = x[ib].scales[j];
3130
+ }
3131
+
3132
+ dm_ptr[ib] = x[ib].d;
3133
+ })
3134
+ .wait_and_throw();
3135
+
3136
+ sycl::free(tmp_buf, *stream);
3137
+ }
3138
+
3034
3139
  static void reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
3035
3140
  uint8_t * data_device = (uint8_t *) src0->data;
3036
3141
  size_t ncols = src0->ne[0];
@@ -3044,6 +3149,9 @@ static void reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
3044
3149
  case GGML_TYPE_Q4_K:
3045
3150
  reorder_qw_q4_k(data_device, size, 0, stream);
3046
3151
  break;
3152
+ case GGML_TYPE_Q6_K:
3153
+ reorder_qw_q6_k(data_device, size, 0, stream);
3154
+ break;
3047
3155
  default:
3048
3156
  GGML_ABORT("reorder_qw() called with unsupported type");
3049
3157
  break;
@@ -3755,7 +3863,7 @@ static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
3755
3863
  const void *data, size_t offset,
3756
3864
  size_t size) try {
3757
3865
  GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
3758
- debug_print_tensor(": tensor=", tensor);
3866
+ GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
3759
3867
  GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
3760
3868
  ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
3761
3869
  ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
@@ -3776,7 +3884,7 @@ static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend,
3776
3884
  void *data, size_t offset,
3777
3885
  size_t size) try {
3778
3886
  GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
3779
- debug_print_tensor(": tensor=", tensor);
3887
+ GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str());
3780
3888
  GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset);
3781
3889
  ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
3782
3890
  ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
@@ -3799,8 +3907,8 @@ static bool ggml_backend_sycl_cpy_tensor_async(ggml_backend_t backend,
3799
3907
  bool is_cpy_supported = dst->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) &&
3800
3908
  ggml_backend_buffer_is_sycl(src->buffer);
3801
3909
  GGML_SYCL_DEBUG("[SYCL] call %s", __func__);
3802
- debug_print_tensor(": dst=", dst);
3803
- debug_print_tensor(" src=", src);
3910
+ GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": dst", dst).c_str());
3911
+ GGML_SYCL_DEBUG("%s", debug_get_tensor_str(" src", src).c_str());
3804
3912
  GGML_SYCL_DEBUG(" is_cpy_supported=%d\n", is_cpy_supported);
3805
3913
  if (is_cpy_supported) {
3806
3914
  /*
@@ -4165,6 +4273,9 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
4165
4273
  {
4166
4274
  ggml_type src0_type = op->src[0]->type;
4167
4275
  ggml_type src1_type = op->src[1]->type;
4276
+ if (src0_type == src1_type && (ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1])) && src0_type != GGML_TYPE_BF16) {
4277
+ return true;
4278
+ }
4168
4279
  if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
4169
4280
  return true;
4170
4281
  }
@@ -4210,6 +4321,21 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
4210
4321
  if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_IQ4_NL) {
4211
4322
  return true;
4212
4323
  }
4324
+ if(src0_type == GGML_TYPE_Q8_0 && src1_type == GGML_TYPE_Q8_0) {
4325
+ return true;
4326
+ }
4327
+ if(src0_type == GGML_TYPE_Q5_0 && src1_type == GGML_TYPE_Q5_0) {
4328
+ return true;
4329
+ }
4330
+ if(src0_type == GGML_TYPE_Q5_1 && src1_type == GGML_TYPE_Q5_1) {
4331
+ return true;
4332
+ }
4333
+ if(src0_type == GGML_TYPE_Q4_0 && src1_type == GGML_TYPE_Q4_0) {
4334
+ return true;
4335
+ }
4336
+ if(src0_type == GGML_TYPE_Q4_1 && src1_type == GGML_TYPE_Q4_1) {
4337
+ return true;
4338
+ }
4213
4339
  return false;
4214
4340
  }
4215
4341
  case GGML_OP_CONCAT:
@@ -29,24 +29,23 @@ static void mul_mat_vec_q_reorder(const void * __restrict__ vx, const void * __r
29
29
  static_assert(blocks_per_subgroup > 0);
30
30
  static_assert(block_elements_per_subgroup > 0);
31
31
 
32
- const block_q8_1 * y = (const block_q8_1 *) vy;
33
-
34
32
  float partial_sum = 0.0f;
35
33
  for (int i = sg.get_local_linear_id() / block_elements_per_subgroup; i < blocks_per_row; i += blocks_per_subgroup) {
36
- const int ibx = row * blocks_per_row + i; // x block index
37
- // TODO: Generalize offsets, right now only works for quantizations that don't split high and low bits
38
- const int bx_offset = block_type::get_block_offset(ibx);
39
- const int d_offset = block_type::get_d_offset(nrows, ncols, ibx);
34
+ const int ibx = row * blocks_per_row + i; // x block index
40
35
 
36
+ const auto bx_offset = block_type::get_block_offset(ibx, nblocks);
37
+ const auto d_offset = block_type::get_d_offset(nrows, ncols, ibx);
41
38
  // Y block index that aligns with ibx
42
39
  const int iby = i * block_type::block_to_q8_1_ratio();
40
+ const int8_t* q8_1_quant_ptr = (const int8_t*)vy + iby * QK8_1;
41
+ const sycl::half2* q8_1_ds_ptr = (const sycl::half2*)((const char*)vy + ncols + iby * sizeof(sycl::half2));
43
42
 
44
43
  #pragma unroll
45
44
  for (int elem = 0; elem < block_elements_per_subgroup; elem += WARP_SIZE) {
46
45
  // x block quant index when casting the quants to int
47
46
  const int iqs = elem + block_traits::vdr_mmvq * (sg.get_local_linear_id() % block_elements_per_subgroup);
48
47
 
49
- partial_sum += reorder_vec_dot_q_sycl()(vx, bx_offset, d_offset, &y[iby], iqs, nblocks);
48
+ partial_sum += reorder_vec_dot_q_sycl()(vx, bx_offset, d_offset, q8_1_quant_ptr, q8_1_ds_ptr, iqs);
50
49
  }
51
50
  }
52
51
 
@@ -785,6 +784,24 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
785
784
  }
786
785
  }
787
786
 
787
+ static void reorder_mul_mat_vec_q6_k_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols,
788
+ const int nrows, dpct::queue_ptr stream) {
789
+ GGML_ASSERT(ncols % QK_K == 0);
790
+ const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y);
791
+ constexpr size_t num_subgroups = 16;
792
+ GGML_ASSERT(block_num_y % num_subgroups == 0);
793
+
794
+ const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE);
795
+ const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
796
+
797
+ stream->submit([&](sycl::handler & cgh) {
798
+ cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size),
799
+ [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
800
+ mul_mat_vec_q_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q6_K>>(vx, vy, dst, ncols, nrows,
801
+ nd_item);
802
+ });
803
+ });
804
+ }
788
805
  static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
789
806
  float *dst, const int ncols,
790
807
  const int nrows,
@@ -1070,7 +1087,14 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens
1070
1087
  mul_mat_vec_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
1071
1088
  break;
1072
1089
  case GGML_TYPE_Q6_K:
1073
- mul_mat_vec_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
1090
+ if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
1091
+ ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
1092
+ GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q6_k_q8_1_sycl\n");
1093
+ reorder_mul_mat_vec_q6_k_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
1094
+ } else {
1095
+ GGML_SYCL_DEBUG("Calling mul_mat_vec_q6_k_q8_1_sycl\n");
1096
+ mul_mat_vec_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
1097
+ }
1074
1098
  break;
1075
1099
  case GGML_TYPE_IQ1_S:
1076
1100
  mul_mat_vec_iq1_s_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
@@ -14,12 +14,13 @@
14
14
  #ifndef GGML_SYCL_QUANTS_HPP
15
15
  #define GGML_SYCL_QUANTS_HPP
16
16
 
17
+ #include <utility>
18
+
17
19
  #include "ggml-common.h"
18
20
  #include "ggml.h"
19
21
 
20
22
  namespace ggml_sycl_reordered {
21
23
 
22
-
23
24
  // The reordered block moves quants (qs) and scales(d) to two
24
25
  // uniform regions of memory that is contiguous in the same tensor.
25
26
  // What this means is that instead of having:
@@ -32,7 +33,6 @@ namespace ggml_sycl_reordered {
32
33
 
33
34
  template <ggml_type type> struct block_q_t;
34
35
 
35
-
36
36
  // qk number of weights / quants in a block
37
37
  // qr number of weights in a byte (described as 'before dequantization')
38
38
  // for quantization types that has low and high bits split, qr is calculated with
@@ -47,10 +47,12 @@ template <> struct block_q_t<GGML_TYPE_Q4_0> {
47
47
  static constexpr uint32_t vdr_mmvq = 2;
48
48
  };
49
49
 
50
- static constexpr int get_block_offset(const int block_index) { return block_index * (traits::qk / traits::qr); }
50
+ static constexpr std::pair<int, int> get_block_offset(const int block_index, const int /* nblocks */) {
51
+ return { block_index * (traits::qk / traits::qr), 0 };
52
+ }
51
53
 
52
- static constexpr int get_d_offset(int nrows, int ncols, const int block_index) {
53
- return (ncols / traits::qr * nrows) + block_index * sizeof(ggml_half);
54
+ static constexpr std::pair<int, int> get_d_offset(int nrows, int ncols, const int block_index) {
55
+ return { (ncols / traits::qr * nrows) + block_index * sizeof(ggml_half), 0 };
54
56
  }
55
57
 
56
58
  static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
@@ -64,20 +66,46 @@ template <> struct block_q_t<GGML_TYPE_Q4_K> {
64
66
  static constexpr uint32_t vdr_mmvq = 2;
65
67
  };
66
68
 
67
- static constexpr int get_block_offset(const int block_index) { return block_index * (traits::qk / traits::qr); }
69
+ static constexpr std::pair<int, int> get_block_offset(const int block_index, const int /* nblocks */) {
70
+ return { block_index * (traits::qk / traits::qr), 0 };
71
+ }
68
72
 
69
- static constexpr int get_d_offset(int nrows, int ncols, const int block_index) {
73
+ static constexpr std::pair<int, int> get_d_offset(int nrows, int ncols, const int block_index) {
70
74
  auto nblocks = (nrows * (ncols / traits::qk));
71
- return (nblocks * QK_K / 2) + (nblocks * K_SCALE_SIZE) + (block_index * sizeof(ggml_half2));
75
+ return { nblocks * (QK_K / 2),
76
+ (nblocks * QK_K / 2) + (nblocks * K_SCALE_SIZE) + (block_index * sizeof(ggml_half2)) };
72
77
  }
73
78
 
74
79
  static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
75
80
 
76
81
  constexpr size_t get_total_qs_bytes(int nblocks) { return nblocks * QK_K / 2; }
77
-
78
- constexpr size_t get_dm_offset(int nblocks) { return get_total_qs_bytes(nblocks) + nblocks * K_SCALE_SIZE; }
79
82
  };
80
83
 
84
+ template <> struct block_q_t<GGML_TYPE_Q6_K> {
85
+ struct traits {
86
+ static constexpr uint32_t qk = QK_K;
87
+ static constexpr uint32_t qi = QI6_K;
88
+ static constexpr uint32_t qr = QR6_K;
89
+ static constexpr uint32_t vdr_mmvq = 1;
90
+ };
91
+
92
+ static constexpr std::pair<int, int> get_block_offset(const int block_index, const int n_blocks) {
93
+ auto low_bits_index = block_index * (traits::qk / traits::qr);
94
+ // the index of high bits it's after all low bits
95
+ auto high_bits_index = n_blocks * (QK_K / 2) + (block_index * (QK_K / 4));
96
+ return { low_bits_index, high_bits_index };
97
+ }
98
+
99
+ static constexpr std::pair<int, int> get_d_offset(int nrows, int ncols, const int block_index) {
100
+ auto nblocks = (nrows * (ncols / traits::qk));
101
+ auto total_qs_bytes = nblocks * (QK_K / 2) + nblocks * (QK_K / 4);
102
+ auto block_scales = total_qs_bytes + block_index * (QK_K / 16);
103
+ auto sb_scale = total_qs_bytes + nblocks * (QK_K / 16);
104
+ return { block_scales, sb_scale };
105
+ }
106
+
107
+ static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
108
+ };
81
109
  } // namespace ggml_sycl_reordered
82
110
 
83
111
  #endif // GGML_SYCL_QUANTS_HPP