@novastera-oss/llamarn 0.2.1 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (268) hide show
  1. package/README.md +80 -14
  2. package/RNLlamaCpp.podspec +10 -3
  3. package/android/CMakeLists.txt +8 -0
  4. package/android/src/main/cpp/include/llama.h +62 -125
  5. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  6. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  10. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  11. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  12. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  13. package/cpp/PureCppImpl.cpp +9 -27
  14. package/cpp/SystemUtils.h +2 -2
  15. package/cpp/build-info.cpp +2 -2
  16. package/cpp/llama.cpp/README.md +11 -3
  17. package/cpp/llama.cpp/build-xcframework.sh +1 -0
  18. package/cpp/llama.cpp/common/CMakeLists.txt +8 -2
  19. package/cpp/llama.cpp/common/arg.cpp +153 -113
  20. package/cpp/llama.cpp/common/chat-parser.cpp +379 -0
  21. package/cpp/llama.cpp/common/chat-parser.h +117 -0
  22. package/cpp/llama.cpp/common/chat.cpp +847 -699
  23. package/cpp/llama.cpp/common/chat.h +73 -6
  24. package/cpp/llama.cpp/common/common.cpp +50 -82
  25. package/cpp/llama.cpp/common/common.h +21 -17
  26. package/cpp/llama.cpp/common/json-partial.cpp +255 -0
  27. package/cpp/llama.cpp/common/json-partial.h +37 -0
  28. package/cpp/llama.cpp/common/minja/chat-template.hpp +9 -5
  29. package/cpp/llama.cpp/common/minja/minja.hpp +69 -36
  30. package/cpp/llama.cpp/common/regex-partial.cpp +204 -0
  31. package/cpp/llama.cpp/common/regex-partial.h +56 -0
  32. package/cpp/llama.cpp/common/sampling.cpp +7 -8
  33. package/cpp/llama.cpp/convert_hf_to_gguf.py +453 -118
  34. package/cpp/llama.cpp/convert_hf_to_gguf_update.py +120 -68
  35. package/cpp/llama.cpp/ggml/CMakeLists.txt +2 -1
  36. package/cpp/llama.cpp/ggml/cmake/common.cmake +25 -0
  37. package/cpp/llama.cpp/ggml/include/ggml-opt.h +49 -28
  38. package/cpp/llama.cpp/ggml/include/ggml.h +26 -7
  39. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +16 -10
  40. package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +4 -1
  41. package/cpp/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +1 -0
  42. package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +2 -0
  43. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +604 -0
  44. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +42 -0
  45. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +54 -2
  46. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +50 -51
  47. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -2
  48. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +5 -9
  49. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +779 -19
  50. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +22 -0
  51. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +88 -5
  52. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -12
  53. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +264 -69
  54. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +322 -100
  55. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +117 -1
  56. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +85 -16
  57. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +220 -49
  58. package/cpp/llama.cpp/ggml/src/ggml-cuda/acc.cu +40 -26
  59. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +1 -1
  60. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +11 -1
  61. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +15 -7
  62. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +266 -64
  63. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +49 -4
  64. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +48 -4
  65. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +2 -1
  66. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +5 -1
  67. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +2 -0
  68. package/cpp/llama.cpp/ggml/src/ggml-cuda/quantize.cu +7 -6
  69. package/cpp/llama.cpp/ggml/src/ggml-cuda/sum.cu +1 -1
  70. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +10 -0
  71. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +2 -0
  72. package/cpp/llama.cpp/ggml/src/ggml-impl.h +1 -1
  73. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +4 -0
  74. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +99 -17
  75. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +200 -2
  76. package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +8 -2
  77. package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cu +112 -0
  78. package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +12 -0
  79. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +6 -0
  80. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +972 -178
  81. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  82. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/div.cl +72 -0
  83. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/group_norm.cl +72 -0
  84. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  85. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sub.cl +72 -0
  86. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
  87. package/cpp/llama.cpp/ggml/src/ggml-opt.cpp +373 -190
  88. package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +29 -23
  89. package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -10
  90. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +101 -5
  91. package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +31 -33
  92. package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +1 -0
  93. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +29 -2
  94. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +4 -5
  95. package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
  96. package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +9 -1
  97. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +84 -72
  98. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +2 -0
  99. package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
  100. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +1 -3
  101. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +324 -129
  102. package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +1 -0
  103. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +31 -2
  104. package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +95 -68
  105. package/cpp/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +1 -0
  106. package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +22 -0
  107. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -2
  108. package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +1 -4
  109. package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +2 -3
  110. package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +69 -43
  111. package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +2 -14
  112. package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +81 -91
  113. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +432 -181
  114. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +17 -0
  115. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +1 -1
  116. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +6 -152
  117. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +162 -0
  118. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +360 -0
  119. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +2 -118
  120. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +1 -1
  121. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +12 -1
  122. package/cpp/llama.cpp/ggml/src/ggml.c +107 -36
  123. package/cpp/llama.cpp/ggml/src/gguf.cpp +33 -33
  124. package/cpp/llama.cpp/gguf-py/gguf/constants.py +100 -15
  125. package/cpp/llama.cpp/gguf-py/gguf/gguf_reader.py +1 -1
  126. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +44 -12
  127. package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_editor_gui.py +21 -10
  128. package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_new_metadata.py +5 -2
  129. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +128 -31
  130. package/cpp/llama.cpp/gguf-py/gguf/utility.py +1 -1
  131. package/cpp/llama.cpp/gguf-py/pyproject.toml +1 -1
  132. package/cpp/llama.cpp/include/llama.h +62 -125
  133. package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +1 -1
  134. package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +1 -1
  135. package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.inp +1 -1
  136. package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.out +1 -1
  137. package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +1 -1
  138. package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +1 -1
  139. package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +1 -1
  140. package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +1 -1
  141. package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.inp +1 -1
  142. package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.out +1 -1
  143. package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +1 -1
  144. package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +1 -1
  145. package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +1 -1
  146. package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +1 -1
  147. package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +1 -1
  148. package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +1 -1
  149. package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.inp +1 -1
  150. package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.out +1 -1
  151. package/cpp/llama.cpp/models/ggml-vocab-nomic-bert-moe.gguf +0 -0
  152. package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +1 -1
  153. package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.out +1 -1
  154. package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +1 -1
  155. package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.out +1 -1
  156. package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.inp +1 -1
  157. package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.out +1 -1
  158. package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +1 -1
  159. package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.out +1 -1
  160. package/cpp/llama.cpp/models/templates/Qwen-QwQ-32B.jinja +62 -0
  161. package/cpp/llama.cpp/models/templates/Qwen-Qwen3-0.6B.jinja +85 -0
  162. package/cpp/llama.cpp/models/templates/README.md +2 -0
  163. package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +5 -1
  164. package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +5 -1
  165. package/cpp/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
  166. package/cpp/llama.cpp/requirements/requirements-gguf_editor_gui.txt +1 -1
  167. package/cpp/llama.cpp/src/CMakeLists.txt +2 -0
  168. package/cpp/llama.cpp/src/llama-arch.cpp +6 -0
  169. package/cpp/llama.cpp/src/llama-arch.h +2 -0
  170. package/cpp/llama.cpp/src/llama-batch.cpp +3 -1
  171. package/cpp/llama.cpp/src/llama-context.cpp +340 -123
  172. package/cpp/llama.cpp/src/llama-context.h +30 -0
  173. package/cpp/llama.cpp/src/llama-cparams.cpp +4 -0
  174. package/cpp/llama.cpp/src/llama-cparams.h +2 -0
  175. package/cpp/llama.cpp/src/llama-grammar.cpp +12 -2
  176. package/cpp/llama.cpp/src/llama-graph.cpp +157 -247
  177. package/cpp/llama.cpp/src/llama-graph.h +52 -7
  178. package/cpp/llama.cpp/src/llama-hparams.cpp +17 -1
  179. package/cpp/llama.cpp/src/llama-hparams.h +37 -5
  180. package/cpp/llama.cpp/src/llama-kv-cache.cpp +742 -481
  181. package/cpp/llama.cpp/src/llama-kv-cache.h +196 -99
  182. package/cpp/llama.cpp/src/llama-kv-cells.h +379 -0
  183. package/cpp/llama.cpp/src/llama-memory.h +4 -3
  184. package/cpp/llama.cpp/src/llama-model-loader.cpp +22 -17
  185. package/cpp/llama.cpp/src/llama-model-saver.cpp +281 -0
  186. package/cpp/llama.cpp/src/llama-model-saver.h +37 -0
  187. package/cpp/llama.cpp/src/llama-model.cpp +529 -172
  188. package/cpp/llama.cpp/src/llama-model.h +6 -1
  189. package/cpp/llama.cpp/src/llama-quant.cpp +15 -13
  190. package/cpp/llama.cpp/src/llama-sampling.cpp +2 -2
  191. package/cpp/llama.cpp/src/llama-vocab.cpp +35 -8
  192. package/cpp/llama.cpp/src/llama-vocab.h +6 -0
  193. package/cpp/llama.cpp/src/llama.cpp +14 -0
  194. package/cpp/rn-completion.cpp +60 -5
  195. package/ios/include/chat.h +73 -6
  196. package/ios/include/common/minja/chat-template.hpp +9 -5
  197. package/ios/include/common/minja/minja.hpp +69 -36
  198. package/ios/include/common.h +21 -17
  199. package/ios/include/llama.h +62 -125
  200. package/ios/libs/llama.xcframework/Info.plist +19 -19
  201. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  202. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4617 -4487
  203. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-opt.h +237 -0
  204. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +26 -7
  205. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +62 -125
  206. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  207. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  208. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4638 -4508
  209. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3557 -3435
  210. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
  211. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
  212. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
  213. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  214. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  215. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4638 -4508
  216. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3559 -3437
  217. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-opt.h +237 -0
  218. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +26 -7
  219. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +62 -125
  220. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-opt.h +237 -0
  221. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +26 -7
  222. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +62 -125
  223. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  224. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-opt.h +237 -0
  225. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +26 -7
  226. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +62 -125
  227. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  228. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  229. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  230. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4616 -4487
  231. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-opt.h +237 -0
  232. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +26 -7
  233. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +62 -125
  234. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  235. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  236. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4637 -4508
  237. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3556 -3435
  238. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
  239. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
  240. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
  241. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  242. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  243. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4653 -4523
  244. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-opt.h +237 -0
  245. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +26 -7
  246. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +62 -125
  247. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  248. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  249. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4674 -4544
  250. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3587 -3465
  251. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
  252. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
  253. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
  254. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  255. package/package.json +1 -1
  256. package/cpp/llama.cpp/common/stb_image.h +0 -7988
  257. package/cpp/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +0 -112
  258. package/cpp/llama.cpp/models/ggml-vocab-chameleon.gguf.out +0 -46
  259. package/cpp/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.inp +0 -112
  260. package/cpp/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.out +0 -46
  261. package/cpp/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +0 -112
  262. package/cpp/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +0 -46
  263. package/cpp/llama.cpp/models/ggml-vocab-llama4.gguf.inp +0 -112
  264. package/cpp/llama.cpp/models/ggml-vocab-llama4.gguf.out +0 -46
  265. package/cpp/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +0 -112
  266. package/cpp/llama.cpp/models/ggml-vocab-pixtral.gguf.out +0 -46
  267. package/cpp/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +0 -112
  268. package/cpp/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +0 -46
@@ -49,35 +49,38 @@ endif()
49
49
  target_compile_options(ggml-sycl PRIVATE "-Wno-narrowing")
50
50
 
51
51
  # Link against oneDNN
52
- find_package(DNNL)
53
52
  set(GGML_SYCL_DNNL 0)
54
- if(DNNL_FOUND)
55
- if (DEFINED ENV{ONEAPI_ROOT} AND NOT DEFINED DNNL_GPU_VENDOR)
56
- # Assuming oneDNN packaged with oneapi release is used which
57
- # supports only intel target
58
- set(DNNL_GPU_VENDOR "INTEL")
59
- if(NOT "${GGML_SYCL_TARGET}" STREQUAL "INTEL")
60
- message(WARNING "oneDNN builds bundled with oneapi release only support INTEL target")
53
+ if(GGML_SYCL_DNN)
54
+ find_package(DNNL)
55
+ if(DNNL_FOUND)
56
+ if (NOT DEFINED DNNL_GPU_VENDOR)
57
+ # default to intel target
58
+ set(DNNL_GPU_VENDOR "INTEL")
59
+ if(NOT "${GGML_SYCL_TARGET}" STREQUAL "INTEL")
60
+ message(WARNING "oneDNN builds bundled with oneapi release only support INTEL target")
61
+ endif()
61
62
  endif()
62
- endif()
63
63
 
64
- # Verify oneDNN was compiled for the same target as llama
65
- if("${GGML_SYCL_TARGET}" STREQUAL "${DNNL_GPU_VENDOR}")
66
- target_link_libraries(ggml-sycl PRIVATE DNNL::dnnl)
67
- set(GGML_SYCL_DNNL 1)
68
- get_target_property(CONFIGS DNNL::dnnl IMPORTED_CONFIGURATIONS)
69
- foreach(CONFIG ${CONFIGS})
70
- get_target_property(DNNL_LIB DNNL::dnnl IMPORTED_LOCATION_${CONFIG})
71
- message(STATUS "Found oneDNN: ${DNNL_LIB}")
72
- endforeach()
64
+ # Verify oneDNN was compiled for the same target as llama
65
+ if("${GGML_SYCL_TARGET}" STREQUAL "${DNNL_GPU_VENDOR}")
66
+ target_link_libraries(ggml-sycl PRIVATE DNNL::dnnl)
67
+ set(GGML_SYCL_DNNL 1)
68
+ get_target_property(CONFIGS DNNL::dnnl IMPORTED_CONFIGURATIONS)
69
+ foreach(CONFIG ${CONFIGS})
70
+ get_target_property(DNNL_LIB DNNL::dnnl IMPORTED_LOCATION_${CONFIG})
71
+ message(STATUS "Found oneDNN: ${DNNL_LIB}")
72
+ endforeach()
73
+ else()
74
+ message(WARNING
75
+ "oneDNN must be compiled for the same target as llama.cpp.
76
+ llama.cpp: ${GGML_SYCL_TARGET}, oneDNN: ${DNNL_GPU_VENDOR}.
77
+ Disabling oneDNN support.")
78
+ endif()
73
79
  else()
74
- message(WARNING
75
- "oneDNN must be compiled for the same target as llama.cpp.
76
- llama.cpp: ${GGML_SYCL_TARGET}, oneDNN: ${DNNL_GPU_VENDOR}.
77
- Disabling oneDNN support.")
80
+ message(STATUS "oneDNN not found, disabling oneDNN support")
78
81
  endif()
79
82
  else()
80
- message(STATUS "oneDNN not found, disabling oneDNN support")
83
+ message(STATUS "oneDNN support disabled by the user")
81
84
  endif()
82
85
  target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_DNNL=${GGML_SYCL_DNNL})
83
86
 
@@ -108,6 +111,9 @@ endif()
108
111
  if (GGML_SYCL_TARGET STREQUAL "INTEL")
109
112
  # Intel devices use Intel oneMKL directly instead of oneMath to avoid the limitation of linking Intel oneMKL statically
110
113
  # See https://github.com/uxlfoundation/oneMath/issues/654
114
+ if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
115
+ set(SYCL_COMPILER ON)
116
+ endif()
111
117
  find_package(MKL REQUIRED)
112
118
  target_link_libraries(ggml-sycl PRIVATE MKL::MKL_SYCL::BLAS)
113
119
  target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_USE_INTEL_ONEMKL)
@@ -319,32 +319,27 @@ inline void ggml_sycl_op_repeat(ggml_backend_sycl_context & ctx, ggml_tensor *ds
319
319
 
320
320
 
321
321
  void ggml_sycl_add(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
322
- GGML_SYCL_DEBUG("call %s\n", __func__);
322
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
323
323
  ggml_sycl_op_add(ctx, dst);
324
- GGML_SYCL_DEBUG("call %s done\n", __func__);
325
324
  }
326
325
 
327
326
  void ggml_sycl_sub(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
328
- GGML_SYCL_DEBUG("call %s\n", __func__);
327
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
329
328
  ggml_sycl_op_sub(ctx, dst);
330
- GGML_SYCL_DEBUG("call %s done\n", __func__);
331
329
  }
332
330
 
333
331
  void ggml_sycl_mul(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
334
- GGML_SYCL_DEBUG("call %s\n", __func__);
332
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
335
333
  ggml_sycl_op_mul(ctx, dst);
336
- GGML_SYCL_DEBUG("call %s done\n", __func__);
337
334
  }
338
335
 
339
336
  void ggml_sycl_div(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
340
- GGML_SYCL_DEBUG("call %s\n", __func__);
337
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
341
338
  ggml_sycl_op_div(ctx, dst);
342
- GGML_SYCL_DEBUG("call %s done\n", __func__);
343
339
  }
344
340
 
345
341
  void ggml_sycl_repeat(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
346
- GGML_SYCL_DEBUG("call %s\n", __func__);
342
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
347
343
  ggml_sycl_op_repeat(ctx, dst);
348
- GGML_SYCL_DEBUG("call %s done\n", __func__);
349
344
  }
350
345
 
@@ -13,8 +13,10 @@
13
13
  #ifndef GGML_SYCL_COMMON_HPP
14
14
  #define GGML_SYCL_COMMON_HPP
15
15
 
16
+ #include <cstddef>
16
17
  #include <fstream>
17
18
  #include <iostream>
19
+ #include <string>
18
20
 
19
21
  #include "dpct/helper.hpp"
20
22
  #include "ggml-sycl.h"
@@ -44,11 +46,20 @@ extern int g_ggml_sycl_debug;
44
46
  extern int g_ggml_sycl_disable_optimize;
45
47
  extern int g_ggml_sycl_prioritize_dmmv;
46
48
 
47
- #define GGML_SYCL_DEBUG(...) \
48
- do { \
49
- if (g_ggml_sycl_debug) \
50
- fprintf(stderr, __VA_ARGS__); \
51
- } while (0)
49
+ #if defined(__clang__) && __has_builtin(__builtin_expect)
50
+ // Hint the optimizer to pipeline the more likely following instruction in branches
51
+ # define LIKELY(expr) __builtin_expect(expr, true)
52
+ # define UNLIKELY(expr) __builtin_expect(expr, false)
53
+ #else
54
+ # define LIKELY(expr) (expr)
55
+ # define UNLIKELY(expr) (expr)
56
+ #endif
57
+
58
+ #define GGML_SYCL_DEBUG(...) \
59
+ do { \
60
+ if (UNLIKELY(g_ggml_sycl_debug)) \
61
+ fprintf(stderr, __VA_ARGS__); \
62
+ } while (0)
52
63
 
53
64
  #define CHECK_TRY_ERROR(expr) \
54
65
  [&]() { \
@@ -471,6 +482,19 @@ static __dpct_inline__ float warp_reduce_max(float x,
471
482
  return x;
472
483
  }
473
484
 
485
+ /* Helper for Computing the linear offset of a ggml_tensor given
486
+ per-dimension sizes, strides, and indices */
487
+ template<int N>
488
+ __dpct_inline__ size_t calculate_offset(const std::array<int, N> & strides, const std::array<int, N> & indices) {
489
+ size_t offset = 0;
490
+ #pragma unroll
491
+ for (int i = 0; i < N; i++) {
492
+ auto index_i = indices[i];
493
+ offset += strides[i] * index_i;
494
+ }
495
+ return offset;
496
+ }
497
+
474
498
  // Helper for vec loading aligned data
475
499
  template <typename Tp, int n>
476
500
  inline sycl::vec<Tp, n> vec_aligned_load(const Tp* aligned_ptr) {
@@ -490,4 +514,76 @@ constexpr size_t ceil_div(const size_t m, const size_t n) {
490
514
  }
491
515
 
492
516
  bool gpu_has_xmx(sycl::device &dev);
517
+
518
+ template <int N, class T> void debug_print_array(const std::string & prefix, const T array[N]) {
519
+ if (LIKELY(!g_ggml_sycl_debug)) {
520
+ return;
521
+ }
522
+ std::stringstream ss;
523
+ ss << prefix << "=[";
524
+ for (std::size_t i = 0; i < N - 1; ++i) {
525
+ ss << array[i] << ", ";
526
+ }
527
+ if constexpr (N > 0) {
528
+ ss << array[N - 1];
529
+ }
530
+ ss << "]";
531
+ GGML_SYCL_DEBUG("%s", ss.str().c_str());
532
+ }
533
+
534
+ inline void debug_print_tensor(const std::string & prefix, const ggml_tensor * tensor,
535
+ const std::string & suffix = "") {
536
+ if (LIKELY(!g_ggml_sycl_debug)) {
537
+ return;
538
+ }
539
+ GGML_SYCL_DEBUG("%s=", prefix.c_str());
540
+ if (tensor) {
541
+ GGML_SYCL_DEBUG("'%s':type=%s", tensor->name, ggml_type_name(tensor->type));
542
+ debug_print_array<GGML_MAX_DIMS>(";ne", tensor->ne);
543
+ debug_print_array<GGML_MAX_DIMS>(";nb", tensor->nb);
544
+ if (!ggml_is_contiguous(tensor)) {
545
+ GGML_SYCL_DEBUG(";strided");
546
+ }
547
+ if (ggml_is_permuted(tensor)) {
548
+ GGML_SYCL_DEBUG(";permuted");
549
+ }
550
+ } else {
551
+ GGML_SYCL_DEBUG("nullptr");
552
+ }
553
+ GGML_SYCL_DEBUG("%s", suffix.c_str());
554
+ }
555
+
556
+ // Use scope_op_debug_print to log operations coming from running a model
557
+ struct scope_op_debug_print {
558
+ // Use string_views to avoid the cost of creating a string and concatenating them
559
+ // string_views must be alive for as long as the object is alive
560
+ // scope_op_debug_print are used with string literals in practice which are stored in constant space so always accessible
561
+ scope_op_debug_print(const std::string_view & func, const std::string_view & func_suffix, const ggml_tensor * dst,
562
+ std::size_t num_src, const std::string_view & suffix = "") :
563
+ func(func),
564
+ func_suffix(func_suffix) {
565
+ if (LIKELY(!g_ggml_sycl_debug)) {
566
+ return;
567
+ }
568
+ GGML_SYCL_DEBUG("[SYCL][OP] call %s%s:", func.data(), func_suffix.data());
569
+ debug_print_tensor(" dst", dst);
570
+ if (dst) {
571
+ for (std::size_t i = 0; i < num_src; ++i) {
572
+ debug_print_tensor("\tsrc" + std::to_string(i), dst->src[i]);
573
+ }
574
+ }
575
+ GGML_SYCL_DEBUG("%s\n", suffix.data());
576
+ }
577
+
578
+ scope_op_debug_print(const std::string_view & func, const ggml_tensor * dst, std::size_t num_src,
579
+ const std::string_view & suffix = "") :
580
+ scope_op_debug_print(func, "", dst, num_src, suffix) {}
581
+
582
+ ~scope_op_debug_print() { GGML_SYCL_DEBUG("[SYCL][OP] call %s%s done\n", func.data(), func_suffix.data()); }
583
+
584
+ private:
585
+ std::string_view func;
586
+ std::string_view func_suffix;
587
+ };
588
+
493
589
  #endif // GGML_SYCL_COMMON_HPP
@@ -159,39 +159,37 @@ static void concat_f32_sycl_non_cont(
159
159
  }
160
160
 
161
161
  void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
162
- const ggml_tensor *src0 = dst->src[0];
163
- const ggml_tensor *src1 = dst->src[1];
164
- queue_ptr stream = ctx.stream();
165
-
166
- const int32_t dim = ((int32_t *)dst->op_params)[0];
167
-
168
- if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
169
- const float *src0_d = (const float *)src0->data;
170
- const float *src1_d = (const float *)src1->data;
171
-
172
- float *dst_d = (float *)dst->data;
173
-
174
- if (dim != 3) {
175
- for (int i3 = 0; i3 < dst->ne[3]; i3++) {
176
- concat_f32_sycl(
177
- src0_d + i3 * (src0->nb[3] / 4), src1_d + i3 * (src1->nb[3] / 4),
178
- dst_d + i3 * (dst->nb[3] / 4), src0->ne[0], src0->ne[1],
179
- src0->ne[2], dst->ne[0], dst->ne[1], dst->ne[2], dim, stream);
180
- }
162
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
163
+ const ggml_tensor * src0 = dst->src[0];
164
+ const ggml_tensor * src1 = dst->src[1];
165
+ queue_ptr stream = ctx.stream();
166
+
167
+ const int32_t dim = ((int32_t *) dst->op_params)[0];
168
+
169
+ if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
170
+ const float * src0_d = (const float *) src0->data;
171
+ const float * src1_d = (const float *) src1->data;
172
+
173
+ float * dst_d = (float *) dst->data;
174
+
175
+ if (dim != 3) {
176
+ for (int i3 = 0; i3 < dst->ne[3]; i3++) {
177
+ concat_f32_sycl(src0_d + i3 * (src0->nb[3] / 4), src1_d + i3 * (src1->nb[3] / 4),
178
+ dst_d + i3 * (dst->nb[3] / 4), src0->ne[0], src0->ne[1], src0->ne[2], dst->ne[0],
179
+ dst->ne[1], dst->ne[2], dim, stream);
180
+ }
181
+ } else {
182
+ const size_t size0 = ggml_nbytes(src0);
183
+ const size_t size1 = ggml_nbytes(src1);
184
+
185
+ SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(dst_d, src0_d, size0).wait()));
186
+ SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(dst_d + size0 / 4, src1_d, size1).wait()));
187
+ }
181
188
  } else {
182
- const size_t size0 = ggml_nbytes(src0);
183
- const size_t size1 = ggml_nbytes(src1);
184
-
185
- SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(dst_d, src0_d, size0).wait()));
186
- SYCL_CHECK(CHECK_TRY_ERROR(
187
- stream->memcpy(dst_d + size0 / 4, src1_d, size1).wait()));
189
+ concat_f32_sycl_non_cont(stream, (const char *) src0->data, (const char *) src1->data, (char *) dst->data,
190
+ src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0->nb[0], src0->nb[1],
191
+ src0->nb[2], src0->nb[3], src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
192
+ src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3], dst->ne[0], dst->ne[1], dst->ne[2],
193
+ dst->ne[3], dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3], dim);
188
194
  }
189
- } else
190
- concat_f32_sycl_non_cont(
191
- stream, (const char *)src0->data, (const char *)src1->data,
192
- (char *)dst->data, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
193
- src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], src1->ne[0],
194
- src1->ne[1], src1->ne[2], src1->ne[3], src1->nb[0], src1->nb[1],
195
- src1->nb[2], src1->nb[3], dst->ne[0], dst->ne[1], dst->ne[2],
196
- dst->ne[3], dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3], dim);
197
195
  }
@@ -72,6 +72,7 @@ static void conv_transpose_1d_f32_f32_sycl(
72
72
  }
73
73
 
74
74
  void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
75
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
75
76
  const ggml_tensor *src0 = dst->src[0];
76
77
  const ggml_tensor *src1 = dst->src[1];
77
78
  const float * src0_d = (const float *)src0->data;
@@ -183,6 +183,24 @@ static void dequantize_row_q4_K_sycl(const void *vx, dst_t *y, const int64_t k,
183
183
  }
184
184
  }
185
185
 
186
+ template <typename dst_t>
187
+ static void dequantize_row_q4_K_sycl_reorder(const void * vx, dst_t * y, const int64_t k, dpct::queue_ptr stream) {
188
+ const int64_t nb = k / QK_K;
189
+ const size_t local_size = 32;
190
+ const size_t global_size = nb * local_size;
191
+
192
+ dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
193
+
194
+ stream->submit([&](sycl::handler & cgh) {
195
+ sycl::local_accessor<uint8_t, 1> scale_local_acc(sycl::range<1>(12), cgh);
196
+
197
+ cgh.parallel_for(sycl::nd_range<1>(sycl::range<1>(global_size), sycl::range<1>(local_size)),
198
+ [=](sycl::nd_item<1> item_ct1) {
199
+ dequantize_block_q4_K_reorder(vx, y, get_pointer(scale_local_acc), item_ct1, nb);
200
+ });
201
+ });
202
+ }
203
+
186
204
  template <typename dst_t>
187
205
  static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int64_t k,
188
206
  dpct::queue_ptr stream) {
@@ -504,7 +522,11 @@ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) {
504
522
  case GGML_TYPE_Q3_K:
505
523
  return dequantize_row_q3_K_sycl;
506
524
  case GGML_TYPE_Q4_K:
507
- return dequantize_row_q4_K_sycl;
525
+ if (dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
526
+ return dequantize_row_q4_K_sycl_reorder;
527
+ } else {
528
+ return dequantize_row_q4_K_sycl;
529
+ }
508
530
  case GGML_TYPE_Q5_K:
509
531
  return dequantize_row_q5_K_sycl;
510
532
  case GGML_TYPE_Q6_K:
@@ -556,7 +578,12 @@ to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) {
556
578
  case GGML_TYPE_Q3_K:
557
579
  return dequantize_row_q3_K_sycl;
558
580
  case GGML_TYPE_Q4_K:
559
- return dequantize_row_q4_K_sycl;
581
+ if (dst->src[0]->extra &&
582
+ ((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {
583
+ return dequantize_row_q4_K_sycl_reorder;
584
+ } else {
585
+ return dequantize_row_q4_K_sycl;
586
+ }
560
587
  case GGML_TYPE_Q5_K:
561
588
  return dequantize_row_q5_K_sycl;
562
589
  case GGML_TYPE_Q6_K:
@@ -616,6 +616,9 @@ static void ggml_cpy_i32_i32_sycl(const char * cx, char * cdst, const int ne, co
616
616
  }
617
617
 
618
618
  void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1) try {
619
+ // Unlike other operators ggml_sycl_cpy takes 2 distinct tensors instead of a dst ggml_tensor and rely on its src field
620
+ scope_op_debug_print scope_dbg_print(__func__, src1, /*num_src=*/0,
621
+ std::string(" src0 type=") + ggml_type_name(src0->type));
619
622
  const int64_t ne = ggml_nelements(src0);
620
623
  GGML_ASSERT(ne == ggml_nelements(src1));
621
624
 
@@ -629,8 +632,6 @@ void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, co
629
632
 
630
633
  char * src0_ddc = (char *) src0->data;
631
634
  char * src1_ddc = (char *) src1->data;
632
- GGML_SYCL_DEBUG("[SYCL] %s: Tensor supplied: %s to %s\n", __func__, ggml_type_name(src0->type),
633
- ggml_type_name(src1->type));
634
635
 
635
636
  if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
636
637
  ggml_cpy_f32_f32_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10,
@@ -694,8 +695,6 @@ void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, co
694
695
  }
695
696
 
696
697
  void ggml_sycl_dup(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
697
- // TODO: why do we pass dst as src1 here?
698
- GGML_SYCL_DEBUG("[SYCL] call %s\n", __func__);
698
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
699
699
  ggml_sycl_cpy(ctx, dst->src[0], dst);
700
- GGML_SYCL_DEBUG("[SYCL] call %s done\n", __func__);
701
700
  }
@@ -357,6 +357,28 @@ static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8
357
357
  }
358
358
  #endif
359
359
 
360
+ template <typename dst_t>
361
+ inline void dequantize_q4_K_common(dst_t * __restrict__ y, const uint8_t * __restrict__ qs_ptr, const float dall,
362
+ const float dmin, uint8_t * __restrict__ scales_local, int il, int ir) {
363
+ const int is = 2 * il;
364
+ constexpr int n = 4;
365
+
366
+ uint8_t sc, m;
367
+ get_scale_min_k4(is + 0, scales_local, sc, m);
368
+ const float d1 = dall * sc;
369
+ const float m1 = dmin * m;
370
+
371
+ get_scale_min_k4(is + 1, scales_local, sc, m);
372
+ const float d2 = dall * sc;
373
+ const float m2 = dmin * m;
374
+
375
+ sycl::vec<uint8_t, n> q_vec = vec_aligned_load<uint8_t, n>(qs_ptr + 32 * il + n * ir);
376
+ for (int l = 0; l < n; ++l) {
377
+ y[l + 0] = d1 * (q_vec[l] & 0xF) - m1;
378
+ y[l + 32] = d2 * (q_vec[l] >> 4) - m2;
379
+ }
380
+ }
381
+
360
382
  template<typename dst_t>
361
383
  static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
362
384
  uint8_t* scales_local, const sycl::nd_item<3> &item_ct1) {
@@ -365,36 +387,22 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri
365
387
  const int64_t i = item_ct1.get_group(2);
366
388
 
367
389
  #if QK_K == 256
368
- // assume 32 threads
369
390
  const int64_t tid = item_ct1.get_local_id(2);
370
- const int64_t il = tid/8;
371
- const int64_t ir = tid%8;
372
- const int64_t is = 2*il;
373
- const int64_t n = 4;
391
+ const int64_t il = tid / 8;
392
+ const int64_t ir = tid % 8;
374
393
 
375
- dst_t * y = yy + i*QK_K + 64*il + n*ir;
394
+ dst_t * y = yy + i * QK_K + 64 * il + 4 * ir;
376
395
 
377
396
  const sycl::half2 dm = x[i].dm;
378
397
  const float dall = dm[0];
379
398
  const float dmin = dm[1];
380
399
 
381
- if (tid < 12)
400
+ if (tid < 12) {
382
401
  scales_local[tid] = x[i].scales[tid];
383
- item_ct1.barrier(sycl::access::fence_space::local_space);
384
-
385
- uint8_t sc, m;
386
- get_scale_min_k4(is + 0, scales_local, sc, m);
387
- const float d1 = dall * sc;
388
- const float m1 = dmin * m;
389
- get_scale_min_k4(is + 1, scales_local, sc, m);
390
- const float d2 = dall * sc;
391
- const float m2 = dmin * m;
392
-
393
- sycl::vec<uint8_t, n> q_vec = vec_aligned_load<uint8_t, n>(x[i].qs + 32*il + n*ir);
394
- for (int l = 0; l < n; ++l) {
395
- y[l + 0] = d1 * (q_vec[l] & 0xF) - m1;
396
- y[l +32] = d2 * (q_vec[l] >> 4) - m2;
397
402
  }
403
+
404
+ item_ct1.barrier(sycl::access::fence_space::local_space);
405
+ dequantize_q4_K_common(y, x[i].qs, dall, dmin, scales_local, il, ir);
398
406
  #else
399
407
  const int64_t tid = item_ct1.get_local_id(2);
400
408
  const uint8_t * q = x[i].qs;
@@ -406,6 +414,36 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri
406
414
  #endif
407
415
  }
408
416
 
417
+ template <typename dst_t>
418
+ static void dequantize_block_q4_K_reorder(const void * __restrict__ vx, dst_t * __restrict__ yy, uint8_t * scales_local,
419
+ const sycl::nd_item<1> & item_ct1, int64_t nb) {
420
+ const int64_t i = item_ct1.get_group(0); // block index
421
+ const int64_t tid = item_ct1.get_local_id(0); // thread index within block
422
+ const int64_t il = tid / 8;
423
+ const int64_t ir = tid % 8;
424
+
425
+ dst_t * y = yy + i * QK_K + 64 * il + 4 * ir;
426
+
427
+ const uint8_t * base = static_cast<const uint8_t *>(vx);
428
+ const size_t qs_offset = i * (QK_K / 2);
429
+ const size_t scales_offset = nb * (QK_K / 2) + i * K_SCALE_SIZE;
430
+ const size_t dm_offset = nb * (QK_K / 2) + nb * K_SCALE_SIZE + i * sizeof(ggml_half2);
431
+
432
+ const uint8_t * qs_ptr = base + qs_offset;
433
+ const uint8_t * scales_ptr = base + scales_offset;
434
+ ggml_half2 dm_values = *reinterpret_cast<const ggml_half2 *>(base + dm_offset);
435
+
436
+ const float dall = dm_values.x();
437
+ const float dmin = dm_values.y();
438
+
439
+ if (tid < 12) {
440
+ scales_local[tid] = scales_ptr[tid];
441
+ }
442
+
443
+ item_ct1.barrier(sycl::access::fence_space::local_space);
444
+ dequantize_q4_K_common(y, qs_ptr, dall, dmin, scales_local, il, ir);
445
+ }
446
+
409
447
  template<typename dst_t>
410
448
  static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
411
449
  const sycl::nd_item<3> &item_ct1) {
@@ -1092,6 +1092,8 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
1092
1092
  src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
1093
1093
 
1094
1094
  if (src1_convert_f16) {
1095
+ scope_op_debug_print scope_dbg_print(__func__, "/to_fp16_sycl", dst, /*num_src=*/2,
1096
+ " : converting src1 to fp16");
1095
1097
  src1_dfloat = src1_dfloat_a.alloc(ne00);
1096
1098
  const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type, dst);
1097
1099
  GGML_ASSERT(to_fp16_sycl != nullptr);
@@ -1129,7 +1131,13 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
1129
1131
  dequantize_mul_mat_vec_q3_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
1130
1132
  break;
1131
1133
  case GGML_TYPE_Q4_K:
1132
- dequantize_mul_mat_vec_q4_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
1134
+ if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
1135
+ ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
1136
+ // reorder is currently not supported for dmmv
1137
+ GGML_ABORT("Unimplemented dequantize case case for q4_k reorder");
1138
+ } else {
1139
+ dequantize_mul_mat_vec_q4_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
1140
+ }
1133
1141
  break;
1134
1142
  case GGML_TYPE_Q5_K:
1135
1143
  dequantize_mul_mat_vec_q5_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);