@novastera-oss/llamarn 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (266) hide show
  1. package/README.md +80 -14
  2. package/RNLlamaCpp.podspec +10 -3
  3. package/android/CMakeLists.txt +8 -0
  4. package/android/src/main/cpp/include/llama.h +62 -125
  5. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  6. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  10. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  11. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  12. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  13. package/cpp/build-info.cpp +2 -2
  14. package/cpp/llama.cpp/README.md +11 -3
  15. package/cpp/llama.cpp/build-xcframework.sh +1 -0
  16. package/cpp/llama.cpp/common/CMakeLists.txt +8 -2
  17. package/cpp/llama.cpp/common/arg.cpp +153 -113
  18. package/cpp/llama.cpp/common/chat-parser.cpp +379 -0
  19. package/cpp/llama.cpp/common/chat-parser.h +117 -0
  20. package/cpp/llama.cpp/common/chat.cpp +847 -699
  21. package/cpp/llama.cpp/common/chat.h +73 -6
  22. package/cpp/llama.cpp/common/common.cpp +50 -82
  23. package/cpp/llama.cpp/common/common.h +21 -17
  24. package/cpp/llama.cpp/common/json-partial.cpp +255 -0
  25. package/cpp/llama.cpp/common/json-partial.h +37 -0
  26. package/cpp/llama.cpp/common/minja/chat-template.hpp +9 -5
  27. package/cpp/llama.cpp/common/minja/minja.hpp +69 -36
  28. package/cpp/llama.cpp/common/regex-partial.cpp +204 -0
  29. package/cpp/llama.cpp/common/regex-partial.h +56 -0
  30. package/cpp/llama.cpp/common/sampling.cpp +7 -8
  31. package/cpp/llama.cpp/convert_hf_to_gguf.py +453 -118
  32. package/cpp/llama.cpp/convert_hf_to_gguf_update.py +120 -68
  33. package/cpp/llama.cpp/ggml/CMakeLists.txt +2 -1
  34. package/cpp/llama.cpp/ggml/cmake/common.cmake +25 -0
  35. package/cpp/llama.cpp/ggml/include/ggml-opt.h +49 -28
  36. package/cpp/llama.cpp/ggml/include/ggml.h +26 -7
  37. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +16 -10
  38. package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +4 -1
  39. package/cpp/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +1 -0
  40. package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +2 -0
  41. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +604 -0
  42. package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +42 -0
  43. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +54 -2
  44. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +50 -51
  45. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -2
  46. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +5 -9
  47. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +779 -19
  48. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +22 -0
  49. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +88 -5
  50. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -12
  51. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +264 -69
  52. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +322 -100
  53. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +117 -1
  54. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +85 -16
  55. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +220 -49
  56. package/cpp/llama.cpp/ggml/src/ggml-cuda/acc.cu +40 -26
  57. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +1 -1
  58. package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +11 -1
  59. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +15 -7
  60. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +266 -64
  61. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +49 -4
  62. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +48 -4
  63. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +2 -1
  64. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +5 -1
  65. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +2 -0
  66. package/cpp/llama.cpp/ggml/src/ggml-cuda/quantize.cu +7 -6
  67. package/cpp/llama.cpp/ggml/src/ggml-cuda/sum.cu +1 -1
  68. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +10 -0
  69. package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +2 -0
  70. package/cpp/llama.cpp/ggml/src/ggml-impl.h +1 -1
  71. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +4 -0
  72. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +99 -17
  73. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +200 -2
  74. package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +8 -2
  75. package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cu +112 -0
  76. package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +12 -0
  77. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +6 -0
  78. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +972 -178
  79. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/argsort.cl +86 -0
  80. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/div.cl +72 -0
  81. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/group_norm.cl +72 -0
  82. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sigmoid.cl +29 -0
  83. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sub.cl +72 -0
  84. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/sum_rows.cl +39 -0
  85. package/cpp/llama.cpp/ggml/src/ggml-opt.cpp +373 -190
  86. package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +29 -23
  87. package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -10
  88. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +101 -5
  89. package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +31 -33
  90. package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +1 -0
  91. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +29 -2
  92. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +4 -5
  93. package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
  94. package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +9 -1
  95. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +84 -72
  96. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +2 -0
  97. package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
  98. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +1 -3
  99. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +324 -129
  100. package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +1 -0
  101. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +31 -2
  102. package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +95 -68
  103. package/cpp/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +1 -0
  104. package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +22 -0
  105. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -2
  106. package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +1 -4
  107. package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +2 -3
  108. package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +69 -43
  109. package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +2 -14
  110. package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +81 -91
  111. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +432 -181
  112. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +17 -0
  113. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +1 -1
  114. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +6 -152
  115. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +162 -0
  116. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +360 -0
  117. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +2 -118
  118. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +1 -1
  119. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +12 -1
  120. package/cpp/llama.cpp/ggml/src/ggml.c +107 -36
  121. package/cpp/llama.cpp/ggml/src/gguf.cpp +33 -33
  122. package/cpp/llama.cpp/gguf-py/gguf/constants.py +100 -15
  123. package/cpp/llama.cpp/gguf-py/gguf/gguf_reader.py +1 -1
  124. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +44 -12
  125. package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_editor_gui.py +21 -10
  126. package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_new_metadata.py +5 -2
  127. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +128 -31
  128. package/cpp/llama.cpp/gguf-py/gguf/utility.py +1 -1
  129. package/cpp/llama.cpp/gguf-py/pyproject.toml +1 -1
  130. package/cpp/llama.cpp/include/llama.h +62 -125
  131. package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +1 -1
  132. package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +1 -1
  133. package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.inp +1 -1
  134. package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.out +1 -1
  135. package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +1 -1
  136. package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +1 -1
  137. package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +1 -1
  138. package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +1 -1
  139. package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.inp +1 -1
  140. package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.out +1 -1
  141. package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +1 -1
  142. package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +1 -1
  143. package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +1 -1
  144. package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +1 -1
  145. package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +1 -1
  146. package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +1 -1
  147. package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.inp +1 -1
  148. package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.out +1 -1
  149. package/cpp/llama.cpp/models/ggml-vocab-nomic-bert-moe.gguf +0 -0
  150. package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +1 -1
  151. package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.out +1 -1
  152. package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +1 -1
  153. package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.out +1 -1
  154. package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.inp +1 -1
  155. package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.out +1 -1
  156. package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +1 -1
  157. package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.out +1 -1
  158. package/cpp/llama.cpp/models/templates/Qwen-QwQ-32B.jinja +62 -0
  159. package/cpp/llama.cpp/models/templates/Qwen-Qwen3-0.6B.jinja +85 -0
  160. package/cpp/llama.cpp/models/templates/README.md +2 -0
  161. package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +5 -1
  162. package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +5 -1
  163. package/cpp/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
  164. package/cpp/llama.cpp/requirements/requirements-gguf_editor_gui.txt +1 -1
  165. package/cpp/llama.cpp/src/CMakeLists.txt +2 -0
  166. package/cpp/llama.cpp/src/llama-arch.cpp +6 -0
  167. package/cpp/llama.cpp/src/llama-arch.h +2 -0
  168. package/cpp/llama.cpp/src/llama-batch.cpp +3 -1
  169. package/cpp/llama.cpp/src/llama-context.cpp +340 -123
  170. package/cpp/llama.cpp/src/llama-context.h +30 -0
  171. package/cpp/llama.cpp/src/llama-cparams.cpp +4 -0
  172. package/cpp/llama.cpp/src/llama-cparams.h +2 -0
  173. package/cpp/llama.cpp/src/llama-grammar.cpp +12 -2
  174. package/cpp/llama.cpp/src/llama-graph.cpp +157 -247
  175. package/cpp/llama.cpp/src/llama-graph.h +52 -7
  176. package/cpp/llama.cpp/src/llama-hparams.cpp +17 -1
  177. package/cpp/llama.cpp/src/llama-hparams.h +37 -5
  178. package/cpp/llama.cpp/src/llama-kv-cache.cpp +742 -481
  179. package/cpp/llama.cpp/src/llama-kv-cache.h +196 -99
  180. package/cpp/llama.cpp/src/llama-kv-cells.h +379 -0
  181. package/cpp/llama.cpp/src/llama-memory.h +4 -3
  182. package/cpp/llama.cpp/src/llama-model-loader.cpp +22 -17
  183. package/cpp/llama.cpp/src/llama-model-saver.cpp +281 -0
  184. package/cpp/llama.cpp/src/llama-model-saver.h +37 -0
  185. package/cpp/llama.cpp/src/llama-model.cpp +529 -172
  186. package/cpp/llama.cpp/src/llama-model.h +6 -1
  187. package/cpp/llama.cpp/src/llama-quant.cpp +15 -13
  188. package/cpp/llama.cpp/src/llama-sampling.cpp +2 -2
  189. package/cpp/llama.cpp/src/llama-vocab.cpp +35 -8
  190. package/cpp/llama.cpp/src/llama-vocab.h +6 -0
  191. package/cpp/llama.cpp/src/llama.cpp +14 -0
  192. package/cpp/rn-completion.cpp +4 -2
  193. package/ios/include/chat.h +73 -6
  194. package/ios/include/common/minja/chat-template.hpp +9 -5
  195. package/ios/include/common/minja/minja.hpp +69 -36
  196. package/ios/include/common.h +21 -17
  197. package/ios/include/llama.h +62 -125
  198. package/ios/libs/llama.xcframework/Info.plist +19 -19
  199. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  200. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4617 -4487
  201. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-opt.h +237 -0
  202. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +26 -7
  203. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +62 -125
  204. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  205. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  206. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4638 -4508
  207. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3557 -3435
  208. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
  209. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
  210. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
  211. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  212. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  213. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4638 -4508
  214. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3559 -3437
  215. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-opt.h +237 -0
  216. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +26 -7
  217. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +62 -125
  218. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-opt.h +237 -0
  219. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +26 -7
  220. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +62 -125
  221. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  222. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-opt.h +237 -0
  223. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +26 -7
  224. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +62 -125
  225. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  226. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  227. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  228. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4616 -4487
  229. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-opt.h +237 -0
  230. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +26 -7
  231. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +62 -125
  232. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  233. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  234. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4637 -4508
  235. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3556 -3435
  236. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
  237. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
  238. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
  239. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  240. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  241. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4653 -4523
  242. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-opt.h +237 -0
  243. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +26 -7
  244. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +62 -125
  245. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  246. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  247. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4674 -4544
  248. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3587 -3465
  249. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +237 -0
  250. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +26 -7
  251. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +62 -125
  252. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  253. package/package.json +1 -1
  254. package/cpp/llama.cpp/common/stb_image.h +0 -7988
  255. package/cpp/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +0 -112
  256. package/cpp/llama.cpp/models/ggml-vocab-chameleon.gguf.out +0 -46
  257. package/cpp/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.inp +0 -112
  258. package/cpp/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.out +0 -46
  259. package/cpp/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +0 -112
  260. package/cpp/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +0 -46
  261. package/cpp/llama.cpp/models/ggml-vocab-llama4.gguf.inp +0 -112
  262. package/cpp/llama.cpp/models/ggml-vocab-llama4.gguf.out +0 -46
  263. package/cpp/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +0 -112
  264. package/cpp/llama.cpp/models/ggml-vocab-pixtral.gguf.out +0 -46
  265. package/cpp/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +0 -112
  266. package/cpp/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +0 -46
@@ -76,6 +76,7 @@ static void gated_linear_attn_f32_kernel(const dpct::queue_ptr stream, u_int B,
76
76
  }
77
77
 
78
78
  void ggml_sycl_op_gated_linear_attn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
79
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/5);
79
80
  const float * k_d = static_cast<const float *>(dst->src[0]->data);
80
81
  const float * v_d = static_cast<const float *>(dst->src[1]->data);
81
82
  const float * r_d = static_cast<const float *>(dst->src[2]->data);
@@ -24,6 +24,7 @@ static void mul_mat_vec_q_reorder(const void * __restrict__ vx, const void * __r
24
24
  const int blocks_per_row = ncols / block_traits::qk;
25
25
  constexpr int blocks_per_subgroup = ceil_div(block_traits::vdr_mmvq * WARP_SIZE, block_traits::qi);
26
26
  constexpr int block_elements_per_subgroup = block_traits::qi / block_traits::vdr_mmvq;
27
+ const int nblocks = nrows * (ncols / block_traits::qk);
27
28
 
28
29
  static_assert(blocks_per_subgroup > 0);
29
30
  static_assert(block_elements_per_subgroup > 0);
@@ -45,7 +46,7 @@ static void mul_mat_vec_q_reorder(const void * __restrict__ vx, const void * __r
45
46
  // x block quant index when casting the quants to int
46
47
  const int iqs = elem + block_traits::vdr_mmvq * (sg.get_local_linear_id() % block_elements_per_subgroup);
47
48
 
48
- partial_sum += reorder_vec_dot_q_sycl()(vx, bx_offset, d_offset, &y[iby], iqs);
49
+ partial_sum += reorder_vec_dot_q_sycl()(vx, bx_offset, d_offset, &y[iby], iqs, nblocks);
49
50
  }
50
51
  }
51
52
 
@@ -739,6 +740,27 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
739
740
  }
740
741
  }
741
742
 
743
+ static void reorder_mul_mat_vec_q4_k_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols,
744
+ const int nrows, dpct::queue_ptr stream) {
745
+ GGML_ASSERT(ncols % QK_K == 0);
746
+
747
+ const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y);
748
+ constexpr size_t num_subgroups = 16;
749
+ GGML_ASSERT(block_num_y % num_subgroups == 0);
750
+
751
+ const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE);
752
+ const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
753
+
754
+ stream->submit([&](sycl::handler & cgh) {
755
+ cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size),
756
+ [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
757
+ mul_mat_vec_q_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q4_K>>(vx, vy, dst, ncols,
758
+ nrows, nd_item);
759
+ });
760
+ });
761
+ }
762
+
763
+
742
764
  static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
743
765
  float *dst, const int ncols,
744
766
  const int nrows,
@@ -1035,7 +1057,14 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens
1035
1057
  mul_mat_vec_q3_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
1036
1058
  break;
1037
1059
  case GGML_TYPE_Q4_K:
1038
- mul_mat_vec_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
1060
+ if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
1061
+ ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
1062
+ GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q4_k_q8_1_sycl\n");
1063
+ reorder_mul_mat_vec_q4_k_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
1064
+ } else {
1065
+ GGML_SYCL_DEBUG("Calling mul_mat_vec_q4_K_q8_1_sycl\n");
1066
+ mul_mat_vec_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
1067
+ }
1039
1068
  break;
1040
1069
  case GGML_TYPE_Q5_K:
1041
1070
  mul_mat_vec_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
@@ -1,40 +1,50 @@
1
1
  #include "norm.hpp"
2
+ #include "ggml-sycl/common.hpp"
3
+ #include "ggml-sycl/presets.hpp"
2
4
 
3
- static void norm_f32(const float* x, float* dst, const int ncols, const float eps,
4
- const sycl::nd_item<3>& item_ct1, sycl::float2* s_sum, int block_size) {
5
- const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
6
- item_ct1.get_local_id(1);
7
- const int tid = item_ct1.get_local_id(2);
5
+ static void norm_f32(const float* x, float* dst, const int ncols, const int64_t stride_row, const int64_t stride_channel,
6
+ const int64_t stride_sample, const float eps, const sycl::nd_item<3>& item_ct1, sycl::float2* s_sum, int block_size) {
7
+
8
+ const int nrows = item_ct1.get_group_range(2);
9
+ const int nchannels = item_ct1.get_group_range(1);
8
10
 
9
11
  const int nthreads = item_ct1.get_local_range(2);
12
+ const int sample = item_ct1.get_group(0);
13
+ const int channel = item_ct1.get_group(1);
14
+ const int row = item_ct1.get_group(2);
15
+
16
+ const int tid = item_ct1.get_local_id(2);
10
17
  const int nwarps = nthreads / WARP_SIZE;
18
+
19
+ const auto strided_offset = calculate_offset<3>({stride_sample, stride_channel, stride_row}, {sample, channel, row});
20
+ const auto packed_offset = calculate_offset<3>({nchannels * nrows * ncols, nrows * ncols, ncols}, {sample, channel, row});
21
+
22
+ x += strided_offset;
23
+ dst += packed_offset;
24
+
11
25
  sycl::float2 mean_var = sycl::float2(0.f, 0.f);
12
26
 
13
27
  for (int col = tid; col < ncols; col += block_size) {
14
- const float xi = x[row * ncols + col];
28
+ const float xi = x[col];
15
29
  mean_var.x() += xi;
16
30
  mean_var.y() += xi * xi;
17
31
  }
18
32
 
19
33
  // sum up partial sums
20
34
  mean_var = warp_reduce_sum(mean_var, item_ct1);
21
- if (block_size > WARP_SIZE) {
22
-
23
- int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
24
- int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
25
- if (lane_id == 0) {
26
- s_sum[warp_id] = mean_var;
35
+ if (block_size > WARP_SIZE) {
36
+ const auto sub_group = item_ct1.get_sub_group();
37
+ const auto sg_id = sub_group.get_group_linear_id();
38
+ const auto wi_in_sg = sub_group.get_local_linear_id();
39
+ if (wi_in_sg == 0) {
40
+ s_sum[sg_id] = mean_var;
27
41
  }
28
- /*
29
- DPCT1118:0: SYCL group functions and algorithms must be encountered in
30
- converged control flow. You may need to adjust the code.
31
- */
32
42
  item_ct1.barrier(sycl::access::fence_space::local_space);
33
43
  mean_var = 0.f;
34
- size_t nreduce = nwarps / WARP_SIZE;
44
+ const size_t nreduce = ceil_div(nwarps, WARP_SIZE);
35
45
  for (size_t i = 0; i < nreduce; i += 1)
36
46
  {
37
- mean_var += s_sum[lane_id + i * WARP_SIZE];
47
+ mean_var += s_sum[wi_in_sg + i * WARP_SIZE];
38
48
  }
39
49
  mean_var = warp_reduce_sum(mean_var, item_ct1);
40
50
  }
@@ -44,7 +54,7 @@ static void norm_f32(const float* x, float* dst, const int ncols, const float ep
44
54
  const float inv_std = sycl::rsqrt(var + eps);
45
55
 
46
56
  for (int col = tid; col < ncols; col += block_size) {
47
- dst[row * ncols + col] = (x[row * ncols + col] - mean) * inv_std;
57
+ dst[col] = (x[col] - mean) * inv_std;
48
58
  }
49
59
  }
50
60
 
@@ -135,39 +145,51 @@ static void group_norm_f32(const float* x, float* dst, const int group_size, con
135
145
  }
136
146
  }
137
147
 
138
- static void rms_norm_f32(const float* x, float* dst, const int ncols, const float eps,
139
- const sycl::nd_item<3>& item_ct1, float* s_sum, int block_size) {
140
- const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
141
- item_ct1.get_local_id(1);
142
- const int tid = item_ct1.get_local_id(2);
148
+ static void rms_norm_f32(const float* x, float* dst, const int ncols, const int64_t stride_row, const int64_t stride_channel,
149
+ const int64_t stride_sample, const float eps, const sycl::nd_item<3>& item_ct1, float* s_sum, int block_size) {
150
+
151
+ const int nrows = item_ct1.get_group_range(2);
152
+ const int nchannels = item_ct1.get_group_range(1);
153
+
154
+ const int sample = item_ct1.get_group(0);
155
+ const int channel = item_ct1.get_group(1);
156
+ const int row = item_ct1.get_group(2);
157
+
143
158
  const int nthreads = item_ct1.get_local_range(2);
159
+
160
+ const int tid = item_ct1.get_local_id(2);
144
161
  const int nwarps = nthreads / WARP_SIZE;
162
+
163
+ const auto strided_offset = calculate_offset<3>({stride_sample, stride_channel, stride_row}, {sample, channel, row});
164
+ const auto packed_offset = calculate_offset<3>({nchannels * nrows * ncols, nrows * ncols, ncols}, {sample, channel, row});
165
+
166
+ x += strided_offset;
167
+ dst += packed_offset;
168
+
169
+
145
170
  float tmp = 0.0f; // partial sum for thread in warp
146
171
 
147
172
  for (int col = tid; col < ncols; col += block_size) {
148
- const float xi = x[row * ncols + col];
173
+ const float xi = x[col];
149
174
  tmp += xi * xi;
150
175
  }
151
176
 
152
177
  // sum up partial sums
153
178
  tmp = warp_reduce_sum(tmp, item_ct1);
154
179
  if (block_size > WARP_SIZE) {
155
-
156
- int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
157
- int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
158
- if (lane_id == 0) {
159
- s_sum[warp_id] = tmp;
180
+ const auto sub_group = item_ct1.get_sub_group();
181
+ const auto sg_id = sub_group.get_group_linear_id();
182
+ const auto wi_in_sg = sub_group.get_local_linear_id();
183
+ if (wi_in_sg == 0) {
184
+ s_sum[sg_id] = tmp;
160
185
  }
161
- /*
162
- DPCT1118:3: SYCL group functions and algorithms must be encountered in
163
- converged control flow. You may need to adjust the code.
164
- */
186
+
165
187
  item_ct1.barrier(sycl::access::fence_space::local_space);
166
- size_t nreduce = nwarps / WARP_SIZE;
188
+ const size_t nreduce = ceil_div(nwarps, WARP_SIZE);
167
189
  tmp = 0.f;
168
190
  for (size_t i = 0; i < nreduce; i += 1)
169
191
  {
170
- tmp += s_sum[lane_id + i * WARP_SIZE];
192
+ tmp += s_sum[wi_in_sg + i * WARP_SIZE];
171
193
  }
172
194
  tmp = warp_reduce_sum(tmp, item_ct1);
173
195
  }
@@ -176,7 +198,7 @@ static void rms_norm_f32(const float* x, float* dst, const int ncols, const floa
176
198
  const float scale = sycl::rsqrt(mean + eps);
177
199
 
178
200
  for (int col = tid; col < ncols; col += block_size) {
179
- dst[row * ncols + col] = scale * x[row * ncols + col];
201
+ dst[col] = scale * x[col];
180
202
  }
181
203
  }
182
204
 
@@ -224,20 +246,20 @@ static void l2_norm_f32(const float* x, float* dst, const int ncols, const float
224
246
  }
225
247
  }
226
248
 
227
- static void norm_f32_sycl(const float* x, float* dst, const int ncols,
228
- const int nrows, const float eps,
229
- queue_ptr stream, int device) {
249
+ static void norm_f32_sycl(const float * x, float * dst, const int ncols, const int nrows, const int nchannels, const int nsamples,
250
+ const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample,
251
+ const float eps, queue_ptr stream, int device) {
252
+
253
+ const sycl::range<3> global_dims(nsamples, nchannels, nrows);
230
254
  GGML_ASSERT(ncols % WARP_SIZE == 0);
231
255
  if (ncols < 1024) {
232
256
  const sycl::range<3> block_dims(1, 1, WARP_SIZE);
233
257
  stream->submit([&](sycl::handler& cgh) {
234
258
  cgh.parallel_for(
235
- sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
236
- block_dims),
259
+ sycl::nd_range<3>(global_dims * block_dims, block_dims),
237
260
  [=](sycl::nd_item<3> item_ct1)
238
261
  [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
239
- norm_f32(x, dst, ncols, eps, item_ct1,
240
- nullptr, WARP_SIZE);
262
+ norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, nullptr, WARP_SIZE);
241
263
  });
242
264
  });
243
265
  }
@@ -252,15 +274,12 @@ static void norm_f32_sycl(const float* x, float* dst, const int ncols,
252
274
  */
253
275
  stream->submit([&](sycl::handler& cgh) {
254
276
  sycl::local_accessor<sycl::float2, 1> s_sum_acc_ct1(
255
- sycl::range<1>(work_group_size / WARP_SIZE), cgh);
256
-
277
+ sycl::range<1>(work_group_size / WARP_SIZE), cgh);
257
278
  cgh.parallel_for(
258
- sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
259
- block_dims),
279
+ sycl::nd_range<3>(global_dims * block_dims, block_dims),
260
280
  [=](sycl::nd_item<3> item_ct1)
261
281
  [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
262
- norm_f32(x, dst, ncols, eps, item_ct1,
263
- get_pointer(s_sum_acc_ct1), work_group_size);
282
+ norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, get_pointer(s_sum_acc_ct1), work_group_size);
264
283
  });
265
284
  });
266
285
  }
@@ -313,21 +332,20 @@ static void group_norm_f32_sycl(const float* x, float* dst,
313
332
  }
314
333
  }
315
334
 
316
- static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols,
317
- const int nrows, const float eps,
318
- queue_ptr stream, int device) {
335
+ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols, const int nrows, const int nchannels, const int nsamples,
336
+ const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample, const float eps, queue_ptr stream, int device) {
319
337
  GGML_ASSERT(ncols % WARP_SIZE == 0);
320
338
  // printf("%s ncols=%d, nrows=%d, WARP_SIZE=%d\n", __func__, ncols, nrows, WARP_SIZE);
339
+
340
+ const sycl::range<3> global_dims(nsamples, nchannels, nrows);
321
341
  if (ncols < 1024) {
322
342
  const sycl::range<3> block_dims(1, 1, WARP_SIZE);
323
343
  stream->submit([&](sycl::handler& cgh) {
324
344
  cgh.parallel_for(
325
- sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
326
- block_dims),
345
+ sycl::nd_range<3>(global_dims * block_dims, block_dims),
327
346
  [=](sycl::nd_item<3> item_ct1)
328
347
  [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
329
- rms_norm_f32(x, dst, ncols, eps, item_ct1,
330
- nullptr, WARP_SIZE);
348
+ rms_norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, nullptr, WARP_SIZE);
331
349
  });
332
350
  });
333
351
  }
@@ -344,12 +362,10 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols,
344
362
  sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(work_group_size / WARP_SIZE),
345
363
  cgh);
346
364
  cgh.parallel_for(
347
- sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
348
- block_dims),
365
+ sycl::nd_range<3>(global_dims * block_dims, block_dims),
349
366
  [=](sycl::nd_item<3> item_ct1)
350
367
  [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
351
- rms_norm_f32(x, dst, ncols, eps, item_ct1,
352
- get_pointer(s_sum_acc_ct1), work_group_size);
368
+ rms_norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, get_pointer(s_sum_acc_ct1), work_group_size);
353
369
  });
354
370
  });
355
371
  }
@@ -398,12 +414,12 @@ static void l2_norm_f32_sycl(const float* x, float* dst, const int ncols,
398
414
  }
399
415
 
400
416
  void ggml_sycl_op_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
417
+ const ggml_tensor * src0 = dst->src[0];
401
418
 
402
419
  GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
403
420
  GGML_ASSERT(dst->type == GGML_TYPE_F32);
404
421
 
405
- const int64_t ne00 = dst->src[0]->ne[0];
406
- const int64_t nrows = ggml_nrows(dst->src[0]);
422
+ GGML_TENSOR_UNARY_OP_LOCALS
407
423
  dpct::queue_ptr main_stream = ctx.stream();
408
424
  SYCL_CHECK(ggml_sycl_set_device(ctx.device));
409
425
  const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
@@ -411,8 +427,14 @@ void ggml_sycl_op_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
411
427
 
412
428
  float eps;
413
429
  memcpy(&eps, dst->op_params, sizeof(float));
414
-
415
- norm_f32_sycl(src0_dd, dst_dd, ne00, nrows, eps, main_stream, ctx.device);
430
+ GGML_ASSERT(eps >= 0.0f);
431
+ const size_t ts0 = ggml_type_size(src0->type);
432
+ GGML_ASSERT(nb00 == ts0);
433
+ const int64_t s01 = nb01 / ts0;
434
+ const int64_t s02 = nb02 / ts0;
435
+ const int64_t s03 = nb03 / ts0;
436
+
437
+ norm_f32_sycl(src0_dd, dst_dd, ne00, ne01, ne02, ne03, s01, s02, s03, eps, main_stream, ctx.device);
416
438
  }
417
439
 
418
440
  void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
@@ -436,11 +458,10 @@ void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
436
458
 
437
459
  void ggml_sycl_op_rms_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
438
460
 
461
+ const ggml_tensor * src0 = dst->src[0];
439
462
  GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
440
463
  GGML_ASSERT(dst->type == GGML_TYPE_F32);
441
464
 
442
- const int64_t ne00 = dst->src[0]->ne[0];
443
- const int64_t nrows = ggml_nrows(dst->src[0]);
444
465
  dpct::queue_ptr main_stream = ctx.stream();
445
466
  SYCL_CHECK(ggml_sycl_set_device(ctx.device));
446
467
 
@@ -450,7 +471,13 @@ void ggml_sycl_op_rms_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
450
471
  float eps;
451
472
  memcpy(&eps, dst->op_params, sizeof(float));
452
473
 
453
- rms_norm_f32_sycl(src0_dd, dst_dd, ne00, nrows, eps, main_stream, ctx.device);
474
+ GGML_TENSOR_UNARY_OP_LOCALS
475
+ const size_t ts0 = ggml_type_size(src0->type);
476
+ GGML_ASSERT(nb00 == ts0);
477
+ const int64_t s01 = nb01 / ts0;
478
+ const int64_t s02 = nb02 / ts0;
479
+ const int64_t s03 = nb03 / ts0;
480
+ rms_norm_f32_sycl(src0_dd, dst_dd, ne00, ne01, ne02, ne03, s01, s02, s03, eps, main_stream, ctx.device);
454
481
  }
455
482
 
456
483
  void ggml_sycl_op_l2_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
@@ -1,6 +1,7 @@
1
1
  #include "outprod.hpp"
2
2
 
3
3
  void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
4
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
4
5
  const ggml_tensor *src0 = dst->src[0];
5
6
  const ggml_tensor *src1 = dst->src[1];
6
7
 
@@ -56,6 +56,28 @@ template <> struct block_q_t<GGML_TYPE_Q4_0> {
56
56
  static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
57
57
  };
58
58
 
59
+ template <> struct block_q_t<GGML_TYPE_Q4_K> {
60
+ struct traits {
61
+ static constexpr uint32_t qk = QK_K;
62
+ static constexpr uint32_t qi = QI4_K;
63
+ static constexpr uint32_t qr = QR4_K;
64
+ static constexpr uint32_t vdr_mmvq = 2;
65
+ };
66
+
67
+ static constexpr int get_block_offset(const int block_index) { return block_index * (traits::qk / traits::qr); }
68
+
69
+ static constexpr int get_d_offset(int nrows, int ncols, const int block_index) {
70
+ auto nblocks = (nrows * (ncols / traits::qk));
71
+ return (nblocks * QK_K / 2) + (nblocks * K_SCALE_SIZE) + (block_index * sizeof(ggml_half2));
72
+ }
73
+
74
+ static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
75
+
76
+ constexpr size_t get_total_qs_bytes(int nblocks) { return nblocks * QK_K / 2; }
77
+
78
+ constexpr size_t get_dm_offset(int nblocks) { return get_total_qs_bytes(nblocks) + nblocks * K_SCALE_SIZE; }
79
+ };
80
+
59
81
  } // namespace ggml_sycl_reordered
60
82
 
61
83
  #endif // GGML_SYCL_QUANTS_HPP
@@ -355,8 +355,7 @@ inline void ggml_sycl_op_rope(ggml_backend_sycl_context & ctx, ggml_tensor *dst)
355
355
  }
356
356
 
357
357
  void ggml_sycl_rope(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
358
- GGML_SYCL_DEBUG("call %s\n", __func__);
358
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/3);
359
359
  ggml_sycl_op_rope(ctx, dst);
360
- GGML_SYCL_DEBUG("call %s done\n", __func__);
361
360
  }
362
361
 
@@ -225,7 +225,7 @@ static void soft_max_f32_sycl(const float * x, const T * mask,
225
225
  }
226
226
 
227
227
  void ggml_sycl_op_soft_max(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
228
-
228
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
229
229
  GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
230
230
  GGML_ASSERT( dst->type == GGML_TYPE_F32);
231
231
 
@@ -249,16 +249,13 @@ void ggml_sycl_op_soft_max(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
249
249
 
250
250
  if (dst->src[1] && dst->src[1]->type == GGML_TYPE_F16) {
251
251
  const sycl::half * src1_dd = static_cast<sycl::half *>(dst->src[1]->data);
252
- GGML_SYCL_DEBUG("%s: F16 mask\n", __func__);
253
252
  soft_max_f32_sycl<sycl::half>(src0_dd, src1_dd, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias,
254
253
  main_stream, ctx.device);
255
254
  } else if (dst->src[1] && dst->src[1]->type == GGML_TYPE_F32) {
256
255
  const float * src1_dd = static_cast<const float *>(dst->src[1]->data);
257
- GGML_SYCL_DEBUG("%s: F32 mask\n", __func__);
258
256
  soft_max_f32_sycl<float>(src0_dd, src1_dd, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias, main_stream, ctx.device);
259
257
  } else {
260
258
  /* mask unavailable */
261
- GGML_SYCL_DEBUG("%s: No mask\n", __func__);
262
259
  soft_max_f32_sycl<float>(src0_dd, nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias, main_stream, ctx.device);
263
260
  }
264
261
  }
@@ -56,8 +56,8 @@ static void timestep_embedding_f32_sycl(
56
56
  }
57
57
 
58
58
  void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
59
- const ggml_tensor *src0 = dst->src[0];
60
- const ggml_tensor *src1 = dst->src[1];
59
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
60
+ const ggml_tensor * src0 = dst->src[0];
61
61
  const float * src0_d = (const float *)src0->data;
62
62
  float * dst_d = (float *)dst->data;
63
63
  dpct::queue_ptr stream = ctx.stream();
@@ -69,5 +69,4 @@ void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, ggml_tenso
69
69
  const int max_period = dst->op_params[1];
70
70
 
71
71
  timestep_embedding_f32_sycl(src0_d, dst_d, src0->ne[0], dst->nb[1], dim, max_period, stream);
72
- GGML_UNUSED(src1);
73
72
  }
@@ -285,7 +285,7 @@ template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q4_0> {
285
285
  }
286
286
 
287
287
  __dpct_inline__ float operator()(const void * __restrict__ vbq, const int ibx_offset, const int d_offset,
288
- const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
288
+ const block_q8_1 * __restrict__ bq8_1, const int & iqs, int /* nblocks */) {
289
289
  const uint8_t * bq4_0 = static_cast<const uint8_t *>(vbq) + ibx_offset;
290
290
  const ggml_half d = *(reinterpret_cast<const ggml_half *>(static_cast<const uint8_t *>(vbq) + d_offset));
291
291
  int v[q4_0_traits::vdr_mmvq];
@@ -303,6 +303,67 @@ template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q4_0> {
303
303
  };
304
304
  };
305
305
 
306
+ static inline float vec_dot_q4_K_q8_1_common(const int * __restrict__ q4, const uint16_t * __restrict__ scales,
307
+ const ggml_half2 & dm, const block_q8_1 * __restrict__ bq8_1,
308
+ const int & iqs) {
309
+ int v[2];
310
+ int u[2 * QR4_K];
311
+ float d8[QR4_K];
312
+
313
+ v[0] = q4[0];
314
+ v[1] = q4[4];
315
+
316
+ uint16_t aux[2];
317
+ const int j = (QR4_K * ((iqs / 2) / (QI8_1 / 2))) / 2;
318
+ if (j < 2) {
319
+ aux[0] = scales[j + 0] & 0x3f3f;
320
+ aux[1] = scales[j + 2] & 0x3f3f;
321
+ } else {
322
+ aux[0] = ((scales[j + 2] >> 0) & 0x0f0f) | ((scales[j - 2] & 0xc0c0) >> 2);
323
+ aux[1] = ((scales[j + 2] >> 4) & 0x0f0f) | ((scales[j - 0] & 0xc0c0) >> 2);
324
+ }
325
+
326
+ const uint8_t * sc = (const uint8_t *) aux;
327
+ const uint8_t * m = sc + 2;
328
+
329
+ const int bq8_offset = QR4_K * ((iqs / 2) / (QI8_1 / 2));
330
+
331
+ for (int i = 0; i < QR4_K; ++i) {
332
+ const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
333
+ d8[i] = bq8i->ds[0];
334
+
335
+ const int * q8 = (const int *) bq8i->qs + ((iqs / 2) % 4);
336
+ u[2 * i + 0] = q8[0];
337
+ u[2 * i + 1] = q8[4];
338
+ }
339
+
340
+ return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, dm, d8);
341
+ }
342
+
343
+ template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q4_K> {
344
+ static constexpr ggml_type gtype = GGML_TYPE_Q4_K;
345
+
346
+ using q4_k_block = ggml_sycl_reordered::block_q_t<GGML_TYPE_Q4_K>;
347
+ using q4_k_traits = typename q4_k_block::traits;
348
+
349
+ float operator()(const void * __restrict__ vbq, const int ibx_offset, const int d_offset,
350
+ const block_q8_1 * __restrict__ bq8_1, const int & iqs, int nblocks) {
351
+ const int ib = ibx_offset / (QK_K / 2);
352
+
353
+ const uint8_t * base = static_cast<const uint8_t *>(vbq);
354
+ const uint8_t * qs = base + ibx_offset;
355
+ const int total_qs_bytes = nblocks * (QK_K / 2);
356
+ const uint8_t * scs = base + total_qs_bytes + ib * K_SCALE_SIZE;
357
+ const ggml_half2 * dms = reinterpret_cast<const ggml_half2 *>(base + d_offset);
358
+
359
+ const int bq8_offset = QR4_K * ((iqs / 2) / (QI8_1 / 2));
360
+ const int * q4 = (const int *) (qs + 16 * bq8_offset + 4 * ((iqs / 2) % 4));
361
+ const uint16_t * scales = (const uint16_t *) scs;
362
+
363
+ return vec_dot_q4_K_q8_1_common(q4, scales, *dms, bq8_1, iqs);
364
+ }
365
+ };
366
+
306
367
  #define VDR_Q4_0_Q8_1_MMVQ 2
307
368
  #define VDR_Q4_0_Q8_1_MMQ 4
308
369
 
@@ -649,52 +710,17 @@ vec_dot_q3_K_q8_1(const void *__restrict__ vbq,
649
710
  return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
650
711
  }
651
712
 
652
- static __dpct_inline__ float
653
- vec_dot_q4_K_q8_1(const void *__restrict__ vbq,
654
- const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
655
-
713
+ static __dpct_inline__ float vec_dot_q4_K_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1,
714
+ const int & iqs) {
656
715
  #ifndef GGML_QKK_64
657
- const block_q4_K * bq4_K = (const block_q4_K *) vbq;
658
-
659
- int v[2];
660
- int u[2*QR4_K];
661
- float d8[QR4_K];
662
716
 
663
- // iqs is in 0,2..30. bq8_offset = iqs/4 -> bq8_offset = 0, 2, 4, 6
664
- const int bq8_offset = QR4_K * ((iqs/2) / (QI8_1/2));
665
-
666
- // iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12
667
- // iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44
668
- // iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
669
- // iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108
670
-
671
- const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
672
- v[0] = q4[0];
673
- v[1] = q4[4];
674
-
675
- const uint16_t * scales = (const uint16_t *)bq4_K->scales;
676
- uint16_t aux[2];
677
- const int j = bq8_offset/2;
678
- if (j < 2) {
679
- aux[0] = scales[j+0] & 0x3f3f;
680
- aux[1] = scales[j+2] & 0x3f3f;
681
- } else {
682
- aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
683
- aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
684
- }
685
- const uint8_t * sc = (const uint8_t *)aux;
686
- const uint8_t * m = sc + 2;
687
-
688
- for (int i = 0; i < QR4_K; ++i) {
689
- const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
690
- d8[i] = bq8i->ds[0];
717
+ const block_q4_K * bq4_K = (const block_q4_K *) vbq;
691
718
 
692
- const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
693
- u[2*i+0] = q8[0];
694
- u[2*i+1] = q8[4];
695
- }
719
+ const int bq8_offset = QR4_K * ((iqs / 2) / (QI8_1 / 2));
720
+ const int * q4 = (const int *) (bq4_K->qs + 16 * bq8_offset + 4 * ((iqs / 2) % 4));
721
+ const uint16_t * scales = (const uint16_t *) bq4_K->scales;
696
722
 
697
- return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
723
+ return vec_dot_q4_K_q8_1_common(q4, scales, bq4_K->dm, bq8_1, iqs);
698
724
 
699
725
  #else
700
726
 
@@ -180,10 +180,7 @@ static void rwkv_wkv7_f32_kernel(
180
180
  }
181
181
 
182
182
  void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
183
-
184
- const ggml_tensor *src0 = dst->src[0];
185
- const ggml_tensor *src1 = dst->src[1];
186
-
183
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/6);
187
184
  const float* k_d = (const float*)dst->src[0]->data;
188
185
  const float* v_d = (const float*)dst->src[1]->data;
189
186
  const float* r_d = (const float*)dst->src[2]->data;
@@ -236,16 +233,10 @@ void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
236
233
  });
237
234
  });
238
235
  }
239
-
240
- GGML_UNUSED(src0);
241
- GGML_UNUSED(src1);
242
236
  }
243
237
 
244
238
  void ggml_sycl_op_rwkv_wkv7(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
245
-
246
- const ggml_tensor *src0 = dst->src[0];
247
- const ggml_tensor *src1 = dst->src[1];
248
-
239
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/7);
249
240
  const float* r_d = (const float*)dst->src[0]->data;
250
241
  const float* w_d = (const float*)dst->src[1]->data;
251
242
  const float* k_d = (const float*)dst->src[2]->data;
@@ -299,7 +290,4 @@ void ggml_sycl_op_rwkv_wkv7(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
299
290
  });
300
291
  });
301
292
  }
302
-
303
- GGML_UNUSED(src0);
304
- GGML_UNUSED(src1);
305
293
  }