@novastera-oss/llamarn 0.2.6 → 0.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (253) hide show
  1. package/android/src/main/cpp/include/llama.h +141 -38
  2. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  3. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  4. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  6. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  7. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  8. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  10. package/cpp/LlamaCppModel.cpp +58 -24
  11. package/cpp/LlamaCppModel.h +3 -3
  12. package/cpp/PureCppImpl.cpp +1 -1
  13. package/cpp/PureCppImpl.h +2 -2
  14. package/cpp/build-info.cpp +2 -2
  15. package/cpp/llama.cpp/CMakeLists.txt +15 -4
  16. package/cpp/llama.cpp/Makefile +2 -2
  17. package/cpp/llama.cpp/README.md +32 -13
  18. package/cpp/llama.cpp/common/CMakeLists.txt +10 -20
  19. package/cpp/llama.cpp/common/arg.cpp +37 -6
  20. package/cpp/llama.cpp/common/build-info.cpp.in +2 -2
  21. package/cpp/llama.cpp/common/chat-parser.cpp +5 -0
  22. package/cpp/llama.cpp/common/chat-parser.h +2 -0
  23. package/cpp/llama.cpp/common/chat.cpp +12 -9
  24. package/cpp/llama.cpp/common/chat.h +1 -1
  25. package/cpp/llama.cpp/common/common.cpp +53 -40
  26. package/cpp/llama.cpp/common/common.h +6 -2
  27. package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
  28. package/cpp/llama.cpp/common/speculative.cpp +6 -4
  29. package/cpp/llama.cpp/convert_hf_to_gguf.py +215 -76
  30. package/cpp/llama.cpp/ggml/CMakeLists.txt +48 -2
  31. package/cpp/llama.cpp/ggml/cmake/common.cmake +1 -2
  32. package/cpp/llama.cpp/ggml/include/ggml-cpu.h +2 -0
  33. package/cpp/llama.cpp/ggml/include/ggml.h +33 -0
  34. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +64 -13
  35. package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +5 -0
  36. package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +6 -1
  37. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
  38. package/cpp/llama.cpp/ggml/src/ggml-common.h +4 -0
  39. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +124 -26
  40. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
  41. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +11 -10
  42. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  43. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4114 -0
  44. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2163 -0
  45. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2639 -0
  46. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  47. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2732 -0
  48. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2069 -0
  49. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +397 -0
  50. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1300 -0
  51. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1481 -0
  52. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +4311 -0
  53. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
  54. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +184 -0
  55. package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +4 -3
  56. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +16 -7
  57. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +93 -104
  58. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +12 -8
  59. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
  60. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
  61. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +58 -8
  62. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
  63. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +194 -69
  64. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -0
  65. package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +1158 -0
  66. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
  67. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +1571 -0
  68. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.h +98 -0
  69. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +213 -37
  70. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
  71. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
  72. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
  73. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +59 -37
  74. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  75. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  76. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
  77. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
  78. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +4 -1
  79. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
  80. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +90 -39
  81. package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +19 -0
  82. package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cuh +3 -0
  83. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cu +257 -87
  84. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cuh +2 -3
  85. package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
  86. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +5 -18
  87. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cuh +0 -1
  88. package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -0
  89. package/cpp/llama.cpp/ggml/src/ggml-impl.h +61 -183
  90. package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +11 -10
  91. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +16 -0
  92. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +260 -49
  93. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +497 -282
  94. package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +2 -2
  95. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +7 -0
  96. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1078 -468
  97. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
  98. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  99. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
  100. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
  101. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
  102. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  103. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
  104. package/cpp/llama.cpp/ggml/src/ggml-quants.c +0 -2
  105. package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
  106. package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +1 -1
  107. package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -6
  108. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +20 -48
  109. package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +28 -41
  110. package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +4 -10
  111. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +117 -165
  112. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +192 -53
  113. package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +32 -0
  114. package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +49 -67
  115. package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
  116. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +99 -159
  117. package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +3 -0
  118. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +8 -105
  119. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +209 -92
  120. package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
  121. package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
  122. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +60 -80
  123. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +158 -203
  124. package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +55 -74
  125. package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +38 -10
  126. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +24 -20
  127. package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -3
  128. package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
  129. package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
  130. package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -8
  131. package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
  132. package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +12 -16
  133. package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +36 -28
  134. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +487 -247
  135. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -12
  136. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  137. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +2 -0
  138. package/cpp/llama.cpp/ggml/src/ggml.c +69 -19
  139. package/cpp/llama.cpp/ggml/src/gguf.cpp +5 -1
  140. package/cpp/llama.cpp/gguf-py/gguf/constants.py +133 -0
  141. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +25 -1
  142. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +78 -3
  143. package/cpp/llama.cpp/gguf-py/gguf/vocab.py +97 -4
  144. package/cpp/llama.cpp/gguf-py/pyproject.toml +2 -2
  145. package/cpp/llama.cpp/include/llama.h +141 -38
  146. package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +124 -0
  147. package/cpp/llama.cpp/requirements/requirements-compare-llama-bench.txt +1 -0
  148. package/cpp/llama.cpp/src/CMakeLists.txt +2 -2
  149. package/cpp/llama.cpp/src/llama-arch.cpp +150 -3
  150. package/cpp/llama.cpp/src/llama-arch.h +25 -1
  151. package/cpp/llama.cpp/src/llama-batch.cpp +736 -274
  152. package/cpp/llama.cpp/src/llama-batch.h +110 -57
  153. package/cpp/llama.cpp/src/llama-chat.cpp +30 -8
  154. package/cpp/llama.cpp/src/llama-chat.h +1 -0
  155. package/cpp/llama.cpp/src/llama-context.cpp +360 -266
  156. package/cpp/llama.cpp/src/llama-context.h +27 -23
  157. package/cpp/llama.cpp/src/llama-cparams.cpp +1 -1
  158. package/cpp/llama.cpp/src/llama-cparams.h +1 -1
  159. package/cpp/llama.cpp/src/llama-graph.cpp +411 -344
  160. package/cpp/llama.cpp/src/llama-graph.h +126 -58
  161. package/cpp/llama.cpp/src/llama-hparams.cpp +10 -2
  162. package/cpp/llama.cpp/src/llama-hparams.h +16 -2
  163. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +103 -73
  164. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +34 -42
  165. package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +345 -221
  166. package/cpp/llama.cpp/src/llama-kv-cache-unified.h +75 -50
  167. package/cpp/llama.cpp/src/llama-kv-cells.h +51 -22
  168. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +246 -0
  169. package/cpp/llama.cpp/src/llama-memory-hybrid.h +138 -0
  170. package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.cpp → llama-memory-recurrent.cpp} +302 -317
  171. package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.h → llama-memory-recurrent.h} +60 -68
  172. package/cpp/llama.cpp/src/llama-memory.cpp +41 -0
  173. package/cpp/llama.cpp/src/llama-memory.h +73 -36
  174. package/cpp/llama.cpp/src/llama-mmap.cpp +1 -1
  175. package/cpp/llama.cpp/src/llama-model-loader.cpp +42 -17
  176. package/cpp/llama.cpp/src/llama-model-saver.cpp +1 -0
  177. package/cpp/llama.cpp/src/llama-model.cpp +1630 -511
  178. package/cpp/llama.cpp/src/llama-model.h +26 -0
  179. package/cpp/llama.cpp/src/llama-quant.cpp +89 -6
  180. package/cpp/llama.cpp/src/llama-vocab.cpp +58 -26
  181. package/cpp/llama.cpp/src/llama-vocab.h +1 -0
  182. package/cpp/llama.cpp/src/llama.cpp +11 -7
  183. package/cpp/llama.cpp/src/unicode.cpp +5 -0
  184. package/cpp/rn-completion.cpp +2 -2
  185. package/cpp/{rn-llama.hpp → rn-llama.h} +1 -1
  186. package/cpp/{rn-utils.hpp → rn-utils.h} +3 -0
  187. package/ios/include/chat.h +1 -1
  188. package/ios/include/common.h +6 -2
  189. package/ios/include/llama.h +141 -38
  190. package/ios/libs/llama.xcframework/Info.plist +15 -15
  191. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  192. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4689
  193. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  194. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +33 -0
  195. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +141 -38
  196. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  197. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  198. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
  199. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3622
  200. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  201. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  202. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
  203. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  204. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  205. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
  206. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3766 -3624
  207. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +2 -0
  208. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +33 -0
  209. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +141 -38
  210. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +2 -0
  211. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +33 -0
  212. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +141 -38
  213. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  214. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +2 -0
  215. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +33 -0
  216. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +141 -38
  217. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  218. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  219. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  220. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4689
  221. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  222. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +33 -0
  223. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +141 -38
  224. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  225. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  226. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
  227. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3622
  228. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  229. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  230. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
  231. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  232. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  233. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4926 -4725
  234. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  235. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +33 -0
  236. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +141 -38
  237. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  238. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  239. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4897 -4746
  240. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3794 -3652
  241. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  242. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  243. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
  244. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  245. package/package.json +1 -2
  246. package/cpp/llama.cpp/common/cmake/build-info-gen-cpp.cmake +0 -24
  247. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  248. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13891
  249. package/cpp/llama.cpp/src/llama-kv-cache.cpp +0 -1
  250. package/cpp/llama.cpp/src/llama-kv-cache.h +0 -44
  251. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
  252. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
  253. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
@@ -254,14 +254,13 @@ static void norm_f32_sycl(const float * x, float * dst, const int ncols, const i
254
254
  GGML_ASSERT(ncols % WARP_SIZE == 0);
255
255
  if (ncols < 1024) {
256
256
  const sycl::range<3> block_dims(1, 1, WARP_SIZE);
257
- stream->submit([&](sycl::handler& cgh) {
258
- cgh.parallel_for(
259
- sycl::nd_range<3>(global_dims * block_dims, block_dims),
260
- [=](sycl::nd_item<3> item_ct1)
261
- [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
262
- norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, nullptr, WARP_SIZE);
263
- });
264
- });
257
+ sycl_launch(stream, [&](sycl::handler & cgh) {
258
+ sycl_parallel_for(cgh, sycl::nd_range<3>(global_dims * block_dims, block_dims),
259
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
260
+ norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1,
261
+ nullptr, WARP_SIZE);
262
+ });
263
+ });
265
264
  }
266
265
  else {
267
266
  const int work_group_size = ggml_sycl_info().max_work_group_sizes[device];
@@ -272,16 +271,15 @@ static void norm_f32_sycl(const float * x, float * dst, const int ncols, const i
272
271
  the limit. To get the device limit, query
273
272
  info::device::max_work_group_size. Adjust the work-group size if needed.
274
273
  */
275
- stream->submit([&](sycl::handler& cgh) {
274
+ sycl_launch(stream, [&](sycl::handler & cgh) {
276
275
  sycl::local_accessor<sycl::float2, 1> s_sum_acc_ct1(
277
276
  sycl::range<1>(work_group_size / WARP_SIZE), cgh);
278
- cgh.parallel_for(
279
- sycl::nd_range<3>(global_dims * block_dims, block_dims),
280
- [=](sycl::nd_item<3> item_ct1)
281
- [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
282
- norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, get_pointer(s_sum_acc_ct1), work_group_size);
283
- });
284
- });
277
+ sycl_parallel_for(cgh, sycl::nd_range<3>(global_dims * block_dims, block_dims),
278
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
279
+ norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1,
280
+ get_pointer(s_sum_acc_ct1), work_group_size);
281
+ });
282
+ });
285
283
  }
286
284
  }
287
285
 
@@ -290,18 +288,14 @@ static void group_norm_f32_sycl(const float* x, float* dst,
290
288
  const int ne_elements, queue_ptr stream, int device) {
291
289
  if (group_size < 1024) {
292
290
  const sycl::range<3> block_dims(1, 1, WARP_SIZE);
293
- stream->submit([&](sycl::handler& cgh) {
291
+ sycl_launch(stream, [&](sycl::handler & cgh) {
294
292
  const float eps_ct4 = eps;
295
- cgh.parallel_for(
296
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims,
297
- block_dims),
298
- [=](sycl::nd_item<3> item_ct1)
299
- [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
300
- group_norm_f32(
301
- x, dst, group_size, ne_elements, eps_ct4, item_ct1,
302
- nullptr, WARP_SIZE);
303
- });
304
- });
293
+ sycl_parallel_for(cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims, block_dims),
294
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
295
+ group_norm_f32(x, dst, group_size, ne_elements, eps_ct4, item_ct1, nullptr,
296
+ WARP_SIZE);
297
+ });
298
+ });
305
299
  }
306
300
  else {
307
301
  const int work_group_size = ggml_sycl_info().max_work_group_sizes[device];
@@ -313,22 +307,18 @@ static void group_norm_f32_sycl(const float* x, float* dst,
313
307
  info::device::max_work_group_size. Adjust the work-group size if needed.
314
308
  */
315
309
 
316
- stream->submit([&](sycl::handler& cgh) {
310
+ sycl_launch(stream, [&](sycl::handler & cgh) {
317
311
  sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(work_group_size / WARP_SIZE),
318
312
  cgh);
319
313
 
320
314
  const float eps_ct4 = eps;
321
315
 
322
- cgh.parallel_for(
323
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims,
324
- block_dims),
325
- [=](sycl::nd_item<3> item_ct1)
326
- [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
327
- group_norm_f32(x, dst, group_size, ne_elements,
328
- eps_ct4, item_ct1,
329
- get_pointer(s_sum_acc_ct1), work_group_size);
330
- });
331
- });
316
+ sycl_parallel_for(cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims, block_dims),
317
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
318
+ group_norm_f32(x, dst, group_size, ne_elements, eps_ct4, item_ct1,
319
+ get_pointer(s_sum_acc_ct1), work_group_size);
320
+ });
321
+ });
332
322
  }
333
323
  }
334
324
 
@@ -340,14 +330,13 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols, const
340
330
  const sycl::range<3> global_dims(nsamples, nchannels, nrows);
341
331
  if (ncols < 1024) {
342
332
  const sycl::range<3> block_dims(1, 1, WARP_SIZE);
343
- stream->submit([&](sycl::handler& cgh) {
344
- cgh.parallel_for(
345
- sycl::nd_range<3>(global_dims * block_dims, block_dims),
346
- [=](sycl::nd_item<3> item_ct1)
347
- [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
348
- rms_norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, nullptr, WARP_SIZE);
349
- });
350
- });
333
+ sycl_launch(stream, [&](sycl::handler & cgh) {
334
+ sycl_parallel_for(cgh, sycl::nd_range<3>(global_dims * block_dims, block_dims),
335
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
336
+ rms_norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1,
337
+ nullptr, WARP_SIZE);
338
+ });
339
+ });
351
340
  }
352
341
  else {
353
342
  const int work_group_size = ggml_sycl_info().max_work_group_sizes[device];
@@ -358,16 +347,15 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols, const
358
347
  the limit. To get the device limit, query
359
348
  info::device::max_work_group_size. Adjust the work-group size if needed.
360
349
  */
361
- stream->submit([&](sycl::handler& cgh) {
350
+ sycl_launch(stream, [&](sycl::handler & cgh) {
362
351
  sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(work_group_size / WARP_SIZE),
363
352
  cgh);
364
- cgh.parallel_for(
365
- sycl::nd_range<3>(global_dims * block_dims, block_dims),
366
- [=](sycl::nd_item<3> item_ct1)
367
- [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
368
- rms_norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, get_pointer(s_sum_acc_ct1), work_group_size);
369
- });
370
- });
353
+ sycl_parallel_for(cgh, sycl::nd_range<3>(global_dims * block_dims, block_dims),
354
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
355
+ rms_norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1,
356
+ get_pointer(s_sum_acc_ct1), work_group_size);
357
+ });
358
+ });
371
359
  }
372
360
  }
373
361
 
@@ -378,16 +366,12 @@ static void l2_norm_f32_sycl(const float* x, float* dst, const int ncols,
378
366
  // printf("%s ncols=%d, nrows=%d, WARP_SIZE=%d\n", __func__, ncols, nrows, WARP_SIZE);
379
367
  if (ncols < 1024) {
380
368
  const sycl::range<3> block_dims(1, 1, WARP_SIZE);
381
- stream->submit([&](sycl::handler& cgh) {
382
- cgh.parallel_for(
383
- sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
384
- block_dims),
385
- [=](sycl::nd_item<3> item_ct1)
386
- [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
387
- l2_norm_f32(x, dst, ncols, eps, item_ct1,
388
- nullptr, WARP_SIZE);
389
- });
390
- });
369
+ sycl_launch(stream, [&](sycl::handler & cgh) {
370
+ sycl_parallel_for(cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims),
371
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
372
+ l2_norm_f32(x, dst, ncols, eps, item_ct1, nullptr, WARP_SIZE);
373
+ });
374
+ });
391
375
  }
392
376
  else {
393
377
  const int work_group_size = ggml_sycl_info().max_work_group_sizes[device];
@@ -398,18 +382,15 @@ static void l2_norm_f32_sycl(const float* x, float* dst, const int ncols,
398
382
  the limit. To get the device limit, query
399
383
  info::device::max_work_group_size. Adjust the work-group size if needed.
400
384
  */
401
- stream->submit([&](sycl::handler& cgh) {
385
+ sycl_launch(stream, [&](sycl::handler & cgh) {
402
386
  sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(work_group_size / WARP_SIZE),
403
387
  cgh);
404
- cgh.parallel_for(
405
- sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
406
- block_dims),
407
- [=](sycl::nd_item<3> item_ct1)
408
- [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
409
- l2_norm_f32(x, dst, ncols, eps, item_ct1,
410
- get_pointer(s_sum_acc_ct1), work_group_size);
411
- });
412
- });
388
+ sycl_parallel_for(cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims),
389
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
390
+ l2_norm_f32(x, dst, ncols, eps, item_ct1, get_pointer(s_sum_acc_ct1),
391
+ work_group_size);
392
+ });
393
+ });
413
394
  }
414
395
  }
415
396
 
@@ -14,12 +14,13 @@
14
14
  #ifndef GGML_SYCL_QUANTS_HPP
15
15
  #define GGML_SYCL_QUANTS_HPP
16
16
 
17
+ #include <utility>
18
+
17
19
  #include "ggml-common.h"
18
20
  #include "ggml.h"
19
21
 
20
22
  namespace ggml_sycl_reordered {
21
23
 
22
-
23
24
  // The reordered block moves quants (qs) and scales(d) to two
24
25
  // uniform regions of memory that is contiguous in the same tensor.
25
26
  // What this means is that instead of having:
@@ -32,7 +33,6 @@ namespace ggml_sycl_reordered {
32
33
 
33
34
  template <ggml_type type> struct block_q_t;
34
35
 
35
-
36
36
  // qk number of weights / quants in a block
37
37
  // qr number of weights in a byte (described as 'before dequantization')
38
38
  // for quantization types that has low and high bits split, qr is calculated with
@@ -47,10 +47,12 @@ template <> struct block_q_t<GGML_TYPE_Q4_0> {
47
47
  static constexpr uint32_t vdr_mmvq = 2;
48
48
  };
49
49
 
50
- static constexpr int get_block_offset(const int block_index) { return block_index * (traits::qk / traits::qr); }
50
+ static constexpr std::pair<int, int> get_block_offset(const int block_index, const int /* nblocks */) {
51
+ return { block_index * (traits::qk / traits::qr), 0 };
52
+ }
51
53
 
52
- static constexpr int get_d_offset(int nrows, int ncols, const int block_index) {
53
- return (ncols / traits::qr * nrows) + block_index * sizeof(ggml_half);
54
+ static constexpr std::pair<int, int> get_d_offset(int nrows, int ncols, const int block_index) {
55
+ return { (ncols / traits::qr * nrows) + block_index * sizeof(ggml_half), 0 };
54
56
  }
55
57
 
56
58
  static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
@@ -64,20 +66,46 @@ template <> struct block_q_t<GGML_TYPE_Q4_K> {
64
66
  static constexpr uint32_t vdr_mmvq = 2;
65
67
  };
66
68
 
67
- static constexpr int get_block_offset(const int block_index) { return block_index * (traits::qk / traits::qr); }
69
+ static constexpr std::pair<int, int> get_block_offset(const int block_index, const int /* nblocks */) {
70
+ return { block_index * (traits::qk / traits::qr), 0 };
71
+ }
68
72
 
69
- static constexpr int get_d_offset(int nrows, int ncols, const int block_index) {
73
+ static constexpr std::pair<int, int> get_d_offset(int nrows, int ncols, const int block_index) {
70
74
  auto nblocks = (nrows * (ncols / traits::qk));
71
- return (nblocks * QK_K / 2) + (nblocks * K_SCALE_SIZE) + (block_index * sizeof(ggml_half2));
75
+ return { nblocks * (QK_K / 2),
76
+ (nblocks * QK_K / 2) + (nblocks * K_SCALE_SIZE) + (block_index * sizeof(ggml_half2)) };
72
77
  }
73
78
 
74
79
  static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
75
80
 
76
81
  constexpr size_t get_total_qs_bytes(int nblocks) { return nblocks * QK_K / 2; }
77
-
78
- constexpr size_t get_dm_offset(int nblocks) { return get_total_qs_bytes(nblocks) + nblocks * K_SCALE_SIZE; }
79
82
  };
80
83
 
84
+ template <> struct block_q_t<GGML_TYPE_Q6_K> {
85
+ struct traits {
86
+ static constexpr uint32_t qk = QK_K;
87
+ static constexpr uint32_t qi = QI6_K;
88
+ static constexpr uint32_t qr = QR6_K;
89
+ static constexpr uint32_t vdr_mmvq = 1;
90
+ };
91
+
92
+ static constexpr std::pair<int, int> get_block_offset(const int block_index, const int n_blocks) {
93
+ auto low_bits_index = block_index * (traits::qk / traits::qr);
94
+ // the index of high bits it's after all low bits
95
+ auto high_bits_index = n_blocks * (QK_K / 2) + (block_index * (QK_K / 4));
96
+ return { low_bits_index, high_bits_index };
97
+ }
98
+
99
+ static constexpr std::pair<int, int> get_d_offset(int nrows, int ncols, const int block_index) {
100
+ auto nblocks = (nrows * (ncols / traits::qk));
101
+ auto total_qs_bytes = nblocks * (QK_K / 2) + nblocks * (QK_K / 4);
102
+ auto block_scales = total_qs_bytes + block_index * (QK_K / 16);
103
+ auto sb_scale = total_qs_bytes + nblocks * (QK_K / 16);
104
+ return { block_scales, sb_scale };
105
+ }
106
+
107
+ static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
108
+ };
81
109
  } // namespace ggml_sycl_reordered
82
110
 
83
111
  #endif // GGML_SYCL_QUANTS_HPP
@@ -235,20 +235,22 @@ static void rope_norm_sycl(const T * x, T * dst, const int ne0, const int ne1, c
235
235
  the limit. To get the device limit, query
236
236
  info::device::max_work_group_size. Adjust the work-group size if needed.
237
237
  */
238
- stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
239
- rope_norm<T, false>(x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims,
240
- theta_scale, freq_factors, item_ct1);
241
- });
238
+ sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
239
+ [=](sycl::nd_item<3> item_ct1) {
240
+ rope_norm<T, false>(x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor,
241
+ attn_factor, corr_dims, theta_scale, freq_factors, item_ct1);
242
+ });
242
243
  } else {
243
244
  /*
244
245
  DPCT1049:41: The work-group size passed to the SYCL kernel may exceed
245
246
  the limit. To get the device limit, query
246
247
  info::device::max_work_group_size. Adjust the work-group size if needed.
247
248
  */
248
- stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
249
- rope_norm<T, true>(x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims,
250
- theta_scale, freq_factors, item_ct1);
251
- });
249
+ sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
250
+ [=](sycl::nd_item<3> item_ct1) {
251
+ rope_norm<T, true>(x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor,
252
+ attn_factor, corr_dims, theta_scale, freq_factors, item_ct1);
253
+ });
252
254
  }
253
255
  }
254
256
 
@@ -267,15 +269,17 @@ static void rope_neox_sycl(const T * x, T * dst, const int ne0, const int ne1, c
267
269
  dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
268
270
 
269
271
  if (freq_factors == nullptr) {
270
- stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
271
- rope_neox<T, false>(x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims,
272
- theta_scale, freq_factors, item_ct1);
273
- });
272
+ sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
273
+ [=](sycl::nd_item<3> item_ct1) {
274
+ rope_neox<T, false>(x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor,
275
+ attn_factor, corr_dims, theta_scale, freq_factors, item_ct1);
276
+ });
274
277
  } else {
275
- stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
276
- rope_neox<T, true>(x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims,
277
- theta_scale, freq_factors, item_ct1);
278
- });
278
+ sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims),
279
+ [=](sycl::nd_item<3> item_ct1) {
280
+ rope_neox<T, true>(x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor,
281
+ attn_factor, corr_dims, theta_scale, freq_factors, item_ct1);
282
+ });
279
283
  }
280
284
  }
281
285
 
@@ -298,12 +302,12 @@ static void rope_multi_sycl(const T * x, T * dst, const int ne0, const int ne1,
298
302
  }
299
303
  // launch kernel
300
304
  if (freq_factors == nullptr) {
301
- stream->parallel_for(nd_range, [=](sycl::nd_item<3> item_ct1) {
305
+ sycl_parallel_for(stream, nd_range, [=](sycl::nd_item<3> item_ct1) {
302
306
  rope_multi<T, false>(x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor,
303
307
  corr_dims, theta_scale, freq_factors, sections, item_ct1);
304
308
  });
305
309
  } else {
306
- stream->parallel_for(nd_range, [=](sycl::nd_item<3> item_ct1) {
310
+ sycl_parallel_for(stream, nd_range, [=](sycl::nd_item<3> item_ct1) {
307
311
  rope_multi<T, true>(x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor,
308
312
  corr_dims, theta_scale, freq_factors, sections, item_ct1);
309
313
  });
@@ -333,12 +337,12 @@ static void rope_vision_sycl(const T * x, T * dst, const int ne0, const int ne1,
333
337
  }
334
338
  // launch kernel
335
339
  if (freq_factors == nullptr) {
336
- stream->parallel_for(nd_range, [=](sycl::nd_item<3> item_ct1) {
340
+ sycl_parallel_for(stream, nd_range, [=](sycl::nd_item<3> item_ct1) {
337
341
  rope_vision<T, false>(x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor,
338
342
  corr_dims, theta_scale, freq_factors, sections, item_ct1);
339
343
  });
340
344
  } else {
341
- stream->parallel_for(nd_range, [=](sycl::nd_item<3> item_ct1) {
345
+ sycl_parallel_for(stream, nd_range, [=](sycl::nd_item<3> item_ct1) {
342
346
  rope_vision<T, true>(x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor,
343
347
  corr_dims, theta_scale, freq_factors, sections, item_ct1);
344
348
  });
@@ -127,11 +127,11 @@ static void soft_max_f32_submitter(const float * x, const T * mask, float * dst,
127
127
  const int nrows_y, const float scale, const float max_bias, const float m0,
128
128
  const float m1, uint32_t n_head_log2, sycl::range<3> block_nums, sycl::range<3> block_dims,
129
129
  const size_t n_local_scratch, queue_ptr stream) {
130
- stream->submit([&](sycl::handler &cgh) {
130
+ sycl_launch(stream, [&](sycl::handler & cgh) {
131
131
  sycl::local_accessor<float, 1> local_buf_acc(n_local_scratch, cgh);
132
132
 
133
- cgh.parallel_for(
134
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
133
+ sycl_parallel_for(
134
+ cgh, sycl::nd_range<3>(block_nums * block_dims, block_dims),
135
135
  [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
136
136
  soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask, dst, ncols_par,
137
137
  nrows_y, scale, max_bias, m0,
@@ -1,6 +1,7 @@
1
1
  #include "sycl_hw.hpp"
2
2
 
3
-
3
+ // TODO: currently not used
4
+ /*
4
5
  sycl_hw_info get_device_hw_info(sycl::device *device_ptr) {
5
6
  sycl_hw_info res;
6
7
  int32_t id = device_ptr->get_info<sycl::ext::intel::info::device::device_id>();
@@ -11,3 +12,4 @@ sycl_hw_info get_device_hw_info(sycl::device *device_ptr) {
11
12
 
12
13
  return res;
13
14
  }
15
+ */
@@ -10,6 +10,8 @@
10
10
 
11
11
  namespace syclex = sycl::ext::oneapi::experimental;
12
12
 
13
+ // TODO: currently not used
14
+ /*
13
15
  struct sycl_hw_info {
14
16
  syclex::architecture arch;
15
17
  int32_t device_id;
@@ -18,6 +20,7 @@ struct sycl_hw_info {
18
20
  bool is_in_vector(std::vector<int> &vec, int item);
19
21
 
20
22
  sycl_hw_info get_device_hw_info(sycl::device *device_ptr);
23
+ */
21
24
 
22
25
 
23
26
  #endif // SYCL_HW_HPP
@@ -45,14 +45,9 @@ static void timestep_embedding_f32_sycl(
45
45
  int num_blocks = (half_ceil + SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE - 1) / SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE;
46
46
  sycl::range<3> block_dims(1, 1, SYCL_TIMESTEP_EMBEDDING_BLOCK_SIZE);
47
47
  sycl::range<3> gridDim(1, ne00, num_blocks);
48
- stream->parallel_for(
49
- sycl::nd_range<3>(
50
- gridDim * block_dims, block_dims),
51
- [=](sycl::nd_item<3> item_ct1) {
52
- timestep_embedding_f32(
53
- x, dst, nb1, dim, max_period, item_ct1
54
- );
55
- });
48
+ sycl_parallel_for(stream, sycl::nd_range<3>(gridDim * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
49
+ timestep_embedding_f32(x, dst, nb1, dim, max_period, item_ct1);
50
+ });
56
51
  }
57
52
 
58
53
  void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
@@ -284,22 +284,23 @@ template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q4_0> {
284
284
  return d4 * (sumi * ds8f.x() - (8 * q4_0_traits::vdr_mmvq / q4_0_traits::qi) * ds8f.y());
285
285
  }
286
286
 
287
- __dpct_inline__ float operator()(const void * __restrict__ vbq, const int ibx_offset, const int d_offset,
288
- const block_q8_1 * __restrict__ bq8_1, const int & iqs, int /* nblocks */) {
289
- const uint8_t * bq4_0 = static_cast<const uint8_t *>(vbq) + ibx_offset;
290
- const ggml_half d = *(reinterpret_cast<const ggml_half *>(static_cast<const uint8_t *>(vbq) + d_offset));
287
+ __dpct_inline__ float operator()(const void * __restrict__ vbq, const std::pair<int, int> ibx_offset,
288
+ const std::pair<int, int> d_offset, const int8_t * q8_1_quant_ptr,
289
+ const sycl::half2 * q8_1_ds, const int & iqs) {
290
+ const uint8_t * bq4_0 = static_cast<const uint8_t *>(vbq) + ibx_offset.first;
291
+ const ggml_half d = *(reinterpret_cast<const ggml_half *>(static_cast<const uint8_t *>(vbq) + d_offset.first));
291
292
  int v[q4_0_traits::vdr_mmvq];
292
293
  int u[2 * q4_0_traits::vdr_mmvq];
293
294
 
294
- #pragma unroll
295
295
 
296
+ #pragma unroll
296
297
  for (size_t i = 0; i < q4_0_traits::vdr_mmvq; ++i) {
297
298
  v[i] = get_int_from_uint8(bq4_0, iqs + i);
298
- u[2 * i + 0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
299
- u[2 * i + 1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + q4_0_traits::qi);
299
+ u[2 * i + 0] = get_int_from_int8_aligned(q8_1_quant_ptr, iqs + i);
300
+ u[2 * i + 1] = get_int_from_int8_aligned(q8_1_quant_ptr, iqs + i + q4_0_traits::qi);
300
301
  }
301
302
 
302
- return vec_dot_q4_0_q8_1_impl(v, u, d, bq8_1->ds);
303
+ return vec_dot_q4_0_q8_1_impl(v, u, d, *q8_1_ds);
303
304
  };
304
305
  };
305
306
 
@@ -346,24 +347,115 @@ template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q4_K> {
346
347
  using q4_k_block = ggml_sycl_reordered::block_q_t<GGML_TYPE_Q4_K>;
347
348
  using q4_k_traits = typename q4_k_block::traits;
348
349
 
349
- float operator()(const void * __restrict__ vbq, const int ibx_offset, const int d_offset,
350
- const block_q8_1 * __restrict__ bq8_1, const int & iqs, int nblocks) {
351
- const int ib = ibx_offset / (QK_K / 2);
350
+ __dpct_inline__ float operator()(const void * __restrict__ vbq, const std::pair<int, int> ibx_offset,
351
+ const std::pair<int, int> d_offset, const int8_t * q8_1_quant_ptr,
352
+ const sycl::half2 * q8_1_ds, const int & iqs) {
353
+ const int ib = ibx_offset.first / (QK_K / 2);
352
354
 
353
355
  const uint8_t * base = static_cast<const uint8_t *>(vbq);
354
- const uint8_t * qs = base + ibx_offset;
355
- const int total_qs_bytes = nblocks * (QK_K / 2);
356
- const uint8_t * scs = base + total_qs_bytes + ib * K_SCALE_SIZE;
357
- const ggml_half2 * dms = reinterpret_cast<const ggml_half2 *>(base + d_offset);
356
+ const uint8_t * qs = base + ibx_offset.first;
357
+ const uint8_t * scs = base + d_offset.first + ib * K_SCALE_SIZE;
358
+ const ggml_half2 * dms = reinterpret_cast<const ggml_half2 *>(base + d_offset.second);
358
359
 
359
360
  const int bq8_offset = QR4_K * ((iqs / 2) / (QI8_1 / 2));
360
361
  const int * q4 = (const int *) (qs + 16 * bq8_offset + 4 * ((iqs / 2) % 4));
361
362
  const uint16_t * scales = (const uint16_t *) scs;
362
363
 
363
- return vec_dot_q4_K_q8_1_common(q4, scales, *dms, bq8_1, iqs);
364
+ int v[2];
365
+ int u[2 * QR4_K];
366
+ float d8[QR4_K];
367
+
368
+ v[0] = q4[0];
369
+ v[1] = q4[4];
370
+
371
+ uint16_t aux[2];
372
+ const int j = (QR4_K * ((iqs / 2) / (QI8_1 / 2))) / 2;
373
+ if (j < 2) {
374
+ aux[0] = scales[j + 0] & 0x3f3f;
375
+ aux[1] = scales[j + 2] & 0x3f3f;
376
+ } else {
377
+ aux[0] = ((scales[j + 2] >> 0) & 0x0f0f) | ((scales[j - 2] & 0xc0c0) >> 2);
378
+ aux[1] = ((scales[j + 2] >> 4) & 0x0f0f) | ((scales[j - 0] & 0xc0c0) >> 2);
379
+ }
380
+
381
+ const uint8_t * sc = (const uint8_t *) aux;
382
+ const uint8_t * m = sc + 2;
383
+
384
+ for (int i = 0; i < QR4_K; ++i) {
385
+ const int8_t* quant_base_ptr = q8_1_quant_ptr + (bq8_offset + i) * QK8_1;
386
+ sycl::half2 ds_values = *(q8_1_ds + bq8_offset + i);
387
+
388
+ d8[i] = ds_values[0];
389
+
390
+ const int * q8 = (const int *) quant_base_ptr + ((iqs / 2) % 4);
391
+ u[2 * i + 0] = q8[0];
392
+ u[2 * i + 1] = q8[4];
393
+ }
394
+
395
+ return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, *dms, d8);
364
396
  }
365
397
  };
366
398
 
399
+ template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q6_K> {
400
+ static constexpr ggml_type gtype = GGML_TYPE_Q6_K;
401
+
402
+ using q6_k_block = ggml_sycl_reordered::block_q_t<GGML_TYPE_Q6_K>;
403
+ using q6_k_traits = typename q6_k_block::traits;
404
+
405
+ __dpct_inline__ float vec_dot_q6_K_q8_1_impl_mmvq(const int vl, const int vh, const int * __restrict__ u,
406
+ const int8_t * __restrict__ scales, const float d,
407
+ const float * __restrict__ d8) {
408
+ float sumf = 0.0f;
409
+
410
+ #pragma unroll
411
+ for (int i = 0; i < QR6_K; ++i) {
412
+ const int sc = scales[4 * i];
413
+
414
+ const int vil = (vl >> (4 * i)) & 0x0F0F0F0F;
415
+
416
+ const int vih = ((vh >> (4 * i)) << 4) & 0x30303030;
417
+
418
+ const int vi = dpct::vectorized_binary<sycl::char4>((vil | vih), 0x20202020,
419
+ dpct::sub_sat()); // vi = (vil | vih) - 32
420
+
421
+ sumf += d8[i] * (dpct::dp4a(vi, u[i], 0) * sc); // SIMD dot product
422
+ }
423
+
424
+ return d * sumf;
425
+ }
426
+
427
+ __dpct_inline__ float operator()(const void * __restrict__ vbq, const std::pair<int, int> ibx_offset,
428
+ const std::pair<int, int> d_offset, const int8_t * q8_1_quant_ptr, const sycl::half2 * q8_1_ds,
429
+ const int iqs) {
430
+ const int ib = ibx_offset.first / (QK_K / 2);
431
+
432
+ const uint8_t * base = static_cast<const uint8_t *>(vbq);
433
+ const uint8_t * ql = base + ibx_offset.first;
434
+ const uint8_t * qh = base + ibx_offset.second;
435
+ const int8_t * scales = reinterpret_cast<const int8_t *>(base + d_offset.first);
436
+ const ggml_half * d = (const ggml_half *) (base + d_offset.second) + ib;
437
+
438
+ const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K / 2)) + (iqs % (QI6_K / 2)) / (QI6_K / 4);
439
+ const int scale_offset = (QI6_K / 4) * (iqs / (QI6_K / 2)) + (iqs % (QI6_K / 2)) / (QI6_K / 8);
440
+ const int vh_shift = 2 * ((iqs % (QI6_K / 2)) / (QI6_K / 4));
441
+
442
+ const int vl = get_int_from_uint8(ql, iqs);
443
+ const int vh = get_int_from_uint8(qh, (QI6_K / 4) * (iqs / (QI6_K / 2)) + iqs % (QI6_K / 4)) >> vh_shift;
444
+
445
+ const int8_t * scs = scales + scale_offset;
446
+
447
+ int u[QR6_K];
448
+ float d8[QR6_K];
449
+
450
+ #pragma unroll
451
+ for (int i = 0; i < QR6_K; ++i) {
452
+ u[i] = get_int_from_int8_aligned(q8_1_quant_ptr + (bq8_offset + 2 * i) * QK8_1, iqs % QI8_1);
453
+ const sycl::half2 ds_values = *(q8_1_ds + bq8_offset + 2 * i);
454
+ d8[i] = ds_values[0];
455
+ }
456
+ return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scs, *d, d8);
457
+ }
458
+ };
367
459
  #define VDR_Q4_0_Q8_1_MMVQ 2
368
460
  #define VDR_Q4_0_Q8_1_MMQ 4
369
461