@novastera-oss/llamarn 0.2.6 → 0.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (253) hide show
  1. package/android/src/main/cpp/include/llama.h +141 -38
  2. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  3. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  4. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  6. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  7. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  8. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  10. package/cpp/LlamaCppModel.cpp +58 -24
  11. package/cpp/LlamaCppModel.h +3 -3
  12. package/cpp/PureCppImpl.cpp +1 -1
  13. package/cpp/PureCppImpl.h +2 -2
  14. package/cpp/build-info.cpp +2 -2
  15. package/cpp/llama.cpp/CMakeLists.txt +15 -4
  16. package/cpp/llama.cpp/Makefile +2 -2
  17. package/cpp/llama.cpp/README.md +32 -13
  18. package/cpp/llama.cpp/common/CMakeLists.txt +10 -20
  19. package/cpp/llama.cpp/common/arg.cpp +37 -6
  20. package/cpp/llama.cpp/common/build-info.cpp.in +2 -2
  21. package/cpp/llama.cpp/common/chat-parser.cpp +5 -0
  22. package/cpp/llama.cpp/common/chat-parser.h +2 -0
  23. package/cpp/llama.cpp/common/chat.cpp +12 -9
  24. package/cpp/llama.cpp/common/chat.h +1 -1
  25. package/cpp/llama.cpp/common/common.cpp +53 -40
  26. package/cpp/llama.cpp/common/common.h +6 -2
  27. package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
  28. package/cpp/llama.cpp/common/speculative.cpp +6 -4
  29. package/cpp/llama.cpp/convert_hf_to_gguf.py +215 -76
  30. package/cpp/llama.cpp/ggml/CMakeLists.txt +48 -2
  31. package/cpp/llama.cpp/ggml/cmake/common.cmake +1 -2
  32. package/cpp/llama.cpp/ggml/include/ggml-cpu.h +2 -0
  33. package/cpp/llama.cpp/ggml/include/ggml.h +33 -0
  34. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +64 -13
  35. package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +5 -0
  36. package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +6 -1
  37. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
  38. package/cpp/llama.cpp/ggml/src/ggml-common.h +4 -0
  39. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +124 -26
  40. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
  41. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +11 -10
  42. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  43. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4114 -0
  44. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2163 -0
  45. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2639 -0
  46. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  47. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2732 -0
  48. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2069 -0
  49. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +397 -0
  50. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1300 -0
  51. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1481 -0
  52. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +4311 -0
  53. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
  54. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +184 -0
  55. package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +4 -3
  56. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +16 -7
  57. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +93 -104
  58. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +12 -8
  59. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
  60. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
  61. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +58 -8
  62. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
  63. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +194 -69
  64. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -0
  65. package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +1158 -0
  66. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
  67. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +1571 -0
  68. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.h +98 -0
  69. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +213 -37
  70. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
  71. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
  72. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
  73. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +59 -37
  74. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  75. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  76. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
  77. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
  78. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +4 -1
  79. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
  80. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +90 -39
  81. package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +19 -0
  82. package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cuh +3 -0
  83. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cu +257 -87
  84. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cuh +2 -3
  85. package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
  86. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +5 -18
  87. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cuh +0 -1
  88. package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -0
  89. package/cpp/llama.cpp/ggml/src/ggml-impl.h +61 -183
  90. package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +11 -10
  91. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +16 -0
  92. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +260 -49
  93. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +497 -282
  94. package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +2 -2
  95. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +7 -0
  96. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1078 -468
  97. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
  98. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  99. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
  100. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
  101. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
  102. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  103. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
  104. package/cpp/llama.cpp/ggml/src/ggml-quants.c +0 -2
  105. package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
  106. package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +1 -1
  107. package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -6
  108. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +20 -48
  109. package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +28 -41
  110. package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +4 -10
  111. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +117 -165
  112. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +192 -53
  113. package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +32 -0
  114. package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +49 -67
  115. package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
  116. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +99 -159
  117. package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +3 -0
  118. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +8 -105
  119. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +209 -92
  120. package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
  121. package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
  122. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +60 -80
  123. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +158 -203
  124. package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +55 -74
  125. package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +38 -10
  126. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +24 -20
  127. package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -3
  128. package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
  129. package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
  130. package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -8
  131. package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
  132. package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +12 -16
  133. package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +36 -28
  134. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +487 -247
  135. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -12
  136. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  137. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +2 -0
  138. package/cpp/llama.cpp/ggml/src/ggml.c +69 -19
  139. package/cpp/llama.cpp/ggml/src/gguf.cpp +5 -1
  140. package/cpp/llama.cpp/gguf-py/gguf/constants.py +133 -0
  141. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +25 -1
  142. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +78 -3
  143. package/cpp/llama.cpp/gguf-py/gguf/vocab.py +97 -4
  144. package/cpp/llama.cpp/gguf-py/pyproject.toml +2 -2
  145. package/cpp/llama.cpp/include/llama.h +141 -38
  146. package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +124 -0
  147. package/cpp/llama.cpp/requirements/requirements-compare-llama-bench.txt +1 -0
  148. package/cpp/llama.cpp/src/CMakeLists.txt +2 -2
  149. package/cpp/llama.cpp/src/llama-arch.cpp +150 -3
  150. package/cpp/llama.cpp/src/llama-arch.h +25 -1
  151. package/cpp/llama.cpp/src/llama-batch.cpp +736 -274
  152. package/cpp/llama.cpp/src/llama-batch.h +110 -57
  153. package/cpp/llama.cpp/src/llama-chat.cpp +30 -8
  154. package/cpp/llama.cpp/src/llama-chat.h +1 -0
  155. package/cpp/llama.cpp/src/llama-context.cpp +360 -266
  156. package/cpp/llama.cpp/src/llama-context.h +27 -23
  157. package/cpp/llama.cpp/src/llama-cparams.cpp +1 -1
  158. package/cpp/llama.cpp/src/llama-cparams.h +1 -1
  159. package/cpp/llama.cpp/src/llama-graph.cpp +411 -344
  160. package/cpp/llama.cpp/src/llama-graph.h +126 -58
  161. package/cpp/llama.cpp/src/llama-hparams.cpp +10 -2
  162. package/cpp/llama.cpp/src/llama-hparams.h +16 -2
  163. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +103 -73
  164. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +34 -42
  165. package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +345 -221
  166. package/cpp/llama.cpp/src/llama-kv-cache-unified.h +75 -50
  167. package/cpp/llama.cpp/src/llama-kv-cells.h +51 -22
  168. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +246 -0
  169. package/cpp/llama.cpp/src/llama-memory-hybrid.h +138 -0
  170. package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.cpp → llama-memory-recurrent.cpp} +302 -317
  171. package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.h → llama-memory-recurrent.h} +60 -68
  172. package/cpp/llama.cpp/src/llama-memory.cpp +41 -0
  173. package/cpp/llama.cpp/src/llama-memory.h +73 -36
  174. package/cpp/llama.cpp/src/llama-mmap.cpp +1 -1
  175. package/cpp/llama.cpp/src/llama-model-loader.cpp +42 -17
  176. package/cpp/llama.cpp/src/llama-model-saver.cpp +1 -0
  177. package/cpp/llama.cpp/src/llama-model.cpp +1630 -511
  178. package/cpp/llama.cpp/src/llama-model.h +26 -0
  179. package/cpp/llama.cpp/src/llama-quant.cpp +89 -6
  180. package/cpp/llama.cpp/src/llama-vocab.cpp +58 -26
  181. package/cpp/llama.cpp/src/llama-vocab.h +1 -0
  182. package/cpp/llama.cpp/src/llama.cpp +11 -7
  183. package/cpp/llama.cpp/src/unicode.cpp +5 -0
  184. package/cpp/rn-completion.cpp +2 -2
  185. package/cpp/{rn-llama.hpp → rn-llama.h} +1 -1
  186. package/cpp/{rn-utils.hpp → rn-utils.h} +3 -0
  187. package/ios/include/chat.h +1 -1
  188. package/ios/include/common.h +6 -2
  189. package/ios/include/llama.h +141 -38
  190. package/ios/libs/llama.xcframework/Info.plist +15 -15
  191. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  192. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4689
  193. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  194. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +33 -0
  195. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +141 -38
  196. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  197. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  198. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
  199. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3622
  200. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  201. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  202. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
  203. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  204. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  205. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
  206. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3766 -3624
  207. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +2 -0
  208. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +33 -0
  209. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +141 -38
  210. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +2 -0
  211. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +33 -0
  212. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +141 -38
  213. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  214. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +2 -0
  215. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +33 -0
  216. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +141 -38
  217. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  218. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  219. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  220. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4689
  221. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  222. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +33 -0
  223. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +141 -38
  224. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  225. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  226. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
  227. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3622
  228. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  229. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  230. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
  231. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  232. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  233. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4926 -4725
  234. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  235. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +33 -0
  236. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +141 -38
  237. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  238. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  239. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4897 -4746
  240. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3794 -3652
  241. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  242. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  243. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
  244. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  245. package/package.json +1 -2
  246. package/cpp/llama.cpp/common/cmake/build-info-gen-cpp.cmake +0 -24
  247. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  248. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13891
  249. package/cpp/llama.cpp/src/llama-kv-cache.cpp +0 -1
  250. package/cpp/llama.cpp/src/llama-kv-cache.h +0 -44
  251. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
  252. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
  253. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
@@ -33,14 +33,11 @@ static void dequantize_block_sycl(const void *__restrict__ vx,
33
33
  {
34
34
  dpct::has_capability_or_fail(stream->get_device(),
35
35
  {sycl::aspect::fp16});
36
- stream->parallel_for(
37
- sycl::nd_range<3>(
38
- sycl::range<3>(1, 1, num_blocks) *
39
- sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE),
40
- sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE)),
41
- [=](sycl::nd_item<3> item_ct1) {
42
- dequantize_block<qk, qr, dequantize_kernel>(vx, y, k, item_ct1);
43
- });
36
+ sycl_parallel_for(
37
+ stream,
38
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE),
39
+ sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE)),
40
+ [=](sycl::nd_item<3> item_ct1) { dequantize_block<qk, qr, dequantize_kernel>(vx, y, k, item_ct1); });
44
41
  }
45
42
  }
46
43
 
@@ -53,24 +50,18 @@ static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int64_t k,
53
50
  dpct::has_capability_or_fail(stream->get_device(),
54
51
  {sycl::aspect::fp16});
55
52
 
56
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
57
- sycl::range<3>(1, 1, 64),
58
- sycl::range<3>(1, 1, 64)),
59
- [=](sycl::nd_item<3> item_ct1) {
60
- dequantize_block_q2_K(vx, y, item_ct1);
61
- });
53
+ sycl_parallel_for(
54
+ stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)),
55
+ [=](sycl::nd_item<3> item_ct1) { dequantize_block_q2_K(vx, y, item_ct1); });
62
56
  }
63
57
  #else
64
58
  {
65
59
  dpct::has_capability_or_fail(stream->get_device(),
66
60
  {sycl::aspect::fp16});
67
61
 
68
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
69
- sycl::range<3>(1, 1, 32),
70
- sycl::range<3>(1, 1, 32)),
71
- [=](sycl::nd_item<3> item_ct1) {
72
- dequantize_block_q2_K(vx, y, item_ct1);
73
- });
62
+ sycl_parallel_for(
63
+ stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
64
+ [=](sycl::nd_item<3> item_ct1) { dequantize_block_q2_K(vx, y, item_ct1); });
74
65
  }
75
66
 
76
67
  #endif
@@ -85,24 +76,18 @@ static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int64_t k,
85
76
  dpct::has_capability_or_fail(stream->get_device(),
86
77
  {sycl::aspect::fp16});
87
78
 
88
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
89
- sycl::range<3>(1, 1, 64),
90
- sycl::range<3>(1, 1, 64)),
91
- [=](sycl::nd_item<3> item_ct1) {
92
- dequantize_block_q3_K(vx, y, item_ct1);
93
- });
79
+ sycl_parallel_for(
80
+ stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)),
81
+ [=](sycl::nd_item<3> item_ct1) { dequantize_block_q3_K(vx, y, item_ct1); });
94
82
  }
95
83
  #else
96
84
  {
97
85
  dpct::has_capability_or_fail(stream->get_device(),
98
86
  {sycl::aspect::fp16});
99
87
 
100
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
101
- sycl::range<3>(1, 1, 32),
102
- sycl::range<3>(1, 1, 32)),
103
- [=](sycl::nd_item<3> item_ct1) {
104
- dequantize_block_q3_K(vx, y, item_ct1);
105
- });
88
+ sycl_parallel_for(
89
+ stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
90
+ [=](sycl::nd_item<3> item_ct1) { dequantize_block_q3_K(vx, y, item_ct1); });
106
91
  }
107
92
  #endif
108
93
  }
@@ -116,12 +101,9 @@ static void dequantize_row_q4_0_sycl(const void *vx, dst_t *y, const int64_t k,
116
101
  dpct::has_capability_or_fail(stream->get_device(),
117
102
  {sycl::aspect::fp16});
118
103
 
119
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
120
- sycl::range<3>(1, 1, 32),
121
- sycl::range<3>(1, 1, 32)),
122
- [=](sycl::nd_item<3> item_ct1) {
123
- dequantize_block_q4_0(vx, y, nb32, item_ct1);
124
- });
104
+ sycl_parallel_for(
105
+ stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
106
+ [=](sycl::nd_item<3> item_ct1) { dequantize_block_q4_0(vx, y, nb32, item_ct1); });
125
107
  }
126
108
  }
127
109
 
@@ -135,13 +117,12 @@ static void dequantize_row_q4_0_sycl_reorder(const void *vx, dst_t *y, const int
135
117
  int constexpr WARP_K = WARP_SIZE * QK4_0;
136
118
  const int n_warp = (k + WARP_K - 1) / WARP_K;
137
119
  GGML_ASSERT(k % 2 == 0);
138
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, n_warp) *
139
- sycl::range<3>(1, 1, WARP_SIZE),
140
- sycl::range<3>(1, 1, WARP_SIZE)),
141
- [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]]{
142
- dequantize_block_q4_0_reorder(vx, y, k, item_ct1);
143
- });
144
-
120
+ sycl_parallel_for(stream,
121
+ sycl::nd_range<3>(sycl::range<3>(1, 1, n_warp) * sycl::range<3>(1, 1, WARP_SIZE),
122
+ sycl::range<3>(1, 1, WARP_SIZE)),
123
+ [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
124
+ dequantize_block_q4_0_reorder(vx, y, k, item_ct1);
125
+ });
145
126
  }
146
127
 
147
128
  template <typename dst_t>
@@ -153,12 +134,9 @@ static void dequantize_row_q4_1_sycl(const void *vx, dst_t *y, const int64_t k,
153
134
  dpct::has_capability_or_fail(stream->get_device(),
154
135
  {sycl::aspect::fp16});
155
136
 
156
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
157
- sycl::range<3>(1, 1, 32),
158
- sycl::range<3>(1, 1, 32)),
159
- [=](sycl::nd_item<3> item_ct1) {
160
- dequantize_block_q4_1(vx, y, nb32, item_ct1);
161
- });
137
+ sycl_parallel_for(
138
+ stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
139
+ [=](sycl::nd_item<3> item_ct1) { dequantize_block_q4_1(vx, y, nb32, item_ct1); });
162
140
  }
163
141
  }
164
142
 
@@ -171,14 +149,13 @@ static void dequantize_row_q4_K_sycl(const void *vx, dst_t *y, const int64_t k,
171
149
  dpct::has_capability_or_fail(stream->get_device(),
172
150
  {sycl::aspect::fp16});
173
151
 
174
- stream->submit([&](sycl::handler &cgh) {
152
+ sycl_launch(stream, [&](sycl::handler & cgh) {
175
153
  sycl::local_accessor<uint8_t, 1> scale_local_acc(sycl::range<1>(12), cgh);
176
- cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
177
- sycl::range<3>(1, 1, 32),
178
- sycl::range<3>(1, 1, 32)),
179
- [=](sycl::nd_item<3> item_ct1) {
180
- dequantize_block_q4_K(vx, y, get_pointer(scale_local_acc), item_ct1);
181
- });
154
+ sycl_parallel_for(
155
+ cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
156
+ [=](sycl::nd_item<3> item_ct1) {
157
+ dequantize_block_q4_K(vx, y, get_pointer(scale_local_acc), item_ct1);
158
+ });
182
159
  });
183
160
  }
184
161
  }
@@ -191,13 +168,13 @@ static void dequantize_row_q4_K_sycl_reorder(const void * vx, dst_t * y, const i
191
168
 
192
169
  dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
193
170
 
194
- stream->submit([&](sycl::handler & cgh) {
171
+ sycl_launch(stream, [&](sycl::handler & cgh) {
195
172
  sycl::local_accessor<uint8_t, 1> scale_local_acc(sycl::range<1>(12), cgh);
196
173
 
197
- cgh.parallel_for(sycl::nd_range<1>(sycl::range<1>(global_size), sycl::range<1>(local_size)),
198
- [=](sycl::nd_item<1> item_ct1) {
199
- dequantize_block_q4_K_reorder(vx, y, get_pointer(scale_local_acc), item_ct1, nb);
200
- });
174
+ sycl_parallel_for<1>(cgh, sycl::nd_range<1>(sycl::range<1>(global_size), sycl::range<1>(local_size)),
175
+ [=](sycl::nd_item<1> item_ct1) {
176
+ dequantize_block_q4_K_reorder(vx, y, get_pointer(scale_local_acc), item_ct1, nb);
177
+ });
201
178
  });
202
179
  }
203
180
 
@@ -210,24 +187,18 @@ static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int64_t k,
210
187
  dpct::has_capability_or_fail(stream->get_device(),
211
188
  {sycl::aspect::fp16});
212
189
 
213
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
214
- sycl::range<3>(1, 1, 64),
215
- sycl::range<3>(1, 1, 64)),
216
- [=](sycl::nd_item<3> item_ct1) {
217
- dequantize_block_q5_K(vx, y, item_ct1);
218
- });
190
+ sycl_parallel_for(
191
+ stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)),
192
+ [=](sycl::nd_item<3> item_ct1) { dequantize_block_q5_K(vx, y, item_ct1); });
219
193
  }
220
194
  #else
221
195
  {
222
196
  dpct::has_capability_or_fail(stream->get_device(),
223
197
  {sycl::aspect::fp16});
224
198
 
225
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
226
- sycl::range<3>(1, 1, 32),
227
- sycl::range<3>(1, 1, 32)),
228
- [=](sycl::nd_item<3> item_ct1) {
229
- dequantize_block_q5_K(vx, y, item_ct1);
230
- });
199
+ sycl_parallel_for(
200
+ stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
201
+ [=](sycl::nd_item<3> item_ct1) { dequantize_block_q5_K(vx, y, item_ct1); });
231
202
  }
232
203
 
233
204
  #endif
@@ -242,29 +213,34 @@ static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int64_t k,
242
213
  dpct::has_capability_or_fail(stream->get_device(),
243
214
  {sycl::aspect::fp16});
244
215
 
245
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
246
- sycl::range<3>(1, 1, 64),
247
- sycl::range<3>(1, 1, 64)),
248
- [=](sycl::nd_item<3> item_ct1) {
249
- dequantize_block_q6_K(vx, y, item_ct1);
250
- });
216
+ sycl_parallel_for(
217
+ stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)),
218
+ [=](sycl::nd_item<3> item_ct1) { dequantize_block_q6_K(vx, y, item_ct1); });
251
219
  }
252
220
  #else
253
221
  {
254
222
  dpct::has_capability_or_fail(stream->get_device(),
255
223
  {sycl::aspect::fp16});
256
224
 
257
- stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
258
- sycl::range<3>(1, 1, 32),
259
- sycl::range<3>(1, 1, 32)),
260
- [=](sycl::nd_item<3> item_ct1) {
261
- dequantize_block_q6_K(vx, y, item_ct1);
262
- });
225
+ sycl_parallel_for(
226
+ stream, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
227
+ [=](sycl::nd_item<3> item_ct1) { dequantize_block_q6_K(vx, y, item_ct1); });
263
228
  }
264
229
 
265
230
  #endif
266
231
  }
267
232
 
233
+ template <typename dst_t>
234
+ static void dequantize_row_q6_K_sycl_reorder(const void * vx, dst_t * y, const int64_t k, dpct::queue_ptr stream) {
235
+ const int64_t nb = k / QK_K;
236
+
237
+ dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
238
+
239
+ sycl_parallel_for(stream,
240
+ sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)),
241
+ [=](sycl::nd_item<3> item_ct1) { dequantize_block_q6_K_reorder(vx, y, item_ct1, nb); });
242
+ }
243
+
268
244
  template <typename dst_t>
269
245
  static void dequantize_row_iq1_s_sycl(const void *vx, dst_t *y, const int64_t k,
270
246
  dpct::queue_ptr stream) {
@@ -273,15 +249,10 @@ static void dequantize_row_iq1_s_sycl(const void *vx, dst_t *y, const int64_t k,
273
249
  dpct::has_capability_or_fail(stream->get_device(),
274
250
  {sycl::aspect::fp16});
275
251
 
276
- stream->submit([&](sycl::handler &cgh) {
277
- cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
278
- sycl::range<3>(1, 1, 32),
279
- sycl::range<3>(1, 1, 32)),
280
- [=](sycl::nd_item<3> item_ct1) {
281
- dequantize_block_iq1_s(
282
- vx, y, item_ct1, iq1s_grid_gpu
283
- );
284
- });
252
+ sycl_launch(stream, [&](sycl::handler & cgh) {
253
+ sycl_parallel_for(
254
+ cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
255
+ [=](sycl::nd_item<3> item_ct1) { dequantize_block_iq1_s(vx, y, item_ct1, iq1s_grid_gpu); });
285
256
  });
286
257
  }
287
258
  }
@@ -294,15 +265,10 @@ static void dequantize_row_iq1_m_sycl(const void *vx, dst_t *y, const int64_t k,
294
265
  dpct::has_capability_or_fail(stream->get_device(),
295
266
  {sycl::aspect::fp16});
296
267
 
297
- stream->submit([&](sycl::handler &cgh) {
298
- cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
299
- sycl::range<3>(1, 1, 32),
300
- sycl::range<3>(1, 1, 32)),
301
- [=](sycl::nd_item<3> item_ct1) {
302
- dequantize_block_iq1_m(
303
- vx, y, item_ct1, iq1s_grid_gpu
304
- );
305
- });
268
+ sycl_launch(stream, [&](sycl::handler & cgh) {
269
+ sycl_parallel_for(
270
+ cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
271
+ [=](sycl::nd_item<3> item_ct1) { dequantize_block_iq1_m(vx, y, item_ct1, iq1s_grid_gpu); });
306
272
  });
307
273
  }
308
274
  }
@@ -315,15 +281,12 @@ static void dequantize_row_iq2_xxs_sycl(const void *vx, dst_t *y, const int64_t
315
281
  dpct::has_capability_or_fail(stream->get_device(),
316
282
  {sycl::aspect::fp16});
317
283
 
318
- stream->submit([&](sycl::handler &cgh) {
319
- cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
320
- sycl::range<3>(1, 1, 32),
321
- sycl::range<3>(1, 1, 32)),
322
- [=](sycl::nd_item<3> item_ct1) {
323
- dequantize_block_iq2_xxs(
324
- vx, y, item_ct1, iq2xxs_grid,
325
- ksigns_iq2xs, kmask_iq2xs);
326
- });
284
+ sycl_launch(stream, [&](sycl::handler & cgh) {
285
+ sycl_parallel_for(
286
+ cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
287
+ [=](sycl::nd_item<3> item_ct1) {
288
+ dequantize_block_iq2_xxs(vx, y, item_ct1, iq2xxs_grid, ksigns_iq2xs, kmask_iq2xs);
289
+ });
327
290
  });
328
291
  }
329
292
  }
@@ -336,15 +299,12 @@ static void dequantize_row_iq2_xs_sycl(const void *vx, dst_t *y, const int64_t k
336
299
  dpct::has_capability_or_fail(stream->get_device(),
337
300
  {sycl::aspect::fp16});
338
301
 
339
- stream->submit([&](sycl::handler &cgh) {
340
- cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
341
- sycl::range<3>(1, 1, 32),
342
- sycl::range<3>(1, 1, 32)),
343
- [=](sycl::nd_item<3> item_ct1) {
344
- dequantize_block_iq2_xs(
345
- vx, y, item_ct1, iq2xs_grid,
346
- ksigns_iq2xs, kmask_iq2xs);
347
- });
302
+ sycl_launch(stream, [&](sycl::handler & cgh) {
303
+ sycl_parallel_for(
304
+ cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
305
+ [=](sycl::nd_item<3> item_ct1) {
306
+ dequantize_block_iq2_xs(vx, y, item_ct1, iq2xs_grid, ksigns_iq2xs, kmask_iq2xs);
307
+ });
348
308
  });
349
309
  }
350
310
  }
@@ -357,13 +317,10 @@ static void dequantize_row_iq2_s_sycl(const void *vx, dst_t *y, const int64_t k,
357
317
  dpct::has_capability_or_fail(stream->get_device(),
358
318
  {sycl::aspect::fp16});
359
319
 
360
- stream->submit([&](sycl::handler &cgh) {
361
- cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
362
- sycl::range<3>(1, 1, 32),
363
- sycl::range<3>(1, 1, 32)),
364
- [=](sycl::nd_item<3> item_ct1) {
365
- dequantize_block_iq2_s(vx, y, item_ct1);
366
- });
320
+ sycl_launch(stream, [&](sycl::handler & cgh) {
321
+ sycl_parallel_for(
322
+ cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
323
+ [=](sycl::nd_item<3> item_ct1) { dequantize_block_iq2_s(vx, y, item_ct1); });
367
324
  });
368
325
  }
369
326
  }
@@ -377,15 +334,12 @@ static void dequantize_row_iq3_xxs_sycl(const void *vx, dst_t *y, const int64_t
377
334
  dpct::has_capability_or_fail(stream->get_device(),
378
335
  {sycl::aspect::fp16});
379
336
 
380
- stream->submit([&](sycl::handler &cgh) {
381
- cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
382
- sycl::range<3>(1, 1, 32),
383
- sycl::range<3>(1, 1, 32)),
384
- [=](sycl::nd_item<3> item_ct1) {
385
- dequantize_block_iq3_xxs(
386
- vx, y, item_ct1, iq3xxs_grid,
387
- ksigns_iq2xs, kmask_iq2xs);
388
- });
337
+ sycl_launch(stream, [&](sycl::handler & cgh) {
338
+ sycl_parallel_for(
339
+ cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
340
+ [=](sycl::nd_item<3> item_ct1) {
341
+ dequantize_block_iq3_xxs(vx, y, item_ct1, iq3xxs_grid, ksigns_iq2xs, kmask_iq2xs);
342
+ });
389
343
  });
390
344
  }
391
345
  }
@@ -398,14 +352,10 @@ static void dequantize_row_iq3_s_sycl(const void *vx, dst_t *y, const int64_t k,
398
352
  dpct::has_capability_or_fail(stream->get_device(),
399
353
  {sycl::aspect::fp16});
400
354
 
401
- stream->submit([&](sycl::handler &cgh) {
402
- cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
403
- sycl::range<3>(1, 1, 32),
404
- sycl::range<3>(1, 1, 32)),
405
- [=](sycl::nd_item<3> item_ct1) {
406
- dequantize_block_iq3_s(
407
- vx, y, item_ct1, kmask_iq2xs, iq3s_grid);
408
- });
355
+ sycl_launch(stream, [&](sycl::handler & cgh) {
356
+ sycl_parallel_for(
357
+ cgh, sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
358
+ [=](sycl::nd_item<3> item_ct1) { dequantize_block_iq3_s(vx, y, item_ct1, kmask_iq2xs, iq3s_grid); });
409
359
  });
410
360
  }
411
361
  }
@@ -421,14 +371,11 @@ static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int64_t k
421
371
  dpct::has_capability_or_fail(stream->get_device(),
422
372
  {sycl::aspect::fp16});
423
373
 
424
- stream->submit([&](sycl::handler &cgh) {
425
- cgh.parallel_for(
426
- sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
427
- sycl::range<3>(1, 1, 32),
428
- sycl::range<3>(1, 1, 32)),
429
- [=](sycl::nd_item<3> item_ct1) {
430
- dequantize_block_iq4_xs(vx, y, item_ct1);
431
- });
374
+ sycl_launch(stream, [&](sycl::handler & cgh) {
375
+ sycl_parallel_for(
376
+ cgh,
377
+ sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
378
+ [=](sycl::nd_item<3> item_ct1) { dequantize_block_iq4_xs(vx, y, item_ct1); });
432
379
  });
433
380
  }
434
381
  #endif
@@ -442,14 +389,11 @@ static void dequantize_row_iq4_nl_sycl(const void *vx, dst_t *y, const int64_t k
442
389
  dpct::has_capability_or_fail(stream->get_device(),
443
390
  {sycl::aspect::fp16});
444
391
 
445
- stream->submit([&](sycl::handler &cgh) {
446
- cgh.parallel_for(
447
- sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
448
- sycl::range<3>(1, 1, 32),
449
- sycl::range<3>(1, 1, 32)),
450
- [=](sycl::nd_item<3> item_ct1) {
451
- dequantize_block_iq4_nl(vx, y, item_ct1);
452
- });
392
+ sycl_launch(stream, [&](sycl::handler & cgh) {
393
+ sycl_parallel_for(
394
+ cgh,
395
+ sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
396
+ [=](sycl::nd_item<3> item_ct1) { dequantize_block_iq4_nl(vx, y, item_ct1); });
453
397
  });
454
398
  }
455
399
  }
@@ -530,7 +474,11 @@ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) {
530
474
  case GGML_TYPE_Q5_K:
531
475
  return dequantize_row_q5_K_sycl;
532
476
  case GGML_TYPE_Q6_K:
533
- return dequantize_row_q6_K_sycl;
477
+ if (dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
478
+ return dequantize_row_q6_K_sycl_reorder;
479
+ } else {
480
+ return dequantize_row_q6_K_sycl;
481
+ }
534
482
  case GGML_TYPE_IQ1_S:
535
483
  return dequantize_row_iq1_s_sycl;
536
484
  case GGML_TYPE_IQ1_M:
@@ -587,7 +535,11 @@ to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) {
587
535
  case GGML_TYPE_Q5_K:
588
536
  return dequantize_row_q5_K_sycl;
589
537
  case GGML_TYPE_Q6_K:
590
- return dequantize_row_q6_K_sycl;
538
+ if (dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
539
+ return dequantize_row_q6_K_sycl_reorder;
540
+ } else {
541
+ return dequantize_row_q6_K_sycl;
542
+ }
591
543
  case GGML_TYPE_IQ1_S:
592
544
  return dequantize_row_iq1_s_sycl;
593
545
  case GGML_TYPE_IQ1_M: