@novastera-oss/llamarn 0.2.6 → 0.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (253) hide show
  1. package/android/src/main/cpp/include/llama.h +141 -38
  2. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  3. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  4. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  6. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  7. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  8. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  10. package/cpp/LlamaCppModel.cpp +58 -24
  11. package/cpp/LlamaCppModel.h +3 -3
  12. package/cpp/PureCppImpl.cpp +1 -1
  13. package/cpp/PureCppImpl.h +2 -2
  14. package/cpp/build-info.cpp +2 -2
  15. package/cpp/llama.cpp/CMakeLists.txt +15 -4
  16. package/cpp/llama.cpp/Makefile +2 -2
  17. package/cpp/llama.cpp/README.md +32 -13
  18. package/cpp/llama.cpp/common/CMakeLists.txt +10 -20
  19. package/cpp/llama.cpp/common/arg.cpp +37 -6
  20. package/cpp/llama.cpp/common/build-info.cpp.in +2 -2
  21. package/cpp/llama.cpp/common/chat-parser.cpp +5 -0
  22. package/cpp/llama.cpp/common/chat-parser.h +2 -0
  23. package/cpp/llama.cpp/common/chat.cpp +12 -9
  24. package/cpp/llama.cpp/common/chat.h +1 -1
  25. package/cpp/llama.cpp/common/common.cpp +53 -40
  26. package/cpp/llama.cpp/common/common.h +6 -2
  27. package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
  28. package/cpp/llama.cpp/common/speculative.cpp +6 -4
  29. package/cpp/llama.cpp/convert_hf_to_gguf.py +215 -76
  30. package/cpp/llama.cpp/ggml/CMakeLists.txt +48 -2
  31. package/cpp/llama.cpp/ggml/cmake/common.cmake +1 -2
  32. package/cpp/llama.cpp/ggml/include/ggml-cpu.h +2 -0
  33. package/cpp/llama.cpp/ggml/include/ggml.h +33 -0
  34. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +64 -13
  35. package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +5 -0
  36. package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +6 -1
  37. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
  38. package/cpp/llama.cpp/ggml/src/ggml-common.h +4 -0
  39. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +124 -26
  40. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
  41. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +11 -10
  42. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  43. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4114 -0
  44. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2163 -0
  45. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2639 -0
  46. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
  47. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2732 -0
  48. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2069 -0
  49. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +397 -0
  50. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1300 -0
  51. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1481 -0
  52. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +4311 -0
  53. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
  54. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +184 -0
  55. package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +4 -3
  56. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +16 -7
  57. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +93 -104
  58. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +12 -8
  59. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
  60. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
  61. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +58 -8
  62. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
  63. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +194 -69
  64. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -0
  65. package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +1158 -0
  66. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
  67. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +1571 -0
  68. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.h +98 -0
  69. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +213 -37
  70. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
  71. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
  72. package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
  73. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +59 -37
  74. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
  75. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
  76. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
  77. package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
  78. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +4 -1
  79. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
  80. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +90 -39
  81. package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +19 -0
  82. package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cuh +3 -0
  83. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cu +257 -87
  84. package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cuh +2 -3
  85. package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
  86. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +5 -18
  87. package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cuh +0 -1
  88. package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -0
  89. package/cpp/llama.cpp/ggml/src/ggml-impl.h +61 -183
  90. package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +11 -10
  91. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +16 -0
  92. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +260 -49
  93. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +497 -282
  94. package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +2 -2
  95. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +7 -0
  96. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1078 -468
  97. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
  98. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  99. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
  100. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
  101. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
  102. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  103. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
  104. package/cpp/llama.cpp/ggml/src/ggml-quants.c +0 -2
  105. package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
  106. package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +1 -1
  107. package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -6
  108. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +20 -48
  109. package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +28 -41
  110. package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +4 -10
  111. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +117 -165
  112. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +192 -53
  113. package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +32 -0
  114. package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +49 -67
  115. package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
  116. package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +99 -159
  117. package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +3 -0
  118. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +8 -105
  119. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +209 -92
  120. package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
  121. package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
  122. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +60 -80
  123. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +158 -203
  124. package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +55 -74
  125. package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +38 -10
  126. package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +24 -20
  127. package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -3
  128. package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
  129. package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
  130. package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -8
  131. package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
  132. package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +12 -16
  133. package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +36 -28
  134. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +487 -247
  135. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -12
  136. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  137. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +2 -0
  138. package/cpp/llama.cpp/ggml/src/ggml.c +69 -19
  139. package/cpp/llama.cpp/ggml/src/gguf.cpp +5 -1
  140. package/cpp/llama.cpp/gguf-py/gguf/constants.py +133 -0
  141. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +25 -1
  142. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +78 -3
  143. package/cpp/llama.cpp/gguf-py/gguf/vocab.py +97 -4
  144. package/cpp/llama.cpp/gguf-py/pyproject.toml +2 -2
  145. package/cpp/llama.cpp/include/llama.h +141 -38
  146. package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +124 -0
  147. package/cpp/llama.cpp/requirements/requirements-compare-llama-bench.txt +1 -0
  148. package/cpp/llama.cpp/src/CMakeLists.txt +2 -2
  149. package/cpp/llama.cpp/src/llama-arch.cpp +150 -3
  150. package/cpp/llama.cpp/src/llama-arch.h +25 -1
  151. package/cpp/llama.cpp/src/llama-batch.cpp +736 -274
  152. package/cpp/llama.cpp/src/llama-batch.h +110 -57
  153. package/cpp/llama.cpp/src/llama-chat.cpp +30 -8
  154. package/cpp/llama.cpp/src/llama-chat.h +1 -0
  155. package/cpp/llama.cpp/src/llama-context.cpp +360 -266
  156. package/cpp/llama.cpp/src/llama-context.h +27 -23
  157. package/cpp/llama.cpp/src/llama-cparams.cpp +1 -1
  158. package/cpp/llama.cpp/src/llama-cparams.h +1 -1
  159. package/cpp/llama.cpp/src/llama-graph.cpp +411 -344
  160. package/cpp/llama.cpp/src/llama-graph.h +126 -58
  161. package/cpp/llama.cpp/src/llama-hparams.cpp +10 -2
  162. package/cpp/llama.cpp/src/llama-hparams.h +16 -2
  163. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +103 -73
  164. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +34 -42
  165. package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +345 -221
  166. package/cpp/llama.cpp/src/llama-kv-cache-unified.h +75 -50
  167. package/cpp/llama.cpp/src/llama-kv-cells.h +51 -22
  168. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +246 -0
  169. package/cpp/llama.cpp/src/llama-memory-hybrid.h +138 -0
  170. package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.cpp → llama-memory-recurrent.cpp} +302 -317
  171. package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.h → llama-memory-recurrent.h} +60 -68
  172. package/cpp/llama.cpp/src/llama-memory.cpp +41 -0
  173. package/cpp/llama.cpp/src/llama-memory.h +73 -36
  174. package/cpp/llama.cpp/src/llama-mmap.cpp +1 -1
  175. package/cpp/llama.cpp/src/llama-model-loader.cpp +42 -17
  176. package/cpp/llama.cpp/src/llama-model-saver.cpp +1 -0
  177. package/cpp/llama.cpp/src/llama-model.cpp +1630 -511
  178. package/cpp/llama.cpp/src/llama-model.h +26 -0
  179. package/cpp/llama.cpp/src/llama-quant.cpp +89 -6
  180. package/cpp/llama.cpp/src/llama-vocab.cpp +58 -26
  181. package/cpp/llama.cpp/src/llama-vocab.h +1 -0
  182. package/cpp/llama.cpp/src/llama.cpp +11 -7
  183. package/cpp/llama.cpp/src/unicode.cpp +5 -0
  184. package/cpp/rn-completion.cpp +2 -2
  185. package/cpp/{rn-llama.hpp → rn-llama.h} +1 -1
  186. package/cpp/{rn-utils.hpp → rn-utils.h} +3 -0
  187. package/ios/include/chat.h +1 -1
  188. package/ios/include/common.h +6 -2
  189. package/ios/include/llama.h +141 -38
  190. package/ios/libs/llama.xcframework/Info.plist +15 -15
  191. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  192. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4689
  193. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  194. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +33 -0
  195. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +141 -38
  196. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  197. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  198. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
  199. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3622
  200. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  201. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  202. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
  203. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  204. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  205. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
  206. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3766 -3624
  207. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +2 -0
  208. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +33 -0
  209. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +141 -38
  210. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +2 -0
  211. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +33 -0
  212. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +141 -38
  213. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  214. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +2 -0
  215. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +33 -0
  216. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +141 -38
  217. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  218. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  219. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  220. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4689
  221. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  222. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +33 -0
  223. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +141 -38
  224. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  225. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  226. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
  227. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3622
  228. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  229. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  230. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
  231. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  232. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  233. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4926 -4725
  234. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
  235. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +33 -0
  236. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +141 -38
  237. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  238. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  239. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4897 -4746
  240. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3794 -3652
  241. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
  242. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
  243. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
  244. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  245. package/package.json +1 -2
  246. package/cpp/llama.cpp/common/cmake/build-info-gen-cpp.cmake +0 -24
  247. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  248. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13891
  249. package/cpp/llama.cpp/src/llama-kv-cache.cpp +0 -1
  250. package/cpp/llama.cpp/src/llama-kv-cache.h +0 -44
  251. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
  252. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
  253. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
@@ -329,60 +329,51 @@ static void acc_f32_sycl(const float *x, const float *y, float *dst,
329
329
  const int ne12, const int nb1, const int nb2,
330
330
  const int offset, queue_ptr stream) {
331
331
  int num_blocks = (n_elements + SYCL_ACC_BLOCK_SIZE - 1) / SYCL_ACC_BLOCK_SIZE;
332
- stream->parallel_for(
333
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
334
- sycl::range<3>(1, 1, SYCL_ACC_BLOCK_SIZE),
335
- sycl::range<3>(1, 1, SYCL_ACC_BLOCK_SIZE)),
336
- [=](sycl::nd_item<3> item_ct1) {
337
- acc_f32(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset,
338
- item_ct1);
339
- });
332
+ sycl_parallel_for(stream,
333
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_ACC_BLOCK_SIZE),
334
+ sycl::range<3>(1, 1, SYCL_ACC_BLOCK_SIZE)),
335
+ [=](sycl::nd_item<3> item_ct1) {
336
+ acc_f32(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset, item_ct1);
337
+ });
340
338
  }
341
339
 
342
340
  template<typename T>
343
341
  static void gelu_sycl(const T *x, T *dst, const int k,
344
342
  queue_ptr stream) {
345
343
  const int num_blocks = (k + SYCL_GELU_BLOCK_SIZE - 1) / SYCL_GELU_BLOCK_SIZE;
346
- stream->parallel_for(
347
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
348
- sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE),
349
- sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)),
350
- [=](sycl::nd_item<3> item_ct1) {
351
- gelu(x, dst, k, item_ct1);
352
- });
344
+ sycl_parallel_for(stream,
345
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE),
346
+ sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)),
347
+ [=](sycl::nd_item<3> item_ct1) { gelu(x, dst, k, item_ct1); });
353
348
  }
354
349
 
355
350
  template<typename T>
356
351
  static void silu_sycl(const T *x, T *dst, const int k,
357
352
  queue_ptr stream) {
358
353
  const int num_blocks = (k + SYCL_SILU_BLOCK_SIZE - 1) / SYCL_SILU_BLOCK_SIZE;
359
- stream->parallel_for(
360
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
361
- sycl::range<3>(1, 1, SYCL_SILU_BLOCK_SIZE),
362
- sycl::range<3>(1, 1, SYCL_SILU_BLOCK_SIZE)),
363
- [=](sycl::nd_item<3> item_ct1) {
364
- silu(x, dst, k, item_ct1);
365
- });
354
+ sycl_parallel_for(stream,
355
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_SILU_BLOCK_SIZE),
356
+ sycl::range<3>(1, 1, SYCL_SILU_BLOCK_SIZE)),
357
+ [=](sycl::nd_item<3> item_ct1) { silu(x, dst, k, item_ct1); });
366
358
  }
367
359
 
368
360
  template<typename T>
369
361
  static void sgn_sycl(const T * x, T * dst, const int k, queue_ptr stream) {
370
362
  // hard code for now
371
363
  const int num_blocks = ceil_div(k, 256);
372
- stream->parallel_for(
373
- sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range(1, 1, 256)), sycl::range(1, 1, 256)), [=](sycl::nd_item<3> item_ct1) {
374
- sgn(x, dst, k, item_ct1);
375
- });
364
+ sycl_parallel_for(
365
+ stream, sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range(1, 1, 256)), sycl::range(1, 1, 256)),
366
+ [=](sycl::nd_item<3> item_ct1) { sgn(x, dst, k, item_ct1); });
376
367
  }
377
368
 
378
369
  template<typename T>
379
370
  static void abs_sycl(const T * x, T * dst, const int k, queue_ptr stream) {
380
371
  // hard code for now
381
372
  const int num_blocks = ceil_div(k, 256);
382
- stream->parallel_for(
383
- sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, 256)), sycl::range<3>(1, 1, 256)), [=](sycl::nd_item<3> item_ct1) {
384
- abs_op(x, dst, k, item_ct1);
385
- });
373
+ sycl_parallel_for(
374
+ stream,
375
+ sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, 256)), sycl::range<3>(1, 1, 256)),
376
+ [=](sycl::nd_item<3> item_ct1) { abs_op(x, dst, k, item_ct1); });
386
377
  }
387
378
 
388
379
 
@@ -390,23 +381,20 @@ template<typename T>
390
381
  static void elu_sycl(const T * x, T * dst, const int k, queue_ptr stream) {
391
382
  // hard code for now
392
383
  const int num_blocks = ceil_div(k, 256);
393
- stream->parallel_for(
394
- sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, 256)), sycl::range<3>(1, 1, 256)), [=](sycl::nd_item<3> item_ct1) {
395
- elu_op(x, dst, k, item_ct1);
396
- });
384
+ sycl_parallel_for(
385
+ stream,
386
+ sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, 256)), sycl::range<3>(1, 1, 256)),
387
+ [=](sycl::nd_item<3> item_ct1) { elu_op(x, dst, k, item_ct1); });
397
388
  }
398
389
 
399
390
  template<typename T>
400
391
  static void gelu_quick_sycl(const T *x, T *dst, const int k,
401
392
  queue_ptr stream) {
402
393
  const int num_blocks = (k + SYCL_GELU_BLOCK_SIZE - 1) / SYCL_GELU_BLOCK_SIZE;
403
- stream->parallel_for(
404
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
405
- sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE),
406
- sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)),
407
- [=](sycl::nd_item<3> item_ct1) {
408
- gelu_quick(x, dst, k, item_ct1);
409
- });
394
+ sycl_parallel_for(stream,
395
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE),
396
+ sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)),
397
+ [=](sycl::nd_item<3> item_ct1) { gelu_quick(x, dst, k, item_ct1); });
410
398
  }
411
399
 
412
400
 
@@ -414,169 +402,133 @@ template<typename T>
414
402
  static void gelu_erf_sycl(const T *x, T *dst, const int k,
415
403
  queue_ptr stream) {
416
404
  const int num_blocks = ceil_div(k, SYCL_GELU_BLOCK_SIZE);
417
- stream->parallel_for(
418
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
419
- sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE),
420
- sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)),
421
- [=](sycl::nd_item<3> item_ct1) {
422
- gelu_erf(x, dst, k, item_ct1);
423
- });
405
+ sycl_parallel_for(stream,
406
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE),
407
+ sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)),
408
+ [=](sycl::nd_item<3> item_ct1) { gelu_erf(x, dst, k, item_ct1); });
424
409
  }
425
410
 
426
411
  template<typename T>
427
412
  static void tanh_sycl(const T *x, T *dst, const int k,
428
413
  queue_ptr stream) {
429
414
  const int num_blocks = (k + SYCL_TANH_BLOCK_SIZE - 1) / SYCL_TANH_BLOCK_SIZE;
430
- stream->parallel_for(
431
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
432
- sycl::range<3>(1, 1, SYCL_TANH_BLOCK_SIZE),
433
- sycl::range<3>(1, 1, SYCL_TANH_BLOCK_SIZE)),
434
- [=](sycl::nd_item<3> item_ct1) {
435
- tanh(x, dst, k, item_ct1);
436
- });
415
+ sycl_parallel_for(stream,
416
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_TANH_BLOCK_SIZE),
417
+ sycl::range<3>(1, 1, SYCL_TANH_BLOCK_SIZE)),
418
+ [=](sycl::nd_item<3> item_ct1) { tanh(x, dst, k, item_ct1); });
437
419
  }
438
420
 
439
421
  template<typename T>
440
422
  static void relu_sycl(const T *x, T *dst, const int k,
441
423
  queue_ptr stream) {
442
424
  const int num_blocks = (k + SYCL_RELU_BLOCK_SIZE - 1) / SYCL_RELU_BLOCK_SIZE;
443
- stream->parallel_for(
444
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
445
- sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE),
446
- sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE)),
447
- [=](sycl::nd_item<3> item_ct1) {
448
- relu(x, dst, k, item_ct1);
449
- });
425
+ sycl_parallel_for(stream,
426
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE),
427
+ sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE)),
428
+ [=](sycl::nd_item<3> item_ct1) { relu(x, dst, k, item_ct1); });
450
429
  }
451
430
 
452
431
  template<typename T>
453
432
  static void hardsigmoid_sycl(const T *x, T *dst, const int k,
454
433
  queue_ptr stream) {
455
434
  const int num_blocks = (k + SYCL_HARDSIGMOID_BLOCK_SIZE - 1) / SYCL_HARDSIGMOID_BLOCK_SIZE;
456
- stream->parallel_for(
457
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
458
- sycl::range<3>(1, 1, SYCL_HARDSIGMOID_BLOCK_SIZE),
435
+ sycl_parallel_for(
436
+ stream,
437
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_HARDSIGMOID_BLOCK_SIZE),
459
438
  sycl::range<3>(1, 1, SYCL_HARDSIGMOID_BLOCK_SIZE)),
460
- [=](sycl::nd_item<3> item_ct1) {
461
- hardsigmoid(x, dst, k, item_ct1);
462
- });
439
+ [=](sycl::nd_item<3> item_ct1) { hardsigmoid(x, dst, k, item_ct1); });
463
440
  }
464
441
 
465
442
  template<typename T>
466
443
  static void hardswish_sycl(const T *x, T *dst, const int k,
467
444
  queue_ptr stream) {
468
445
  const int num_blocks = (k + SYCL_HARDSWISH_BLOCK_SIZE - 1) / SYCL_HARDSWISH_BLOCK_SIZE;
469
- stream->parallel_for(
470
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
471
- sycl::range<3>(1, 1, SYCL_HARDSWISH_BLOCK_SIZE),
446
+ sycl_parallel_for(
447
+ stream,
448
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_HARDSWISH_BLOCK_SIZE),
472
449
  sycl::range<3>(1, 1, SYCL_HARDSWISH_BLOCK_SIZE)),
473
- [=](sycl::nd_item<3> item_ct1) {
474
- hardswish(x, dst, k, item_ct1);
475
- });
450
+ [=](sycl::nd_item<3> item_ct1) { hardswish(x, dst, k, item_ct1); });
476
451
  }
477
452
 
478
453
  template<typename T>
479
454
  static void exp_sycl(const T *x, T *dst, const int k,
480
455
  queue_ptr stream) {
481
456
  const int num_blocks = (k + SYCL_EXP_BLOCK_SIZE - 1) / SYCL_EXP_BLOCK_SIZE;
482
- stream->parallel_for(
483
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
484
- sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE),
485
- sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE)),
486
- [=](sycl::nd_item<3> item_ct1) {
487
- exp(x, dst, k, item_ct1);
488
- });
457
+ sycl_parallel_for(stream,
458
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE),
459
+ sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE)),
460
+ [=](sycl::nd_item<3> item_ct1) { exp(x, dst, k, item_ct1); });
489
461
  }
490
462
 
491
463
  template<typename T>
492
464
  static void log_sycl(const T *x, T *dst, const int k,
493
465
  queue_ptr stream) {
494
466
  const int num_blocks = (k + SYCL_EXP_BLOCK_SIZE - 1) / SYCL_EXP_BLOCK_SIZE;
495
- stream->parallel_for(
496
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
497
- sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE),
498
- sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE)),
499
- [=](sycl::nd_item<3> item_ct1) {
500
- log(x, dst, k, item_ct1);
501
- });
467
+ sycl_parallel_for(stream,
468
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE),
469
+ sycl::range<3>(1, 1, SYCL_EXP_BLOCK_SIZE)),
470
+ [=](sycl::nd_item<3> item_ct1) { log(x, dst, k, item_ct1); });
502
471
  }
503
472
 
504
473
  template<typename T>
505
474
  static void neg_sycl(const T *x, T *dst, const int k,
506
475
  queue_ptr stream) {
507
476
  const int num_blocks = (k + SYCL_NEG_BLOCK_SIZE - 1) / SYCL_NEG_BLOCK_SIZE;
508
- stream->parallel_for(
509
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
510
- sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE),
511
- sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE)),
512
- [=](sycl::nd_item<3> item_ct1) {
513
- neg(x, dst, k, item_ct1);
514
- });
477
+ sycl_parallel_for(stream,
478
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE),
479
+ sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE)),
480
+ [=](sycl::nd_item<3> item_ct1) { neg(x, dst, k, item_ct1); });
515
481
  }
516
482
 
517
483
  template<typename T>
518
484
  static void step_sycl(const T *x, T *dst, const int k,
519
485
  queue_ptr stream) {
520
486
  const int num_blocks = (k + SYCL_NEG_BLOCK_SIZE - 1) / SYCL_NEG_BLOCK_SIZE;
521
- stream->parallel_for(
522
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
523
- sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE),
524
- sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE)),
525
- [=](sycl::nd_item<3> item_ct1) {
526
- step(x, dst, k, item_ct1);
527
- });
487
+ sycl_parallel_for(stream,
488
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE),
489
+ sycl::range<3>(1, 1, SYCL_NEG_BLOCK_SIZE)),
490
+ [=](sycl::nd_item<3> item_ct1) { step(x, dst, k, item_ct1); });
528
491
  }
529
492
 
530
493
  template<typename T>
531
494
  static void sigmoid_sycl(const T *x, T *dst, const int k,
532
495
  queue_ptr stream) {
533
496
  const int num_blocks = (k + SYCL_SIGMOID_BLOCK_SIZE - 1) / SYCL_SIGMOID_BLOCK_SIZE;
534
- stream->parallel_for(
535
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
536
- sycl::range<3>(1, 1, SYCL_SIGMOID_BLOCK_SIZE),
497
+ sycl_parallel_for(
498
+ stream,
499
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_SIGMOID_BLOCK_SIZE),
537
500
  sycl::range<3>(1, 1, SYCL_SIGMOID_BLOCK_SIZE)),
538
- [=](sycl::nd_item<3> item_ct1) {
539
- sigmoid(x, dst, k, item_ct1);
540
- });
501
+ [=](sycl::nd_item<3> item_ct1) { sigmoid(x, dst, k, item_ct1); });
541
502
  }
542
503
 
543
504
  template<typename T>
544
505
  static void sqrt_sycl(const T *x, T *dst, const int k,
545
506
  queue_ptr stream) {
546
507
  const int num_blocks = (k + SYCL_SQRT_BLOCK_SIZE - 1) / SYCL_SQRT_BLOCK_SIZE;
547
- stream->parallel_for(
548
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
549
- sycl::range<3>(1, 1, SYCL_SQRT_BLOCK_SIZE),
550
- sycl::range<3>(1, 1, SYCL_SQRT_BLOCK_SIZE)),
551
- [=](sycl::nd_item<3> item_ct1) {
552
- sqrt(x, dst, k, item_ct1);
553
- });
508
+ sycl_parallel_for(stream,
509
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_SQRT_BLOCK_SIZE),
510
+ sycl::range<3>(1, 1, SYCL_SQRT_BLOCK_SIZE)),
511
+ [=](sycl::nd_item<3> item_ct1) { sqrt(x, dst, k, item_ct1); });
554
512
  }
555
513
 
556
514
  template<typename T>
557
515
  static void sin_sycl(const T *x, T *dst, const int k,
558
516
  queue_ptr stream) {
559
517
  const int num_blocks = (k + SYCL_SIN_BLOCK_SIZE - 1) / SYCL_SIN_BLOCK_SIZE;
560
- stream->parallel_for(
561
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
562
- sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE),
563
- sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE)),
564
- [=](sycl::nd_item<3> item_ct1) {
565
- sin(x, dst, k, item_ct1);
566
- });
518
+ sycl_parallel_for(stream,
519
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE),
520
+ sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE)),
521
+ [=](sycl::nd_item<3> item_ct1) { sin(x, dst, k, item_ct1); });
567
522
  }
568
523
 
569
524
  template<typename T>
570
525
  static void cos_sycl(const T *x, T *dst, const int k,
571
526
  queue_ptr stream) {
572
527
  const int num_blocks = (k + SYCL_SIN_BLOCK_SIZE - 1) / SYCL_SIN_BLOCK_SIZE;
573
- stream->parallel_for(
574
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
575
- sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE),
576
- sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE)),
577
- [=](sycl::nd_item<3> item_ct1) {
578
- cos(x, dst, k, item_ct1);
579
- });
528
+ sycl_parallel_for(stream,
529
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE),
530
+ sycl::range<3>(1, 1, SYCL_SIN_BLOCK_SIZE)),
531
+ [=](sycl::nd_item<3> item_ct1) { cos(x, dst, k, item_ct1); });
580
532
  }
581
533
 
582
534
  template<typename T>
@@ -584,26 +536,20 @@ static void leaky_relu_sycl(const T *x, T *dst, const int k,
584
536
  const float negative_slope,
585
537
  queue_ptr stream) {
586
538
  const int num_blocks = (k + SYCL_RELU_BLOCK_SIZE - 1) / SYCL_RELU_BLOCK_SIZE;
587
- stream->parallel_for(
588
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
589
- sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE),
590
- sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE)),
591
- [=](sycl::nd_item<3> item_ct1) {
592
- leaky_relu(x, dst, k, negative_slope, item_ct1);
593
- });
539
+ sycl_parallel_for(stream,
540
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE),
541
+ sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE)),
542
+ [=](sycl::nd_item<3> item_ct1) { leaky_relu(x, dst, k, negative_slope, item_ct1); });
594
543
  }
595
544
 
596
545
  template<typename T>
597
546
  static void sqr_sycl(const T *x, T *dst, const int k,
598
547
  queue_ptr stream) {
599
548
  const int num_blocks = (k + SYCL_SQR_BLOCK_SIZE - 1) / SYCL_SQR_BLOCK_SIZE;
600
- stream->parallel_for(
601
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
602
- sycl::range<3>(1, 1, SYCL_SQR_BLOCK_SIZE),
603
- sycl::range<3>(1, 1, SYCL_SQR_BLOCK_SIZE)),
604
- [=](sycl::nd_item<3> item_ct1) {
605
- sqr(x, dst, k, item_ct1);
606
- });
549
+ sycl_parallel_for(stream,
550
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_SQR_BLOCK_SIZE),
551
+ sycl::range<3>(1, 1, SYCL_SQR_BLOCK_SIZE)),
552
+ [=](sycl::nd_item<3> item_ct1) { sqr(x, dst, k, item_ct1); });
607
553
  }
608
554
 
609
555
  template<typename T>
@@ -614,9 +560,8 @@ static void upscale_sycl(const T *x, T *dst, const int nb00, const int nb01,
614
560
  int dst_size = ne10 * ne11 * ne12 * ne13;
615
561
  int num_blocks = (dst_size + SYCL_UPSCALE_BLOCK_SIZE - 1) / SYCL_UPSCALE_BLOCK_SIZE;
616
562
  sycl::range<1> gridDim(num_blocks * SYCL_UPSCALE_BLOCK_SIZE);
617
- stream->parallel_for(
618
- sycl::nd_range<1>(gridDim, sycl::range<1>(SYCL_UPSCALE_BLOCK_SIZE)),
619
- [=](sycl::nd_item<1> item_ct1) {
563
+ sycl_parallel_for<1>(
564
+ stream, sycl::nd_range<1>(gridDim, sycl::range<1>(SYCL_UPSCALE_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) {
620
565
  upscale(x, dst, nb00, nb01, nb02, nb03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3, item_ct1);
621
566
  });
622
567
  }
@@ -627,12 +572,10 @@ static void pad_sycl(const T *x, T *dst, const int ne00,
627
572
  const int ne1, const int ne2, queue_ptr stream) {
628
573
  int num_blocks = (ne0 + SYCL_PAD_BLOCK_SIZE - 1) / SYCL_PAD_BLOCK_SIZE;
629
574
  sycl::range<3> gridDim(ne2, ne1, num_blocks);
630
- stream->parallel_for(
631
- sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE),
632
- sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE)),
633
- [=](sycl::nd_item<3> item_ct1) {
634
- pad(x, dst, ne0, ne00, ne01, ne02, item_ct1);
635
- });
575
+ sycl_parallel_for(stream,
576
+ sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE),
577
+ sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE)),
578
+ [=](sycl::nd_item<3> item_ct1) { pad(x, dst, ne0, ne00, ne01, ne02, item_ct1); });
636
579
  }
637
580
 
638
581
  template<typename T>
@@ -640,13 +583,10 @@ static void clamp_sycl(const T *x, T *dst, const float min,
640
583
  const float max, const int k,
641
584
  queue_ptr stream) {
642
585
  const int num_blocks = (k + SYCL_CLAMP_BLOCK_SIZE - 1) / SYCL_CLAMP_BLOCK_SIZE;
643
- stream->parallel_for(
644
- sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
645
- sycl::range<3>(1, 1, SYCL_CLAMP_BLOCK_SIZE),
646
- sycl::range<3>(1, 1, SYCL_CLAMP_BLOCK_SIZE)),
647
- [=](sycl::nd_item<3> item_ct1) {
648
- clamp(x, dst, min, max, k, item_ct1);
649
- });
586
+ sycl_parallel_for(stream,
587
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CLAMP_BLOCK_SIZE),
588
+ sycl::range<3>(1, 1, SYCL_CLAMP_BLOCK_SIZE)),
589
+ [=](sycl::nd_item<3> item_ct1) { clamp(x, dst, min, max, k, item_ct1); });
650
590
  }
651
591
 
652
592
  inline void ggml_sycl_op_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
@@ -65,6 +65,9 @@ public:
65
65
 
66
66
  dnnl::primitive_attr primitive_attr;
67
67
  primitive_attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
68
+ #ifdef GGML_SYCL_F16
69
+ primitive_attr.set_fpmath_mode(dnnl::fpmath_mode::f16);
70
+ #endif
68
71
 
69
72
  auto a_mem = dnnl::memory(a_in_md, eng, const_cast<void*>(a));
70
73
  auto b_mem = dnnl::memory(b_in_md, eng, const_cast<void*>(b));
@@ -60,54 +60,6 @@ static void k_get_rows(
60
60
  dst_row[iybs + iqs + y_offset] = v.y();
61
61
  }
62
62
 
63
- template<int qk, int qr, dequantize_kernel_t_reorder dequantize_kernel_recorder, typename dst_t>
64
- static void k_get_rows_reorder(
65
- const void * src0, const void *src0_dq, const int32_t * src1, dst_t * dst,
66
- int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
67
- /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
68
- /*size_t s0,*/ size_t s1, size_t s2, size_t s3,
69
- /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
70
- size_t s10, size_t s11, size_t s12,
71
- const sycl::nd_item<3> &item_ct1/*, size_t s13*/) {
72
-
73
- const int i00 = (item_ct1.get_group(2) * item_ct1.get_local_range(2) +
74
- item_ct1.get_local_id(2)) *
75
- 2;
76
- const int i10 = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
77
- item_ct1.get_local_id(1);
78
- const int i11 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
79
- item_ct1.get_local_id(0)) /
80
- ne12;
81
- const int i12 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
82
- item_ct1.get_local_id(0)) %
83
- ne12;
84
-
85
- if (i00 >= ne00) {
86
- return;
87
- }
88
- auto ncols = ne00;
89
- const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
90
-
91
- dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
92
-
93
- const int src0_off = i01 * ncols + i00;
94
- const int ib = src0_off / QK4_0; // block index
95
- const int iqs = (i00%qk)/qr; // x quant index
96
- const int iybs = i00 - i00%qk; // dst block start index
97
- const int y_offset = qr == 1 ? 1 : qk/2;
98
-
99
- // dequantize
100
- dfloat2 v;
101
- dequantize_kernel_recorder((const void *)src0_dq, ib, (const void *)src0, src0_off/2, v);
102
-
103
- dst_row[iybs + iqs + 0] = v.x();
104
- dst_row[iybs + iqs + y_offset] = v.y();
105
-
106
- GGML_UNUSED(nb01);
107
- GGML_UNUSED(nb02);
108
- GGML_UNUSED(nb03);
109
- }
110
-
111
63
  template<typename src0_t, typename dst_t>
112
64
  static void k_get_rows_float(
113
65
  const src0_t * src0, const int32_t * src1, dst_t * dst,
@@ -166,58 +118,15 @@ static void get_rows_sycl(ggml_backend_sycl_context & ctx, const ggml_tensor *sr
166
118
 
167
119
  GGML_ASSERT(ne00 % 2 == 0);
168
120
 
169
- stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
170
- [=](sycl::nd_item<3> item_ct1) {
171
- k_get_rows<qk, qr, dq>(
172
- src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2,
173
- s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);
174
- });
175
-
176
- GGML_UNUSED(dst);
177
- GGML_UNUSED(ctx);
178
- }
179
-
180
- template <int qk, int qr, dequantize_kernel_t_reorder dq_reorder>
181
- static void get_rows_sycl_reorder(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1,
182
- ggml_tensor *dst, const void *src0_dd,
183
- const int32_t *src1_dd, float *dst_dd,
184
- queue_ptr stream) {
185
-
186
- GGML_TENSOR_BINARY_OP_LOCALS
187
-
188
- const sycl::range<3> block_dims(1, 1, SYCL_GET_ROWS_BLOCK_SIZE);
189
- const int block_num_x = (ne00 + 2*SYCL_GET_ROWS_BLOCK_SIZE - 1) / (2*SYCL_GET_ROWS_BLOCK_SIZE);
190
- const sycl::range<3> block_nums(ne11 * ne12, ne10, block_num_x);
191
-
192
- // strides in elements
193
- //const size_t s0 = nb0 / ggml_element_size(dst);
194
- const size_t s1 = nb1 / ggml_element_size(dst);
195
- const size_t s2 = nb2 / ggml_element_size(dst);
196
- const size_t s3 = nb3 / ggml_element_size(dst);
197
-
198
- const size_t s10 = nb10 / ggml_element_size(src1);
199
- const size_t s11 = nb11 / ggml_element_size(src1);
200
- const size_t s12 = nb12 / ggml_element_size(src1);
201
- //const size_t s13 = nb13 / ggml_element_size(src1);
202
-
203
- GGML_ASSERT(ne00 % 2 == 0);
204
-
205
- const uint8_t* src0_q = (const uint8_t*)src0_dd;
206
- const size_t ncols = ne00;
207
- const size_t nrows = ne01;
208
- const sycl::half* src0_dq = (const sycl::half*)(src0_q + nrows * ncols / 2);
209
- stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
210
- [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]]{
211
- k_get_rows_reorder<qk, qr, dq_reorder>(
212
- src0_dd, src0_dq, src1_dd, dst_dd, ne00, ne12, s1, s2,
213
- s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);
214
- });
121
+ sycl_parallel_for(stream, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
122
+ k_get_rows<qk, qr, dq>(src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2, s3, nb01, nb02, nb03, s10, s11, s12,
123
+ item_ct1);
124
+ });
215
125
 
216
126
  GGML_UNUSED(dst);
217
127
  GGML_UNUSED(ctx);
218
128
  }
219
129
 
220
-
221
130
  template <typename src0_t>
222
131
  static void get_rows_sycl_float(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
223
132
  const ggml_tensor *src1, ggml_tensor *dst,
@@ -245,9 +154,8 @@ static void get_rows_sycl_float(ggml_backend_sycl_context & ctx, const ggml_tens
245
154
  dpct::has_capability_or_fail(stream->get_device(),
246
155
  {sycl::aspect::fp16});
247
156
 
248
- stream->parallel_for(
249
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
250
- [=](sycl::nd_item<3> item_ct1) {
157
+ sycl_parallel_for(
158
+ stream, sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
251
159
  k_get_rows_float(src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2,
252
160
  s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);
253
161
  });
@@ -277,13 +185,8 @@ void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
277
185
  src1_i32, (float *)dst->data, ctx.stream());
278
186
  break;
279
187
  case GGML_TYPE_Q4_0:
280
- if (ctx.opt_feature.reorder && dst->op == GGML_OP_MUL_MAT) {
281
- get_rows_sycl_reorder<QK4_0, QR4_0, dequantize_q4_0_reorder>(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data,
282
- src1_i32, (float *)dst->data, ctx.stream());
283
- } else {
284
- get_rows_sycl<QK4_0, QR4_0, dequantize_q4_0>(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data,
285
- src1_i32, (float *)dst->data, ctx.stream());
286
- }
188
+ get_rows_sycl<QK4_0, QR4_0, dequantize_q4_0>(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data,
189
+ src1_i32, (float *)dst->data, ctx.stream());
287
190
  break;
288
191
  case GGML_TYPE_Q4_1:
289
192
  get_rows_sycl<QK4_1, QR4_1, dequantize_q4_1>(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data,