@fugood/llama.node 0.3.16 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (281) hide show
  1. package/CMakeLists.txt +6 -1
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +44 -2
  19. package/lib/index.js +132 -1
  20. package/lib/index.ts +203 -3
  21. package/package.json +2 -1
  22. package/src/EmbeddingWorker.cpp +1 -1
  23. package/src/LlamaCompletionWorker.cpp +374 -19
  24. package/src/LlamaCompletionWorker.h +31 -10
  25. package/src/LlamaContext.cpp +216 -7
  26. package/src/LlamaContext.h +12 -0
  27. package/src/common.hpp +15 -0
  28. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +233 -0
  29. package/src/llama.cpp/.github/workflows/build.yml +89 -767
  30. package/src/llama.cpp/.github/workflows/docker.yml +9 -6
  31. package/src/llama.cpp/.github/workflows/release.yml +716 -0
  32. package/src/llama.cpp/.github/workflows/server.yml +19 -23
  33. package/src/llama.cpp/CMakeLists.txt +11 -1
  34. package/src/llama.cpp/cmake/build-info.cmake +8 -2
  35. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
  36. package/src/llama.cpp/common/CMakeLists.txt +35 -4
  37. package/src/llama.cpp/common/arg.cpp +844 -121
  38. package/src/llama.cpp/common/arg.h +9 -0
  39. package/src/llama.cpp/common/chat.cpp +129 -107
  40. package/src/llama.cpp/common/chat.h +2 -0
  41. package/src/llama.cpp/common/common.cpp +64 -518
  42. package/src/llama.cpp/common/common.h +35 -45
  43. package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
  44. package/src/llama.cpp/common/llguidance.cpp +31 -47
  45. package/src/llama.cpp/common/minja/chat-template.hpp +23 -11
  46. package/src/llama.cpp/common/minja/minja.hpp +186 -127
  47. package/src/llama.cpp/common/regex-partial.cpp +204 -0
  48. package/src/llama.cpp/common/regex-partial.h +56 -0
  49. package/src/llama.cpp/common/sampling.cpp +60 -50
  50. package/src/llama.cpp/docs/build.md +122 -7
  51. package/src/llama.cpp/examples/CMakeLists.txt +2 -32
  52. package/src/llama.cpp/examples/batched/batched.cpp +1 -1
  53. package/src/llama.cpp/examples/embedding/embedding.cpp +9 -12
  54. package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
  55. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  56. package/src/llama.cpp/examples/parallel/parallel.cpp +89 -15
  57. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
  58. package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
  59. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  60. package/src/llama.cpp/examples/sycl/build.sh +2 -2
  61. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
  62. package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
  63. package/src/llama.cpp/examples/training/finetune.cpp +96 -0
  64. package/src/llama.cpp/ggml/CMakeLists.txt +35 -2
  65. package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
  66. package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
  67. package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
  68. package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
  69. package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
  70. package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
  71. package/src/llama.cpp/ggml/include/ggml.h +76 -106
  72. package/src/llama.cpp/ggml/src/CMakeLists.txt +11 -8
  73. package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
  74. package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
  75. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
  76. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
  77. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
  78. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
  79. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
  80. package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
  81. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
  82. package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
  83. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +66 -33
  84. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
  85. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  86. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
  87. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
  88. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -194
  89. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
  90. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1060 -410
  91. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1008 -13533
  92. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +31 -16
  93. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +90 -12
  94. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -13
  95. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +266 -72
  96. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1034 -88
  97. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8796 -0
  98. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
  99. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
  101. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
  102. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +252 -0
  103. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
  104. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
  105. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
  106. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
  107. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
  108. package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
  109. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +106 -14
  110. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
  111. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -262
  112. package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
  113. package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
  114. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +307 -40
  115. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +125 -45
  116. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +10 -8
  117. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +239 -0
  118. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  119. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
  120. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +9 -307
  121. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
  122. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
  123. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
  124. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
  125. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
  126. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +944 -438
  127. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
  128. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
  129. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
  130. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
  131. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +507 -411
  132. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
  133. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
  134. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
  135. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
  136. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
  137. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
  138. package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
  140. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
  141. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
  142. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +83 -49
  143. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1278 -282
  144. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +32 -0
  145. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +133 -30
  146. package/src/llama.cpp/ggml/src/ggml.c +170 -265
  147. package/src/llama.cpp/ggml/src/gguf.cpp +34 -33
  148. package/src/llama.cpp/include/llama.h +82 -22
  149. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
  150. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
  151. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
  152. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
  153. package/src/llama.cpp/requirements/requirements-all.txt +5 -3
  154. package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
  155. package/src/llama.cpp/scripts/xxd.cmake +1 -1
  156. package/src/llama.cpp/src/CMakeLists.txt +4 -2
  157. package/src/llama.cpp/src/llama-adapter.cpp +43 -1
  158. package/src/llama.cpp/src/llama-arch.cpp +163 -17
  159. package/src/llama.cpp/src/llama-arch.h +16 -0
  160. package/src/llama.cpp/src/llama-batch.cpp +5 -1
  161. package/src/llama.cpp/src/llama-batch.h +2 -1
  162. package/src/llama.cpp/src/llama-chat.cpp +91 -16
  163. package/src/llama.cpp/src/llama-chat.h +7 -2
  164. package/src/llama.cpp/src/llama-context.cpp +479 -575
  165. package/src/llama.cpp/src/llama-context.h +44 -33
  166. package/src/llama.cpp/src/llama-cparams.h +1 -0
  167. package/src/llama.cpp/src/llama-graph.cpp +209 -157
  168. package/src/llama.cpp/src/llama-graph.h +38 -14
  169. package/src/llama.cpp/src/llama-hparams.h +13 -0
  170. package/src/llama.cpp/src/llama-kv-cache.cpp +1604 -543
  171. package/src/llama.cpp/src/llama-kv-cache.h +283 -171
  172. package/src/llama.cpp/src/llama-memory.h +12 -2
  173. package/src/llama.cpp/src/llama-mmap.cpp +1 -1
  174. package/src/llama.cpp/src/llama-model-loader.cpp +34 -20
  175. package/src/llama.cpp/src/llama-model-loader.h +5 -3
  176. package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
  177. package/src/llama.cpp/src/llama-model-saver.h +37 -0
  178. package/src/llama.cpp/src/llama-model.cpp +1803 -330
  179. package/src/llama.cpp/src/llama-model.h +21 -2
  180. package/src/llama.cpp/src/llama-quant.cpp +33 -10
  181. package/src/llama.cpp/src/llama-sampling.cpp +25 -7
  182. package/src/llama.cpp/src/llama-vocab.cpp +86 -10
  183. package/src/llama.cpp/src/llama-vocab.h +6 -0
  184. package/src/llama.cpp/src/llama.cpp +15 -1
  185. package/src/llama.cpp/tests/CMakeLists.txt +52 -31
  186. package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
  187. package/src/llama.cpp/tests/test-backend-ops.cpp +189 -90
  188. package/src/llama.cpp/tests/test-chat-template.cpp +26 -6
  189. package/src/llama.cpp/tests/test-chat.cpp +15 -3
  190. package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
  191. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
  192. package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
  193. package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
  194. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
  195. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
  196. package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
  197. package/src/llama.cpp/tests/test-opt.cpp +33 -21
  198. package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
  199. package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
  200. package/src/llama.cpp/tests/test-sampling.cpp +1 -1
  201. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
  202. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
  203. package/src/llama.cpp/tools/CMakeLists.txt +39 -0
  204. package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +3 -3
  205. package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +1 -1
  206. package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +15 -16
  207. package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
  208. package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +623 -274
  209. package/src/llama.cpp/{examples → tools}/main/main.cpp +22 -14
  210. package/src/llama.cpp/tools/mtmd/CMakeLists.txt +47 -0
  211. package/src/llama.cpp/tools/mtmd/clip-impl.h +365 -0
  212. package/src/llama.cpp/tools/mtmd/clip.cpp +3646 -0
  213. package/src/llama.cpp/tools/mtmd/clip.h +99 -0
  214. package/src/llama.cpp/tools/mtmd/deprecation-warning.cpp +22 -0
  215. package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +370 -0
  216. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
  217. package/src/llama.cpp/tools/mtmd/mtmd.cpp +678 -0
  218. package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
  219. package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +21 -5
  220. package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +53 -3
  221. package/src/llama.cpp/tools/rpc/CMakeLists.txt +4 -0
  222. package/src/llama.cpp/tools/rpc/rpc-server.cpp +322 -0
  223. package/src/llama.cpp/tools/run/CMakeLists.txt +16 -0
  224. package/src/llama.cpp/{examples → tools}/run/run.cpp +30 -30
  225. package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
  226. package/src/llama.cpp/{examples → tools}/server/httplib.h +313 -247
  227. package/src/llama.cpp/{examples → tools}/server/server.cpp +529 -215
  228. package/src/llama.cpp/{examples → tools}/server/utils.hpp +427 -6
  229. package/src/llama.cpp/{examples → tools}/tts/tts.cpp +6 -9
  230. package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
  231. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
  232. package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
  233. package/src/llama.cpp/examples/infill/infill.cpp +0 -590
  234. package/src/llama.cpp/examples/llava/CMakeLists.txt +0 -66
  235. package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
  236. package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
  237. package/src/llama.cpp/examples/llava/clip.cpp +0 -3206
  238. package/src/llama.cpp/examples/llava/clip.h +0 -118
  239. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
  240. package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
  241. package/src/llama.cpp/examples/llava/llava.cpp +0 -574
  242. package/src/llama.cpp/examples/llava/llava.h +0 -49
  243. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
  244. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +0 -584
  245. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
  246. package/src/llama.cpp/examples/rpc/CMakeLists.txt +0 -2
  247. package/src/llama.cpp/examples/rpc/rpc-server.cpp +0 -171
  248. package/src/llama.cpp/examples/run/CMakeLists.txt +0 -5
  249. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  250. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  251. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  252. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  253. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  254. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  255. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  256. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  257. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  258. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
  259. /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
  260. /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
  261. /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
  262. /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
  263. /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
  264. /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
  265. /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
  266. /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
  267. /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
  268. /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
  269. /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
  270. /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
  271. /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
  272. /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
  273. /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
  274. /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
  275. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
  276. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
  277. /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
  278. /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
  279. /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
  280. /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
  281. /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
@@ -42,6 +42,7 @@ void ggml_sycl_host_free(void* ptr);
42
42
 
43
43
  extern int g_ggml_sycl_debug;
44
44
  extern int g_ggml_sycl_disable_optimize;
45
+ extern int g_ggml_sycl_prioritize_dmmv;
45
46
 
46
47
  #define GGML_SYCL_DEBUG(...) \
47
48
  do { \
@@ -80,10 +81,6 @@ extern int g_ggml_sycl_disable_optimize;
80
81
  // max batch size to use MMQ kernels when tensor cores are available
81
82
  #define MMQ_MAX_BATCH_SIZE 32
82
83
 
83
- #if defined(_MSC_VER)
84
- #pragma warning(disable : 4244 4267) // possible loss of data
85
- #endif
86
-
87
84
  // dmmv = dequantize_mul_mat_vec
88
85
  #ifndef GGML_SYCL_DMMV_X
89
86
  #define GGML_SYCL_DMMV_X 32
@@ -118,17 +115,12 @@ static void crash() {
118
115
  GGML_ABORT("SYCL error");
119
116
  }
120
117
 
121
- #define SYCL_CHECK(err) \
122
- do { \
123
- auto err_ = (err); \
124
- if (err_ != 0) \
125
- ggml_sycl_error( \
126
- #err, \
127
- __func__, \
128
- __FILE__, \
129
- __LINE__, \
130
- "Meet error in this line code!"); \
131
- } while (0)
118
+ #define SYCL_CHECK(err) \
119
+ do { \
120
+ auto err_ = (err); \
121
+ if (err_ != 0) \
122
+ ggml_sycl_error(#err, __func__, __FILE__, __LINE__, "Exception caught in this line of code."); \
123
+ } while (0)
132
124
 
133
125
  #if DPCT_COMPAT_RT_VERSION >= 11100
134
126
  #define GGML_SYCL_ASSUME(x) __builtin_assume(x)
@@ -313,7 +305,6 @@ struct ggml_backend_sycl_context {
313
305
  int device;
314
306
  std::string name;
315
307
  optimize_feature opt_feature;
316
- bool optimized_graph=false;
317
308
 
318
309
  queue_ptr qptrs[GGML_SYCL_MAX_DEVICES][GGML_SYCL_MAX_STREAMS] = { { nullptr } };
319
310
 
@@ -494,298 +485,9 @@ static __dpct_inline__ Tp* get_pointer(sycl::local_accessor<Tp, dim> acc) {
494
485
 
495
486
  int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block_size);
496
487
 
497
- typedef void (*ggml_sycl_op_flatten_t)(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
498
- const ggml_tensor *src1,
499
- ggml_tensor *dst, const float *src0_dd,
500
- const float *src1_dd, float *dst_dd,
501
- const queue_ptr &main_stream);
502
-
503
- template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
504
- static void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
505
- int ne0, int ne1, int ne2, int ne3,
506
- int ne10, int ne11, int ne12, int ne13,
507
- /*int s0, */ int s1, int s2, int s3,
508
- /*int s00,*/ int s01, int s02, int s03,
509
- /*int s10,*/ int s11, int s12, int s13,
510
- const sycl::nd_item<3> &item_ct1) {
511
- const int i0s = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
512
- item_ct1.get_local_id(2);
513
- const int i1 = (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
514
- item_ct1.get_local_id(1));
515
- const int i2 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) +
516
- item_ct1.get_local_id(0)) /
517
- ne3;
518
- const int i3 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) +
519
- item_ct1.get_local_id(0)) %
520
- ne3;
521
-
522
- if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
523
- return;
524
- }
525
-
526
- const int i11 = i1 % ne11;
527
- const int i12 = i2 % ne12;
528
- const int i13 = i3 % ne13;
529
-
530
- const size_t i_src0 = i3*s03 + i2*s02 + i1*s01;
531
- const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
532
- const size_t i_dst = i3*s3 + i2*s2 + i1*s1;
533
-
534
- const src0_t * src0_row = src0 + i_src0;
535
- const src1_t * src1_row = src1 + i_src1;
536
- dst_t * dst_row = dst + i_dst;
537
-
538
- for (int i0 = i0s; i0 < ne0;
539
- i0 += item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) {
540
- const int i10 = i0 % ne10;
541
- dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
542
- }
543
- }
544
-
545
- template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
546
- static void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst,
547
- int ne0, int ne1, int ne2, int ne3,
548
- int ne10, int ne11, int ne12, int ne13,
549
- /*int s0, */ int s1, int s2, int s3,
550
- /*int s00,*/ int s01, int s02, int s03,
551
- /*int s10,*/ int s11, int s12, int s13,
552
- const sycl::nd_item<3> &item_ct1) {
553
-
554
- const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
555
- item_ct1.get_local_id(2);
556
-
557
- const int i3 = i/(ne2*ne1*ne0);
558
- const int i2 = (i/(ne1*ne0)) % ne2;
559
- const int i1 = (i/ne0) % ne1;
560
- const int i0 = i % ne0;
561
-
562
- if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
563
- return;
564
- }
565
-
566
- const int i11 = i1 % ne11;
567
- const int i12 = i2 % ne12;
568
- const int i13 = i3 % ne13;
569
-
570
- const size_t i_src0 = i3*s03 + i2*s02 + i1*s01;
571
- const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
572
- const size_t i_dst = i3*s3 + i2*s2 + i1*s1;
573
-
574
- const src0_t * src0_row = src0 + i_src0;
575
- const src1_t * src1_row = src1 + i_src1;
576
- dst_t * dst_row = dst + i_dst;
577
-
578
- const int i10 = i0 % ne10;
579
- dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
580
- }
581
-
582
-
583
- template<float (*bin_op)(const float, const float)>
584
- struct bin_bcast_sycl {
585
- template <typename src0_t, typename src1_t, typename dst_t>
586
- void operator()(ggml_backend_sycl_context & ctx,
587
- const struct ggml_tensor *src0,
588
- const struct ggml_tensor *src1, struct ggml_tensor *dst,
589
- const src0_t *src0_dd, const src1_t *src1_dd, dst_t *dst_dd,
590
- queue_ptr stream) {
591
-
592
- GGML_TENSOR_BINARY_OP_LOCALS
593
-
594
- int nr0 = ne10/ne0;
595
- int nr1 = ne11/ne1;
596
- int nr2 = ne12/ne2;
597
- int nr3 = ne13/ne3;
598
-
599
- int nr[4] = { nr0, nr1, nr2, nr3 };
600
-
601
- // collapse dimensions until first broadcast dimension
602
- int64_t cne[] = {ne0, ne1, ne2, ne3};
603
- int64_t cne0[] = {ne00, ne01, ne02, ne03};
604
- int64_t cne1[] = {ne10, ne11, ne12, ne13};
605
- size_t cnb[] = {nb0, nb1, nb2, nb3};
606
- size_t cnb0[] = {nb00, nb01, nb02, nb03};
607
- size_t cnb1[] = {nb10, nb11, nb12, nb13};
608
- auto collapse = [](int64_t cne[]) {
609
- cne[0] *= cne[1];
610
- cne[1] = cne[2];
611
- cne[2] = cne[3];
612
- cne[3] = 1;
613
- };
614
-
615
- auto collapse_nb = [](size_t cnb[], int64_t cne[]) {
616
- cnb[1] *= cne[1];
617
- cnb[2] *= cne[2];
618
- cnb[3] *= cne[3];
619
- };
620
-
621
- if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) {
622
- for (int i = 0; i < 4; i++) {
623
- if (nr[i] != 1) {
624
- break;
625
- }
626
- if (i > 0) {
627
- collapse_nb(cnb, cne);
628
- collapse_nb(cnb0, cne0);
629
- collapse_nb(cnb1, cne1);
630
- collapse(cne);
631
- collapse(cne0);
632
- collapse(cne1);
633
- }
634
- }
635
- }
636
- {
637
- int64_t ne0 = cne[0];
638
- int64_t ne1 = cne[1];
639
- int64_t ne2 = cne[2];
640
- int64_t ne3 = cne[3];
641
-
642
- int64_t ne10 = cne1[0];
643
- int64_t ne11 = cne1[1];
644
- int64_t ne12 = cne1[2];
645
- int64_t ne13 = cne1[3];
646
-
647
- size_t nb0 = cnb[0];
648
- size_t nb1 = cnb[1];
649
- size_t nb2 = cnb[2];
650
- size_t nb3 = cnb[3];
651
-
652
- size_t nb00 = cnb0[0];
653
- size_t nb01 = cnb0[1];
654
- size_t nb02 = cnb0[2];
655
- size_t nb03 = cnb0[3];
656
-
657
- size_t nb10 = cnb1[0];
658
- size_t nb11 = cnb1[1];
659
- size_t nb12 = cnb1[2];
660
- size_t nb13 = cnb1[3];
661
-
662
- size_t s0 = nb0 / sizeof(dst_t);
663
- size_t s1 = nb1 / sizeof(dst_t);
664
- size_t s2 = nb2 / sizeof(dst_t);
665
- size_t s3 = nb3 / sizeof(dst_t);
666
-
667
- size_t s10 = nb10 / sizeof(src1_t);
668
- size_t s11 = nb11 / sizeof(src1_t);
669
- size_t s12 = nb12 / sizeof(src1_t);
670
- size_t s13 = nb13 / sizeof(src1_t);
671
-
672
- size_t s00 = nb00 / sizeof(src0_t);
673
- size_t s01 = nb01 / sizeof(src0_t);
674
- size_t s02 = nb02 / sizeof(src0_t);
675
- size_t s03 = nb03 / sizeof(src0_t);
676
-
677
- GGML_UNUSED(s00);
678
-
679
- GGML_ASSERT(nb0 % sizeof(dst_t) == 0);
680
- GGML_ASSERT(nb1 % sizeof(dst_t) == 0);
681
- GGML_ASSERT(nb2 % sizeof(dst_t) == 0);
682
- GGML_ASSERT(nb3 % sizeof(dst_t) == 0);
683
-
684
- GGML_ASSERT(nb00 % sizeof(src0_t) == 0);
685
- GGML_ASSERT(nb01 % sizeof(src0_t) == 0);
686
- GGML_ASSERT(nb02 % sizeof(src0_t) == 0);
687
- GGML_ASSERT(nb03 % sizeof(src0_t) == 0);
688
-
689
- GGML_ASSERT(nb10 % sizeof(src1_t) == 0);
690
- GGML_ASSERT(nb11 % sizeof(src1_t) == 0);
691
- GGML_ASSERT(nb12 % sizeof(src1_t) == 0);
692
- GGML_ASSERT(nb13 % sizeof(src1_t) == 0);
693
-
694
- GGML_ASSERT(s0 == 1);
695
- GGML_ASSERT(s10 == 1);
696
-
697
- const int block_size = 128;
698
-
699
- int64_t hne0 = std::max(ne0/2LL, 1LL);
700
-
701
- sycl::range<3> block_dims(1, 1, 1);
702
- block_dims[2] = std::min<unsigned int>(hne0, block_size);
703
- block_dims[1] = std::min<unsigned int>(
704
- ne1, block_size / (unsigned int)block_dims[2]);
705
- block_dims[0] = std::min(
706
- std::min<unsigned int>(
707
- ne2 * ne3, block_size / (unsigned int)block_dims[2] /
708
- (unsigned int)block_dims[1]),
709
- 64U);
710
-
711
- sycl::range<3> block_nums(
712
- (ne2 * ne3 + block_dims[0] - 1) / block_dims[0],
713
- (ne1 + block_dims[1] - 1) / block_dims[1],
714
- (hne0 + block_dims[2] - 1) / block_dims[2]);
715
-
716
- if (block_nums[0] > 65535) {
717
- // this is the maximum number of blocks in z direction, fallback to 1D grid kernel
718
- int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;
719
- {
720
- dpct::has_capability_or_fail(stream->get_device(),
721
- {sycl::aspect::fp16});
722
-
723
- stream->parallel_for(
724
- sycl::nd_range<3>(sycl::range<3>(1, 1, block_num) *
725
- sycl::range<3>(1, 1, block_size),
726
- sycl::range<3>(1, 1, block_size)),
727
- [=](sycl::nd_item<3> item_ct1) {
728
- k_bin_bcast_unravel<bin_op>(
729
- src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3,
730
- ne10, ne11, ne12, ne13, s1, s2, s3, s01, s02,
731
- s03, s11, s12, s13, item_ct1);
732
- });
733
- }
734
- } else {
735
- /*
736
- DPCT1049:16: The work-group size passed to the SYCL kernel may
737
- exceed the limit. To get the device limit, query
738
- info::device::max_work_group_size. Adjust the work-group size if
739
- needed.
740
- */
741
- dpct::has_capability_or_fail(stream->get_device(),
742
- {sycl::aspect::fp16});
743
-
744
- stream->parallel_for(
745
- sycl::nd_range<3>(block_nums * block_dims, block_dims),
746
- [=](sycl::nd_item<3> item_ct1) {
747
- k_bin_bcast<bin_op>(src0_dd, src1_dd, dst_dd, ne0, ne1,
748
- ne2, ne3, ne10, ne11, ne12, ne13,
749
- s1, s2, s3, s01, s02, s03, s11, s12, s13,
750
- item_ct1);
751
- });
752
- }
753
- }
754
- GGML_UNUSED(ctx);
755
- }
756
- };
757
-
758
- template <class op>
759
- inline void ggml_sycl_op_bin_bcast(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
760
- const ggml_tensor *src1, ggml_tensor *dst,
761
- const float *src0_dd, const float *src1_dd,
762
- float *dst_dd,
763
- const queue_ptr &main_stream) {
764
-
765
- if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
766
- op()(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
767
- } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
768
- op()(ctx, src0, src1, dst, (const sycl::half *)src0_dd, src1_dd,
769
- (sycl::half *)dst_dd, main_stream);
770
- } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
771
- op()(ctx, src0, src1, dst, (const sycl::half *)src0_dd, src1_dd, dst_dd,
772
- main_stream);
773
- } else if (src0->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_I32) {
774
- op()(ctx, src0, src1, dst, (const int32_t *)src0_dd, (const int32_t *)src1_dd, (int32_t *)dst_dd,
775
- main_stream);
776
- } else if (src0->type == GGML_TYPE_I16 && dst->type == GGML_TYPE_I16) {
777
- op()(ctx, src0, src1, dst, (const int16_t *)src0_dd, (const int16_t *)src1_dd, (int16_t *)dst_dd,
778
- main_stream);
779
- } else {
780
- fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
781
- ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
782
- GGML_ABORT("fatal error");
783
- }
488
+ constexpr size_t ceil_div(const size_t m, const size_t n) {
489
+ return (m + n - 1) / n;
784
490
  }
785
491
 
786
492
  bool gpu_has_xmx(sycl::device &dev);
787
-
788
- void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
789
- const ggml_tensor *src1, ggml_tensor *dst,
790
- const ggml_sycl_op_flatten_t op);
791
493
  #endif // GGML_SYCL_COMMON_HPP
@@ -183,6 +183,24 @@ static void dequantize_row_q4_K_sycl(const void *vx, dst_t *y, const int64_t k,
183
183
  }
184
184
  }
185
185
 
186
+ template <typename dst_t>
187
+ static void dequantize_row_q4_K_sycl_reorder(const void * vx, dst_t * y, const int64_t k, dpct::queue_ptr stream) {
188
+ const int64_t nb = k / QK_K;
189
+ const size_t local_size = 32;
190
+ const size_t global_size = nb * local_size;
191
+
192
+ dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
193
+
194
+ stream->submit([&](sycl::handler & cgh) {
195
+ sycl::local_accessor<uint8_t, 1> scale_local_acc(sycl::range<1>(12), cgh);
196
+
197
+ cgh.parallel_for(sycl::nd_range<1>(sycl::range<1>(global_size), sycl::range<1>(local_size)),
198
+ [=](sycl::nd_item<1> item_ct1) {
199
+ dequantize_block_q4_K_reorder(vx, y, get_pointer(scale_local_acc), item_ct1, nb);
200
+ });
201
+ });
202
+ }
203
+
186
204
  template <typename dst_t>
187
205
  static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int64_t k,
188
206
  dpct::queue_ptr stream) {
@@ -437,41 +455,52 @@ static void dequantize_row_iq4_nl_sycl(const void *vx, dst_t *y, const int64_t k
437
455
  }
438
456
 
439
457
  template <typename src_t, typename dst_t>
440
- static void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k,
441
- const sycl::nd_item<3> &item_ct1) {
458
+ static void convert_unary_nc(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t ne00, const int64_t ne01,
459
+ const int64_t ne02, const int64_t s01, const int64_t s02, const int64_t s03,
460
+ const sycl::nd_item<3> & item_ct1) {
461
+
442
462
  const int64_t work_group_size = item_ct1.get_local_range(2);
443
- const int64_t global_id = item_ct1.get_local_id(2) + work_group_size * item_ct1.get_group(2);
463
+ const int64_t global_id = item_ct1.get_local_id(2) + work_group_size * item_ct1.get_group(2);
464
+
465
+ const int64_t i01 = item_ct1.get_group(1);
466
+ const int64_t i02 = item_ct1.get_group(0) % ne02;
467
+ const int64_t i03 = item_ct1.get_group(0) / ne02;
444
468
 
445
469
  // make each work-item deal with more elements since sycl global range can not exceed max int
446
- const src_t * x = (const src_t *) vx;
447
- for (int64_t i = global_id; i < k; i += work_group_size * item_ct1.get_group_range(2)) {
448
- y[i] = x[i];
470
+ const src_t * x = static_cast<const src_t *>(vx);
471
+ const int64_t ix = i03 * s03 + i02 * s02 + i01 * s01;
472
+ const int64_t iy = ((i03 * ne02 + i02) * ne01 + i01) * ne00;
473
+
474
+ #pragma unroll
475
+ for (int64_t i00 = global_id; i00 < ne00; i00 += work_group_size * item_ct1.get_group_range(2)) {
476
+ y[iy + i00] = static_cast<dst_t>(x[ix + i00]);
449
477
  }
450
478
  }
451
479
 
452
480
  template <typename src_t, typename dst_t>
453
- static void convert_unary_sycl(const void *__restrict__ vx,
454
- dst_t *__restrict__ y, const int64_t k,
455
- dpct::queue_ptr stream) {
456
- const int64_t num_blocks = (k + SYCL_DEQUANTIZE_BLOCK_SIZE - 1) / SYCL_DEQUANTIZE_BLOCK_SIZE;
481
+ static void convert_unary_nc_sycl(const void * __restrict__ vx, dst_t * __restrict__ y,
482
+ const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
483
+ const int64_t s01, const int64_t s02, const int64_t s03, dpct::queue_ptr queue) {
484
+ dpct::has_capability_or_fail(queue->get_device(), { sycl::aspect::fp16 });
485
+
486
+ sycl::range<3> global_size(ne02 * ne03, ne01, ceil_div(ne00, SYCL_DEQUANTIZE_BLOCK_SIZE));
457
487
 
458
488
  // decrease global range when it exceeds the max int
459
- int64_t local_size = downsample_sycl_global_range(num_blocks, SYCL_DEQUANTIZE_BLOCK_SIZE);
460
- sycl::range<3> block_nums(1, 1, num_blocks);
461
- sycl::range<3> local_range(1, 1, local_size);
462
- {
463
- dpct::has_capability_or_fail(stream->get_device(),
464
- {sycl::aspect::fp16});
489
+ // TODO: Downsample logic is separated from the kernel, a rewrite is desirable
490
+ int64_t downsized_workgroup = downsample_sycl_global_range(global_size[0], SYCL_DEQUANTIZE_BLOCK_SIZE);
491
+ sycl::range<3> workgroup_size(1, 1, downsized_workgroup);
465
492
 
466
- stream->parallel_for(
467
- sycl::nd_range<3>(block_nums * local_range, local_range),
468
- [=](sycl::nd_item<3> item_ct1) {
469
- convert_unary<src_t>(vx, y, k, item_ct1);
470
- });
471
- }
493
+ queue->parallel_for(sycl::nd_range<3>(global_size * workgroup_size, workgroup_size), [=](sycl::nd_item<3> item_ct1) {
494
+ convert_unary_nc<src_t>(vx, y, ne00, ne01, ne02, s01, s02, s03, item_ct1);
495
+ });
472
496
  }
473
497
 
474
- to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor *dst) {
498
+ template <typename src_t, typename dst_t>
499
+ static void convert_unary_sycl(const void * vx, dst_t * y, const int64_t k, dpct::queue_ptr queue) {
500
+ convert_unary_nc_sycl<src_t>(vx, y, k, 1, 1, 1, k, k, k, queue);
501
+ }
502
+
503
+ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) {
475
504
  switch (type) {
476
505
  case GGML_TYPE_Q4_0:
477
506
  if (dst->src[0]->extra &&
@@ -493,7 +522,11 @@ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor *dst) {
493
522
  case GGML_TYPE_Q3_K:
494
523
  return dequantize_row_q3_K_sycl;
495
524
  case GGML_TYPE_Q4_K:
496
- return dequantize_row_q4_K_sycl;
525
+ if (dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
526
+ return dequantize_row_q4_K_sycl_reorder;
527
+ } else {
528
+ return dequantize_row_q4_K_sycl;
529
+ }
497
530
  case GGML_TYPE_Q5_K:
498
531
  return dequantize_row_q5_K_sycl;
499
532
  case GGML_TYPE_Q6_K:
@@ -545,7 +578,12 @@ to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) {
545
578
  case GGML_TYPE_Q3_K:
546
579
  return dequantize_row_q3_K_sycl;
547
580
  case GGML_TYPE_Q4_K:
548
- return dequantize_row_q4_K_sycl;
581
+ if (dst->src[0]->extra &&
582
+ ((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {
583
+ return dequantize_row_q4_K_sycl_reorder;
584
+ } else {
585
+ return dequantize_row_q4_K_sycl;
586
+ }
549
587
  case GGML_TYPE_Q5_K:
550
588
  return dequantize_row_q5_K_sycl;
551
589
  case GGML_TYPE_Q6_K:
@@ -574,3 +612,12 @@ to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) {
574
612
  return nullptr;
575
613
  }
576
614
  }
615
+
616
+ to_fp16_nc_sycl_t get_to_fp16_nc_sycl(ggml_type type) {
617
+ switch (type) {
618
+ case GGML_TYPE_F32:
619
+ return convert_unary_nc_sycl<float>;
620
+ default:
621
+ return nullptr;
622
+ }
623
+ }
@@ -1,6 +1,6 @@
1
1
  //
2
2
  // MIT license
3
- // Copyright (C) 2024 Intel Corporation
3
+ // Copyright (C) 2025 Intel Corporation
4
4
  // SPDX-License-Identifier: MIT
5
5
  //
6
6
 
@@ -16,12 +16,19 @@
16
16
  #include "common.hpp"
17
17
 
18
18
  template <typename T>
19
- using to_t_sycl_t = void (*)(const void *__restrict__ x, T *__restrict__ y,
20
- int64_t k, dpct::queue_ptr stream);
21
- typedef to_t_sycl_t<float> to_fp32_sycl_t;
19
+ using to_t_sycl_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int64_t k, dpct::queue_ptr stream);
20
+ typedef to_t_sycl_t<float> to_fp32_sycl_t;
22
21
  typedef to_t_sycl_t<sycl::half> to_fp16_sycl_t;
23
22
 
24
- to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor *dst);
25
- to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst);
23
+ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst);
24
+ to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor * dst);
26
25
 
27
- #endif // GGML_SYCL_CONVERT_HPP
26
+ // Nc = Non-contiguous
27
+ template <typename T>
28
+ using to_t_nc_sycl_t = void (*)(const void * x, T * y, int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne03,
29
+ int64_t s01, int64_t s02, int64_t s03, dpct::queue_ptr queue);
30
+
31
+ typedef to_t_nc_sycl_t<sycl::half> to_fp16_nc_sycl_t;
32
+ to_fp16_nc_sycl_t get_to_fp16_nc_sycl(ggml_type type);
33
+
34
+ #endif // GGML_SYCL_CONVERT_HPP
@@ -357,6 +357,28 @@ static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8
357
357
  }
358
358
  #endif
359
359
 
360
+ template <typename dst_t>
361
+ inline void dequantize_q4_K_common(dst_t * __restrict__ y, const uint8_t * __restrict__ qs_ptr, const float dall,
362
+ const float dmin, uint8_t * __restrict__ scales_local, int il, int ir) {
363
+ const int is = 2 * il;
364
+ constexpr int n = 4;
365
+
366
+ uint8_t sc, m;
367
+ get_scale_min_k4(is + 0, scales_local, sc, m);
368
+ const float d1 = dall * sc;
369
+ const float m1 = dmin * m;
370
+
371
+ get_scale_min_k4(is + 1, scales_local, sc, m);
372
+ const float d2 = dall * sc;
373
+ const float m2 = dmin * m;
374
+
375
+ sycl::vec<uint8_t, n> q_vec = vec_aligned_load<uint8_t, n>(qs_ptr + 32 * il + n * ir);
376
+ for (int l = 0; l < n; ++l) {
377
+ y[l + 0] = d1 * (q_vec[l] & 0xF) - m1;
378
+ y[l + 32] = d2 * (q_vec[l] >> 4) - m2;
379
+ }
380
+ }
381
+
360
382
  template<typename dst_t>
361
383
  static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
362
384
  uint8_t* scales_local, const sycl::nd_item<3> &item_ct1) {
@@ -365,36 +387,22 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri
365
387
  const int64_t i = item_ct1.get_group(2);
366
388
 
367
389
  #if QK_K == 256
368
- // assume 32 threads
369
390
  const int64_t tid = item_ct1.get_local_id(2);
370
- const int64_t il = tid/8;
371
- const int64_t ir = tid%8;
372
- const int64_t is = 2*il;
373
- const int64_t n = 4;
391
+ const int64_t il = tid / 8;
392
+ const int64_t ir = tid % 8;
374
393
 
375
- dst_t * y = yy + i*QK_K + 64*il + n*ir;
394
+ dst_t * y = yy + i * QK_K + 64 * il + 4 * ir;
376
395
 
377
396
  const sycl::half2 dm = x[i].dm;
378
397
  const float dall = dm[0];
379
398
  const float dmin = dm[1];
380
399
 
381
- if (tid < 12)
400
+ if (tid < 12) {
382
401
  scales_local[tid] = x[i].scales[tid];
383
- item_ct1.barrier(sycl::access::fence_space::local_space);
384
-
385
- uint8_t sc, m;
386
- get_scale_min_k4(is + 0, scales_local, sc, m);
387
- const float d1 = dall * sc;
388
- const float m1 = dmin * m;
389
- get_scale_min_k4(is + 1, scales_local, sc, m);
390
- const float d2 = dall * sc;
391
- const float m2 = dmin * m;
392
-
393
- sycl::vec<uint8_t, n> q_vec = vec_aligned_load<uint8_t, n>(x[i].qs + 32*il + n*ir);
394
- for (int l = 0; l < n; ++l) {
395
- y[l + 0] = d1 * (q_vec[l] & 0xF) - m1;
396
- y[l +32] = d2 * (q_vec[l] >> 4) - m2;
397
402
  }
403
+
404
+ item_ct1.barrier(sycl::access::fence_space::local_space);
405
+ dequantize_q4_K_common(y, x[i].qs, dall, dmin, scales_local, il, ir);
398
406
  #else
399
407
  const int64_t tid = item_ct1.get_local_id(2);
400
408
  const uint8_t * q = x[i].qs;
@@ -406,6 +414,36 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri
406
414
  #endif
407
415
  }
408
416
 
417
+ template <typename dst_t>
418
+ static void dequantize_block_q4_K_reorder(const void * __restrict__ vx, dst_t * __restrict__ yy, uint8_t * scales_local,
419
+ const sycl::nd_item<1> & item_ct1, int64_t nb) {
420
+ const int64_t i = item_ct1.get_group(0); // block index
421
+ const int64_t tid = item_ct1.get_local_id(0); // thread index within block
422
+ const int64_t il = tid / 8;
423
+ const int64_t ir = tid % 8;
424
+
425
+ dst_t * y = yy + i * QK_K + 64 * il + 4 * ir;
426
+
427
+ const uint8_t * base = static_cast<const uint8_t *>(vx);
428
+ const size_t qs_offset = i * (QK_K / 2);
429
+ const size_t scales_offset = nb * (QK_K / 2) + i * K_SCALE_SIZE;
430
+ const size_t dm_offset = nb * (QK_K / 2) + nb * K_SCALE_SIZE + i * sizeof(ggml_half2);
431
+
432
+ const uint8_t * qs_ptr = base + qs_offset;
433
+ const uint8_t * scales_ptr = base + scales_offset;
434
+ ggml_half2 dm_values = *reinterpret_cast<const ggml_half2 *>(base + dm_offset);
435
+
436
+ const float dall = dm_values.x();
437
+ const float dmin = dm_values.y();
438
+
439
+ if (tid < 12) {
440
+ scales_local[tid] = scales_ptr[tid];
441
+ }
442
+
443
+ item_ct1.barrier(sycl::access::fence_space::local_space);
444
+ dequantize_q4_K_common(y, qs_ptr, dall, dmin, scales_local, il, ir);
445
+ }
446
+
409
447
  template<typename dst_t>
410
448
  static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
411
449
  const sycl::nd_item<3> &item_ct1) {
@@ -1129,7 +1129,13 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
1129
1129
  dequantize_mul_mat_vec_q3_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
1130
1130
  break;
1131
1131
  case GGML_TYPE_Q4_K:
1132
- dequantize_mul_mat_vec_q4_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
1132
+ if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
1133
+ ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
1134
+ // reorder is currently not supported for dmmv
1135
+ GGML_ABORT("Unimplemented dequantize case case for q4_k reorder");
1136
+ } else {
1137
+ dequantize_mul_mat_vec_q4_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
1138
+ }
1133
1139
  break;
1134
1140
  case GGML_TYPE_Q5_K:
1135
1141
  dequantize_mul_mat_vec_q5_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);