@fugood/llama.node 0.3.16 → 0.3.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (202) hide show
  1. package/CMakeLists.txt +3 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +5 -0
  19. package/package.json +1 -1
  20. package/src/LlamaCompletionWorker.cpp +8 -0
  21. package/src/LlamaCompletionWorker.h +1 -0
  22. package/src/LlamaContext.cpp +3 -2
  23. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +124 -0
  24. package/src/llama.cpp/.github/workflows/build.yml +70 -27
  25. package/src/llama.cpp/.github/workflows/docker.yml +6 -6
  26. package/src/llama.cpp/.github/workflows/server.yml +7 -11
  27. package/src/llama.cpp/CMakeLists.txt +23 -1
  28. package/src/llama.cpp/common/CMakeLists.txt +6 -3
  29. package/src/llama.cpp/common/arg.cpp +809 -105
  30. package/src/llama.cpp/common/arg.h +9 -0
  31. package/src/llama.cpp/common/chat.cpp +1 -1
  32. package/src/llama.cpp/common/common.cpp +31 -521
  33. package/src/llama.cpp/common/common.h +17 -36
  34. package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
  35. package/src/llama.cpp/common/llguidance.cpp +30 -47
  36. package/src/llama.cpp/common/minja/chat-template.hpp +15 -7
  37. package/src/llama.cpp/common/minja/minja.hpp +119 -93
  38. package/src/llama.cpp/common/sampling.cpp +3 -0
  39. package/src/llama.cpp/docs/build.md +122 -7
  40. package/src/llama.cpp/examples/CMakeLists.txt +0 -9
  41. package/src/llama.cpp/examples/batched/batched.cpp +1 -1
  42. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +1 -1
  43. package/src/llama.cpp/examples/embedding/embedding.cpp +7 -1
  44. package/src/llama.cpp/examples/export-lora/export-lora.cpp +1 -1
  45. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -16
  46. package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
  47. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +210 -8
  48. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  49. package/src/llama.cpp/examples/llava/CMakeLists.txt +39 -24
  50. package/src/llama.cpp/examples/llava/clip-impl.h +345 -0
  51. package/src/llama.cpp/examples/llava/clip.cpp +2152 -1803
  52. package/src/llama.cpp/examples/llava/clip.h +39 -22
  53. package/src/llama.cpp/examples/llava/deprecation-warning.cpp +22 -0
  54. package/src/llama.cpp/examples/llava/llava.cpp +64 -52
  55. package/src/llama.cpp/examples/llava/mtmd-cli.cpp +344 -0
  56. package/src/llama.cpp/examples/llava/mtmd.cpp +708 -0
  57. package/src/llama.cpp/examples/llava/mtmd.h +168 -0
  58. package/src/llama.cpp/examples/llava/{qwen2vl-cli.cpp → qwen2vl-test.cpp} +83 -31
  59. package/src/llama.cpp/examples/main/main.cpp +16 -5
  60. package/src/llama.cpp/examples/parallel/parallel.cpp +3 -1
  61. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
  62. package/src/llama.cpp/examples/perplexity/perplexity.cpp +17 -3
  63. package/src/llama.cpp/examples/quantize/quantize.cpp +115 -2
  64. package/src/llama.cpp/examples/rpc/CMakeLists.txt +4 -2
  65. package/src/llama.cpp/examples/rpc/rpc-server.cpp +163 -8
  66. package/src/llama.cpp/examples/run/CMakeLists.txt +12 -1
  67. package/src/llama.cpp/examples/run/run.cpp +14 -28
  68. package/src/llama.cpp/examples/server/httplib.h +313 -247
  69. package/src/llama.cpp/examples/server/server.cpp +238 -139
  70. package/src/llama.cpp/examples/server/utils.hpp +51 -2
  71. package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
  72. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  73. package/src/llama.cpp/examples/sycl/build.sh +2 -2
  74. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
  75. package/src/llama.cpp/examples/tts/tts.cpp +6 -9
  76. package/src/llama.cpp/ggml/CMakeLists.txt +8 -2
  77. package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
  78. package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
  79. package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
  80. package/src/llama.cpp/ggml/include/ggml.h +66 -99
  81. package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
  82. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
  83. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
  84. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
  85. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
  86. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
  87. package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
  88. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
  89. package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
  90. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +48 -22
  91. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
  92. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  93. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
  94. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
  95. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -192
  96. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
  97. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +754 -404
  98. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1003 -13519
  99. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +2 -7
  101. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +0 -1
  102. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +3 -4
  103. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +533 -88
  104. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8809 -0
  105. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
  106. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
  107. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
  108. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
  109. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +258 -0
  110. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
  111. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
  112. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
  113. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
  114. package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
  115. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +70 -3
  116. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
  117. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -260
  118. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +293 -40
  119. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +96 -22
  120. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  121. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +350 -0
  122. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  123. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
  124. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +2 -292
  125. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
  126. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +967 -438
  127. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
  128. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
  129. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
  130. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +204 -280
  131. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
  132. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
  133. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
  134. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
  135. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
  136. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
  137. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
  138. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +23 -0
  139. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +646 -114
  140. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +12 -0
  141. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +17 -8
  142. package/src/llama.cpp/ggml/src/ggml.c +141 -245
  143. package/src/llama.cpp/ggml/src/gguf.cpp +1 -0
  144. package/src/llama.cpp/include/llama.h +30 -11
  145. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
  146. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
  147. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
  148. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
  149. package/src/llama.cpp/requirements/requirements-all.txt +2 -0
  150. package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
  151. package/src/llama.cpp/src/CMakeLists.txt +3 -2
  152. package/src/llama.cpp/src/llama-adapter.cpp +37 -1
  153. package/src/llama.cpp/src/llama-arch.cpp +160 -17
  154. package/src/llama.cpp/src/llama-arch.h +16 -0
  155. package/src/llama.cpp/src/llama-chat.cpp +82 -17
  156. package/src/llama.cpp/src/llama-chat.h +6 -2
  157. package/src/llama.cpp/src/llama-context.cpp +108 -92
  158. package/src/llama.cpp/src/llama-context.h +1 -2
  159. package/src/llama.cpp/src/llama-graph.cpp +189 -119
  160. package/src/llama.cpp/src/llama-graph.h +26 -6
  161. package/src/llama.cpp/src/llama-hparams.h +13 -0
  162. package/src/llama.cpp/src/llama-kv-cache.cpp +70 -123
  163. package/src/llama.cpp/src/llama-kv-cache.h +41 -115
  164. package/src/llama.cpp/src/llama-memory.h +1 -1
  165. package/src/llama.cpp/src/llama-mmap.cpp +1 -1
  166. package/src/llama.cpp/src/llama-model-loader.cpp +10 -5
  167. package/src/llama.cpp/src/llama-model-loader.h +5 -3
  168. package/src/llama.cpp/src/llama-model.cpp +1760 -534
  169. package/src/llama.cpp/src/llama-model.h +13 -1
  170. package/src/llama.cpp/src/llama-quant.cpp +29 -8
  171. package/src/llama.cpp/src/llama-sampling.cpp +7 -1
  172. package/src/llama.cpp/src/llama-vocab.cpp +44 -6
  173. package/src/llama.cpp/src/llama.cpp +1 -1
  174. package/src/llama.cpp/tests/CMakeLists.txt +43 -30
  175. package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
  176. package/src/llama.cpp/tests/test-backend-ops.cpp +82 -43
  177. package/src/llama.cpp/tests/test-chat-template.cpp +34 -13
  178. package/src/llama.cpp/tests/test-chat.cpp +12 -2
  179. package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
  180. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
  181. package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
  182. package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
  183. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
  184. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
  185. package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
  186. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
  187. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
  188. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
  189. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
  190. package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
  191. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
  192. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
  193. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  194. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  195. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  196. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  197. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  198. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  199. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  200. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  201. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  202. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
@@ -23,6 +23,32 @@ ggml_add_backend_library(ggml-sycl
23
23
  ../../include/ggml-sycl.h
24
24
  )
25
25
 
26
+ file(GLOB GGML_HEADERS_SYCL "*.hpp")
27
+ file(GLOB GGML_SOURCES_SYCL "*.cpp")
28
+ target_sources(ggml-sycl PRIVATE ${GGML_HEADERS_SYCL} ${GGML_SOURCES_SYCL})
29
+
30
+ if (WIN32)
31
+ # To generate a Visual Studio solution, using Intel C++ Compiler for ggml-sycl is mandatory
32
+ if( ${CMAKE_GENERATOR} MATCHES "Visual Studio" AND NOT (${CMAKE_GENERATOR_TOOLSET} MATCHES "Intel C"))
33
+ set_target_properties(ggml-sycl PROPERTIES VS_PLATFORM_TOOLSET "Intel C++ Compiler 2025")
34
+ set(CMAKE_CXX_COMPILER "icx")
35
+ set(CMAKE_CXX_COMPILER_ID "IntelLLVM")
36
+ endif()
37
+ endif()
38
+
39
+ find_package(IntelSYCL)
40
+ if (IntelSYCL_FOUND)
41
+ # Use oneAPI CMake when possible
42
+ target_link_libraries(ggml-sycl PRIVATE IntelSYCL::SYCL_CXX)
43
+ else()
44
+ # Fallback to the simplest way of enabling SYCL when using intel/llvm nightly for instance
45
+ target_compile_options(ggml-sycl PRIVATE "-fsycl")
46
+ target_link_options(ggml-sycl PRIVATE "-fsycl")
47
+ endif()
48
+
49
+ target_compile_options(ggml-sycl PRIVATE "-Wno-narrowing")
50
+
51
+ # Link against oneDNN
26
52
  find_package(DNNL)
27
53
  set(GGML_SYCL_DNNL 0)
28
54
  if(DNNL_FOUND)
@@ -62,8 +88,6 @@ if (GGML_SYCL_F16)
62
88
  add_compile_definitions(GGML_SYCL_F16)
63
89
  endif()
64
90
 
65
- set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing -fsycl")
66
-
67
91
  if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
68
92
  add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
69
93
  elseif (GGML_SYCL_TARGET STREQUAL "AMD")
@@ -76,34 +100,84 @@ else()
76
100
  add_compile_definitions(GGML_SYCL_WARP_SIZE=16)
77
101
  endif()
78
102
 
79
- file(GLOB GGML_HEADERS_SYCL "*.hpp")
80
- file(GLOB GGML_SOURCES_SYCL "*.cpp")
81
- target_sources(ggml-sycl PRIVATE ${GGML_HEADERS_SYCL} ${GGML_SOURCES_SYCL})
82
-
103
+ if (GGML_SYCL_GRAPH)
104
+ target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_GRAPH)
105
+ endif()
83
106
 
84
- if (WIN32)
85
- find_package(IntelSYCL REQUIRED)
107
+ # Link against Intel oneMKL or oneMath
108
+ if (GGML_SYCL_TARGET STREQUAL "INTEL")
109
+ # Intel devices use Intel oneMKL directly instead of oneMath to avoid the limitation of linking Intel oneMKL statically
110
+ # See https://github.com/uxlfoundation/oneMath/issues/654
86
111
  find_package(MKL REQUIRED)
87
- target_link_libraries(ggml-sycl PRIVATE IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
112
+ target_link_libraries(ggml-sycl PRIVATE MKL::MKL_SYCL::BLAS)
113
+ target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_USE_INTEL_ONEMKL)
88
114
  else()
89
- if (GGML_SYCL_GRAPH)
90
- add_compile_definitions(GGML_SYCL_GRAPH)
115
+ find_package(oneMath QUIET)
116
+ if (NOT oneMath_FOUND)
117
+ message(STATUS "oneMath not found: oneMath will be automatically downloaded")
118
+ # Use FetchContent to automatically pull and build oneMath
119
+ include(FetchContent)
120
+ set(BUILD_FUNCTIONAL_TESTS False)
121
+ set(BUILD_EXAMPLES False)
122
+ set(TARGET_DOMAINS blas)
123
+ if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
124
+ set(ENABLE_MKLCPU_BACKEND False)
125
+ set(ENABLE_MKLGPU_BACKEND False)
126
+ set(ENABLE_CUBLAS_BACKEND True)
127
+ elseif (GGML_SYCL_TARGET STREQUAL "AMD")
128
+ set(ENABLE_MKLCPU_BACKEND False)
129
+ set(ENABLE_MKLGPU_BACKEND False)
130
+ set(ENABLE_ROCBLAS_BACKEND True)
131
+ # Ensure setting a string variable here is not overriden by oneMath CACHE variables
132
+ cmake_policy(SET CMP0126 NEW)
133
+ # Setting the device architecture is only needed and useful for AMD devices in oneMath
134
+ set(HIP_TARGETS ${GGML_SYCL_DEVICE_ARCH} CACHE STRING "oneMath HIP target" FORCE)
135
+ endif()
136
+ FetchContent_Declare(
137
+ ONEMATH
138
+ GIT_REPOSITORY https://github.com/uxlfoundation/oneMath.git
139
+ GIT_TAG c255b1b4c41e2ee3059455c1f96a965d6a62568a
140
+ )
141
+ FetchContent_MakeAvailable(ONEMATH)
142
+ # Create alias to match with find_package targets name
143
+ function(onemath_alias target)
144
+ if (TARGET ${target}_obj)
145
+ # Silence verbose warnings from external libraries
146
+ target_compile_options(${target}_obj PRIVATE -w)
147
+ endif()
148
+ if (TARGET ${target})
149
+ add_library(ONEMATH::${target} ALIAS ${target})
150
+ endif()
151
+ endfunction()
152
+ onemath_alias(onemath)
153
+ onemath_alias(onemath_blas_mklcpu)
154
+ onemath_alias(onemath_blas_mklgpu)
155
+ onemath_alias(onemath_blas_cublas)
156
+ onemath_alias(onemath_blas_rocblas)
91
157
  endif()
92
- if (GGML_SYCL_TARGET STREQUAL "INTEL")
93
- target_link_libraries(ggml-sycl PRIVATE sycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
94
- elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA")
95
- set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
96
- add_compile_definitions(GGML_SYCL_NVIDIA)
97
- target_link_libraries(ggml-sycl PRIVATE sycl pthread m dl onemkl_blas_cublas)
158
+
159
+ # Below oneMath compile-time dispatching is used for better performance
160
+ if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
161
+ target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath_blas_cublas)
162
+ target_compile_options(ggml-sycl PRIVATE "-fsycl-targets=nvptx64-nvidia-cuda")
163
+ target_link_options(ggml-sycl PRIVATE "-fsycl-targets=nvptx64-nvidia-cuda")
164
+ target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_NVIDIA)
98
165
  elseif (GGML_SYCL_TARGET STREQUAL "AMD")
99
166
  if (NOT GGML_SYCL_DEVICE_ARCH)
100
167
  message(ERROR "Can't enable SYCL hip backend, GGML_SYCL_DEVICE_ARCH has not been set.")
101
168
  endif()
102
- set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=amdgcn-amd-amdhsa")
103
- target_link_libraries(ggml-sycl PRIVATE sycl pthread m dl onemkl)
169
+ target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath_blas_rocblas)
170
+ target_compile_options(ggml-sycl PRIVATE "-fsycl-targets=amdgcn-amd-amdhsa")
171
+ target_link_options(ggml-sycl PRIVATE "-fsycl-targets=amdgcn-amd-amdhsa")
172
+ target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_AMD)
173
+ else()
174
+ # Fallback to oneMath runtime dispatcher
175
+ target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath)
176
+ target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_GENERIC)
104
177
  endif()
178
+ endif()
105
179
 
106
- if (GGML_SYCL_DEVICE_ARCH)
107
- set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Xsycl-target-backend --offload-arch=${GGML_SYCL_DEVICE_ARCH}")
108
- endif()
180
+ if (GGML_SYCL_DEVICE_ARCH)
181
+ target_compile_options(ggml-sycl PRIVATE -Xsycl-target-backend --offload-arch=${GGML_SYCL_DEVICE_ARCH})
182
+ target_link_options(ggml-sycl PRIVATE -Xsycl-target-backend --offload-arch=${GGML_SYCL_DEVICE_ARCH})
109
183
  endif()
@@ -13,6 +13,7 @@
13
13
  #ifndef GGML_SYCL_BACKEND_HPP
14
14
  #define GGML_SYCL_BACKEND_HPP
15
15
 
16
+ #include "binbcast.hpp"
16
17
  #include "concat.hpp"
17
18
  #include "common.hpp"
18
19
  #include "conv.hpp"
@@ -0,0 +1,350 @@
1
+ #include "binbcast.hpp"
2
+
3
+ #include <cstddef>
4
+ #include <cstdint>
5
+ #include <sycl/sycl.hpp>
6
+
7
+ #include "ggml.h"
8
+
9
+ template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
10
+ static void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
11
+ int ne0, int ne1, int ne2, int ne3,
12
+ int ne10, int ne11, int ne12, int ne13,
13
+ /*int s0, */ int s1, int s2, int s3,
14
+ /*int s00,*/ int s01, int s02, int s03,
15
+ /*int s10,*/ int s11, int s12, int s13,
16
+ const sycl::nd_item<3> &item_ct1) {
17
+ const int i0s = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
18
+ item_ct1.get_local_id(2);
19
+ const int i1 = (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
20
+ item_ct1.get_local_id(1));
21
+ const int i2 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) +
22
+ item_ct1.get_local_id(0)) /
23
+ ne3;
24
+ const int i3 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) +
25
+ item_ct1.get_local_id(0)) %
26
+ ne3;
27
+
28
+ if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
29
+ return;
30
+ }
31
+
32
+ const int i11 = i1 % ne11;
33
+ const int i12 = i2 % ne12;
34
+ const int i13 = i3 % ne13;
35
+
36
+ const size_t i_src0 = i3*s03 + i2*s02 + i1*s01;
37
+ const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
38
+ const size_t i_dst = i3*s3 + i2*s2 + i1*s1;
39
+
40
+ const src0_t * src0_row = src0 + i_src0;
41
+ const src1_t * src1_row = src1 + i_src1;
42
+ dst_t * dst_row = dst + i_dst;
43
+
44
+ for (int i0 = i0s; i0 < ne0;
45
+ i0 += item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) {
46
+ const int i10 = i0 % ne10;
47
+ dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
48
+ }
49
+ }
50
+
51
+ template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
52
+ static void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst,
53
+ int ne0, int ne1, int ne2, int ne3,
54
+ int ne10, int ne11, int ne12, int ne13,
55
+ /*int s0, */ int s1, int s2, int s3,
56
+ /*int s00,*/ int s01, int s02, int s03,
57
+ /*int s10,*/ int s11, int s12, int s13,
58
+ const sycl::nd_item<3> &item_ct1) {
59
+
60
+ const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
61
+ item_ct1.get_local_id(2);
62
+
63
+ const int i3 = i/(ne2*ne1*ne0);
64
+ const int i2 = (i/(ne1*ne0)) % ne2;
65
+ const int i1 = (i/ne0) % ne1;
66
+ const int i0 = i % ne0;
67
+
68
+ if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
69
+ return;
70
+ }
71
+
72
+ const int i11 = i1 % ne11;
73
+ const int i12 = i2 % ne12;
74
+ const int i13 = i3 % ne13;
75
+
76
+ const size_t i_src0 = i3*s03 + i2*s02 + i1*s01;
77
+ const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
78
+ const size_t i_dst = i3*s3 + i2*s2 + i1*s1;
79
+
80
+ const src0_t * src0_row = src0 + i_src0;
81
+ const src1_t * src1_row = src1 + i_src1;
82
+ dst_t * dst_row = dst + i_dst;
83
+
84
+ const int i10 = i0 % ne10;
85
+ dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
86
+ }
87
+
88
+
89
+ template<float (*bin_op)(const float, const float)>
90
+ struct bin_bcast_sycl {
91
+ template <typename src0_t, typename src1_t, typename dst_t>
92
+ void operator()(const src0_t * src0_dd, const src1_t * src1_dd, dst_t * dst_dd, const int64_t ne00,
93
+ const int64_t ne01, const int64_t ne02, const int64_t ne03, const int64_t ne10, const int64_t ne11,
94
+ const int64_t ne12, const int64_t ne13, const int64_t ne0, const int64_t ne1, const int64_t ne2,
95
+ const int64_t ne3, const size_t nb00, const size_t nb01, const size_t nb02, const size_t nb03,
96
+ const size_t nb10, const size_t nb11, const size_t nb12, const size_t nb13, const size_t nb0,
97
+ const size_t nb1, const size_t nb2, const size_t nb3, const bool src0_is_contiguous,
98
+ const bool src1_is_contiguous, const bool dst_is_contiguous, queue_ptr stream) {
99
+ int nr0 = ne10 / ne0;
100
+ int nr1 = ne11/ne1;
101
+ int nr2 = ne12/ne2;
102
+ int nr3 = ne13/ne3;
103
+
104
+ int nr[4] = { nr0, nr1, nr2, nr3 };
105
+
106
+ // collapse dimensions until first broadcast dimension
107
+ int64_t cne[] = {ne0, ne1, ne2, ne3};
108
+ int64_t cne0[] = {ne00, ne01, ne02, ne03};
109
+ int64_t cne1[] = {ne10, ne11, ne12, ne13};
110
+ size_t cnb[] = {nb0, nb1, nb2, nb3};
111
+ size_t cnb0[] = {nb00, nb01, nb02, nb03};
112
+ size_t cnb1[] = {nb10, nb11, nb12, nb13};
113
+ auto collapse = [](int64_t cne[]) {
114
+ cne[0] *= cne[1];
115
+ cne[1] = cne[2];
116
+ cne[2] = cne[3];
117
+ cne[3] = 1;
118
+ };
119
+
120
+ auto collapse_nb = [](size_t cnb[], int64_t cne[]) {
121
+ cnb[1] *= cne[1];
122
+ cnb[2] *= cne[2];
123
+ cnb[3] *= cne[3];
124
+ };
125
+
126
+ if (src0_is_contiguous && src1_is_contiguous && dst_is_contiguous) {
127
+ for (int i = 0; i < 4; i++) {
128
+ if (nr[i] != 1) {
129
+ break;
130
+ }
131
+ if (i > 0) {
132
+ collapse_nb(cnb, cne);
133
+ collapse_nb(cnb0, cne0);
134
+ collapse_nb(cnb1, cne1);
135
+ collapse(cne);
136
+ collapse(cne0);
137
+ collapse(cne1);
138
+ }
139
+ }
140
+ }
141
+ {
142
+ int64_t ne0 = cne[0];
143
+ int64_t ne1 = cne[1];
144
+ int64_t ne2 = cne[2];
145
+ int64_t ne3 = cne[3];
146
+
147
+ int64_t ne10 = cne1[0];
148
+ int64_t ne11 = cne1[1];
149
+ int64_t ne12 = cne1[2];
150
+ int64_t ne13 = cne1[3];
151
+
152
+ size_t nb0 = cnb[0];
153
+ size_t nb1 = cnb[1];
154
+ size_t nb2 = cnb[2];
155
+ size_t nb3 = cnb[3];
156
+
157
+ size_t nb00 = cnb0[0];
158
+ size_t nb01 = cnb0[1];
159
+ size_t nb02 = cnb0[2];
160
+ size_t nb03 = cnb0[3];
161
+
162
+ size_t nb10 = cnb1[0];
163
+ size_t nb11 = cnb1[1];
164
+ size_t nb12 = cnb1[2];
165
+ size_t nb13 = cnb1[3];
166
+
167
+ size_t s0 = nb0 / sizeof(dst_t);
168
+ size_t s1 = nb1 / sizeof(dst_t);
169
+ size_t s2 = nb2 / sizeof(dst_t);
170
+ size_t s3 = nb3 / sizeof(dst_t);
171
+
172
+ size_t s10 = nb10 / sizeof(src1_t);
173
+ size_t s11 = nb11 / sizeof(src1_t);
174
+ size_t s12 = nb12 / sizeof(src1_t);
175
+ size_t s13 = nb13 / sizeof(src1_t);
176
+
177
+ size_t s00 = nb00 / sizeof(src0_t);
178
+ size_t s01 = nb01 / sizeof(src0_t);
179
+ size_t s02 = nb02 / sizeof(src0_t);
180
+ size_t s03 = nb03 / sizeof(src0_t);
181
+
182
+ GGML_UNUSED(s00);
183
+
184
+ GGML_ASSERT(nb0 % sizeof(dst_t) == 0);
185
+ GGML_ASSERT(nb1 % sizeof(dst_t) == 0);
186
+ GGML_ASSERT(nb2 % sizeof(dst_t) == 0);
187
+ GGML_ASSERT(nb3 % sizeof(dst_t) == 0);
188
+
189
+ GGML_ASSERT(nb00 % sizeof(src0_t) == 0);
190
+ GGML_ASSERT(nb01 % sizeof(src0_t) == 0);
191
+ GGML_ASSERT(nb02 % sizeof(src0_t) == 0);
192
+ GGML_ASSERT(nb03 % sizeof(src0_t) == 0);
193
+
194
+ GGML_ASSERT(nb10 % sizeof(src1_t) == 0);
195
+ GGML_ASSERT(nb11 % sizeof(src1_t) == 0);
196
+ GGML_ASSERT(nb12 % sizeof(src1_t) == 0);
197
+ GGML_ASSERT(nb13 % sizeof(src1_t) == 0);
198
+
199
+ GGML_ASSERT(s0 == 1);
200
+ GGML_ASSERT(s10 == 1);
201
+
202
+ const int block_size = 128;
203
+
204
+ int64_t hne0 = std::max(ne0/2LL, 1LL);
205
+
206
+ sycl::range<3> block_dims(1, 1, 1);
207
+ block_dims[2] = std::min<unsigned int>(hne0, block_size);
208
+ block_dims[1] = std::min<unsigned int>(
209
+ ne1, block_size / (unsigned int)block_dims[2]);
210
+ block_dims[0] = std::min(
211
+ std::min<unsigned int>(
212
+ ne2 * ne3, block_size / (unsigned int)block_dims[2] /
213
+ (unsigned int)block_dims[1]),
214
+ 64U);
215
+
216
+ sycl::range<3> block_nums(
217
+ (ne2 * ne3 + block_dims[0] - 1) / block_dims[0],
218
+ (ne1 + block_dims[1] - 1) / block_dims[1],
219
+ (hne0 + block_dims[2] - 1) / block_dims[2]);
220
+
221
+ if (block_nums[0] > 65535) {
222
+ // this is the maximum number of blocks in z direction, fallback to 1D grid kernel
223
+ int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;
224
+ {
225
+ dpct::has_capability_or_fail(stream->get_device(),
226
+ {sycl::aspect::fp16});
227
+
228
+ stream->parallel_for(
229
+ sycl::nd_range<3>(sycl::range<3>(1, 1, block_num) *
230
+ sycl::range<3>(1, 1, block_size),
231
+ sycl::range<3>(1, 1, block_size)),
232
+ [=](sycl::nd_item<3> item_ct1) {
233
+ k_bin_bcast_unravel<bin_op>(
234
+ src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3,
235
+ ne10, ne11, ne12, ne13, s1, s2, s3, s01, s02,
236
+ s03, s11, s12, s13, item_ct1);
237
+ });
238
+ }
239
+ } else {
240
+ /*
241
+ DPCT1049:16: The work-group size passed to the SYCL kernel may
242
+ exceed the limit. To get the device limit, query
243
+ info::device::max_work_group_size. Adjust the work-group size if
244
+ needed.
245
+ */
246
+ dpct::has_capability_or_fail(stream->get_device(),
247
+ {sycl::aspect::fp16});
248
+
249
+ stream->parallel_for(
250
+ sycl::nd_range<3>(block_nums * block_dims, block_dims),
251
+ [=](sycl::nd_item<3> item_ct1) {
252
+ k_bin_bcast<bin_op>(src0_dd, src1_dd, dst_dd, ne0, ne1,
253
+ ne2, ne3, ne10, ne11, ne12, ne13,
254
+ s1, s2, s3, s01, s02, s03, s11, s12, s13,
255
+ item_ct1);
256
+ });
257
+ }
258
+ }
259
+ }
260
+ };
261
+
262
+ template <class op>
263
+ inline void ggml_sycl_op_bin_bcast(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1,
264
+ ggml_tensor * dst) {
265
+ dpct::queue_ptr main_stream = ctx.stream();
266
+ GGML_TENSOR_BINARY_OP_LOCALS
267
+
268
+ if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
269
+ op()((const float *) src0->data, (const float *) src1->data, (float *) dst->data, ne00, ne01, ne02, ne03, ne10,
270
+ ne11, ne12, ne13, ne0, ne1, ne2, ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb0, nb1, nb2, nb3,
271
+ ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_contiguous(dst), main_stream);
272
+ } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
273
+ op()((const sycl::half *) src0->data, (const sycl::half *) src1->data, (sycl::half *) dst->data, ne00, ne01,
274
+ ne02, ne03, ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13,
275
+ nb0, nb1, nb2, nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_contiguous(dst),
276
+ main_stream);
277
+ } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) {
278
+ op()((const sycl::half *) src0->data, (const float *) src1->data, (sycl::half *) dst->data, ne00, ne01, ne02,
279
+ ne03, ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb0, nb1,
280
+ nb2, nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_contiguous(dst), main_stream);
281
+ } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_I32) {
282
+ op()((const int32_t *) src0->data, (const int32_t *) src1->data, (int32_t *) dst->data, ne00, ne01, ne02, ne03,
283
+ ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb0, nb1, nb2,
284
+ nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_contiguous(dst), main_stream);
285
+ } else if (src0->type == GGML_TYPE_I16 && src1->type == GGML_TYPE_I16 && dst->type == GGML_TYPE_I16) {
286
+ op()((const int16_t *) src0->data, (const int16_t *) src1->data, (int16_t *) dst->data, ne00, ne01, ne02, ne03,
287
+ ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb0, nb1, nb2,
288
+ nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_contiguous(dst), main_stream);
289
+ } else {
290
+ fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__, ggml_type_name(dst->type),
291
+ ggml_type_name(src0->type), ggml_type_name(src1->type));
292
+ GGML_ABORT("fatal error");
293
+ }
294
+ }
295
+
296
+ inline void ggml_sycl_op_add(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
297
+
298
+ ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_add>>(ctx, dst->src[0], dst->src[1], dst);
299
+ }
300
+
301
+ inline void ggml_sycl_op_sub(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
302
+
303
+ ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_sub>>(ctx, dst->src[0], dst->src[1], dst);
304
+ }
305
+
306
+ inline void ggml_sycl_op_mul(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
307
+
308
+ ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_mul>>(ctx, dst->src[0], dst->src[1], dst);
309
+ }
310
+
311
+ inline void ggml_sycl_op_div(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
312
+
313
+ ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_div>>(ctx, dst->src[0], dst->src[1], dst);
314
+ }
315
+
316
+ inline void ggml_sycl_op_repeat(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
317
+ ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_repeat>>(ctx, dst, dst->src[0], dst);
318
+ }
319
+
320
+
321
+ void ggml_sycl_add(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
322
+ GGML_SYCL_DEBUG("call %s\n", __func__);
323
+ ggml_sycl_op_add(ctx, dst);
324
+ GGML_SYCL_DEBUG("call %s done\n", __func__);
325
+ }
326
+
327
+ void ggml_sycl_sub(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
328
+ GGML_SYCL_DEBUG("call %s\n", __func__);
329
+ ggml_sycl_op_sub(ctx, dst);
330
+ GGML_SYCL_DEBUG("call %s done\n", __func__);
331
+ }
332
+
333
+ void ggml_sycl_mul(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
334
+ GGML_SYCL_DEBUG("call %s\n", __func__);
335
+ ggml_sycl_op_mul(ctx, dst);
336
+ GGML_SYCL_DEBUG("call %s done\n", __func__);
337
+ }
338
+
339
+ void ggml_sycl_div(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
340
+ GGML_SYCL_DEBUG("call %s\n", __func__);
341
+ ggml_sycl_op_div(ctx, dst);
342
+ GGML_SYCL_DEBUG("call %s done\n", __func__);
343
+ }
344
+
345
+ void ggml_sycl_repeat(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
346
+ GGML_SYCL_DEBUG("call %s\n", __func__);
347
+ ggml_sycl_op_repeat(ctx, dst);
348
+ GGML_SYCL_DEBUG("call %s done\n", __func__);
349
+ }
350
+
@@ -0,0 +1,39 @@
1
+ #ifndef GGML_SYCL_BINBCAST_HPP
2
+ #define GGML_SYCL_BINBCAST_HPP
3
+ #include "common.hpp"
4
+
5
+
6
+ static __dpct_inline__ float op_repeat(const float a, const float b) {
7
+ return b;
8
+ GGML_UNUSED(a);
9
+ }
10
+
11
+ static __dpct_inline__ float op_add(const float a, const float b) {
12
+ return a + b;
13
+ }
14
+
15
+ static __dpct_inline__ float op_sub(const float a, const float b) {
16
+ return a - b;
17
+ }
18
+
19
+ static __dpct_inline__ float op_mul(const float a, const float b) {
20
+ return a * b;
21
+ }
22
+
23
+ static __dpct_inline__ float op_div(const float a, const float b) {
24
+ return a / b;
25
+ }
26
+
27
+ void ggml_sycl_add(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
28
+
29
+ void ggml_sycl_sub(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
30
+
31
+ void ggml_sycl_mul(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
32
+
33
+ void ggml_sycl_div(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
34
+
35
+ void ggml_sycl_repeat(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
36
+
37
+
38
+ #endif //GGML_SYCL_BINBCAST_HPP
39
+
@@ -66,41 +66,6 @@ int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block
66
66
  return sycl_down_blk_size;
67
67
  }
68
68
 
69
- void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
70
- const ggml_tensor *src1, ggml_tensor *dst,
71
- const ggml_sycl_op_flatten_t op) try {
72
-
73
- const bool use_src1 = src1 != nullptr;
74
- if(use_src1)
75
- GGML_ASSERT(strcmp(src1->buffer->buft->iface.get_name(src1->buffer->buft), GGML_SYCL_NAME "_Split") != 0);
76
- GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0);
77
-
78
- // dd = data device
79
- float * src0_ddf = (float *) src0->data;
80
- float * src1_ddf = use_src1 ? (float *) src1->data : nullptr;
81
- float * dst_ddf = (float *) dst->data;
82
-
83
- ggml_sycl_pool_alloc<float> src0_f(ctx.pool());
84
- ggml_sycl_pool_alloc<float> src1_f(ctx.pool());
85
- ggml_sycl_pool_alloc<float> dst_f(ctx.pool());
86
-
87
- ggml_sycl_set_device(ctx.device);
88
- queue_ptr main_stream = ctx.stream();
89
- // GGML_SYCL_DEBUG("ctx.device=%d, main_stream=%p src0_on_device=%d, src1_on_device=%d, dst_on_device=%d\n",
90
- // ctx.device, main_stream, src0_on_device, src1_on_device, dst_on_device);
91
-
92
- // do the computation
93
- op(ctx, src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
94
- // print_ggml_tensor("tensor", dst);
95
- }
96
- catch (sycl::exception const &exc) {
97
-
98
- std::cerr << exc.what() << "Exception caught at file:" << __FILE__
99
- << ", line:" << __LINE__ << std::endl;
100
- std::exit(1);
101
- }
102
-
103
-
104
69
  void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector<queue_ptr> streams) {
105
70
  for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
106
71
  for (int64_t is = 0; is < GGML_SYCL_MAX_STREAMS; ++is) {