@fugood/llama.node 0.3.15 → 0.3.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/package.json +1 -1
  18. package/src/llama.cpp/examples/server/server.cpp +5 -0
  19. package/src/llama.cpp/examples/tts/tts.cpp +8 -0
  20. package/src/llama.cpp/ggml/src/CMakeLists.txt +5 -1
  21. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +1493 -12
  22. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +31 -27
  23. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +32 -12
  24. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +27 -1
  25. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +12 -43
  26. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +6 -6
  27. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +46 -12
  28. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +4 -2
  29. package/src/llama.cpp/src/llama-arch.cpp +1 -0
  30. package/src/llama.cpp/src/llama-model.cpp +65 -38
  31. package/src/llama.cpp/tests/test-backend-ops.cpp +57 -14
@@ -3110,17 +3110,17 @@ static void ggml_compute_forward_dup_same_cont(
3110
3110
  const int ith = params->ith; // thread index
3111
3111
  const int nth = params->nth; // number of threads
3112
3112
 
3113
- // parallelize by elements
3114
- const int ne = ggml_nelements(dst);
3115
- const int dr = (ne + nth - 1) / nth;
3116
- const int ie0 = dr * ith;
3117
- const int ie1 = MIN(ie0 + dr, ne);
3113
+ // parallelize by blocks
3114
+ const int nk = ggml_nelements(src0)/ggml_blck_size(src0->type);
3115
+ const int dr = (nk + nth - 1) / nth;
3116
+ const int k0 = dr * ith;
3117
+ const int k1 = MIN(k0 + dr, nk);
3118
3118
 
3119
- if (ie0 < ie1) {
3119
+ if (k0 < k1) {
3120
3120
  memcpy(
3121
- ((char *) dst->data + ie0*nb0),
3122
- ((char *) src0->data + ie0*nb0),
3123
- (ie1 - ie0) * nb0);
3121
+ ((char *) dst->data + k0*nb0),
3122
+ ((char *) src0->data + k0*nb0),
3123
+ (k1 - k0) * nb0);
3124
3124
  }
3125
3125
  }
3126
3126
 
@@ -4055,7 +4055,6 @@ static void ggml_compute_forward_dup_f32(
4055
4055
  static void ggml_compute_forward_dup_bytes(
4056
4056
  const struct ggml_compute_params * params,
4057
4057
  struct ggml_tensor * dst) {
4058
-
4059
4058
  const struct ggml_tensor * src0 = dst->src[0];
4060
4059
 
4061
4060
  GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
@@ -4069,10 +4068,10 @@ static void ggml_compute_forward_dup_bytes(
4069
4068
  }
4070
4069
 
4071
4070
  const size_t type_size = ggml_type_size(src0->type);
4071
+
4072
4072
  const int ith = params->ith; // thread index
4073
4073
  const int nth = params->nth; // number of threads
4074
4074
 
4075
-
4076
4075
  // parallelize by rows
4077
4076
  const int nr = ne01;
4078
4077
  // number of rows per thread
@@ -4082,10 +4081,10 @@ static void ggml_compute_forward_dup_bytes(
4082
4081
  const int ir1 = MIN(ir0 + dr, nr);
4083
4082
 
4084
4083
  if (src0->type == dst->type &&
4085
- ne00 == ne0 &&
4084
+ ggml_are_same_shape(src0, dst) &&
4086
4085
  nb00 == type_size && nb0 == type_size) {
4087
4086
  // copy by rows
4088
- const size_t rs = ne00 * type_size;
4087
+ const size_t rs = ggml_row_size(src0->type, ne00);
4089
4088
  for (int64_t i03 = 0; i03 < ne03; i03++) {
4090
4089
  for (int64_t i02 = 0; i02 < ne02; i02++) {
4091
4090
  for (int64_t i01 = ir0; i01 < ir1; i01++) {
@@ -4140,17 +4139,20 @@ static void ggml_compute_forward_dup_bytes(
4140
4139
  }
4141
4140
 
4142
4141
  // dst counters
4143
-
4144
- int64_t i10 = 0;
4142
+ int64_t k10 = 0;
4145
4143
  int64_t i11 = 0;
4146
4144
  int64_t i12 = 0;
4147
4145
  int64_t i13 = 0;
4148
4146
 
4147
+ // number of blocks in a row
4148
+ const int64_t nk00 = ne00 / ggml_blck_size(src0->type);
4149
+ const int64_t nk0 = ne0 / ggml_blck_size(dst->type);
4150
+
4149
4151
  for (int64_t i03 = 0; i03 < ne03; i03++) {
4150
4152
  for (int64_t i02 = 0; i02 < ne02; i02++) {
4151
- i10 += ne00 * ir0;
4152
- while (i10 >= ne0) {
4153
- i10 -= ne0;
4153
+ k10 += nk00 * ir0;
4154
+ while (k10 >= nk0) {
4155
+ k10 -= nk0;
4154
4156
  if (++i11 == ne1) {
4155
4157
  i11 = 0;
4156
4158
  if (++i12 == ne2) {
@@ -4162,14 +4164,14 @@ static void ggml_compute_forward_dup_bytes(
4162
4164
  }
4163
4165
  }
4164
4166
  for (int64_t i01 = ir0; i01 < ir1; i01++) {
4165
- for (int64_t i00 = 0; i00 < ne00; i00++) {
4166
- const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
4167
- char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
4167
+ for (int64_t k00 = 0; k00 < nk00; k00++) {
4168
+ const char * src0_ptr = ((char *) src0->data + k00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
4169
+ char * dst_ptr = ((char *) dst->data + k10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
4168
4170
 
4169
4171
  memcpy(dst_ptr, src0_ptr, type_size);
4170
4172
 
4171
- if (++i10 == ne0) {
4172
- i10 = 0;
4173
+ if (++k10 == nk0) {
4174
+ k10 = 0;
4173
4175
  if (++i11 == ne1) {
4174
4176
  i11 = 0;
4175
4177
  if (++i12 == ne2) {
@@ -4182,9 +4184,9 @@ static void ggml_compute_forward_dup_bytes(
4182
4184
  }
4183
4185
  }
4184
4186
  }
4185
- i10 += ne00 * (ne01 - ir1);
4186
- while (i10 >= ne0) {
4187
- i10 -= ne0;
4187
+ k10 += nk00 * (ne01 - ir1);
4188
+ while (k10 >= nk0) {
4189
+ k10 -= nk0;
4188
4190
  if (++i11 == ne1) {
4189
4191
  i11 = 0;
4190
4192
  if (++i12 == ne2) {
@@ -14308,7 +14310,9 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14308
14310
  }
14309
14311
 
14310
14312
  // extra_buffer op?
14311
- if (ggml_cpu_extra_compute_forward(params, tensor)) return;
14313
+ if (ggml_cpu_extra_compute_forward(params, tensor)) {
14314
+ return;
14315
+ }
14312
14316
 
14313
14317
  switch (tensor->op) {
14314
14318
  case GGML_OP_DUP:
@@ -23,6 +23,38 @@ ggml_add_backend_library(ggml-sycl
23
23
  ../../include/ggml-sycl.h
24
24
  )
25
25
 
26
+ find_package(DNNL)
27
+ set(GGML_SYCL_DNNL 0)
28
+ if(DNNL_FOUND)
29
+ if (DEFINED ENV{ONEAPI_ROOT} AND NOT DEFINED DNNL_GPU_VENDOR)
30
+ # Assuming oneDNN packaged with oneapi release is used which
31
+ # supports only intel target
32
+ set(DNNL_GPU_VENDOR "INTEL")
33
+ if(NOT "${GGML_SYCL_TARGET}" STREQUAL "INTEL")
34
+ message(WARNING "oneDNN builds bundled with oneapi release only support INTEL target")
35
+ endif()
36
+ endif()
37
+
38
+ # Verify oneDNN was compiled for the same target as llama
39
+ if("${GGML_SYCL_TARGET}" STREQUAL "${DNNL_GPU_VENDOR}")
40
+ target_link_libraries(ggml-sycl PRIVATE DNNL::dnnl)
41
+ set(GGML_SYCL_DNNL 1)
42
+ get_target_property(CONFIGS DNNL::dnnl IMPORTED_CONFIGURATIONS)
43
+ foreach(CONFIG ${CONFIGS})
44
+ get_target_property(DNNL_LIB DNNL::dnnl IMPORTED_LOCATION_${CONFIG})
45
+ message(STATUS "Found oneDNN: ${DNNL_LIB}")
46
+ endforeach()
47
+ else()
48
+ message(WARNING
49
+ "oneDNN must be compiled for the same target as llama.cpp.
50
+ llama.cpp: ${GGML_SYCL_TARGET}, oneDNN: ${DNNL_GPU_VENDOR}.
51
+ Disabling oneDNN support.")
52
+ endif()
53
+ else()
54
+ message(STATUS "oneDNN not found, disabling oneDNN support")
55
+ endif()
56
+ target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_DNNL=${GGML_SYCL_DNNL})
57
+
26
58
  if (GGML_SYCL_F16)
27
59
  if (GGML_SYCL_TARGET STREQUAL "AMD")
28
60
  message(WARNING "AMD target does not entirely support FP16 in the SYCL backend.")
@@ -48,18 +80,6 @@ file(GLOB GGML_HEADERS_SYCL "*.hpp")
48
80
  file(GLOB GGML_SOURCES_SYCL "*.cpp")
49
81
  target_sources(ggml-sycl PRIVATE ${GGML_HEADERS_SYCL} ${GGML_SOURCES_SYCL})
50
82
 
51
- find_package(DNNL)
52
- message("-- DNNL found:" ${DNNL_FOUND})
53
-
54
- if (GGML_SYCL_TARGET STREQUAL "INTEL")
55
- add_compile_definitions(GGML_SYCL_DNNL=${DNNL_FOUND})
56
- else()
57
- add_compile_definitions(GGML_SYCL_DNNL=0)
58
- endif()
59
-
60
- if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL")
61
- target_link_libraries(ggml-sycl PRIVATE DNNL::dnnl)
62
- endif()
63
83
 
64
84
  if (WIN32)
65
85
  find_package(IntelSYCL REQUIRED)
@@ -170,7 +170,6 @@ static size_t g_scratch_offset = 0;
170
170
  int get_current_device_id();
171
171
 
172
172
  inline dpct::err0 ggml_sycl_set_device(const int device) try {
173
-
174
173
  int current_device_id;
175
174
  SYCL_CHECK(CHECK_TRY_ERROR(current_device_id = get_current_device_id()));
176
175
 
@@ -242,6 +241,14 @@ struct ggml_sycl_pool_alloc {
242
241
  }
243
242
  }
244
243
 
244
+ T * realloc(size_t size) {
245
+ GGML_ASSERT(pool != nullptr);
246
+ if (ptr)
247
+ pool->free(ptr, actual_size);
248
+ ptr = (T *) pool->alloc(size * sizeof(T), &this->actual_size);
249
+ return ptr;
250
+ }
251
+
245
252
  // size is in number of elements
246
253
  T * alloc(size_t size) {
247
254
  GGML_ASSERT(pool != nullptr);
@@ -371,10 +378,29 @@ struct ggml_backend_sycl_context {
371
378
  dnnl::stream stream_dnnl() {
372
379
  return stream_dnnl(device, 0);
373
380
  }
381
+ dnnl::memory get_scratchpad_mem(const dnnl::memory::desc & scratchpad_md,
382
+ const dnnl::engine & eng, const queue_ptr q) {
383
+ ggml_sycl_pool_alloc<uint8_t> * pool;
384
+ auto it = scratchpad_map.find(q);
385
+ if (it == scratchpad_map.end()) {
386
+ scratchpad_map[q] = std::make_unique<ggml_sycl_pool_alloc<uint8_t>>(this->pool());
387
+ pool = scratchpad_map[q].get();
388
+ } else {
389
+ pool = it->second.get();
390
+ }
391
+
392
+ size_t scratchpad_size = scratchpad_md.get_size();
393
+ if (scratchpad_size > pool->actual_size) {
394
+ pool->realloc(scratchpad_size);
395
+ }
396
+ void * mem_ptr = pool->get();
397
+ return dnnl::memory(scratchpad_md, eng, mem_ptr);
398
+ }
374
399
  #endif
375
400
 
376
401
  // pool
377
402
  std::unique_ptr<ggml_sycl_pool> pools[GGML_SYCL_MAX_DEVICES];
403
+ std::unordered_map<sycl::queue *, std::unique_ptr<ggml_sycl_pool_alloc<uint8_t>>> scratchpad_map;
378
404
 
379
405
  std::unique_ptr<ggml_sycl_pool> host_pools[GGML_SYCL_MAX_DEVICES];
380
406
 
@@ -13,9 +13,6 @@
13
13
  #ifndef GGML_SYCL_GEMM_HPP
14
14
  #define GGML_SYCL_GEMM_HPP
15
15
 
16
- #include <fstream>
17
- #include <iostream>
18
-
19
16
  #include "ggml-sycl.h"
20
17
 
21
18
  #if GGML_SYCL_DNNL
@@ -35,62 +32,34 @@ public:
35
32
  else static_assert(0);
36
33
  }
37
34
 
38
- static inline void row_gemm(sycl::queue& q, bool a_trans,
39
- bool b_trans, int m, int n, int k,
40
- const void* a, dt at, const void* b, dt bt, void* c, dt ct)
41
- {
42
- // Get the device associated with the queue
43
- sycl::device dev = q.get_device();
44
- // Get the context associated with the queue
45
- sycl::context ctx = q.get_context();
46
- const dnnl::engine eng = dnnl::sycl_interop::make_engine(dev, ctx);
47
- const dnnl::stream stream = dnnl::sycl_interop::make_stream(eng, q);
35
+ static inline void row_gemm(ggml_backend_sycl_context & ctx, bool a_trans, bool b_trans, int m, int n, int k,
36
+ const void * a, dt at, const void * b, dt bt, void * c, dt ct, const queue_ptr & q) {
37
+ auto stream = ctx.stream_dnnl(q);
38
+ auto eng = ctx.engine_dnnl(q);
48
39
  dnnl::memory::dims a_dims = { m, k };
49
40
  dnnl::memory::dims b_dims = { k, n };
50
41
  dnnl::memory::dims c_dims = { m, n };
51
42
  const auto a_in_md = dnnl::memory::desc(a_dims, at, a_trans ? tag::ba : tag::ab);
52
43
  const auto b_in_md = dnnl::memory::desc(b_dims, bt, b_trans ? tag::ba : tag::ab);
53
- const auto c_md = dnnl::memory::desc(c_dims, ct, tag::ab);
54
- auto a_mem = dnnl::memory(a_in_md, eng, const_cast<void*>(a));
55
- auto b_mem = dnnl::memory(b_in_md, eng, const_cast<void*>(b));
56
- auto matmul_pd = dnnl::matmul::primitive_desc(eng, a_in_md, b_in_md, c_md);
57
- auto c_mem = dnnl::memory(matmul_pd.dst_desc(), eng, c);
44
+ const auto c_md = dnnl::memory::desc(c_dims, ct, tag::ab);
58
45
 
59
- // Create the primitive.
60
- auto matmul_prim = dnnl::matmul(matmul_pd);
61
- // Primitive arguments.
62
- std::unordered_map<int, dnnl::memory> matmul_args;
63
- matmul_args.insert({ DNNL_ARG_SRC, a_mem });
64
- matmul_args.insert({ DNNL_ARG_WEIGHTS, b_mem });
65
- matmul_args.insert({ DNNL_ARG_DST, c_mem });
46
+ dnnl::primitive_attr primitive_attr;
47
+ primitive_attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
66
48
 
67
- matmul_prim.execute(stream, matmul_args);
68
- }
69
-
70
-
71
- static inline void row_gemm(const dnnl::stream& stream, bool a_trans,
72
- bool b_trans, int m, int n, int k,
73
- const void* a, dt at, const void* b, dt bt, void* c, dt ct)
74
- {
75
- auto const eng = stream.get_engine();
76
- dnnl::memory::dims a_dims = { m, k };
77
- dnnl::memory::dims b_dims = { k, n };
78
- dnnl::memory::dims c_dims = { m, n };
79
- const auto a_in_md = dnnl::memory::desc(a_dims, at, a_trans ? tag::ba : tag::ab);
80
- const auto b_in_md = dnnl::memory::desc(b_dims, bt, b_trans ? tag::ba : tag::ab);
81
- const auto c_md = dnnl::memory::desc(c_dims, ct, tag::ab);
82
49
  auto a_mem = dnnl::memory(a_in_md, eng, const_cast<void*>(a));
83
50
  auto b_mem = dnnl::memory(b_in_md, eng, const_cast<void*>(b));
84
- auto matmul_pd = dnnl::matmul::primitive_desc(eng, a_in_md, b_in_md, c_md);
51
+ auto matmul_pd = dnnl::matmul::primitive_desc(eng, a_in_md, b_in_md, c_md, primitive_attr);
85
52
  auto c_mem = dnnl::memory(matmul_pd.dst_desc(), eng, c);
86
53
 
87
- // Create the primitive.
54
+ auto scratchpad_md = matmul_pd.scratchpad_desc();
55
+ auto scratchpad_mem = ctx.get_scratchpad_mem(scratchpad_md, eng, q);
88
56
  auto matmul_prim = dnnl::matmul(matmul_pd);
89
- // Primitive arguments.
57
+
90
58
  std::unordered_map<int, dnnl::memory> matmul_args;
91
59
  matmul_args.insert({ DNNL_ARG_SRC, a_mem });
92
60
  matmul_args.insert({ DNNL_ARG_WEIGHTS, b_mem });
93
61
  matmul_args.insert({ DNNL_ARG_DST, c_mem });
62
+ matmul_args.insert({ DNNL_ARG_SCRATCHPAD, scratchpad_mem });
94
63
 
95
64
  matmul_prim.execute(stream, matmul_args);
96
65
  }
@@ -2058,9 +2058,9 @@ inline void ggml_sycl_op_mul_mat_sycl(
2058
2058
  const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst);
2059
2059
  to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
2060
2060
  #else
2061
- auto dnnl_stream = ctx.stream_dnnl(stream);
2062
- DnnlGemmWrapper::row_gemm(dnnl_stream, false, true, src1_ncols, row_diff, ne10, src1_ptr, DnnlGemmWrapper::to_dt<sycl::half>(),
2063
- src0_ptr, DnnlGemmWrapper::to_dt<sycl::half>(), dst_f16.get(), DnnlGemmWrapper::to_dt<sycl::half>());
2061
+ DnnlGemmWrapper::row_gemm(ctx, false, true, src1_ncols, row_diff, ne10, src1_ptr,
2062
+ DnnlGemmWrapper::to_dt<sycl::half>(), src0_ptr, DnnlGemmWrapper::to_dt<sycl::half>(),
2063
+ dst_f16.get(), DnnlGemmWrapper::to_dt<sycl::half>(), stream);
2064
2064
  const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst);
2065
2065
  to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff* src1_ncols, stream);
2066
2066
  #endif
@@ -2099,9 +2099,9 @@ inline void ggml_sycl_op_mul_mat_sycl(
2099
2099
  dst_dd_i, ldc)));
2100
2100
  # endif
2101
2101
  #else
2102
- auto dnnl_stream = ctx.stream_dnnl(stream);
2103
- DnnlGemmWrapper::row_gemm(dnnl_stream, false, true, src1_ncols, row_diff, ne10, src1_ddf1_i, DnnlGemmWrapper::to_dt<float>(),
2104
- src0_ddf_i, DnnlGemmWrapper::to_dt<float>(), dst_dd_i, DnnlGemmWrapper::to_dt<float>());
2102
+ DnnlGemmWrapper::row_gemm(ctx, false, true, src1_ncols, row_diff, ne10, src1_ddf1_i,
2103
+ DnnlGemmWrapper::to_dt<float>(), src0_ddf_i, DnnlGemmWrapper::to_dt<float>(),
2104
+ dst_dd_i, DnnlGemmWrapper::to_dt<float>(), stream);
2105
2105
  #endif
2106
2106
  }
2107
2107
  GGML_UNUSED(dst);
@@ -149,6 +149,7 @@ class vk_perf_logger;
149
149
  static void ggml_vk_destroy_buffer(vk_buffer& buf);
150
150
 
151
151
  static constexpr uint32_t mul_mat_vec_max_cols = 8;
152
+ static constexpr uint32_t p021_max_gqa_ratio = 8;
152
153
 
153
154
  enum vk_device_architecture {
154
155
  OTHER,
@@ -231,6 +232,7 @@ struct vk_device_struct {
231
232
  bool uma;
232
233
  bool prefer_host_memory;
233
234
  bool float_controls_rte_fp16;
235
+ bool subgroup_add;
234
236
 
235
237
  bool subgroup_size_control;
236
238
  uint32_t subgroup_min_size;
@@ -277,7 +279,7 @@ struct vk_device_struct {
277
279
  vk_pipeline pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_COUNT][mul_mat_vec_max_cols];
278
280
  vk_pipeline pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_COUNT];
279
281
 
280
- vk_pipeline pipeline_mul_mat_vec_p021_f16_f32;
282
+ vk_pipeline pipeline_mul_mat_vec_p021_f16_f32[p021_max_gqa_ratio];
281
283
  vk_pipeline pipeline_mul_mat_vec_nc_f16_f32;
282
284
  vk_pipeline pipeline_get_rows[GGML_TYPE_COUNT];
283
285
  vk_pipeline pipeline_get_rows_f32[GGML_TYPE_COUNT];
@@ -2265,7 +2267,13 @@ static void ggml_vk_load_shaders(vk_device& device) {
2265
2267
 
2266
2268
  ggml_vk_create_pipeline(device, device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256 * 4, 1, 1}, {}, 1);
2267
2269
 
2268
- ggml_vk_create_pipeline(device, device->pipeline_mul_mat_vec_p021_f16_f32, "mul_mat_vec_p021_f16_f32", mul_mat_vec_p021_f16_f32_len, mul_mat_vec_p021_f16_f32_data, "main", 3, 6 * sizeof(uint32_t), {1, 1, 1}, {}, 1);
2270
+ for (uint32_t i = 0; i < p021_max_gqa_ratio; ++i) {
2271
+ if (device->subgroup_add && device->subgroup_require_full_support) {
2272
+ ggml_vk_create_pipeline(device, device->pipeline_mul_mat_vec_p021_f16_f32[i], "mul_mat_vec_p021_f16_f32"+std::to_string(i+1), mul_mat_vec_p021_f16_f32_subgroup_add_len, mul_mat_vec_p021_f16_f32_subgroup_add_data, "main", 3, 6 * sizeof(uint32_t), {1, 1, 1}, {device->subgroup_size, i + 1}, 1, true, true);
2273
+ } else {
2274
+ ggml_vk_create_pipeline(device, device->pipeline_mul_mat_vec_p021_f16_f32[i], "mul_mat_vec_p021_f16_f32"+std::to_string(i+1), mul_mat_vec_p021_f16_f32_len, mul_mat_vec_p021_f16_f32_data, "main", 3, 6 * sizeof(uint32_t), {1, 1, 1}, {device->subgroup_size, i + 1}, 1, true);
2275
+ }
2276
+ }
2269
2277
  ggml_vk_create_pipeline(device, device->pipeline_mul_mat_vec_nc_f16_f32, "mul_mat_vec_nc_f16_f32", mul_mat_vec_nc_f16_f32_len, mul_mat_vec_nc_f16_f32_data, "main", 3, 7 * sizeof(uint32_t), {1, 1, 1}, {}, 1);
2270
2278
 
2271
2279
  ggml_vk_create_pipeline(device, device->pipeline_norm_f32, "norm_f32", norm_f32_len, norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
@@ -2281,13 +2289,21 @@ static void ggml_vk_load_shaders(vk_device& device) {
2281
2289
  ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_f32, "contig_cpy_f32_f32", contig_cpy_f32_f32_len, contig_cpy_f32_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
2282
2290
  ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_f16, "contig_cpy_f32_f16", contig_cpy_f32_f16_len, contig_cpy_f32_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
2283
2291
  ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f16_f16, "contig_cpy_f16_f16", contig_cpy_f16_f16_len, contig_cpy_f16_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
2284
-
2285
- ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_0], "cpy_f32_q4_0", cpy_f32_q4_0_len, cpy_f32_q4_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_0), 1, 1}, {}, 1);
2286
- ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_1], "cpy_f32_q4_1", cpy_f32_q4_1_len, cpy_f32_q4_1_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_1), 1, 1}, {}, 1);
2287
- ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_0], "cpy_f32_q5_0", cpy_f32_q5_0_len, cpy_f32_q5_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q5_0), 1, 1}, {}, 1);
2288
- ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_1], "cpy_f32_q5_1", cpy_f32_q5_1_len, cpy_f32_q5_1_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q5_1), 1, 1}, {}, 1);
2289
- ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q8_0], "cpy_f32_q8_0", cpy_f32_q8_0_len, cpy_f32_q8_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q8_0), 1, 1}, {}, 1);
2290
- ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_IQ4_NL], "cpy_f32_iq4_nl", cpy_f32_iq4_nl_len, cpy_f32_iq4_nl_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_IQ4_NL), 1, 1}, {}, 1);
2292
+ if (device->float_controls_rte_fp16) {
2293
+ ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_0], "cpy_f32_q4_0", cpy_f32_q4_0_rte_len, cpy_f32_q4_0_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_0), 1, 1}, {}, 1);
2294
+ ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_1], "cpy_f32_q4_1", cpy_f32_q4_1_rte_len, cpy_f32_q4_1_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_1), 1, 1}, {}, 1);
2295
+ ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_0], "cpy_f32_q5_0", cpy_f32_q5_0_rte_len, cpy_f32_q5_0_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q5_0), 1, 1}, {}, 1);
2296
+ ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_1], "cpy_f32_q5_1", cpy_f32_q5_1_rte_len, cpy_f32_q5_1_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q5_1), 1, 1}, {}, 1);
2297
+ ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q8_0], "cpy_f32_q8_0", cpy_f32_q8_0_rte_len, cpy_f32_q8_0_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q8_0), 1, 1}, {}, 1);
2298
+ ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_IQ4_NL], "cpy_f32_iq4_nl", cpy_f32_iq4_nl_rte_len, cpy_f32_iq4_nl_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_IQ4_NL), 1, 1}, {}, 1);
2299
+ } else {
2300
+ ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_0], "cpy_f32_q4_0", cpy_f32_q4_0_len, cpy_f32_q4_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_0), 1, 1}, {}, 1);
2301
+ ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_1], "cpy_f32_q4_1", cpy_f32_q4_1_len, cpy_f32_q4_1_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_1), 1, 1}, {}, 1);
2302
+ ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_0], "cpy_f32_q5_0", cpy_f32_q5_0_len, cpy_f32_q5_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q5_0), 1, 1}, {}, 1);
2303
+ ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_1], "cpy_f32_q5_1", cpy_f32_q5_1_len, cpy_f32_q5_1_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q5_1), 1, 1}, {}, 1);
2304
+ ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q8_0], "cpy_f32_q8_0", cpy_f32_q8_0_len, cpy_f32_q8_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q8_0), 1, 1}, {}, 1);
2305
+ ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_IQ4_NL], "cpy_f32_iq4_nl", cpy_f32_iq4_nl_len, cpy_f32_iq4_nl_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_IQ4_NL), 1, 1}, {}, 1);
2306
+ }
2291
2307
 
2292
2308
  ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q4_0], "cpy_q4_0_f32", cpy_q4_0_f32_len, cpy_q4_0_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_0), 1, 1}, {}, 1);
2293
2309
  ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q4_1], "cpy_q4_1_f32", cpy_q4_1_f32_len, cpy_q4_1_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_1), 1, 1}, {}, 1);
@@ -2471,13 +2487,15 @@ static vk_device ggml_vk_get_device(size_t idx) {
2471
2487
  vk::PhysicalDeviceDriverProperties driver_props;
2472
2488
  vk::PhysicalDeviceShaderSMBuiltinsPropertiesNV sm_props;
2473
2489
  vk::PhysicalDeviceShaderCoreProperties2AMD amd_shader_core_properties2_props;
2490
+ vk::PhysicalDeviceVulkan11Properties vk11_props;
2474
2491
  vk::PhysicalDeviceVulkan12Properties vk12_props;
2475
2492
  vk::PhysicalDeviceSubgroupSizeControlPropertiesEXT subgroup_size_control_props;
2476
2493
 
2477
2494
  props2.pNext = &props3;
2478
2495
  props3.pNext = &subgroup_props;
2479
2496
  subgroup_props.pNext = &driver_props;
2480
- driver_props.pNext = &vk12_props;
2497
+ driver_props.pNext = &vk11_props;
2498
+ vk11_props.pNext = &vk12_props;
2481
2499
 
2482
2500
  VkBaseOutStructure * last_struct = (VkBaseOutStructure *)&vk12_props;
2483
2501
 
@@ -2541,6 +2559,9 @@ static vk_device ggml_vk_get_device(size_t idx) {
2541
2559
  }
2542
2560
  device->float_controls_rte_fp16 = vk12_props.shaderRoundingModeRTEFloat16;
2543
2561
 
2562
+ device->subgroup_add = (vk11_props.subgroupSupportedStages & vk::ShaderStageFlagBits::eCompute) &&
2563
+ (vk11_props.subgroupSupportedOperations & vk::SubgroupFeatureFlagBits::eArithmetic);
2564
+
2544
2565
  const bool force_disable_f16 = getenv("GGML_VK_DISABLE_F16") != nullptr;
2545
2566
 
2546
2567
  device->fp16 = !force_disable_f16 && fp16_storage && fp16_compute;
@@ -4627,9 +4648,15 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
4627
4648
  const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type);
4628
4649
  const uint64_t d_sz = sizeof(float) * d_ne;
4629
4650
 
4651
+ // With grouped query attention there are > 1 Q matrices per K, V matrix.
4652
+ uint32_t gqa_ratio = (uint32_t)ne12 / (uint32_t)ne02;
4653
+ if (gqa_ratio > 8 || gqa_ratio == 0 || ne12 != ne02 * gqa_ratio) {
4654
+ gqa_ratio = 1;
4655
+ }
4656
+
4630
4657
  if (dryrun) {
4631
4658
  // Request descriptor sets
4632
- ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, 1);
4659
+ ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_mul_mat_vec_p021_f16_f32[gqa_ratio - 1], 1);
4633
4660
  return;
4634
4661
  }
4635
4662
 
@@ -4653,8 +4680,15 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
4653
4680
 
4654
4681
  // compute
4655
4682
  const std::array<uint32_t, 6> pc = { (uint32_t)ne00, (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
4683
+
4684
+ uint32_t workgroups_z = (uint32_t)ne12;
4685
+ // When gqa_ratio > 1, each invocation does multiple rows and we can launch fewer workgroups
4686
+ if (gqa_ratio > 1) {
4687
+ workgroups_z /= gqa_ratio;
4688
+ }
4689
+
4656
4690
  ggml_vk_sync_buffers(subctx);
4657
- ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
4691
+ ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32[gqa_ratio - 1], { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, workgroups_z });
4658
4692
  }
4659
4693
 
4660
4694
  static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
@@ -426,8 +426,9 @@ void process_shaders() {
426
426
  }
427
427
  }
428
428
 
429
- string_to_spv("mul_mat_vec_p021_f16_f32", "mul_mat_vec_p021.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
430
- string_to_spv("mul_mat_vec_nc_f16_f32", "mul_mat_vec_nc.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
429
+ string_to_spv("mul_mat_vec_p021_f16_f32_subgroup_add", "mul_mat_vec_p021.comp", {{"A_TYPE", "float16_t"}, {"A_TYPE_VEC4", "f16vec4"}, {"B_TYPE", "float"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}, {"USE_SUBGROUP_ADD", "1"}});
430
+ string_to_spv("mul_mat_vec_p021_f16_f32", "mul_mat_vec_p021.comp", {{"A_TYPE", "float16_t"}, {"A_TYPE_VEC4", "f16vec4"}, {"B_TYPE", "float"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}});
431
+ string_to_spv("mul_mat_vec_nc_f16_f32", "mul_mat_vec_nc.comp", {{"A_TYPE", "float16_t"}, {"A_TYPE_VEC4", "f16vec4"}, {"B_TYPE", "float"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}});
431
432
 
432
433
  // Norms
433
434
  string_to_spv("norm_f32", "norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
@@ -445,6 +446,7 @@ void process_shaders() {
445
446
 
446
447
  for (std::string t : {"q4_0", "q4_1", "q5_0", "q5_1", "q8_0", "iq4_nl"}) {
447
448
  string_to_spv("cpy_f32_" + t, "copy_to_quant.comp", {{"DATA_A_" + to_uppercase(t), "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
449
+ string_to_spv("cpy_f32_" + t + "_rte", "copy_to_quant.comp", {{"DATA_A_" + to_uppercase(t), "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"RTE16", "1"}});
448
450
  string_to_spv("cpy_" + t + "_f32", "copy_from_quant.comp", {{"DATA_A_" + to_uppercase(t), "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
449
451
  }
450
452
 
@@ -778,6 +778,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
778
778
  {
779
779
  { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
780
780
  { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
781
+ { LLM_TENSOR_OUTPUT, "output" },
781
782
  { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
782
783
  { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
783
784
  { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },