@fugood/llama.node 0.3.15 → 0.3.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +1 -1
- package/src/llama.cpp/examples/server/server.cpp +5 -0
- package/src/llama.cpp/examples/tts/tts.cpp +8 -0
- package/src/llama.cpp/ggml/src/CMakeLists.txt +5 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +1493 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +31 -27
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +32 -12
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +27 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +12 -43
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +6 -6
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +46 -12
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +4 -2
- package/src/llama.cpp/src/llama-arch.cpp +1 -0
- package/src/llama.cpp/src/llama-model.cpp +65 -38
- package/src/llama.cpp/tests/test-backend-ops.cpp +57 -14
|
@@ -3110,17 +3110,17 @@ static void ggml_compute_forward_dup_same_cont(
|
|
|
3110
3110
|
const int ith = params->ith; // thread index
|
|
3111
3111
|
const int nth = params->nth; // number of threads
|
|
3112
3112
|
|
|
3113
|
-
// parallelize by
|
|
3114
|
-
const int
|
|
3115
|
-
const int dr = (
|
|
3116
|
-
const int
|
|
3117
|
-
const int
|
|
3113
|
+
// parallelize by blocks
|
|
3114
|
+
const int nk = ggml_nelements(src0)/ggml_blck_size(src0->type);
|
|
3115
|
+
const int dr = (nk + nth - 1) / nth;
|
|
3116
|
+
const int k0 = dr * ith;
|
|
3117
|
+
const int k1 = MIN(k0 + dr, nk);
|
|
3118
3118
|
|
|
3119
|
-
if (
|
|
3119
|
+
if (k0 < k1) {
|
|
3120
3120
|
memcpy(
|
|
3121
|
-
((char *) dst->data +
|
|
3122
|
-
((char *) src0->data +
|
|
3123
|
-
(
|
|
3121
|
+
((char *) dst->data + k0*nb0),
|
|
3122
|
+
((char *) src0->data + k0*nb0),
|
|
3123
|
+
(k1 - k0) * nb0);
|
|
3124
3124
|
}
|
|
3125
3125
|
}
|
|
3126
3126
|
|
|
@@ -4055,7 +4055,6 @@ static void ggml_compute_forward_dup_f32(
|
|
|
4055
4055
|
static void ggml_compute_forward_dup_bytes(
|
|
4056
4056
|
const struct ggml_compute_params * params,
|
|
4057
4057
|
struct ggml_tensor * dst) {
|
|
4058
|
-
|
|
4059
4058
|
const struct ggml_tensor * src0 = dst->src[0];
|
|
4060
4059
|
|
|
4061
4060
|
GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
|
|
@@ -4069,10 +4068,10 @@ static void ggml_compute_forward_dup_bytes(
|
|
|
4069
4068
|
}
|
|
4070
4069
|
|
|
4071
4070
|
const size_t type_size = ggml_type_size(src0->type);
|
|
4071
|
+
|
|
4072
4072
|
const int ith = params->ith; // thread index
|
|
4073
4073
|
const int nth = params->nth; // number of threads
|
|
4074
4074
|
|
|
4075
|
-
|
|
4076
4075
|
// parallelize by rows
|
|
4077
4076
|
const int nr = ne01;
|
|
4078
4077
|
// number of rows per thread
|
|
@@ -4082,10 +4081,10 @@ static void ggml_compute_forward_dup_bytes(
|
|
|
4082
4081
|
const int ir1 = MIN(ir0 + dr, nr);
|
|
4083
4082
|
|
|
4084
4083
|
if (src0->type == dst->type &&
|
|
4085
|
-
|
|
4084
|
+
ggml_are_same_shape(src0, dst) &&
|
|
4086
4085
|
nb00 == type_size && nb0 == type_size) {
|
|
4087
4086
|
// copy by rows
|
|
4088
|
-
const size_t rs = ne00
|
|
4087
|
+
const size_t rs = ggml_row_size(src0->type, ne00);
|
|
4089
4088
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
|
4090
4089
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
|
4091
4090
|
for (int64_t i01 = ir0; i01 < ir1; i01++) {
|
|
@@ -4140,17 +4139,20 @@ static void ggml_compute_forward_dup_bytes(
|
|
|
4140
4139
|
}
|
|
4141
4140
|
|
|
4142
4141
|
// dst counters
|
|
4143
|
-
|
|
4144
|
-
int64_t i10 = 0;
|
|
4142
|
+
int64_t k10 = 0;
|
|
4145
4143
|
int64_t i11 = 0;
|
|
4146
4144
|
int64_t i12 = 0;
|
|
4147
4145
|
int64_t i13 = 0;
|
|
4148
4146
|
|
|
4147
|
+
// number of blocks in a row
|
|
4148
|
+
const int64_t nk00 = ne00 / ggml_blck_size(src0->type);
|
|
4149
|
+
const int64_t nk0 = ne0 / ggml_blck_size(dst->type);
|
|
4150
|
+
|
|
4149
4151
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
|
4150
4152
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
|
4151
|
-
|
|
4152
|
-
while (
|
|
4153
|
-
|
|
4153
|
+
k10 += nk00 * ir0;
|
|
4154
|
+
while (k10 >= nk0) {
|
|
4155
|
+
k10 -= nk0;
|
|
4154
4156
|
if (++i11 == ne1) {
|
|
4155
4157
|
i11 = 0;
|
|
4156
4158
|
if (++i12 == ne2) {
|
|
@@ -4162,14 +4164,14 @@ static void ggml_compute_forward_dup_bytes(
|
|
|
4162
4164
|
}
|
|
4163
4165
|
}
|
|
4164
4166
|
for (int64_t i01 = ir0; i01 < ir1; i01++) {
|
|
4165
|
-
for (int64_t
|
|
4166
|
-
const char * src0_ptr = ((char *) src0->data +
|
|
4167
|
-
char * dst_ptr = ((char *) dst->data +
|
|
4167
|
+
for (int64_t k00 = 0; k00 < nk00; k00++) {
|
|
4168
|
+
const char * src0_ptr = ((char *) src0->data + k00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
|
4169
|
+
char * dst_ptr = ((char *) dst->data + k10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
|
|
4168
4170
|
|
|
4169
4171
|
memcpy(dst_ptr, src0_ptr, type_size);
|
|
4170
4172
|
|
|
4171
|
-
if (++
|
|
4172
|
-
|
|
4173
|
+
if (++k10 == nk0) {
|
|
4174
|
+
k10 = 0;
|
|
4173
4175
|
if (++i11 == ne1) {
|
|
4174
4176
|
i11 = 0;
|
|
4175
4177
|
if (++i12 == ne2) {
|
|
@@ -4182,9 +4184,9 @@ static void ggml_compute_forward_dup_bytes(
|
|
|
4182
4184
|
}
|
|
4183
4185
|
}
|
|
4184
4186
|
}
|
|
4185
|
-
|
|
4186
|
-
while (
|
|
4187
|
-
|
|
4187
|
+
k10 += nk00 * (ne01 - ir1);
|
|
4188
|
+
while (k10 >= nk0) {
|
|
4189
|
+
k10 -= nk0;
|
|
4188
4190
|
if (++i11 == ne1) {
|
|
4189
4191
|
i11 = 0;
|
|
4190
4192
|
if (++i12 == ne2) {
|
|
@@ -14308,7 +14310,9 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
|
14308
14310
|
}
|
|
14309
14311
|
|
|
14310
14312
|
// extra_buffer op?
|
|
14311
|
-
if (ggml_cpu_extra_compute_forward(params, tensor))
|
|
14313
|
+
if (ggml_cpu_extra_compute_forward(params, tensor)) {
|
|
14314
|
+
return;
|
|
14315
|
+
}
|
|
14312
14316
|
|
|
14313
14317
|
switch (tensor->op) {
|
|
14314
14318
|
case GGML_OP_DUP:
|
|
@@ -23,6 +23,38 @@ ggml_add_backend_library(ggml-sycl
|
|
|
23
23
|
../../include/ggml-sycl.h
|
|
24
24
|
)
|
|
25
25
|
|
|
26
|
+
find_package(DNNL)
|
|
27
|
+
set(GGML_SYCL_DNNL 0)
|
|
28
|
+
if(DNNL_FOUND)
|
|
29
|
+
if (DEFINED ENV{ONEAPI_ROOT} AND NOT DEFINED DNNL_GPU_VENDOR)
|
|
30
|
+
# Assuming oneDNN packaged with oneapi release is used which
|
|
31
|
+
# supports only intel target
|
|
32
|
+
set(DNNL_GPU_VENDOR "INTEL")
|
|
33
|
+
if(NOT "${GGML_SYCL_TARGET}" STREQUAL "INTEL")
|
|
34
|
+
message(WARNING "oneDNN builds bundled with oneapi release only support INTEL target")
|
|
35
|
+
endif()
|
|
36
|
+
endif()
|
|
37
|
+
|
|
38
|
+
# Verify oneDNN was compiled for the same target as llama
|
|
39
|
+
if("${GGML_SYCL_TARGET}" STREQUAL "${DNNL_GPU_VENDOR}")
|
|
40
|
+
target_link_libraries(ggml-sycl PRIVATE DNNL::dnnl)
|
|
41
|
+
set(GGML_SYCL_DNNL 1)
|
|
42
|
+
get_target_property(CONFIGS DNNL::dnnl IMPORTED_CONFIGURATIONS)
|
|
43
|
+
foreach(CONFIG ${CONFIGS})
|
|
44
|
+
get_target_property(DNNL_LIB DNNL::dnnl IMPORTED_LOCATION_${CONFIG})
|
|
45
|
+
message(STATUS "Found oneDNN: ${DNNL_LIB}")
|
|
46
|
+
endforeach()
|
|
47
|
+
else()
|
|
48
|
+
message(WARNING
|
|
49
|
+
"oneDNN must be compiled for the same target as llama.cpp.
|
|
50
|
+
llama.cpp: ${GGML_SYCL_TARGET}, oneDNN: ${DNNL_GPU_VENDOR}.
|
|
51
|
+
Disabling oneDNN support.")
|
|
52
|
+
endif()
|
|
53
|
+
else()
|
|
54
|
+
message(STATUS "oneDNN not found, disabling oneDNN support")
|
|
55
|
+
endif()
|
|
56
|
+
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_DNNL=${GGML_SYCL_DNNL})
|
|
57
|
+
|
|
26
58
|
if (GGML_SYCL_F16)
|
|
27
59
|
if (GGML_SYCL_TARGET STREQUAL "AMD")
|
|
28
60
|
message(WARNING "AMD target does not entirely support FP16 in the SYCL backend.")
|
|
@@ -48,18 +80,6 @@ file(GLOB GGML_HEADERS_SYCL "*.hpp")
|
|
|
48
80
|
file(GLOB GGML_SOURCES_SYCL "*.cpp")
|
|
49
81
|
target_sources(ggml-sycl PRIVATE ${GGML_HEADERS_SYCL} ${GGML_SOURCES_SYCL})
|
|
50
82
|
|
|
51
|
-
find_package(DNNL)
|
|
52
|
-
message("-- DNNL found:" ${DNNL_FOUND})
|
|
53
|
-
|
|
54
|
-
if (GGML_SYCL_TARGET STREQUAL "INTEL")
|
|
55
|
-
add_compile_definitions(GGML_SYCL_DNNL=${DNNL_FOUND})
|
|
56
|
-
else()
|
|
57
|
-
add_compile_definitions(GGML_SYCL_DNNL=0)
|
|
58
|
-
endif()
|
|
59
|
-
|
|
60
|
-
if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL")
|
|
61
|
-
target_link_libraries(ggml-sycl PRIVATE DNNL::dnnl)
|
|
62
|
-
endif()
|
|
63
83
|
|
|
64
84
|
if (WIN32)
|
|
65
85
|
find_package(IntelSYCL REQUIRED)
|
|
@@ -170,7 +170,6 @@ static size_t g_scratch_offset = 0;
|
|
|
170
170
|
int get_current_device_id();
|
|
171
171
|
|
|
172
172
|
inline dpct::err0 ggml_sycl_set_device(const int device) try {
|
|
173
|
-
|
|
174
173
|
int current_device_id;
|
|
175
174
|
SYCL_CHECK(CHECK_TRY_ERROR(current_device_id = get_current_device_id()));
|
|
176
175
|
|
|
@@ -242,6 +241,14 @@ struct ggml_sycl_pool_alloc {
|
|
|
242
241
|
}
|
|
243
242
|
}
|
|
244
243
|
|
|
244
|
+
T * realloc(size_t size) {
|
|
245
|
+
GGML_ASSERT(pool != nullptr);
|
|
246
|
+
if (ptr)
|
|
247
|
+
pool->free(ptr, actual_size);
|
|
248
|
+
ptr = (T *) pool->alloc(size * sizeof(T), &this->actual_size);
|
|
249
|
+
return ptr;
|
|
250
|
+
}
|
|
251
|
+
|
|
245
252
|
// size is in number of elements
|
|
246
253
|
T * alloc(size_t size) {
|
|
247
254
|
GGML_ASSERT(pool != nullptr);
|
|
@@ -371,10 +378,29 @@ struct ggml_backend_sycl_context {
|
|
|
371
378
|
dnnl::stream stream_dnnl() {
|
|
372
379
|
return stream_dnnl(device, 0);
|
|
373
380
|
}
|
|
381
|
+
dnnl::memory get_scratchpad_mem(const dnnl::memory::desc & scratchpad_md,
|
|
382
|
+
const dnnl::engine & eng, const queue_ptr q) {
|
|
383
|
+
ggml_sycl_pool_alloc<uint8_t> * pool;
|
|
384
|
+
auto it = scratchpad_map.find(q);
|
|
385
|
+
if (it == scratchpad_map.end()) {
|
|
386
|
+
scratchpad_map[q] = std::make_unique<ggml_sycl_pool_alloc<uint8_t>>(this->pool());
|
|
387
|
+
pool = scratchpad_map[q].get();
|
|
388
|
+
} else {
|
|
389
|
+
pool = it->second.get();
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
size_t scratchpad_size = scratchpad_md.get_size();
|
|
393
|
+
if (scratchpad_size > pool->actual_size) {
|
|
394
|
+
pool->realloc(scratchpad_size);
|
|
395
|
+
}
|
|
396
|
+
void * mem_ptr = pool->get();
|
|
397
|
+
return dnnl::memory(scratchpad_md, eng, mem_ptr);
|
|
398
|
+
}
|
|
374
399
|
#endif
|
|
375
400
|
|
|
376
401
|
// pool
|
|
377
402
|
std::unique_ptr<ggml_sycl_pool> pools[GGML_SYCL_MAX_DEVICES];
|
|
403
|
+
std::unordered_map<sycl::queue *, std::unique_ptr<ggml_sycl_pool_alloc<uint8_t>>> scratchpad_map;
|
|
378
404
|
|
|
379
405
|
std::unique_ptr<ggml_sycl_pool> host_pools[GGML_SYCL_MAX_DEVICES];
|
|
380
406
|
|
|
@@ -13,9 +13,6 @@
|
|
|
13
13
|
#ifndef GGML_SYCL_GEMM_HPP
|
|
14
14
|
#define GGML_SYCL_GEMM_HPP
|
|
15
15
|
|
|
16
|
-
#include <fstream>
|
|
17
|
-
#include <iostream>
|
|
18
|
-
|
|
19
16
|
#include "ggml-sycl.h"
|
|
20
17
|
|
|
21
18
|
#if GGML_SYCL_DNNL
|
|
@@ -35,62 +32,34 @@ public:
|
|
|
35
32
|
else static_assert(0);
|
|
36
33
|
}
|
|
37
34
|
|
|
38
|
-
static inline void row_gemm(
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
// Get the device associated with the queue
|
|
43
|
-
sycl::device dev = q.get_device();
|
|
44
|
-
// Get the context associated with the queue
|
|
45
|
-
sycl::context ctx = q.get_context();
|
|
46
|
-
const dnnl::engine eng = dnnl::sycl_interop::make_engine(dev, ctx);
|
|
47
|
-
const dnnl::stream stream = dnnl::sycl_interop::make_stream(eng, q);
|
|
35
|
+
static inline void row_gemm(ggml_backend_sycl_context & ctx, bool a_trans, bool b_trans, int m, int n, int k,
|
|
36
|
+
const void * a, dt at, const void * b, dt bt, void * c, dt ct, const queue_ptr & q) {
|
|
37
|
+
auto stream = ctx.stream_dnnl(q);
|
|
38
|
+
auto eng = ctx.engine_dnnl(q);
|
|
48
39
|
dnnl::memory::dims a_dims = { m, k };
|
|
49
40
|
dnnl::memory::dims b_dims = { k, n };
|
|
50
41
|
dnnl::memory::dims c_dims = { m, n };
|
|
51
42
|
const auto a_in_md = dnnl::memory::desc(a_dims, at, a_trans ? tag::ba : tag::ab);
|
|
52
43
|
const auto b_in_md = dnnl::memory::desc(b_dims, bt, b_trans ? tag::ba : tag::ab);
|
|
53
|
-
const auto c_md
|
|
54
|
-
auto a_mem = dnnl::memory(a_in_md, eng, const_cast<void*>(a));
|
|
55
|
-
auto b_mem = dnnl::memory(b_in_md, eng, const_cast<void*>(b));
|
|
56
|
-
auto matmul_pd = dnnl::matmul::primitive_desc(eng, a_in_md, b_in_md, c_md);
|
|
57
|
-
auto c_mem = dnnl::memory(matmul_pd.dst_desc(), eng, c);
|
|
44
|
+
const auto c_md = dnnl::memory::desc(c_dims, ct, tag::ab);
|
|
58
45
|
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
// Primitive arguments.
|
|
62
|
-
std::unordered_map<int, dnnl::memory> matmul_args;
|
|
63
|
-
matmul_args.insert({ DNNL_ARG_SRC, a_mem });
|
|
64
|
-
matmul_args.insert({ DNNL_ARG_WEIGHTS, b_mem });
|
|
65
|
-
matmul_args.insert({ DNNL_ARG_DST, c_mem });
|
|
46
|
+
dnnl::primitive_attr primitive_attr;
|
|
47
|
+
primitive_attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
|
|
66
48
|
|
|
67
|
-
matmul_prim.execute(stream, matmul_args);
|
|
68
|
-
}
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
static inline void row_gemm(const dnnl::stream& stream, bool a_trans,
|
|
72
|
-
bool b_trans, int m, int n, int k,
|
|
73
|
-
const void* a, dt at, const void* b, dt bt, void* c, dt ct)
|
|
74
|
-
{
|
|
75
|
-
auto const eng = stream.get_engine();
|
|
76
|
-
dnnl::memory::dims a_dims = { m, k };
|
|
77
|
-
dnnl::memory::dims b_dims = { k, n };
|
|
78
|
-
dnnl::memory::dims c_dims = { m, n };
|
|
79
|
-
const auto a_in_md = dnnl::memory::desc(a_dims, at, a_trans ? tag::ba : tag::ab);
|
|
80
|
-
const auto b_in_md = dnnl::memory::desc(b_dims, bt, b_trans ? tag::ba : tag::ab);
|
|
81
|
-
const auto c_md = dnnl::memory::desc(c_dims, ct, tag::ab);
|
|
82
49
|
auto a_mem = dnnl::memory(a_in_md, eng, const_cast<void*>(a));
|
|
83
50
|
auto b_mem = dnnl::memory(b_in_md, eng, const_cast<void*>(b));
|
|
84
|
-
auto matmul_pd = dnnl::matmul::primitive_desc(eng, a_in_md, b_in_md, c_md);
|
|
51
|
+
auto matmul_pd = dnnl::matmul::primitive_desc(eng, a_in_md, b_in_md, c_md, primitive_attr);
|
|
85
52
|
auto c_mem = dnnl::memory(matmul_pd.dst_desc(), eng, c);
|
|
86
53
|
|
|
87
|
-
|
|
54
|
+
auto scratchpad_md = matmul_pd.scratchpad_desc();
|
|
55
|
+
auto scratchpad_mem = ctx.get_scratchpad_mem(scratchpad_md, eng, q);
|
|
88
56
|
auto matmul_prim = dnnl::matmul(matmul_pd);
|
|
89
|
-
|
|
57
|
+
|
|
90
58
|
std::unordered_map<int, dnnl::memory> matmul_args;
|
|
91
59
|
matmul_args.insert({ DNNL_ARG_SRC, a_mem });
|
|
92
60
|
matmul_args.insert({ DNNL_ARG_WEIGHTS, b_mem });
|
|
93
61
|
matmul_args.insert({ DNNL_ARG_DST, c_mem });
|
|
62
|
+
matmul_args.insert({ DNNL_ARG_SCRATCHPAD, scratchpad_mem });
|
|
94
63
|
|
|
95
64
|
matmul_prim.execute(stream, matmul_args);
|
|
96
65
|
}
|
|
@@ -2058,9 +2058,9 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
|
|
2058
2058
|
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst);
|
|
2059
2059
|
to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
|
|
2060
2060
|
#else
|
|
2061
|
-
|
|
2062
|
-
|
|
2063
|
-
|
|
2061
|
+
DnnlGemmWrapper::row_gemm(ctx, false, true, src1_ncols, row_diff, ne10, src1_ptr,
|
|
2062
|
+
DnnlGemmWrapper::to_dt<sycl::half>(), src0_ptr, DnnlGemmWrapper::to_dt<sycl::half>(),
|
|
2063
|
+
dst_f16.get(), DnnlGemmWrapper::to_dt<sycl::half>(), stream);
|
|
2064
2064
|
const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst);
|
|
2065
2065
|
to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff* src1_ncols, stream);
|
|
2066
2066
|
#endif
|
|
@@ -2099,9 +2099,9 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
|
|
2099
2099
|
dst_dd_i, ldc)));
|
|
2100
2100
|
# endif
|
|
2101
2101
|
#else
|
|
2102
|
-
|
|
2103
|
-
|
|
2104
|
-
|
|
2102
|
+
DnnlGemmWrapper::row_gemm(ctx, false, true, src1_ncols, row_diff, ne10, src1_ddf1_i,
|
|
2103
|
+
DnnlGemmWrapper::to_dt<float>(), src0_ddf_i, DnnlGemmWrapper::to_dt<float>(),
|
|
2104
|
+
dst_dd_i, DnnlGemmWrapper::to_dt<float>(), stream);
|
|
2105
2105
|
#endif
|
|
2106
2106
|
}
|
|
2107
2107
|
GGML_UNUSED(dst);
|
|
@@ -149,6 +149,7 @@ class vk_perf_logger;
|
|
|
149
149
|
static void ggml_vk_destroy_buffer(vk_buffer& buf);
|
|
150
150
|
|
|
151
151
|
static constexpr uint32_t mul_mat_vec_max_cols = 8;
|
|
152
|
+
static constexpr uint32_t p021_max_gqa_ratio = 8;
|
|
152
153
|
|
|
153
154
|
enum vk_device_architecture {
|
|
154
155
|
OTHER,
|
|
@@ -231,6 +232,7 @@ struct vk_device_struct {
|
|
|
231
232
|
bool uma;
|
|
232
233
|
bool prefer_host_memory;
|
|
233
234
|
bool float_controls_rte_fp16;
|
|
235
|
+
bool subgroup_add;
|
|
234
236
|
|
|
235
237
|
bool subgroup_size_control;
|
|
236
238
|
uint32_t subgroup_min_size;
|
|
@@ -277,7 +279,7 @@ struct vk_device_struct {
|
|
|
277
279
|
vk_pipeline pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_COUNT][mul_mat_vec_max_cols];
|
|
278
280
|
vk_pipeline pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_COUNT];
|
|
279
281
|
|
|
280
|
-
vk_pipeline pipeline_mul_mat_vec_p021_f16_f32;
|
|
282
|
+
vk_pipeline pipeline_mul_mat_vec_p021_f16_f32[p021_max_gqa_ratio];
|
|
281
283
|
vk_pipeline pipeline_mul_mat_vec_nc_f16_f32;
|
|
282
284
|
vk_pipeline pipeline_get_rows[GGML_TYPE_COUNT];
|
|
283
285
|
vk_pipeline pipeline_get_rows_f32[GGML_TYPE_COUNT];
|
|
@@ -2265,7 +2267,13 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
|
2265
2267
|
|
|
2266
2268
|
ggml_vk_create_pipeline(device, device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256 * 4, 1, 1}, {}, 1);
|
|
2267
2269
|
|
|
2268
|
-
|
|
2270
|
+
for (uint32_t i = 0; i < p021_max_gqa_ratio; ++i) {
|
|
2271
|
+
if (device->subgroup_add && device->subgroup_require_full_support) {
|
|
2272
|
+
ggml_vk_create_pipeline(device, device->pipeline_mul_mat_vec_p021_f16_f32[i], "mul_mat_vec_p021_f16_f32"+std::to_string(i+1), mul_mat_vec_p021_f16_f32_subgroup_add_len, mul_mat_vec_p021_f16_f32_subgroup_add_data, "main", 3, 6 * sizeof(uint32_t), {1, 1, 1}, {device->subgroup_size, i + 1}, 1, true, true);
|
|
2273
|
+
} else {
|
|
2274
|
+
ggml_vk_create_pipeline(device, device->pipeline_mul_mat_vec_p021_f16_f32[i], "mul_mat_vec_p021_f16_f32"+std::to_string(i+1), mul_mat_vec_p021_f16_f32_len, mul_mat_vec_p021_f16_f32_data, "main", 3, 6 * sizeof(uint32_t), {1, 1, 1}, {device->subgroup_size, i + 1}, 1, true);
|
|
2275
|
+
}
|
|
2276
|
+
}
|
|
2269
2277
|
ggml_vk_create_pipeline(device, device->pipeline_mul_mat_vec_nc_f16_f32, "mul_mat_vec_nc_f16_f32", mul_mat_vec_nc_f16_f32_len, mul_mat_vec_nc_f16_f32_data, "main", 3, 7 * sizeof(uint32_t), {1, 1, 1}, {}, 1);
|
|
2270
2278
|
|
|
2271
2279
|
ggml_vk_create_pipeline(device, device->pipeline_norm_f32, "norm_f32", norm_f32_len, norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
|
|
@@ -2281,13 +2289,21 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
|
2281
2289
|
ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_f32, "contig_cpy_f32_f32", contig_cpy_f32_f32_len, contig_cpy_f32_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
|
2282
2290
|
ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_f16, "contig_cpy_f32_f16", contig_cpy_f32_f16_len, contig_cpy_f32_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
|
2283
2291
|
ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f16_f16, "contig_cpy_f16_f16", contig_cpy_f16_f16_len, contig_cpy_f16_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
|
2284
|
-
|
|
2285
|
-
|
|
2286
|
-
|
|
2287
|
-
|
|
2288
|
-
|
|
2289
|
-
|
|
2290
|
-
|
|
2292
|
+
if (device->float_controls_rte_fp16) {
|
|
2293
|
+
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_0], "cpy_f32_q4_0", cpy_f32_q4_0_rte_len, cpy_f32_q4_0_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_0), 1, 1}, {}, 1);
|
|
2294
|
+
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_1], "cpy_f32_q4_1", cpy_f32_q4_1_rte_len, cpy_f32_q4_1_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_1), 1, 1}, {}, 1);
|
|
2295
|
+
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_0], "cpy_f32_q5_0", cpy_f32_q5_0_rte_len, cpy_f32_q5_0_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q5_0), 1, 1}, {}, 1);
|
|
2296
|
+
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_1], "cpy_f32_q5_1", cpy_f32_q5_1_rte_len, cpy_f32_q5_1_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q5_1), 1, 1}, {}, 1);
|
|
2297
|
+
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q8_0], "cpy_f32_q8_0", cpy_f32_q8_0_rte_len, cpy_f32_q8_0_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q8_0), 1, 1}, {}, 1);
|
|
2298
|
+
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_IQ4_NL], "cpy_f32_iq4_nl", cpy_f32_iq4_nl_rte_len, cpy_f32_iq4_nl_rte_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_IQ4_NL), 1, 1}, {}, 1);
|
|
2299
|
+
} else {
|
|
2300
|
+
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_0], "cpy_f32_q4_0", cpy_f32_q4_0_len, cpy_f32_q4_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_0), 1, 1}, {}, 1);
|
|
2301
|
+
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_1], "cpy_f32_q4_1", cpy_f32_q4_1_len, cpy_f32_q4_1_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_1), 1, 1}, {}, 1);
|
|
2302
|
+
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_0], "cpy_f32_q5_0", cpy_f32_q5_0_len, cpy_f32_q5_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q5_0), 1, 1}, {}, 1);
|
|
2303
|
+
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_1], "cpy_f32_q5_1", cpy_f32_q5_1_len, cpy_f32_q5_1_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q5_1), 1, 1}, {}, 1);
|
|
2304
|
+
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q8_0], "cpy_f32_q8_0", cpy_f32_q8_0_len, cpy_f32_q8_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q8_0), 1, 1}, {}, 1);
|
|
2305
|
+
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_IQ4_NL], "cpy_f32_iq4_nl", cpy_f32_iq4_nl_len, cpy_f32_iq4_nl_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_IQ4_NL), 1, 1}, {}, 1);
|
|
2306
|
+
}
|
|
2291
2307
|
|
|
2292
2308
|
ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q4_0], "cpy_q4_0_f32", cpy_q4_0_f32_len, cpy_q4_0_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_0), 1, 1}, {}, 1);
|
|
2293
2309
|
ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q4_1], "cpy_q4_1_f32", cpy_q4_1_f32_len, cpy_q4_1_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_1), 1, 1}, {}, 1);
|
|
@@ -2471,13 +2487,15 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
|
|
2471
2487
|
vk::PhysicalDeviceDriverProperties driver_props;
|
|
2472
2488
|
vk::PhysicalDeviceShaderSMBuiltinsPropertiesNV sm_props;
|
|
2473
2489
|
vk::PhysicalDeviceShaderCoreProperties2AMD amd_shader_core_properties2_props;
|
|
2490
|
+
vk::PhysicalDeviceVulkan11Properties vk11_props;
|
|
2474
2491
|
vk::PhysicalDeviceVulkan12Properties vk12_props;
|
|
2475
2492
|
vk::PhysicalDeviceSubgroupSizeControlPropertiesEXT subgroup_size_control_props;
|
|
2476
2493
|
|
|
2477
2494
|
props2.pNext = &props3;
|
|
2478
2495
|
props3.pNext = &subgroup_props;
|
|
2479
2496
|
subgroup_props.pNext = &driver_props;
|
|
2480
|
-
driver_props.pNext = &
|
|
2497
|
+
driver_props.pNext = &vk11_props;
|
|
2498
|
+
vk11_props.pNext = &vk12_props;
|
|
2481
2499
|
|
|
2482
2500
|
VkBaseOutStructure * last_struct = (VkBaseOutStructure *)&vk12_props;
|
|
2483
2501
|
|
|
@@ -2541,6 +2559,9 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
|
|
2541
2559
|
}
|
|
2542
2560
|
device->float_controls_rte_fp16 = vk12_props.shaderRoundingModeRTEFloat16;
|
|
2543
2561
|
|
|
2562
|
+
device->subgroup_add = (vk11_props.subgroupSupportedStages & vk::ShaderStageFlagBits::eCompute) &&
|
|
2563
|
+
(vk11_props.subgroupSupportedOperations & vk::SubgroupFeatureFlagBits::eArithmetic);
|
|
2564
|
+
|
|
2544
2565
|
const bool force_disable_f16 = getenv("GGML_VK_DISABLE_F16") != nullptr;
|
|
2545
2566
|
|
|
2546
2567
|
device->fp16 = !force_disable_f16 && fp16_storage && fp16_compute;
|
|
@@ -4627,9 +4648,15 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
|
|
4627
4648
|
const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type);
|
|
4628
4649
|
const uint64_t d_sz = sizeof(float) * d_ne;
|
|
4629
4650
|
|
|
4651
|
+
// With grouped query attention there are > 1 Q matrices per K, V matrix.
|
|
4652
|
+
uint32_t gqa_ratio = (uint32_t)ne12 / (uint32_t)ne02;
|
|
4653
|
+
if (gqa_ratio > 8 || gqa_ratio == 0 || ne12 != ne02 * gqa_ratio) {
|
|
4654
|
+
gqa_ratio = 1;
|
|
4655
|
+
}
|
|
4656
|
+
|
|
4630
4657
|
if (dryrun) {
|
|
4631
4658
|
// Request descriptor sets
|
|
4632
|
-
ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, 1);
|
|
4659
|
+
ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_mul_mat_vec_p021_f16_f32[gqa_ratio - 1], 1);
|
|
4633
4660
|
return;
|
|
4634
4661
|
}
|
|
4635
4662
|
|
|
@@ -4653,8 +4680,15 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
|
|
4653
4680
|
|
|
4654
4681
|
// compute
|
|
4655
4682
|
const std::array<uint32_t, 6> pc = { (uint32_t)ne00, (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
|
|
4683
|
+
|
|
4684
|
+
uint32_t workgroups_z = (uint32_t)ne12;
|
|
4685
|
+
// When gqa_ratio > 1, each invocation does multiple rows and we can launch fewer workgroups
|
|
4686
|
+
if (gqa_ratio > 1) {
|
|
4687
|
+
workgroups_z /= gqa_ratio;
|
|
4688
|
+
}
|
|
4689
|
+
|
|
4656
4690
|
ggml_vk_sync_buffers(subctx);
|
|
4657
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01,
|
|
4691
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32[gqa_ratio - 1], { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, workgroups_z });
|
|
4658
4692
|
}
|
|
4659
4693
|
|
|
4660
4694
|
static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
|
@@ -426,8 +426,9 @@ void process_shaders() {
|
|
|
426
426
|
}
|
|
427
427
|
}
|
|
428
428
|
|
|
429
|
-
string_to_spv("
|
|
430
|
-
string_to_spv("
|
|
429
|
+
string_to_spv("mul_mat_vec_p021_f16_f32_subgroup_add", "mul_mat_vec_p021.comp", {{"A_TYPE", "float16_t"}, {"A_TYPE_VEC4", "f16vec4"}, {"B_TYPE", "float"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}, {"USE_SUBGROUP_ADD", "1"}});
|
|
430
|
+
string_to_spv("mul_mat_vec_p021_f16_f32", "mul_mat_vec_p021.comp", {{"A_TYPE", "float16_t"}, {"A_TYPE_VEC4", "f16vec4"}, {"B_TYPE", "float"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}});
|
|
431
|
+
string_to_spv("mul_mat_vec_nc_f16_f32", "mul_mat_vec_nc.comp", {{"A_TYPE", "float16_t"}, {"A_TYPE_VEC4", "f16vec4"}, {"B_TYPE", "float"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}});
|
|
431
432
|
|
|
432
433
|
// Norms
|
|
433
434
|
string_to_spv("norm_f32", "norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
|
@@ -445,6 +446,7 @@ void process_shaders() {
|
|
|
445
446
|
|
|
446
447
|
for (std::string t : {"q4_0", "q4_1", "q5_0", "q5_1", "q8_0", "iq4_nl"}) {
|
|
447
448
|
string_to_spv("cpy_f32_" + t, "copy_to_quant.comp", {{"DATA_A_" + to_uppercase(t), "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
|
449
|
+
string_to_spv("cpy_f32_" + t + "_rte", "copy_to_quant.comp", {{"DATA_A_" + to_uppercase(t), "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"RTE16", "1"}});
|
|
448
450
|
string_to_spv("cpy_" + t + "_f32", "copy_from_quant.comp", {{"DATA_A_" + to_uppercase(t), "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
|
449
451
|
}
|
|
450
452
|
|
|
@@ -778,6 +778,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
778
778
|
{
|
|
779
779
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
780
780
|
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
781
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
781
782
|
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
782
783
|
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
783
784
|
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|