@fugood/llama.node 0.0.1-alpha.3 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +36 -7
- package/README.md +9 -0
- package/bin/darwin/arm64/default.metallib +0 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/default.metallib +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/lib/binding.js +18 -1
- package/lib/binding.ts +22 -2
- package/lib/index.ts +2 -2
- package/package.json +15 -3
- package/src/LlamaCompletionWorker.cpp +5 -1
- package/src/LlamaCompletionWorker.h +4 -0
- package/src/LlamaContext.cpp +18 -1
- package/src/common.hpp +11 -7
- package/src/llama.cpp/CMakeLists.txt +13 -7
- package/src/llama.cpp/common/common.cpp +221 -173
- package/src/llama.cpp/common/common.h +19 -8
- package/src/llama.cpp/common/json-schema-to-grammar.h +4 -0
- package/src/llama.cpp/common/log.h +2 -2
- package/src/llama.cpp/common/sampling.cpp +17 -1
- package/src/llama.cpp/common/sampling.h +28 -20
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +17 -11
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +5 -5
- package/src/llama.cpp/examples/finetune/finetune.cpp +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -4
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +72 -39
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +42 -3
- package/src/llama.cpp/examples/llava/clip.cpp +74 -23
- package/src/llama.cpp/examples/llava/llava-cli.cpp +37 -28
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +0 -1
- package/src/llama.cpp/examples/lookup/lookup.cpp +0 -1
- package/src/llama.cpp/examples/main/main.cpp +10 -8
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +175 -55
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +74 -47
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
- package/src/llama.cpp/examples/server/server.cpp +97 -86
- package/src/llama.cpp/examples/server/utils.hpp +17 -15
- package/src/llama.cpp/ggml-backend.c +7 -5
- package/src/llama.cpp/ggml-impl.h +339 -4
- package/src/llama.cpp/ggml-kompute.cpp +7 -0
- package/src/llama.cpp/ggml-opencl.cpp +1 -0
- package/src/llama.cpp/ggml-quants.c +302 -293
- package/src/llama.cpp/ggml-sycl.cpp +28 -16
- package/src/llama.cpp/ggml-vulkan-shaders.hpp +46843 -39205
- package/src/llama.cpp/ggml-vulkan.cpp +951 -263
- package/src/llama.cpp/ggml.c +1469 -116
- package/src/llama.cpp/ggml.h +37 -7
- package/src/llama.cpp/llama.cpp +969 -432
- package/src/llama.cpp/llama.h +46 -14
- package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf-update.txt +2 -0
- package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +0 -1
- package/src/llama.cpp/requirements/requirements-convert.txt +2 -2
- package/src/llama.cpp/requirements.txt +1 -0
- package/src/llama.cpp/sgemm.cpp +134 -103
- package/src/llama.cpp/sgemm.h +4 -2
- package/src/llama.cpp/tests/CMakeLists.txt +96 -36
- package/src/llama.cpp/tests/test-backend-ops.cpp +56 -6
- package/src/llama.cpp/tests/test-chat-template.cpp +4 -0
- package/src/llama.cpp/tests/test-grammar-integration.cpp +225 -136
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +1 -0
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +292 -0
- package/src/llama.cpp/tests/{test-tokenizer-1-llama.cpp → test-tokenizer-1-spm.cpp} +1 -1
- package/src/llama.cpp/unicode-data.cpp +1188 -656
- package/src/llama.cpp/unicode-data.h +4 -3
- package/src/llama.cpp/unicode.cpp +590 -49
- package/src/llama.cpp/unicode.h +6 -3
- package/src/llama.cpp/tests/test-tokenizer-0-falcon.cpp +0 -187
- package/src/llama.cpp/tests/test-tokenizer-0-llama.cpp +0 -190
|
@@ -1,10 +1,40 @@
|
|
|
1
|
+
function(llama_test target)
|
|
2
|
+
include(CMakeParseArguments)
|
|
3
|
+
set(options)
|
|
4
|
+
set(oneValueArgs NAME LABEL WORKING_DIRECTORY)
|
|
5
|
+
set(multiValueArgs ARGS)
|
|
6
|
+
cmake_parse_arguments(LLAMA_TEST "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
|
|
7
|
+
|
|
8
|
+
if (NOT DEFINED LLAMA_TEST_LABEL)
|
|
9
|
+
set(LLAMA_TEST_LABEL "main")
|
|
10
|
+
endif()
|
|
11
|
+
if (NOT DEFINED LLAMA_TEST_WORKING_DIRECTORY)
|
|
12
|
+
set(LLAMA_TEST_WORKING_DIRECTORY .)
|
|
13
|
+
endif()
|
|
14
|
+
if (DEFINED LLAMA_TEST_NAME)
|
|
15
|
+
set(TEST_NAME ${LLAMA_TEST_NAME})
|
|
16
|
+
else()
|
|
17
|
+
set(TEST_NAME ${target})
|
|
18
|
+
endif()
|
|
19
|
+
|
|
20
|
+
set(TEST_TARGET ${target})
|
|
21
|
+
|
|
22
|
+
add_test(
|
|
23
|
+
NAME ${TEST_NAME}
|
|
24
|
+
WORKING_DIRECTORY ${LLAMA_TEST_WORKING_DIRECTORY}
|
|
25
|
+
COMMAND $<TARGET_FILE:${TEST_TARGET}>
|
|
26
|
+
${LLAMA_TEST_ARGS})
|
|
27
|
+
|
|
28
|
+
set_property(TEST ${TEST_NAME} PROPERTY LABELS ${LLAMA_TEST_LABEL})
|
|
29
|
+
endfunction()
|
|
30
|
+
|
|
1
31
|
# Builds and runs a test source file.
|
|
2
32
|
# Optional args:
|
|
3
33
|
# - NAME: name of the executable & test target (defaults to the source file name without extension)
|
|
4
34
|
# - LABEL: label for the test (defaults to main)
|
|
5
35
|
# - ARGS: arguments to pass to the test executable
|
|
6
36
|
# - WORKING_DIRECTORY
|
|
7
|
-
function(
|
|
37
|
+
function(llama_target_and_test source)
|
|
8
38
|
include(CMakeParseArguments)
|
|
9
39
|
set(options)
|
|
10
40
|
set(oneValueArgs NAME LABEL WORKING_DIRECTORY)
|
|
@@ -35,41 +65,71 @@ function(llama_test source)
|
|
|
35
65
|
set_property(TEST ${TEST_TARGET} PROPERTY LABELS ${LLAMA_TEST_LABEL})
|
|
36
66
|
endfunction()
|
|
37
67
|
|
|
38
|
-
#
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
llama_test(test-tokenizer-0
|
|
45
|
-
llama_test(test-tokenizer-0
|
|
46
|
-
|
|
47
|
-
llama_test(test-tokenizer-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
llama_test(test-tokenizer-
|
|
51
|
-
llama_test(test-tokenizer-
|
|
52
|
-
llama_test(test-tokenizer-
|
|
53
|
-
llama_test(test-tokenizer-
|
|
54
|
-
llama_test(test-tokenizer-
|
|
55
|
-
llama_test(test-tokenizer-
|
|
56
|
-
llama_test(test-tokenizer-
|
|
57
|
-
llama_test(test-tokenizer-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
#
|
|
65
|
-
llama_test(test-
|
|
66
|
-
|
|
67
|
-
llama_test(test-
|
|
68
|
-
|
|
69
|
-
llama_test(test-
|
|
70
|
-
llama_test(test-
|
|
71
|
-
|
|
72
|
-
llama_test(test-
|
|
68
|
+
# build test-tokenizer-0 target once and add many tests
|
|
69
|
+
add_executable(test-tokenizer-0 test-tokenizer-0.cpp)
|
|
70
|
+
target_link_libraries(test-tokenizer-0 PRIVATE common)
|
|
71
|
+
install(TARGETS test-tokenizer-0 RUNTIME)
|
|
72
|
+
|
|
73
|
+
llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-spm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf)
|
|
74
|
+
llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
|
|
75
|
+
llama_test(test-tokenizer-0 NAME test-tokenizer-0-phi-3 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-phi-3.gguf)
|
|
76
|
+
llama_test(test-tokenizer-0 NAME test-tokenizer-0-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
|
77
|
+
llama_test(test-tokenizer-0 NAME test-tokenizer-0-bert-bge ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bert-bge.gguf)
|
|
78
|
+
# TODO: enable when fixed
|
|
79
|
+
# https://github.com/ggerganov/llama.cpp/pull/7036
|
|
80
|
+
#llama_test(test-tokenizer-0 NAME test-tokenizer-0-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
|
|
81
|
+
#llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-llm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf)
|
|
82
|
+
#llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-coder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf)
|
|
83
|
+
llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
|
|
84
|
+
llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
|
|
85
|
+
llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
|
|
86
|
+
llama_test(test-tokenizer-0 NAME test-tokenizer-0-command-r ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-command-r.gguf)
|
|
87
|
+
llama_test(test-tokenizer-0 NAME test-tokenizer-0-qwen2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-qwen2.gguf)
|
|
88
|
+
|
|
89
|
+
# build test-tokenizer-1-bpe target once and add many tests
|
|
90
|
+
add_executable(test-tokenizer-1-bpe test-tokenizer-1-bpe.cpp)
|
|
91
|
+
target_link_libraries(test-tokenizer-1-bpe PRIVATE common)
|
|
92
|
+
install(TARGETS test-tokenizer-1-bpe RUNTIME)
|
|
93
|
+
|
|
94
|
+
# TODO: disabled due to slowness
|
|
95
|
+
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
|
|
96
|
+
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
|
97
|
+
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-aquila ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
|
|
98
|
+
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
|
|
99
|
+
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-stablelm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-stablelm.gguf)
|
|
100
|
+
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-neox ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
|
|
101
|
+
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
|
|
102
|
+
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
|
|
103
|
+
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt2.gguf)
|
|
104
|
+
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-bloom ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bloom.gguf)
|
|
105
|
+
|
|
106
|
+
# build test-tokenizer-1-spm target once and add many tests
|
|
107
|
+
add_executable(test-tokenizer-1-spm test-tokenizer-1-spm.cpp)
|
|
108
|
+
target_link_libraries(test-tokenizer-1-spm PRIVATE common)
|
|
109
|
+
install(TARGETS test-tokenizer-1-spm RUNTIME)
|
|
110
|
+
|
|
111
|
+
llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-llama-spm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf)
|
|
112
|
+
#llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-baichuan ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
|
|
113
|
+
|
|
114
|
+
# llama_target_and_test(test-double-float.cpp) # SLOW
|
|
115
|
+
llama_target_and_test(test-quantize-fns.cpp)
|
|
116
|
+
llama_target_and_test(test-quantize-perf.cpp)
|
|
117
|
+
llama_target_and_test(test-sampling.cpp)
|
|
118
|
+
llama_target_and_test(test-chat-template.cpp)
|
|
119
|
+
|
|
120
|
+
llama_target_and_test(test-grammar-parser.cpp)
|
|
121
|
+
llama_target_and_test(test-llama-grammar.cpp)
|
|
122
|
+
llama_target_and_test(test-grammar-integration.cpp)
|
|
123
|
+
llama_target_and_test(test-grad0.cpp)
|
|
124
|
+
# llama_target_and_test(test-opt.cpp) # SLOW
|
|
125
|
+
llama_target_and_test(test-backend-ops.cpp)
|
|
126
|
+
|
|
127
|
+
llama_target_and_test(test-rope.cpp)
|
|
128
|
+
|
|
129
|
+
llama_target_and_test(test-model-load-cancel.cpp LABEL "model")
|
|
130
|
+
llama_target_and_test(test-autorelease.cpp LABEL "model")
|
|
131
|
+
|
|
132
|
+
llama_target_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
|
|
73
133
|
target_include_directories(test-json-schema-to-grammar PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../examples/server)
|
|
74
134
|
|
|
75
135
|
# dummy executable - not installed
|
|
@@ -50,7 +50,7 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
|
|
|
50
50
|
|
|
51
51
|
if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) {
|
|
52
52
|
ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float));
|
|
53
|
-
} else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16) {
|
|
53
|
+
} else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16) {
|
|
54
54
|
GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0);
|
|
55
55
|
std::vector<uint8_t> dataq(ggml_row_size(tensor->type, size));
|
|
56
56
|
std::vector<float> imatrix(tensor->ne[0], 1.0f); // dummy importance matrix
|
|
@@ -92,6 +92,8 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
|
|
|
92
92
|
size_t i = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0/bs*t->nb[0];
|
|
93
93
|
if (t->type == GGML_TYPE_F16) {
|
|
94
94
|
tv.push_back(ggml_fp16_to_fp32(*(ggml_fp16_t*)&buf[i]));
|
|
95
|
+
} else if (t->type == GGML_TYPE_BF16) {
|
|
96
|
+
tv.push_back(ggml_bf16_to_fp32(*(ggml_bf16_t*)&buf[i]));
|
|
95
97
|
} else if (t->type == GGML_TYPE_F32) {
|
|
96
98
|
tv.push_back(*(float *) &buf[i]);
|
|
97
99
|
} else if (t->type == GGML_TYPE_I32) {
|
|
@@ -1090,6 +1092,12 @@ struct test_soft_max : public test_case {
|
|
|
1090
1092
|
return VARS_TO_STR5(type, ne, mask, scale, max_bias);
|
|
1091
1093
|
}
|
|
1092
1094
|
|
|
1095
|
+
// the 1024 test with bias occasionally fails:
|
|
1096
|
+
// SOFT_MAX(type=f32,ne=[1024,16,1,1],mask=1,scale=1.000000,max_bias=8.000000): [SOFT_MAX] NMSE = 0.000000103 > 0.000000100 FAIL
|
|
1097
|
+
virtual double max_nmse_err() override {
|
|
1098
|
+
return 1e-6;
|
|
1099
|
+
}
|
|
1100
|
+
|
|
1093
1101
|
test_soft_max(ggml_type type = GGML_TYPE_F32,
|
|
1094
1102
|
std::array<int64_t, 4> ne = {10, 10, 10, 10},
|
|
1095
1103
|
bool mask = false,
|
|
@@ -1101,7 +1109,7 @@ struct test_soft_max : public test_case {
|
|
|
1101
1109
|
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
|
1102
1110
|
ggml_tensor * mask = nullptr;
|
|
1103
1111
|
if (this->mask) {
|
|
1104
|
-
mask = ggml_new_tensor_2d(ctx,
|
|
1112
|
+
mask = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, ne[0], ne[1]);
|
|
1105
1113
|
}
|
|
1106
1114
|
ggml_tensor * pos = nullptr;
|
|
1107
1115
|
if (max_bias > 0.0f) {
|
|
@@ -1475,6 +1483,34 @@ struct test_leaky_relu : public test_case {
|
|
|
1475
1483
|
}
|
|
1476
1484
|
};
|
|
1477
1485
|
|
|
1486
|
+
// GGML_OP_FLASH_ATTN_EXT
|
|
1487
|
+
struct test_flash_attn_ext : public test_case {
|
|
1488
|
+
const int64_t hs; // head size
|
|
1489
|
+
const int64_t nh; // num heads
|
|
1490
|
+
const int64_t kv; // kv size
|
|
1491
|
+
const int64_t nb; // batch size
|
|
1492
|
+
|
|
1493
|
+
std::string vars() override {
|
|
1494
|
+
return VARS_TO_STR4(hs, nh, kv, nb);
|
|
1495
|
+
}
|
|
1496
|
+
|
|
1497
|
+
double max_nmse_err() override {
|
|
1498
|
+
return 5e-4;
|
|
1499
|
+
}
|
|
1500
|
+
|
|
1501
|
+
test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8)
|
|
1502
|
+
: hs(hs), nh(nh), kv(kv), nb(nb) {}
|
|
1503
|
+
|
|
1504
|
+
ggml_tensor * build_graph(ggml_context * ctx) override {
|
|
1505
|
+
ggml_tensor * q = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, hs, nb, nh, 1);
|
|
1506
|
+
ggml_tensor * k = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, hs, kv, nh, 1);
|
|
1507
|
+
ggml_tensor * v = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, hs, kv, nh, 1);
|
|
1508
|
+
ggml_tensor * mask = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, GGML_PAD(nb, GGML_KQ_MASK_PAD), 1, 1);
|
|
1509
|
+
ggml_tensor * out = ggml_flash_attn_ext(ctx, q, k, v, mask, 1.0f/sqrtf(hs));
|
|
1510
|
+
return out;
|
|
1511
|
+
}
|
|
1512
|
+
};
|
|
1513
|
+
|
|
1478
1514
|
enum llm_norm_type {
|
|
1479
1515
|
LLM_NORM,
|
|
1480
1516
|
LLM_NORM_RMS,
|
|
@@ -1661,7 +1697,7 @@ struct test_llama : public test_llm {
|
|
|
1661
1697
|
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, hp.n_tokens);
|
|
1662
1698
|
|
|
1663
1699
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
|
1664
|
-
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx,
|
|
1700
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, hp.n_kv, hp.n_tokens, 1);
|
|
1665
1701
|
|
|
1666
1702
|
ggml_tensor * k_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
|
|
1667
1703
|
ggml_tensor * v_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
|
|
@@ -1783,7 +1819,7 @@ struct test_falcon : public test_llm {
|
|
|
1783
1819
|
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, hp.n_tokens);
|
|
1784
1820
|
|
|
1785
1821
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
|
1786
|
-
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx,
|
|
1822
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, hp.n_kv, hp.n_tokens, 1);
|
|
1787
1823
|
|
|
1788
1824
|
ggml_tensor * k_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
|
|
1789
1825
|
ggml_tensor * v_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
|
|
@@ -1864,7 +1900,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
|
|
1864
1900
|
std::default_random_engine rng(0);
|
|
1865
1901
|
|
|
1866
1902
|
const ggml_type all_types[] = {
|
|
1867
|
-
GGML_TYPE_F32, GGML_TYPE_F16,
|
|
1903
|
+
GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16,
|
|
1868
1904
|
GGML_TYPE_Q4_0, GGML_TYPE_Q4_1,
|
|
1869
1905
|
GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,
|
|
1870
1906
|
GGML_TYPE_Q8_0,
|
|
@@ -2095,7 +2131,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
|
|
2095
2131
|
for (float scale : {1.0f, 0.1f}) {
|
|
2096
2132
|
for (int64_t ne0 : {16, 1024}) {
|
|
2097
2133
|
for (int64_t ne1 : {16, 1024}) {
|
|
2098
|
-
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0,
|
|
2134
|
+
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0, ne1, 1, 1}, mask, scale, max_bias));
|
|
2099
2135
|
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0-1, ne1-1, 1, 1}, mask, scale, max_bias));
|
|
2100
2136
|
}
|
|
2101
2137
|
}
|
|
@@ -2139,6 +2175,20 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
|
|
2139
2175
|
test_cases.emplace_back(new test_timestep_embedding());
|
|
2140
2176
|
test_cases.emplace_back(new test_leaky_relu());
|
|
2141
2177
|
|
|
2178
|
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
|
2179
|
+
for (int hs : { 64, 128, }) { // other head sizes not implemented
|
|
2180
|
+
#else
|
|
2181
|
+
for (int hs : { 64, 80, 128, 256, }) {
|
|
2182
|
+
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
|
2183
|
+
for (int nh : { 32, }) {
|
|
2184
|
+
for (int kv : { 512, 1024, }) {
|
|
2185
|
+
for (int nb : { 1, 2, 4, 8, }) {
|
|
2186
|
+
test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb));
|
|
2187
|
+
}
|
|
2188
|
+
}
|
|
2189
|
+
}
|
|
2190
|
+
}
|
|
2191
|
+
|
|
2142
2192
|
// these tests are disabled to save execution time, but they can be handy for debugging
|
|
2143
2193
|
#if 0
|
|
2144
2194
|
test_cases.emplace_back(new test_llama(1));
|
|
@@ -49,6 +49,8 @@ int main(void) {
|
|
|
49
49
|
"{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true %}{% set loop_messages = messages %}{% set system_message = 'You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + system_message + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% elif message['role'] == 'assistant' %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}{% endif %}",
|
|
50
50
|
// Llama-3
|
|
51
51
|
"{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}",
|
|
52
|
+
// Phi-3
|
|
53
|
+
"{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + ' ' + message['content'] + '<|end|> ' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|> ' }}{% else %}{{ eos_token }}{% endif %}"
|
|
52
54
|
};
|
|
53
55
|
std::vector<std::string> expected_output = {
|
|
54
56
|
// teknium/OpenHermes-2.5-Mistral-7B
|
|
@@ -77,6 +79,8 @@ int main(void) {
|
|
|
77
79
|
"<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>You are a helpful assistant<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hello<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>Hi there<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Who are you<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>I am an assistant<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Another question<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
|
|
78
80
|
// Llama 3
|
|
79
81
|
"<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHi there<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI am an assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nAnother question<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
|
|
82
|
+
// Phi 3
|
|
83
|
+
"<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\nI am an assistant<|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
|
|
80
84
|
};
|
|
81
85
|
std::vector<char> formatted_chat(1024);
|
|
82
86
|
int32_t res;
|