@fugood/llama.node 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +9 -0
- package/README.md +1 -1
- package/bin/darwin/arm64/default.metallib +0 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/default.metallib +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +1 -1
- package/package.json +2 -1
- package/patches/llama.patch +22 -0
- package/src/TokenizeWorker.cpp +1 -1
- package/src/llama.cpp/CMakeLists.txt +14 -12
- package/src/llama.cpp/common/common.cpp +19 -5
- package/src/llama.cpp/common/common.h +2 -0
- package/src/llama.cpp/common/grammar-parser.cpp +9 -0
- package/src/llama.cpp/common/sampling.cpp +3 -3
- package/src/llama.cpp/common/sampling.h +1 -1
- package/src/llama.cpp/examples/CMakeLists.txt +3 -0
- package/src/llama.cpp/examples/embedding/embedding.cpp +10 -2
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +56 -7
- package/src/llama.cpp/examples/llama.android/{app/src/main/cpp → llama}/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +49 -0
- package/src/llama.cpp/examples/llama.android/{app → llama}/src/main/cpp/llama-android.cpp +14 -14
- package/src/llama.cpp/examples/llava/llava-cli.cpp +26 -6
- package/src/llama.cpp/examples/main/main.cpp +5 -1
- package/src/llama.cpp/examples/rpc/CMakeLists.txt +2 -0
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +70 -0
- package/src/llama.cpp/examples/server/server.cpp +12 -16
- package/src/llama.cpp/examples/server/utils.hpp +1 -1
- package/src/llama.cpp/ggml-backend.c +2 -2
- package/src/llama.cpp/ggml-kompute.cpp +9 -3
- package/src/llama.cpp/ggml-quants.c +6 -0
- package/src/llama.cpp/ggml-rpc.cpp +1023 -0
- package/src/llama.cpp/ggml-rpc.h +24 -0
- package/src/llama.cpp/ggml-sycl.cpp +20 -143
- package/src/llama.cpp/ggml-vulkan.cpp +4 -2
- package/src/llama.cpp/ggml.c +116 -271
- package/src/llama.cpp/ggml.h +12 -15
- package/src/llama.cpp/llama.cpp +451 -265
- package/src/llama.cpp/llama.h +3 -0
- package/src/llama.cpp/requirements.txt +0 -1
- package/src/llama.cpp/tests/CMakeLists.txt +1 -1
- package/src/llama.cpp/tests/test-backend-ops.cpp +16 -19
- package/src/llama.cpp/tests/test-grammar-integration.cpp +46 -0
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +27 -3
- package/src/llama.cpp/requirements/requirements-convert-lora-to-ggml.txt +0 -2
package/CMakeLists.txt
CHANGED
|
@@ -64,6 +64,15 @@ if (VULKAN_SDK)
|
|
|
64
64
|
find_package(Vulkan REQUIRED)
|
|
65
65
|
endif()
|
|
66
66
|
|
|
67
|
+
find_program(PATCH patch REQUIRED)
|
|
68
|
+
|
|
69
|
+
add_custom_target(
|
|
70
|
+
patch ALL
|
|
71
|
+
COMMAND ${PATCH} -p1 -N < ${CMAKE_SOURCE_DIR}/patches/llama.patch || true
|
|
72
|
+
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/src/llama.cpp
|
|
73
|
+
COMMENT "Applying patches"
|
|
74
|
+
)
|
|
75
|
+
|
|
67
76
|
set(LLAMA_STATIC ON CACHE BOOL "Build llama as static library")
|
|
68
77
|
add_subdirectory("src/llama.cpp")
|
|
69
78
|
|
package/README.md
CHANGED
|
@@ -30,7 +30,7 @@ const context = await loadModel({
|
|
|
30
30
|
})
|
|
31
31
|
|
|
32
32
|
// Do completion
|
|
33
|
-
const { text
|
|
33
|
+
const { text } = await context.completion(
|
|
34
34
|
{
|
|
35
35
|
prompt: 'This is a conversation between user and llama, a friendly chatbot. respond in simple markdown.\n\nUser: Hello!\nLlama:',
|
|
36
36
|
n_predict: 100,
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
package/lib/binding.ts
CHANGED
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@fugood/llama.node",
|
|
3
3
|
"access": "public",
|
|
4
|
-
"version": "0.2.
|
|
4
|
+
"version": "0.2.1",
|
|
5
5
|
"description": "Llama.cpp for Node.js",
|
|
6
6
|
"main": "lib/index.js",
|
|
7
7
|
"scripts": {
|
|
@@ -38,6 +38,7 @@
|
|
|
38
38
|
]
|
|
39
39
|
},
|
|
40
40
|
"files": [
|
|
41
|
+
"patches/*.patch",
|
|
41
42
|
"bin/**/*",
|
|
42
43
|
"src/**/*.{c,cc,cpp,h,hh,hpp,txt,cmake}",
|
|
43
44
|
"lib/*.js",
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
|
|
2
|
+
index b9449be0..cfa0f774 100644
|
|
3
|
+
--- a/ggml-vulkan.cpp
|
|
4
|
+
+++ b/ggml-vulkan.cpp
|
|
5
|
+
@@ -525,9 +525,15 @@ static void ggml_vk_create_pipeline(ggml_backend_vk_context * ctx, vk_pipeline&
|
|
6
|
+
vk::PipelineCreateFlags(),
|
|
7
|
+
pipeline_shader_create_info,
|
|
8
|
+
pipeline->layout);
|
|
9
|
+
- pipeline->pipeline = ctx->device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value;
|
|
10
|
+
|
|
11
|
+
- ctx->device->pipelines.push_back(pipeline);
|
|
12
|
+
+ try {
|
|
13
|
+
+ pipeline->pipeline = ctx->device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value;
|
|
14
|
+
+ ctx->device->pipelines.push_back(pipeline);
|
|
15
|
+
+ } catch (vk::UnknownError const&) {
|
|
16
|
+
+ std::cerr << "ggml_vk_create_pipeline: Failed to create pipeline " << name << std::endl;
|
|
17
|
+
+ ggml_vk_destroy_pipeline(ctx->device->device, pipeline);
|
|
18
|
+
+ pipeline.reset();
|
|
19
|
+
+ }
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline) {
|
package/src/TokenizeWorker.cpp
CHANGED
|
@@ -7,7 +7,7 @@ TokenizeWorker::TokenizeWorker(const Napi::CallbackInfo &info,
|
|
|
7
7
|
|
|
8
8
|
void TokenizeWorker::Execute() {
|
|
9
9
|
const auto tokens = ::llama_tokenize(_sess->context(), _text, false);
|
|
10
|
-
_result
|
|
10
|
+
_result.tokens = std::move(tokens);
|
|
11
11
|
}
|
|
12
12
|
|
|
13
13
|
void TokenizeWorker::OnOK() {
|
|
@@ -123,6 +123,7 @@ set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING
|
|
|
123
123
|
set(LLAMA_METAL_STD "" CACHE STRING "llama: metal standard version (-std flag)")
|
|
124
124
|
option(LLAMA_KOMPUTE "llama: use Kompute" OFF)
|
|
125
125
|
option(LLAMA_MPI "llama: use MPI" OFF)
|
|
126
|
+
option(LLAMA_RPC "llama: use RPC" OFF)
|
|
126
127
|
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
|
|
127
128
|
option(LLAMA_SYCL "llama: use SYCL" OFF)
|
|
128
129
|
option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF)
|
|
@@ -296,7 +297,7 @@ if (LLAMA_BLAS)
|
|
|
296
297
|
if (LLAMA_STATIC)
|
|
297
298
|
set(BLA_STATIC ON)
|
|
298
299
|
endif()
|
|
299
|
-
if (
|
|
300
|
+
if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.22)
|
|
300
301
|
set(BLA_SIZEOF_INTEGER 8)
|
|
301
302
|
endif()
|
|
302
303
|
|
|
@@ -494,6 +495,17 @@ if (LLAMA_MPI)
|
|
|
494
495
|
endif()
|
|
495
496
|
endif()
|
|
496
497
|
|
|
498
|
+
if (LLAMA_RPC)
|
|
499
|
+
add_compile_definitions(GGML_USE_RPC)
|
|
500
|
+
|
|
501
|
+
if (WIN32)
|
|
502
|
+
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ws2_32)
|
|
503
|
+
endif()
|
|
504
|
+
|
|
505
|
+
set(GGML_HEADERS_RPC ggml-rpc.h)
|
|
506
|
+
set(GGML_SOURCES_RPC ggml-rpc.cpp)
|
|
507
|
+
endif()
|
|
508
|
+
|
|
497
509
|
if (LLAMA_CLBLAST)
|
|
498
510
|
find_package(CLBlast)
|
|
499
511
|
if (CLBlast_FOUND)
|
|
@@ -1176,6 +1188,7 @@ add_library(ggml OBJECT
|
|
|
1176
1188
|
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
|
|
1177
1189
|
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
|
|
1178
1190
|
${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
|
|
1191
|
+
${GGML_SOURCES_RPC} ${GGML_HEADERS_RPC}
|
|
1179
1192
|
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
|
|
1180
1193
|
${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL}
|
|
1181
1194
|
${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE}
|
|
@@ -1281,17 +1294,6 @@ install(
|
|
|
1281
1294
|
WORLD_READ
|
|
1282
1295
|
WORLD_EXECUTE
|
|
1283
1296
|
DESTINATION ${CMAKE_INSTALL_BINDIR})
|
|
1284
|
-
install(
|
|
1285
|
-
FILES convert-lora-to-ggml.py
|
|
1286
|
-
PERMISSIONS
|
|
1287
|
-
OWNER_READ
|
|
1288
|
-
OWNER_WRITE
|
|
1289
|
-
OWNER_EXECUTE
|
|
1290
|
-
GROUP_READ
|
|
1291
|
-
GROUP_EXECUTE
|
|
1292
|
-
WORLD_READ
|
|
1293
|
-
WORLD_EXECUTE
|
|
1294
|
-
DESTINATION ${CMAKE_INSTALL_BINDIR})
|
|
1295
1297
|
if (LLAMA_METAL)
|
|
1296
1298
|
install(
|
|
1297
1299
|
FILES ggml-metal.metal
|
|
@@ -901,6 +901,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|
|
901
901
|
params.interactive = true;
|
|
902
902
|
return true;
|
|
903
903
|
}
|
|
904
|
+
if (arg == "--interactive-specials") {
|
|
905
|
+
params.interactive_specials = true;
|
|
906
|
+
return true;
|
|
907
|
+
}
|
|
904
908
|
if (arg == "--embedding") {
|
|
905
909
|
params.embedding = true;
|
|
906
910
|
return true;
|
|
@@ -1056,6 +1060,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|
|
1056
1060
|
#endif // GGML_USE_CUDA_SYCL_VULKAN
|
|
1057
1061
|
return true;
|
|
1058
1062
|
}
|
|
1063
|
+
if (arg == "--rpc") {
|
|
1064
|
+
if (++i >= argc) {
|
|
1065
|
+
invalid_param = true;
|
|
1066
|
+
return true;
|
|
1067
|
+
}
|
|
1068
|
+
params.rpc_servers = argv[i];
|
|
1069
|
+
return true;
|
|
1070
|
+
}
|
|
1059
1071
|
if (arg == "--no-mmap") {
|
|
1060
1072
|
params.use_mmap = false;
|
|
1061
1073
|
return true;
|
|
@@ -1367,14 +1379,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
|
|
1367
1379
|
if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
|
|
1368
1380
|
std::replace(arg.begin(), arg.end(), '_', '-');
|
|
1369
1381
|
}
|
|
1370
|
-
|
|
1371
1382
|
if (!gpt_params_find_arg(argc, argv, arg, params, i, invalid_param)) {
|
|
1372
1383
|
throw std::invalid_argument("error: unknown argument: " + arg);
|
|
1373
1384
|
}
|
|
1374
|
-
|
|
1375
|
-
|
|
1376
|
-
|
|
1377
|
-
throw std::invalid_argument("error: invalid parameter for argument: " + arg);
|
|
1385
|
+
if (invalid_param) {
|
|
1386
|
+
throw std::invalid_argument("error: invalid parameter for argument: " + arg);
|
|
1387
|
+
}
|
|
1378
1388
|
}
|
|
1379
1389
|
|
|
1380
1390
|
if (params.prompt_cache_all &&
|
|
@@ -1422,6 +1432,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|
|
1422
1432
|
printf(" -h, --help show this help message and exit\n");
|
|
1423
1433
|
printf(" --version show version and build info\n");
|
|
1424
1434
|
printf(" -i, --interactive run in interactive mode\n");
|
|
1435
|
+
printf(" --interactive-specials allow special tokens in user text, in interactive mode\n");
|
|
1425
1436
|
printf(" --interactive-first run in interactive mode and wait for input right away\n");
|
|
1426
1437
|
printf(" -cnv, --conversation run in conversation mode (does not print special tokens and suffix/prefix)\n");
|
|
1427
1438
|
printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n");
|
|
@@ -1554,6 +1565,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|
|
1554
1565
|
printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
|
|
1555
1566
|
printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
|
|
1556
1567
|
}
|
|
1568
|
+
printf(" --rpc SERVERS comma separated list of RPC servers\n");
|
|
1557
1569
|
printf(" --verbose-prompt print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false");
|
|
1558
1570
|
printf(" --no-display-prompt don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false");
|
|
1559
1571
|
printf(" -gan N, --grp-attn-n N\n");
|
|
@@ -1827,6 +1839,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
|
|
|
1827
1839
|
if (params.n_gpu_layers != -1) {
|
|
1828
1840
|
mparams.n_gpu_layers = params.n_gpu_layers;
|
|
1829
1841
|
}
|
|
1842
|
+
mparams.rpc_servers = params.rpc_servers.c_str();
|
|
1830
1843
|
mparams.main_gpu = params.main_gpu;
|
|
1831
1844
|
mparams.split_mode = params.split_mode;
|
|
1832
1845
|
mparams.tensor_split = params.tensor_split;
|
|
@@ -2652,6 +2665,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
|
|
2652
2665
|
dump_string_yaml_multiline(stream, "in_suffix", params.input_prefix.c_str());
|
|
2653
2666
|
fprintf(stream, "instruct: %s # default: false\n", params.instruct ? "true" : "false");
|
|
2654
2667
|
fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false");
|
|
2668
|
+
fprintf(stream, "interactive_specials: %s # default: false\n", params.interactive_specials ? "true" : "false");
|
|
2655
2669
|
fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false");
|
|
2656
2670
|
fprintf(stream, "keep: %d # default: 0\n", params.n_keep);
|
|
2657
2671
|
fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
|
|
@@ -82,6 +82,7 @@ struct gpt_params {
|
|
|
82
82
|
float yarn_beta_slow = 1.0f; // YaRN high correction dim
|
|
83
83
|
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
|
84
84
|
float defrag_thold = -1.0f; // KV cache defragmentation threshold
|
|
85
|
+
std::string rpc_servers = ""; // comma separated list of RPC servers
|
|
85
86
|
|
|
86
87
|
ggml_backend_sched_eval_callback cb_eval = nullptr;
|
|
87
88
|
void * cb_eval_user_data = nullptr;
|
|
@@ -140,6 +141,7 @@ struct gpt_params {
|
|
|
140
141
|
bool random_prompt = false; // do not randomize prompt if none provided
|
|
141
142
|
bool use_color = false; // use color to distinguish generations and inputs
|
|
142
143
|
bool interactive = false; // interactive mode
|
|
144
|
+
bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode
|
|
143
145
|
bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
|
|
144
146
|
bool chatml = false; // chatml mode (used for models trained on chatml syntax)
|
|
145
147
|
bool prompt_cache_all = false; // save user input and generations to prompt cache
|
|
@@ -142,6 +142,9 @@ namespace grammar_parser {
|
|
|
142
142
|
pos++;
|
|
143
143
|
last_sym_start = out_elements.size();
|
|
144
144
|
while (*pos != '"') {
|
|
145
|
+
if (!*pos) {
|
|
146
|
+
throw std::runtime_error("unexpected end of input");
|
|
147
|
+
}
|
|
145
148
|
auto char_pair = parse_char(pos);
|
|
146
149
|
pos = char_pair.second;
|
|
147
150
|
out_elements.push_back({LLAMA_GRETYPE_CHAR, char_pair.first});
|
|
@@ -156,6 +159,9 @@ namespace grammar_parser {
|
|
|
156
159
|
}
|
|
157
160
|
last_sym_start = out_elements.size();
|
|
158
161
|
while (*pos != ']') {
|
|
162
|
+
if (!*pos) {
|
|
163
|
+
throw std::runtime_error("unexpected end of input");
|
|
164
|
+
}
|
|
159
165
|
auto char_pair = parse_char(pos);
|
|
160
166
|
pos = char_pair.second;
|
|
161
167
|
enum llama_gretype type = last_sym_start < out_elements.size()
|
|
@@ -164,6 +170,9 @@ namespace grammar_parser {
|
|
|
164
170
|
|
|
165
171
|
out_elements.push_back({type, char_pair.first});
|
|
166
172
|
if (pos[0] == '-' && pos[1] != ']') {
|
|
173
|
+
if (!pos[1]) {
|
|
174
|
+
throw std::runtime_error("unexpected end of input");
|
|
175
|
+
}
|
|
167
176
|
auto endchar_pair = parse_char(pos + 1);
|
|
168
177
|
pos = endchar_pair.second;
|
|
169
178
|
out_elements.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first});
|
|
@@ -35,7 +35,7 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_
|
|
|
35
35
|
|
|
36
36
|
result->prev.resize(params.n_prev);
|
|
37
37
|
|
|
38
|
-
result->
|
|
38
|
+
result->n_valid = 0;
|
|
39
39
|
|
|
40
40
|
llama_sampling_set_rng_seed(result, params.seed);
|
|
41
41
|
|
|
@@ -66,7 +66,7 @@ void llama_sampling_reset(llama_sampling_context * ctx) {
|
|
|
66
66
|
|
|
67
67
|
std::fill(ctx->prev.begin(), ctx->prev.end(), 0);
|
|
68
68
|
ctx->cur.clear();
|
|
69
|
-
ctx->
|
|
69
|
+
ctx->n_valid = 0;
|
|
70
70
|
}
|
|
71
71
|
|
|
72
72
|
void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed) {
|
|
@@ -256,7 +256,7 @@ static llama_token llama_sampling_sample_impl(
|
|
|
256
256
|
}
|
|
257
257
|
}
|
|
258
258
|
|
|
259
|
-
ctx_sampling->
|
|
259
|
+
ctx_sampling->n_valid = temp == 0.0f ? 0 : cur_p.size;
|
|
260
260
|
|
|
261
261
|
return id;
|
|
262
262
|
}
|
|
@@ -81,7 +81,7 @@ struct llama_sampling_context {
|
|
|
81
81
|
// TODO: replace with ring-buffer
|
|
82
82
|
std::vector<llama_token> prev;
|
|
83
83
|
std::vector<llama_token_data> cur;
|
|
84
|
-
size_t
|
|
84
|
+
size_t n_valid; // Number of correct top tokens with correct probabilities.
|
|
85
85
|
|
|
86
86
|
std::mt19937 rng;
|
|
87
87
|
};
|
|
@@ -49,6 +49,12 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
|
|
|
49
49
|
}
|
|
50
50
|
|
|
51
51
|
float * out = output + batch.seq_id[i][0] * n_embd;
|
|
52
|
+
//TODO: I would also add a parameter here to enable normalization or not.
|
|
53
|
+
/*fprintf(stdout, "unnormalized_embedding:");
|
|
54
|
+
for (int hh = 0; hh < n_embd; hh++) {
|
|
55
|
+
fprintf(stdout, "%9.6f ", embd[hh]);
|
|
56
|
+
}
|
|
57
|
+
fprintf(stdout, "\n");*/
|
|
52
58
|
llama_embd_normalize(embd, out, n_embd);
|
|
53
59
|
}
|
|
54
60
|
}
|
|
@@ -123,10 +129,12 @@ int main(int argc, char ** argv) {
|
|
|
123
129
|
inputs.push_back(inp);
|
|
124
130
|
}
|
|
125
131
|
|
|
126
|
-
//
|
|
132
|
+
// check if the last token is SEP
|
|
133
|
+
// it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true'
|
|
127
134
|
for (auto & inp : inputs) {
|
|
128
135
|
if (inp.empty() || inp.back() != llama_token_sep(model)) {
|
|
129
|
-
|
|
136
|
+
fprintf(stderr, "%s: warning: last token in the prompt is not SEP\n", __func__);
|
|
137
|
+
fprintf(stderr, "%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
|
|
130
138
|
}
|
|
131
139
|
}
|
|
132
140
|
|
|
@@ -161,10 +161,17 @@ static const char * split_mode_str(llama_split_mode mode) {
|
|
|
161
161
|
}
|
|
162
162
|
}
|
|
163
163
|
|
|
164
|
+
static std::string pair_str(const std::pair<int, int> & p) {
|
|
165
|
+
static char buf[32];
|
|
166
|
+
snprintf(buf, sizeof(buf), "%d,%d", p.first, p.second);
|
|
167
|
+
return buf;
|
|
168
|
+
}
|
|
169
|
+
|
|
164
170
|
struct cmd_params {
|
|
165
171
|
std::vector<std::string> model;
|
|
166
172
|
std::vector<int> n_prompt;
|
|
167
173
|
std::vector<int> n_gen;
|
|
174
|
+
std::vector<std::pair<int, int>> n_pg;
|
|
168
175
|
std::vector<int> n_batch;
|
|
169
176
|
std::vector<int> n_ubatch;
|
|
170
177
|
std::vector<ggml_type> type_k;
|
|
@@ -188,6 +195,7 @@ static const cmd_params cmd_params_defaults = {
|
|
|
188
195
|
/* model */ {"models/7B/ggml-model-q4_0.gguf"},
|
|
189
196
|
/* n_prompt */ {512},
|
|
190
197
|
/* n_gen */ {128},
|
|
198
|
+
/* n_pg */ {{512, 128}},
|
|
191
199
|
/* n_batch */ {2048},
|
|
192
200
|
/* n_ubatch */ {512},
|
|
193
201
|
/* type_k */ {GGML_TYPE_F16},
|
|
@@ -215,10 +223,11 @@ static void print_usage(int /* argc */, char ** argv) {
|
|
|
215
223
|
printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
|
|
216
224
|
printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
|
|
217
225
|
printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
|
|
226
|
+
printf(" -pg <pp,tg> (default: %s)\n", join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
|
|
218
227
|
printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
|
|
219
|
-
printf(" -ub
|
|
220
|
-
printf(" -ctk
|
|
221
|
-
printf(" -ctv
|
|
228
|
+
printf(" -ub, --ubatch-size <n> (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str());
|
|
229
|
+
printf(" -ctk, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
|
|
230
|
+
printf(" -ctv, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
|
|
222
231
|
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
|
|
223
232
|
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
|
|
224
233
|
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
|
|
@@ -304,6 +313,17 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|
|
304
313
|
}
|
|
305
314
|
auto p = split<int>(argv[i], split_delim);
|
|
306
315
|
params.n_gen.insert(params.n_gen.end(), p.begin(), p.end());
|
|
316
|
+
} else if (arg == "-pg") {
|
|
317
|
+
if (++i >= argc) {
|
|
318
|
+
invalid_param = true;
|
|
319
|
+
break;
|
|
320
|
+
}
|
|
321
|
+
auto p = split<std::string>(argv[i], ',');
|
|
322
|
+
if (p.size() != 2) {
|
|
323
|
+
invalid_param = true;
|
|
324
|
+
break;
|
|
325
|
+
}
|
|
326
|
+
params.n_pg.push_back({std::stoi(p[0]), std::stoi(p[1])});
|
|
307
327
|
} else if (arg == "-b" || arg == "--batch-size") {
|
|
308
328
|
if (++i >= argc) {
|
|
309
329
|
invalid_param = true;
|
|
@@ -493,6 +513,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|
|
493
513
|
if (params.model.empty()) { params.model = cmd_params_defaults.model; }
|
|
494
514
|
if (params.n_prompt.empty()) { params.n_prompt = cmd_params_defaults.n_prompt; }
|
|
495
515
|
if (params.n_gen.empty()) { params.n_gen = cmd_params_defaults.n_gen; }
|
|
516
|
+
if (params.n_pg.empty()) { params.n_pg = cmd_params_defaults.n_pg; }
|
|
496
517
|
if (params.n_batch.empty()) { params.n_batch = cmd_params_defaults.n_batch; }
|
|
497
518
|
if (params.n_ubatch.empty()) { params.n_ubatch = cmd_params_defaults.n_ubatch; }
|
|
498
519
|
if (params.type_k.empty()) { params.type_k = cmd_params_defaults.type_k; }
|
|
@@ -632,6 +653,31 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|
|
632
653
|
};
|
|
633
654
|
instances.push_back(instance);
|
|
634
655
|
}
|
|
656
|
+
|
|
657
|
+
for (const auto & n_pg : params.n_pg) {
|
|
658
|
+
if (n_pg.first == 0 && n_pg.second == 0) {
|
|
659
|
+
continue;
|
|
660
|
+
}
|
|
661
|
+
cmd_params_instance instance = {
|
|
662
|
+
/* .model = */ m,
|
|
663
|
+
/* .n_prompt = */ n_pg.first,
|
|
664
|
+
/* .n_gen = */ n_pg.second,
|
|
665
|
+
/* .n_batch = */ nb,
|
|
666
|
+
/* .n_ubatch = */ nub,
|
|
667
|
+
/* .type_k = */ tk,
|
|
668
|
+
/* .type_v = */ tv,
|
|
669
|
+
/* .n_threads = */ nt,
|
|
670
|
+
/* .n_gpu_layers = */ nl,
|
|
671
|
+
/* .split_mode = */ sm,
|
|
672
|
+
/* .main_gpu = */ mg,
|
|
673
|
+
/* .no_kv_offload= */ nkvo,
|
|
674
|
+
/* .flash_attn = */ fa,
|
|
675
|
+
/* .tensor_split = */ ts,
|
|
676
|
+
/* .use_mmap = */ mmp,
|
|
677
|
+
/* .embeddings = */ embd,
|
|
678
|
+
};
|
|
679
|
+
instances.push_back(instance);
|
|
680
|
+
}
|
|
635
681
|
}
|
|
636
682
|
|
|
637
683
|
return instances;
|
|
@@ -965,6 +1011,9 @@ struct markdown_printer : public printer {
|
|
|
965
1011
|
if (field == "n_gpu_layers") {
|
|
966
1012
|
return 3;
|
|
967
1013
|
}
|
|
1014
|
+
if (field == "test") {
|
|
1015
|
+
return 13;
|
|
1016
|
+
}
|
|
968
1017
|
|
|
969
1018
|
int width = std::max((int)field.length(), 10);
|
|
970
1019
|
|
|
@@ -1091,12 +1140,11 @@ struct markdown_printer : public printer {
|
|
|
1091
1140
|
value = test::get_backend();
|
|
1092
1141
|
} else if (field == "test") {
|
|
1093
1142
|
if (t.n_prompt > 0 && t.n_gen == 0) {
|
|
1094
|
-
snprintf(buf, sizeof(buf), "pp
|
|
1143
|
+
snprintf(buf, sizeof(buf), "pp%d", t.n_prompt);
|
|
1095
1144
|
} else if (t.n_gen > 0 && t.n_prompt == 0) {
|
|
1096
|
-
snprintf(buf, sizeof(buf), "tg
|
|
1145
|
+
snprintf(buf, sizeof(buf), "tg%d", t.n_gen);
|
|
1097
1146
|
} else {
|
|
1098
|
-
|
|
1099
|
-
exit(1);
|
|
1147
|
+
snprintf(buf, sizeof(buf), "pp%d+tg%d", t.n_prompt, t.n_gen);
|
|
1100
1148
|
}
|
|
1101
1149
|
value = buf;
|
|
1102
1150
|
} else if (field == "t/s") {
|
|
@@ -1297,6 +1345,7 @@ int main(int argc, char ** argv) {
|
|
|
1297
1345
|
llama_kv_cache_clear(ctx);
|
|
1298
1346
|
|
|
1299
1347
|
uint64_t t_start = get_time_ns();
|
|
1348
|
+
|
|
1300
1349
|
if (t.n_prompt > 0) {
|
|
1301
1350
|
test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
|
|
1302
1351
|
}
|
|
@@ -37,7 +37,7 @@ FetchContent_MakeAvailable(llama)
|
|
|
37
37
|
# used in the AndroidManifest.xml file.
|
|
38
38
|
add_library(${CMAKE_PROJECT_NAME} SHARED
|
|
39
39
|
# List C/C++ source files with relative paths to this CMakeLists.txt.
|
|
40
|
-
|
|
40
|
+
llama-android.cpp)
|
|
41
41
|
|
|
42
42
|
# Specifies libraries CMake should link to your target library. You
|
|
43
43
|
# can link libraries from various origins, such as libraries defined in this
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# For more information about using CMake with Android Studio, read the
|
|
2
|
+
# documentation: https://d.android.com/studio/projects/add-native-code.html.
|
|
3
|
+
# For more examples on how to use CMake, see https://github.com/android/ndk-samples.
|
|
4
|
+
|
|
5
|
+
# Sets the minimum CMake version required for this project.
|
|
6
|
+
cmake_minimum_required(VERSION 3.22.1)
|
|
7
|
+
|
|
8
|
+
# Declares the project name. The project name can be accessed via ${ PROJECT_NAME},
|
|
9
|
+
# Since this is the top level CMakeLists.txt, the project name is also accessible
|
|
10
|
+
# with ${CMAKE_PROJECT_NAME} (both CMake variables are in-sync within the top level
|
|
11
|
+
# build script scope).
|
|
12
|
+
project("llama-android")
|
|
13
|
+
|
|
14
|
+
include(FetchContent)
|
|
15
|
+
FetchContent_Declare(
|
|
16
|
+
llama
|
|
17
|
+
GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
|
|
18
|
+
GIT_TAG master
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
# Also provides "common"
|
|
22
|
+
FetchContent_MakeAvailable(llama)
|
|
23
|
+
|
|
24
|
+
# Creates and names a library, sets it as either STATIC
|
|
25
|
+
# or SHARED, and provides the relative paths to its source code.
|
|
26
|
+
# You can define multiple libraries, and CMake builds them for you.
|
|
27
|
+
# Gradle automatically packages shared libraries with your APK.
|
|
28
|
+
#
|
|
29
|
+
# In this top level CMakeLists.txt, ${CMAKE_PROJECT_NAME} is used to define
|
|
30
|
+
# the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME}
|
|
31
|
+
# is preferred for the same purpose.
|
|
32
|
+
#
|
|
33
|
+
# In order to load a library into your app from Java/Kotlin, you must call
|
|
34
|
+
# System.loadLibrary() and pass the name of the library defined here;
|
|
35
|
+
# for GameActivity/NativeActivity derived applications, the same library name must be
|
|
36
|
+
# used in the AndroidManifest.xml file.
|
|
37
|
+
add_library(${CMAKE_PROJECT_NAME} SHARED
|
|
38
|
+
# List C/C++ source files with relative paths to this CMakeLists.txt.
|
|
39
|
+
llama-android.cpp)
|
|
40
|
+
|
|
41
|
+
# Specifies libraries CMake should link to your target library. You
|
|
42
|
+
# can link libraries from various origins, such as libraries defined in this
|
|
43
|
+
# build script, prebuilt third-party libraries, or Android system libraries.
|
|
44
|
+
target_link_libraries(${CMAKE_PROJECT_NAME}
|
|
45
|
+
# List libraries link to the target library
|
|
46
|
+
llama
|
|
47
|
+
common
|
|
48
|
+
android
|
|
49
|
+
log)
|