@fugood/llama.node 0.2.0 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +9 -0
- package/README.md +1 -1
- package/bin/darwin/arm64/default.metallib +0 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/default.metallib +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +1 -1
- package/package.json +2 -1
- package/patches/llama.patch +22 -0
- package/src/LlamaContext.cpp +2 -2
- package/src/TokenizeWorker.cpp +1 -1
- package/src/llama.cpp/CMakeLists.txt +82 -54
- package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +16 -0
- package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +6 -0
- package/src/llama.cpp/common/common.cpp +748 -754
- package/src/llama.cpp/common/common.h +49 -41
- package/src/llama.cpp/common/grammar-parser.cpp +10 -1
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +6 -6
- package/src/llama.cpp/common/log.h +5 -5
- package/src/llama.cpp/common/sampling.cpp +92 -10
- package/src/llama.cpp/common/sampling.h +6 -1
- package/src/llama.cpp/common/train.cpp +2 -2
- package/src/llama.cpp/examples/CMakeLists.txt +3 -0
- package/src/llama.cpp/examples/batched/batched.cpp +1 -1
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +13 -4
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +2 -2
- package/src/llama.cpp/examples/finetune/finetune.cpp +4 -3
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -2
- package/src/llama.cpp/examples/infill/infill.cpp +8 -8
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +57 -8
- package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +55 -0
- package/src/llama.cpp/examples/llama.android/{app → llama}/src/main/cpp/CMakeLists.txt +7 -8
- package/src/llama.cpp/examples/llama.android/{app → llama}/src/main/cpp/llama-android.cpp +14 -14
- package/src/llama.cpp/examples/llava/clip.h +1 -1
- package/src/llama.cpp/examples/llava/llava-cli.cpp +27 -7
- package/src/llama.cpp/examples/llava/llava.cpp +0 -15
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -1
- package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
- package/src/llama.cpp/examples/main/main.cpp +29 -17
- package/src/llama.cpp/examples/parallel/parallel.cpp +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +9 -9
- package/src/llama.cpp/examples/quantize/quantize.cpp +2 -2
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +2 -2
- package/src/llama.cpp/examples/rpc/CMakeLists.txt +2 -0
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +134 -0
- package/src/llama.cpp/examples/server/server.cpp +33 -25
- package/src/llama.cpp/examples/server/utils.hpp +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +359 -9
- package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +4 -3
- package/src/llama.cpp/ggml-backend.c +2 -3
- package/src/llama.cpp/ggml-common.h +0 -54
- package/src/llama.cpp/ggml-cuda.h +1 -0
- package/src/llama.cpp/ggml-impl.h +51 -0
- package/src/llama.cpp/ggml-kompute.cpp +13 -3
- package/src/llama.cpp/ggml-opencl.cpp +4 -1
- package/src/llama.cpp/ggml-quants.c +3715 -2050
- package/src/llama.cpp/ggml-rpc.cpp +1155 -0
- package/src/llama.cpp/ggml-rpc.h +24 -0
- package/src/llama.cpp/ggml-sycl.cpp +119 -673
- package/src/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
- package/src/llama.cpp/ggml-vulkan.cpp +203 -224
- package/src/llama.cpp/ggml.c +1208 -1483
- package/src/llama.cpp/ggml.h +71 -46
- package/src/llama.cpp/llama.cpp +1374 -938
- package/src/llama.cpp/llama.h +22 -6
- package/src/llama.cpp/requirements.txt +0 -2
- package/src/llama.cpp/tests/CMakeLists.txt +1 -1
- package/src/llama.cpp/tests/test-backend-ops.cpp +120 -57
- package/src/llama.cpp/tests/test-chat-template.cpp +16 -4
- package/src/llama.cpp/tests/test-grad0.cpp +43 -83
- package/src/llama.cpp/tests/test-grammar-integration.cpp +46 -0
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +27 -3
- package/src/llama.cpp/unicode-data.cpp +6969 -2169
- package/src/llama.cpp/unicode-data.h +15 -12
- package/src/llama.cpp/unicode.cpp +89 -111
- package/src/llama.cpp/unicode.h +44 -12
- package/src/llama.cpp/build.zig +0 -172
- package/src/llama.cpp/ggml-mpi.c +0 -216
- package/src/llama.cpp/ggml-mpi.h +0 -39
- package/src/llama.cpp/requirements/requirements-convert-lora-to-ggml.txt +0 -2
- package/src/llama.cpp/requirements/requirements-convert-persimmon-to-gguf.txt +0 -2
|
@@ -50,9 +50,9 @@ static void write_logfile(
|
|
|
50
50
|
return;
|
|
51
51
|
}
|
|
52
52
|
|
|
53
|
-
const std::string timestamp =
|
|
53
|
+
const std::string timestamp = string_get_sortable_timestamp();
|
|
54
54
|
|
|
55
|
-
const bool success =
|
|
55
|
+
const bool success = fs_create_directory_with_parents(params.logdir);
|
|
56
56
|
if (!success) {
|
|
57
57
|
fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
|
|
58
58
|
__func__, params.logdir.c_str());
|
|
@@ -70,7 +70,7 @@ static void write_logfile(
|
|
|
70
70
|
fprintf(logfile, "binary: infill\n");
|
|
71
71
|
char model_desc[128];
|
|
72
72
|
llama_model_desc(model, model_desc, sizeof(model_desc));
|
|
73
|
-
|
|
73
|
+
yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc);
|
|
74
74
|
|
|
75
75
|
fprintf(logfile, "\n");
|
|
76
76
|
fprintf(logfile, "######################\n");
|
|
@@ -78,8 +78,8 @@ static void write_logfile(
|
|
|
78
78
|
fprintf(logfile, "######################\n");
|
|
79
79
|
fprintf(logfile, "\n");
|
|
80
80
|
|
|
81
|
-
|
|
82
|
-
|
|
81
|
+
yaml_dump_string_multiline(logfile, "output", output.c_str());
|
|
82
|
+
yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
|
|
83
83
|
|
|
84
84
|
llama_dump_timing_info_yaml(logfile, ctx);
|
|
85
85
|
fclose(logfile);
|
|
@@ -236,7 +236,7 @@ int main(int argc, char ** argv) {
|
|
|
236
236
|
// print system information
|
|
237
237
|
{
|
|
238
238
|
LOG_TEE("\n");
|
|
239
|
-
LOG_TEE("%s\n",
|
|
239
|
+
LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
|
|
240
240
|
}
|
|
241
241
|
const bool add_bos = llama_should_add_bos_token(model);
|
|
242
242
|
GGML_ASSERT(llama_add_eos_token(model) != 1);
|
|
@@ -621,8 +621,8 @@ int main(int argc, char ** argv) {
|
|
|
621
621
|
|
|
622
622
|
if (params.escape) {
|
|
623
623
|
//process escape sequences, for the initial prompt this is done in common.cpp when we load the params, but for the interactive mode we need to do it here
|
|
624
|
-
|
|
625
|
-
|
|
624
|
+
string_process_escapes(params.input_prefix);
|
|
625
|
+
string_process_escapes(params.input_suffix);
|
|
626
626
|
}
|
|
627
627
|
suff_rm_leading_spc = params.escape;
|
|
628
628
|
if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
|
|
@@ -161,10 +161,17 @@ static const char * split_mode_str(llama_split_mode mode) {
|
|
|
161
161
|
}
|
|
162
162
|
}
|
|
163
163
|
|
|
164
|
+
static std::string pair_str(const std::pair<int, int> & p) {
|
|
165
|
+
static char buf[32];
|
|
166
|
+
snprintf(buf, sizeof(buf), "%d,%d", p.first, p.second);
|
|
167
|
+
return buf;
|
|
168
|
+
}
|
|
169
|
+
|
|
164
170
|
struct cmd_params {
|
|
165
171
|
std::vector<std::string> model;
|
|
166
172
|
std::vector<int> n_prompt;
|
|
167
173
|
std::vector<int> n_gen;
|
|
174
|
+
std::vector<std::pair<int, int>> n_pg;
|
|
168
175
|
std::vector<int> n_batch;
|
|
169
176
|
std::vector<int> n_ubatch;
|
|
170
177
|
std::vector<ggml_type> type_k;
|
|
@@ -188,11 +195,12 @@ static const cmd_params cmd_params_defaults = {
|
|
|
188
195
|
/* model */ {"models/7B/ggml-model-q4_0.gguf"},
|
|
189
196
|
/* n_prompt */ {512},
|
|
190
197
|
/* n_gen */ {128},
|
|
198
|
+
/* n_pg */ {},
|
|
191
199
|
/* n_batch */ {2048},
|
|
192
200
|
/* n_ubatch */ {512},
|
|
193
201
|
/* type_k */ {GGML_TYPE_F16},
|
|
194
202
|
/* type_v */ {GGML_TYPE_F16},
|
|
195
|
-
/* n_threads */ {
|
|
203
|
+
/* n_threads */ {cpu_get_num_math()},
|
|
196
204
|
/* n_gpu_layers */ {99},
|
|
197
205
|
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
|
|
198
206
|
/* main_gpu */ {0},
|
|
@@ -215,10 +223,11 @@ static void print_usage(int /* argc */, char ** argv) {
|
|
|
215
223
|
printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
|
|
216
224
|
printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
|
|
217
225
|
printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
|
|
226
|
+
printf(" -pg <pp,tg> (default: %s)\n", join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
|
|
218
227
|
printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
|
|
219
|
-
printf(" -ub
|
|
220
|
-
printf(" -ctk
|
|
221
|
-
printf(" -ctv
|
|
228
|
+
printf(" -ub, --ubatch-size <n> (default: %s)\n", join(cmd_params_defaults.n_ubatch, ",").c_str());
|
|
229
|
+
printf(" -ctk, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
|
|
230
|
+
printf(" -ctv, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
|
|
222
231
|
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
|
|
223
232
|
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
|
|
224
233
|
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
|
|
@@ -304,6 +313,17 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|
|
304
313
|
}
|
|
305
314
|
auto p = split<int>(argv[i], split_delim);
|
|
306
315
|
params.n_gen.insert(params.n_gen.end(), p.begin(), p.end());
|
|
316
|
+
} else if (arg == "-pg") {
|
|
317
|
+
if (++i >= argc) {
|
|
318
|
+
invalid_param = true;
|
|
319
|
+
break;
|
|
320
|
+
}
|
|
321
|
+
auto p = split<std::string>(argv[i], ',');
|
|
322
|
+
if (p.size() != 2) {
|
|
323
|
+
invalid_param = true;
|
|
324
|
+
break;
|
|
325
|
+
}
|
|
326
|
+
params.n_pg.push_back({std::stoi(p[0]), std::stoi(p[1])});
|
|
307
327
|
} else if (arg == "-b" || arg == "--batch-size") {
|
|
308
328
|
if (++i >= argc) {
|
|
309
329
|
invalid_param = true;
|
|
@@ -493,6 +513,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|
|
493
513
|
if (params.model.empty()) { params.model = cmd_params_defaults.model; }
|
|
494
514
|
if (params.n_prompt.empty()) { params.n_prompt = cmd_params_defaults.n_prompt; }
|
|
495
515
|
if (params.n_gen.empty()) { params.n_gen = cmd_params_defaults.n_gen; }
|
|
516
|
+
if (params.n_pg.empty()) { params.n_pg = cmd_params_defaults.n_pg; }
|
|
496
517
|
if (params.n_batch.empty()) { params.n_batch = cmd_params_defaults.n_batch; }
|
|
497
518
|
if (params.n_ubatch.empty()) { params.n_ubatch = cmd_params_defaults.n_ubatch; }
|
|
498
519
|
if (params.type_k.empty()) { params.type_k = cmd_params_defaults.type_k; }
|
|
@@ -632,6 +653,31 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|
|
632
653
|
};
|
|
633
654
|
instances.push_back(instance);
|
|
634
655
|
}
|
|
656
|
+
|
|
657
|
+
for (const auto & n_pg : params.n_pg) {
|
|
658
|
+
if (n_pg.first == 0 && n_pg.second == 0) {
|
|
659
|
+
continue;
|
|
660
|
+
}
|
|
661
|
+
cmd_params_instance instance = {
|
|
662
|
+
/* .model = */ m,
|
|
663
|
+
/* .n_prompt = */ n_pg.first,
|
|
664
|
+
/* .n_gen = */ n_pg.second,
|
|
665
|
+
/* .n_batch = */ nb,
|
|
666
|
+
/* .n_ubatch = */ nub,
|
|
667
|
+
/* .type_k = */ tk,
|
|
668
|
+
/* .type_v = */ tv,
|
|
669
|
+
/* .n_threads = */ nt,
|
|
670
|
+
/* .n_gpu_layers = */ nl,
|
|
671
|
+
/* .split_mode = */ sm,
|
|
672
|
+
/* .main_gpu = */ mg,
|
|
673
|
+
/* .no_kv_offload= */ nkvo,
|
|
674
|
+
/* .flash_attn = */ fa,
|
|
675
|
+
/* .tensor_split = */ ts,
|
|
676
|
+
/* .use_mmap = */ mmp,
|
|
677
|
+
/* .embeddings = */ embd,
|
|
678
|
+
};
|
|
679
|
+
instances.push_back(instance);
|
|
680
|
+
}
|
|
635
681
|
}
|
|
636
682
|
|
|
637
683
|
return instances;
|
|
@@ -965,6 +1011,9 @@ struct markdown_printer : public printer {
|
|
|
965
1011
|
if (field == "n_gpu_layers") {
|
|
966
1012
|
return 3;
|
|
967
1013
|
}
|
|
1014
|
+
if (field == "test") {
|
|
1015
|
+
return 13;
|
|
1016
|
+
}
|
|
968
1017
|
|
|
969
1018
|
int width = std::max((int)field.length(), 10);
|
|
970
1019
|
|
|
@@ -1091,12 +1140,11 @@ struct markdown_printer : public printer {
|
|
|
1091
1140
|
value = test::get_backend();
|
|
1092
1141
|
} else if (field == "test") {
|
|
1093
1142
|
if (t.n_prompt > 0 && t.n_gen == 0) {
|
|
1094
|
-
snprintf(buf, sizeof(buf), "pp
|
|
1143
|
+
snprintf(buf, sizeof(buf), "pp%d", t.n_prompt);
|
|
1095
1144
|
} else if (t.n_gen > 0 && t.n_prompt == 0) {
|
|
1096
|
-
snprintf(buf, sizeof(buf), "tg
|
|
1145
|
+
snprintf(buf, sizeof(buf), "tg%d", t.n_gen);
|
|
1097
1146
|
} else {
|
|
1098
|
-
|
|
1099
|
-
exit(1);
|
|
1147
|
+
snprintf(buf, sizeof(buf), "pp%d+tg%d", t.n_prompt, t.n_gen);
|
|
1100
1148
|
}
|
|
1101
1149
|
value = buf;
|
|
1102
1150
|
} else if (field == "t/s") {
|
|
@@ -1297,6 +1345,7 @@ int main(int argc, char ** argv) {
|
|
|
1297
1345
|
llama_kv_cache_clear(ctx);
|
|
1298
1346
|
|
|
1299
1347
|
uint64_t t_start = get_time_ns();
|
|
1348
|
+
|
|
1300
1349
|
if (t.n_prompt > 0) {
|
|
1301
1350
|
test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
|
|
1302
1351
|
}
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
|
|
2
|
+
# For more information about using CMake with Android Studio, read the
|
|
3
|
+
# documentation: https://d.android.com/studio/projects/add-native-code.html.
|
|
4
|
+
# For more examples on how to use CMake, see https://github.com/android/ndk-samples.
|
|
5
|
+
|
|
6
|
+
# Sets the minimum CMake version required for this project.
|
|
7
|
+
cmake_minimum_required(VERSION 3.22.1)
|
|
8
|
+
|
|
9
|
+
# Declares the project name. The project name can be accessed via ${ PROJECT_NAME},
|
|
10
|
+
# Since this is the top level CMakeLists.txt, the project name is also accessible
|
|
11
|
+
# with ${CMAKE_PROJECT_NAME} (both CMake variables are in-sync within the top level
|
|
12
|
+
# build script scope).
|
|
13
|
+
project("llama-android")
|
|
14
|
+
|
|
15
|
+
## Fetch latest llama.cpp from GitHub
|
|
16
|
+
#include(FetchContent)
|
|
17
|
+
#FetchContent_Declare(
|
|
18
|
+
# llama
|
|
19
|
+
# GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
|
|
20
|
+
# GIT_TAG master
|
|
21
|
+
#)
|
|
22
|
+
#
|
|
23
|
+
## Also provides "common"
|
|
24
|
+
#FetchContent_MakeAvailable(llama)
|
|
25
|
+
|
|
26
|
+
# llama.cpp CI uses the code from the current branch
|
|
27
|
+
# ref: https://github.com/ggerganov/llama.cpp/pull/7341#issuecomment-2117617700
|
|
28
|
+
add_subdirectory(../../../../../../ build-llama)
|
|
29
|
+
|
|
30
|
+
# Creates and names a library, sets it as either STATIC
|
|
31
|
+
# or SHARED, and provides the relative paths to its source code.
|
|
32
|
+
# You can define multiple libraries, and CMake builds them for you.
|
|
33
|
+
# Gradle automatically packages shared libraries with your APK.
|
|
34
|
+
#
|
|
35
|
+
# In this top level CMakeLists.txt, ${CMAKE_PROJECT_NAME} is used to define
|
|
36
|
+
# the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME}
|
|
37
|
+
# is preferred for the same purpose.
|
|
38
|
+
#
|
|
39
|
+
# In order to load a library into your app from Java/Kotlin, you must call
|
|
40
|
+
# System.loadLibrary() and pass the name of the library defined here;
|
|
41
|
+
# for GameActivity/NativeActivity derived applications, the same library name must be
|
|
42
|
+
# used in the AndroidManifest.xml file.
|
|
43
|
+
add_library(${CMAKE_PROJECT_NAME} SHARED
|
|
44
|
+
# List C/C++ source files with relative paths to this CMakeLists.txt.
|
|
45
|
+
llama-android.cpp)
|
|
46
|
+
|
|
47
|
+
# Specifies libraries CMake should link to your target library. You
|
|
48
|
+
# can link libraries from various origins, such as libraries defined in this
|
|
49
|
+
# build script, prebuilt third-party libraries, or Android system libraries.
|
|
50
|
+
target_link_libraries(${CMAKE_PROJECT_NAME}
|
|
51
|
+
# List libraries link to the target library
|
|
52
|
+
llama
|
|
53
|
+
common
|
|
54
|
+
android
|
|
55
|
+
log)
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
|
|
2
1
|
# For more information about using CMake with Android Studio, read the
|
|
3
2
|
# documentation: https://d.android.com/studio/projects/add-native-code.html.
|
|
4
3
|
# For more examples on how to use CMake, see https://github.com/android/ndk-samples.
|
|
@@ -36,15 +35,15 @@ FetchContent_MakeAvailable(llama)
|
|
|
36
35
|
# for GameActivity/NativeActivity derived applications, the same library name must be
|
|
37
36
|
# used in the AndroidManifest.xml file.
|
|
38
37
|
add_library(${CMAKE_PROJECT_NAME} SHARED
|
|
39
|
-
|
|
40
|
-
|
|
38
|
+
# List C/C++ source files with relative paths to this CMakeLists.txt.
|
|
39
|
+
llama-android.cpp)
|
|
41
40
|
|
|
42
41
|
# Specifies libraries CMake should link to your target library. You
|
|
43
42
|
# can link libraries from various origins, such as libraries defined in this
|
|
44
43
|
# build script, prebuilt third-party libraries, or Android system libraries.
|
|
45
44
|
target_link_libraries(${CMAKE_PROJECT_NAME}
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
45
|
+
# List libraries link to the target library
|
|
46
|
+
llama
|
|
47
|
+
common
|
|
48
|
+
android
|
|
49
|
+
log)
|
|
@@ -81,7 +81,7 @@ static void log_callback(ggml_log_level level, const char * fmt, void * data) {
|
|
|
81
81
|
|
|
82
82
|
extern "C"
|
|
83
83
|
JNIEXPORT jlong JNICALL
|
|
84
|
-
|
|
84
|
+
Java_android_llama_cpp_LLamaAndroid_load_1model(JNIEnv *env, jobject, jstring filename) {
|
|
85
85
|
llama_model_params model_params = llama_model_default_params();
|
|
86
86
|
|
|
87
87
|
auto path_to_model = env->GetStringUTFChars(filename, 0);
|
|
@@ -101,13 +101,13 @@ Java_com_example_llama_Llm_load_1model(JNIEnv *env, jobject, jstring filename) {
|
|
|
101
101
|
|
|
102
102
|
extern "C"
|
|
103
103
|
JNIEXPORT void JNICALL
|
|
104
|
-
|
|
104
|
+
Java_android_llama_cpp_LLamaAndroid_free_1model(JNIEnv *, jobject, jlong model) {
|
|
105
105
|
llama_free_model(reinterpret_cast<llama_model *>(model));
|
|
106
106
|
}
|
|
107
107
|
|
|
108
108
|
extern "C"
|
|
109
109
|
JNIEXPORT jlong JNICALL
|
|
110
|
-
|
|
110
|
+
Java_android_llama_cpp_LLamaAndroid_new_1context(JNIEnv *env, jobject, jlong jmodel) {
|
|
111
111
|
auto model = reinterpret_cast<llama_model *>(jmodel);
|
|
112
112
|
|
|
113
113
|
if (!model) {
|
|
@@ -139,25 +139,25 @@ Java_com_example_llama_Llm_new_1context(JNIEnv *env, jobject, jlong jmodel) {
|
|
|
139
139
|
|
|
140
140
|
extern "C"
|
|
141
141
|
JNIEXPORT void JNICALL
|
|
142
|
-
|
|
142
|
+
Java_android_llama_cpp_LLamaAndroid_free_1context(JNIEnv *, jobject, jlong context) {
|
|
143
143
|
llama_free(reinterpret_cast<llama_context *>(context));
|
|
144
144
|
}
|
|
145
145
|
|
|
146
146
|
extern "C"
|
|
147
147
|
JNIEXPORT void JNICALL
|
|
148
|
-
|
|
148
|
+
Java_android_llama_cpp_LLamaAndroid_backend_1free(JNIEnv *, jobject) {
|
|
149
149
|
llama_backend_free();
|
|
150
150
|
}
|
|
151
151
|
|
|
152
152
|
extern "C"
|
|
153
153
|
JNIEXPORT void JNICALL
|
|
154
|
-
|
|
154
|
+
Java_android_llama_cpp_LLamaAndroid_log_1to_1android(JNIEnv *, jobject) {
|
|
155
155
|
llama_log_set(log_callback, NULL);
|
|
156
156
|
}
|
|
157
157
|
|
|
158
158
|
extern "C"
|
|
159
159
|
JNIEXPORT jstring JNICALL
|
|
160
|
-
|
|
160
|
+
Java_android_llama_cpp_LLamaAndroid_bench_1model(
|
|
161
161
|
JNIEnv *env,
|
|
162
162
|
jobject,
|
|
163
163
|
jlong context_pointer,
|
|
@@ -271,13 +271,13 @@ Java_com_example_llama_Llm_bench_1model(
|
|
|
271
271
|
|
|
272
272
|
extern "C"
|
|
273
273
|
JNIEXPORT void JNICALL
|
|
274
|
-
|
|
274
|
+
Java_android_llama_cpp_LLamaAndroid_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
|
|
275
275
|
llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
|
|
276
276
|
}
|
|
277
277
|
|
|
278
278
|
extern "C"
|
|
279
279
|
JNIEXPORT jlong JNICALL
|
|
280
|
-
|
|
280
|
+
Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
|
|
281
281
|
|
|
282
282
|
// Source: Copy of llama.cpp:llama_batch_init but heap-allocated.
|
|
283
283
|
|
|
@@ -313,19 +313,19 @@ Java_com_example_llama_Llm_new_1batch(JNIEnv *, jobject, jint n_tokens, jint emb
|
|
|
313
313
|
|
|
314
314
|
extern "C"
|
|
315
315
|
JNIEXPORT void JNICALL
|
|
316
|
-
|
|
316
|
+
Java_android_llama_cpp_LLamaAndroid_backend_1init(JNIEnv *, jobject) {
|
|
317
317
|
llama_backend_init();
|
|
318
318
|
}
|
|
319
319
|
|
|
320
320
|
extern "C"
|
|
321
321
|
JNIEXPORT jstring JNICALL
|
|
322
|
-
|
|
322
|
+
Java_android_llama_cpp_LLamaAndroid_system_1info(JNIEnv *env, jobject) {
|
|
323
323
|
return env->NewStringUTF(llama_print_system_info());
|
|
324
324
|
}
|
|
325
325
|
|
|
326
326
|
extern "C"
|
|
327
327
|
JNIEXPORT jint JNICALL
|
|
328
|
-
|
|
328
|
+
Java_android_llama_cpp_LLamaAndroid_completion_1init(
|
|
329
329
|
JNIEnv *env,
|
|
330
330
|
jobject,
|
|
331
331
|
jlong context_pointer,
|
|
@@ -376,7 +376,7 @@ Java_com_example_llama_Llm_completion_1init(
|
|
|
376
376
|
|
|
377
377
|
extern "C"
|
|
378
378
|
JNIEXPORT jstring JNICALL
|
|
379
|
-
|
|
379
|
+
Java_android_llama_cpp_LLamaAndroid_completion_1loop(
|
|
380
380
|
JNIEnv * env,
|
|
381
381
|
jobject,
|
|
382
382
|
jlong context_pointer,
|
|
@@ -438,6 +438,6 @@ Java_com_example_llama_Llm_completion_1loop(
|
|
|
438
438
|
|
|
439
439
|
extern "C"
|
|
440
440
|
JNIEXPORT void JNICALL
|
|
441
|
-
|
|
441
|
+
Java_android_llama_cpp_LLamaAndroid_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
|
|
442
442
|
llama_kv_cache_clear(reinterpret_cast<llama_context *>(context));
|
|
443
443
|
}
|
|
@@ -68,7 +68,7 @@ CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8
|
|
|
68
68
|
/** interpret bytes as an image file with length bytes_length, and use the result to populate img */
|
|
69
69
|
CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
|
|
70
70
|
|
|
71
|
-
/** preprocess img and store the result in res_imgs, pad_to_square may be
|
|
71
|
+
/** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */
|
|
72
72
|
CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );
|
|
73
73
|
|
|
74
74
|
CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
|
|
@@ -189,6 +189,11 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
|
|
|
189
189
|
LOG_TEE("\n");
|
|
190
190
|
|
|
191
191
|
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);
|
|
192
|
+
if (!ctx_sampling) {
|
|
193
|
+
fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
|
|
194
|
+
exit(1);
|
|
195
|
+
}
|
|
196
|
+
|
|
192
197
|
std::string response = "";
|
|
193
198
|
for (int i = 0; i < max_tgt_len; i++) {
|
|
194
199
|
const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past);
|
|
@@ -285,7 +290,7 @@ int main(int argc, char ** argv) {
|
|
|
285
290
|
#endif // LOG_DISABLE_LOGS
|
|
286
291
|
|
|
287
292
|
if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
|
|
288
|
-
|
|
293
|
+
gpt_params_print_usage(argc, argv, params);
|
|
289
294
|
show_additional_info(argc, argv);
|
|
290
295
|
return 1;
|
|
291
296
|
}
|
|
@@ -295,14 +300,10 @@ int main(int argc, char ** argv) {
|
|
|
295
300
|
return 1;
|
|
296
301
|
}
|
|
297
302
|
|
|
298
|
-
|
|
303
|
+
if (prompt_contains_image(params.prompt)) {
|
|
299
304
|
auto ctx_llava = llava_init_context(¶ms, model);
|
|
300
305
|
|
|
301
|
-
auto image_embed = load_image(ctx_llava, ¶ms,
|
|
302
|
-
if (!image_embed) {
|
|
303
|
-
std::cerr << "error: failed to load image " << image << ". Terminating\n\n";
|
|
304
|
-
return 1;
|
|
305
|
-
}
|
|
306
|
+
auto image_embed = load_image(ctx_llava, ¶ms, "");
|
|
306
307
|
|
|
307
308
|
// process the prompt
|
|
308
309
|
process_prompt(ctx_llava, image_embed, ¶ms, params.prompt);
|
|
@@ -311,7 +312,26 @@ int main(int argc, char ** argv) {
|
|
|
311
312
|
llava_image_embed_free(image_embed);
|
|
312
313
|
ctx_llava->model = NULL;
|
|
313
314
|
llava_free(ctx_llava);
|
|
315
|
+
} else {
|
|
316
|
+
for (auto & image : params.image) {
|
|
317
|
+
auto ctx_llava = llava_init_context(¶ms, model);
|
|
318
|
+
|
|
319
|
+
auto image_embed = load_image(ctx_llava, ¶ms, image);
|
|
320
|
+
if (!image_embed) {
|
|
321
|
+
std::cerr << "error: failed to load image " << image << ". Terminating\n\n";
|
|
322
|
+
return 1;
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
// process the prompt
|
|
326
|
+
process_prompt(ctx_llava, image_embed, ¶ms, params.prompt);
|
|
327
|
+
|
|
328
|
+
llama_print_timings(ctx_llava->ctx_llama);
|
|
329
|
+
llava_image_embed_free(image_embed);
|
|
330
|
+
ctx_llava->model = NULL;
|
|
331
|
+
llava_free(ctx_llava);
|
|
332
|
+
}
|
|
314
333
|
}
|
|
334
|
+
|
|
315
335
|
llama_free_model(model);
|
|
316
336
|
|
|
317
337
|
return 0;
|
|
@@ -88,7 +88,6 @@ static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<
|
|
|
88
88
|
// Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out)
|
|
89
89
|
static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) {
|
|
90
90
|
struct {
|
|
91
|
-
struct ggml_tensor * newline;
|
|
92
91
|
struct ggml_context * ctx;
|
|
93
92
|
} model;
|
|
94
93
|
|
|
@@ -150,20 +149,6 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
|
|
|
150
149
|
|
|
151
150
|
model.ctx = ggml_init(params);
|
|
152
151
|
|
|
153
|
-
ggml_tensor * newline_tmp = clip_get_newline_tensor(ctx_clip);
|
|
154
|
-
model.newline = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, newline_tmp->ne[0]);
|
|
155
|
-
if (newline_tmp->backend != GGML_BACKEND_TYPE_CPU) {
|
|
156
|
-
if (newline_tmp->buffer == NULL) {
|
|
157
|
-
LOG_TEE("newline_tmp tensor buffer is NULL\n");
|
|
158
|
-
}
|
|
159
|
-
ggml_backend_tensor_get(newline_tmp, model.newline->data, 0, ggml_nbytes(newline_tmp));
|
|
160
|
-
} else {
|
|
161
|
-
model.newline->data = newline_tmp->data;
|
|
162
|
-
if (model.newline->data == NULL) {
|
|
163
|
-
LOG_TEE("newline_tmp tensor data is NULL\n");
|
|
164
|
-
}
|
|
165
|
-
}
|
|
166
|
-
|
|
167
152
|
struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4
|
|
168
153
|
// ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
|
|
169
154
|
// fill it with the image embeddings, ignoring the base
|
|
@@ -174,7 +174,7 @@ int main(int argc, char ** argv) {
|
|
|
174
174
|
// debug
|
|
175
175
|
if (dump_kv_cache) {
|
|
176
176
|
llama_kv_cache_view_update(ctx, &kvc_view);
|
|
177
|
-
|
|
177
|
+
llama_kv_cache_dump_view_seqs(kvc_view, 40);
|
|
178
178
|
}
|
|
179
179
|
|
|
180
180
|
// build the mask from https://lmsys.org/blog/2023-11-21-lookahead-decoding/
|
|
@@ -121,7 +121,7 @@ int main(int argc, char ** argv){
|
|
|
121
121
|
// debug
|
|
122
122
|
if (dump_kv_cache) {
|
|
123
123
|
llama_kv_cache_view_update(ctx, &kvc_view);
|
|
124
|
-
|
|
124
|
+
llama_kv_cache_dump_view_seqs(kvc_view, 40);
|
|
125
125
|
}
|
|
126
126
|
|
|
127
127
|
// print current draft sequence
|
|
@@ -60,9 +60,9 @@ static void write_logfile(
|
|
|
60
60
|
return;
|
|
61
61
|
}
|
|
62
62
|
|
|
63
|
-
const std::string timestamp =
|
|
63
|
+
const std::string timestamp = string_get_sortable_timestamp();
|
|
64
64
|
|
|
65
|
-
const bool success =
|
|
65
|
+
const bool success = fs_create_directory_with_parents(params.logdir);
|
|
66
66
|
if (!success) {
|
|
67
67
|
fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
|
|
68
68
|
__func__, params.logdir.c_str());
|
|
@@ -80,7 +80,7 @@ static void write_logfile(
|
|
|
80
80
|
fprintf(logfile, "binary: main\n");
|
|
81
81
|
char model_desc[128];
|
|
82
82
|
llama_model_desc(model, model_desc, sizeof(model_desc));
|
|
83
|
-
|
|
83
|
+
yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc);
|
|
84
84
|
|
|
85
85
|
fprintf(logfile, "\n");
|
|
86
86
|
fprintf(logfile, "######################\n");
|
|
@@ -88,8 +88,8 @@ static void write_logfile(
|
|
|
88
88
|
fprintf(logfile, "######################\n");
|
|
89
89
|
fprintf(logfile, "\n");
|
|
90
90
|
|
|
91
|
-
|
|
92
|
-
|
|
91
|
+
yaml_dump_string_multiline(logfile, "output", output.c_str());
|
|
92
|
+
yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
|
|
93
93
|
|
|
94
94
|
llama_dump_timing_info_yaml(logfile, ctx);
|
|
95
95
|
fclose(logfile);
|
|
@@ -181,7 +181,7 @@ int main(int argc, char ** argv) {
|
|
|
181
181
|
|
|
182
182
|
std::mt19937 rng(params.seed);
|
|
183
183
|
if (params.random_prompt) {
|
|
184
|
-
params.prompt =
|
|
184
|
+
params.prompt = string_random_prompt(rng);
|
|
185
185
|
}
|
|
186
186
|
|
|
187
187
|
LOG("%s: llama backend init\n", __func__);
|
|
@@ -219,7 +219,7 @@ int main(int argc, char ** argv) {
|
|
|
219
219
|
// print system information
|
|
220
220
|
{
|
|
221
221
|
LOG_TEE("\n");
|
|
222
|
-
LOG_TEE("%s\n",
|
|
222
|
+
LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
|
|
223
223
|
}
|
|
224
224
|
|
|
225
225
|
std::string path_session = params.path_prompt_cache;
|
|
@@ -474,12 +474,12 @@ int main(int argc, char ** argv) {
|
|
|
474
474
|
LOG_TEE("\n\n");
|
|
475
475
|
|
|
476
476
|
if (params.interactive) {
|
|
477
|
-
const char *control_message;
|
|
477
|
+
const char * control_message;
|
|
478
478
|
if (params.multiline_input) {
|
|
479
|
-
control_message = " - To return control to
|
|
479
|
+
control_message = " - To return control to the AI, end your input with '\\'.\n"
|
|
480
480
|
" - To return control without starting a new line, end your input with '/'.\n";
|
|
481
481
|
} else {
|
|
482
|
-
control_message = " - Press Return to return control to
|
|
482
|
+
control_message = " - Press Return to return control to the AI.\n"
|
|
483
483
|
" - To return control without starting a new line, end your input with '/'.\n"
|
|
484
484
|
" - If you want to submit another line, end your input with '\\'.\n";
|
|
485
485
|
}
|
|
@@ -523,6 +523,10 @@ int main(int argc, char ** argv) {
|
|
|
523
523
|
}
|
|
524
524
|
|
|
525
525
|
struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
|
|
526
|
+
if (!ctx_sampling) {
|
|
527
|
+
fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
|
|
528
|
+
exit(1);
|
|
529
|
+
}
|
|
526
530
|
|
|
527
531
|
while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
|
|
528
532
|
// predict
|
|
@@ -703,7 +707,7 @@ int main(int argc, char ** argv) {
|
|
|
703
707
|
|
|
704
708
|
const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
|
|
705
709
|
|
|
706
|
-
llama_sampling_accept(ctx_sampling, ctx, id, true);
|
|
710
|
+
llama_sampling_accept(ctx_sampling, ctx, id, /* apply_grammar= */ true);
|
|
707
711
|
|
|
708
712
|
LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
|
|
709
713
|
|
|
@@ -724,7 +728,7 @@ int main(int argc, char ** argv) {
|
|
|
724
728
|
|
|
725
729
|
// push the prompt in the sampling context in order to apply repetition penalties later
|
|
726
730
|
// for the prompt, we don't apply grammar rules
|
|
727
|
-
llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], false);
|
|
731
|
+
llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], /* apply_grammar= */ false);
|
|
728
732
|
|
|
729
733
|
++n_consumed;
|
|
730
734
|
if ((int) embd.size() >= params.n_batch) {
|
|
@@ -736,18 +740,26 @@ int main(int argc, char ** argv) {
|
|
|
736
740
|
// display text
|
|
737
741
|
if (input_echo && display) {
|
|
738
742
|
for (auto id : embd) {
|
|
739
|
-
const std::string token_str = llama_token_to_piece(ctx, id,
|
|
740
|
-
|
|
743
|
+
const std::string token_str = llama_token_to_piece(ctx, id, params.special);
|
|
744
|
+
|
|
745
|
+
// Console/Stream Output
|
|
746
|
+
fprintf(stdout, "%s", token_str.c_str());
|
|
741
747
|
|
|
748
|
+
// Record Displayed Tokens To Log
|
|
749
|
+
// Note: Generated tokens are created one by one hence this check
|
|
742
750
|
if (embd.size() > 1) {
|
|
751
|
+
// Incoming Requested Tokens
|
|
743
752
|
input_tokens.push_back(id);
|
|
744
753
|
} else {
|
|
754
|
+
// Outgoing Generated Tokens
|
|
745
755
|
output_tokens.push_back(id);
|
|
746
756
|
output_ss << token_str;
|
|
747
757
|
}
|
|
758
|
+
|
|
759
|
+
fflush(stdout);
|
|
748
760
|
}
|
|
749
|
-
fflush(stdout);
|
|
750
761
|
}
|
|
762
|
+
|
|
751
763
|
// reset color to default if there is no pending user input
|
|
752
764
|
if (input_echo && (int) embd_inp.size() == n_consumed) {
|
|
753
765
|
console::set_display(console::reset);
|
|
@@ -875,11 +887,11 @@ int main(int argc, char ** argv) {
|
|
|
875
887
|
embd_inp.insert(embd_inp.end(), cml_pfx.begin(), cml_pfx.end());
|
|
876
888
|
}
|
|
877
889
|
if (params.escape) {
|
|
878
|
-
|
|
890
|
+
string_process_escapes(buffer);
|
|
879
891
|
}
|
|
880
892
|
|
|
881
893
|
const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
|
|
882
|
-
const auto line_inp = ::llama_tokenize(ctx, buffer, false,
|
|
894
|
+
const auto line_inp = ::llama_tokenize(ctx, buffer, false, params.interactive_specials);
|
|
883
895
|
const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
|
|
884
896
|
|
|
885
897
|
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
|
|
@@ -210,7 +210,7 @@ int main(int argc, char ** argv) {
|
|
|
210
210
|
while (true) {
|
|
211
211
|
if (dump_kv_cache) {
|
|
212
212
|
llama_kv_cache_view_update(ctx, &kvc_view);
|
|
213
|
-
|
|
213
|
+
llama_kv_cache_dump_view_seqs(kvc_view, 40);
|
|
214
214
|
}
|
|
215
215
|
|
|
216
216
|
llama_batch_clear(batch);
|