@fugood/llama.node 0.3.0 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +1 -10
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +6 -4
- package/src/LlamaCompletionWorker.cpp +6 -6
- package/src/LlamaContext.cpp +7 -9
- package/src/common.hpp +2 -1
- package/src/llama.cpp/.github/workflows/build.yml +98 -24
- package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
- package/src/llama.cpp/.github/workflows/docker.yml +43 -34
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
- package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
- package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
- package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
- package/src/llama.cpp/.github/workflows/server.yml +7 -0
- package/src/llama.cpp/CMakeLists.txt +20 -8
- package/src/llama.cpp/common/CMakeLists.txt +12 -10
- package/src/llama.cpp/common/arg.cpp +2006 -0
- package/src/llama.cpp/common/arg.h +77 -0
- package/src/llama.cpp/common/common.cpp +496 -1632
- package/src/llama.cpp/common/common.h +161 -63
- package/src/llama.cpp/common/console.cpp +3 -0
- package/src/llama.cpp/common/log.cpp +401 -0
- package/src/llama.cpp/common/log.h +66 -698
- package/src/llama.cpp/common/ngram-cache.cpp +3 -0
- package/src/llama.cpp/common/sampling.cpp +348 -350
- package/src/llama.cpp/common/sampling.h +62 -139
- package/src/llama.cpp/common/stb_image.h +5990 -6398
- package/src/llama.cpp/common/train.cpp +2 -0
- package/src/llama.cpp/docs/build.md +36 -1
- package/src/llama.cpp/examples/CMakeLists.txt +0 -1
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +1 -2
- package/src/llama.cpp/examples/batched/batched.cpp +39 -55
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +34 -44
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +15 -15
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
- package/src/llama.cpp/examples/embedding/embedding.cpp +143 -87
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +33 -33
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +36 -35
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
- package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +34 -27
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +59 -62
- package/src/llama.cpp/examples/infill/infill.cpp +117 -132
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +265 -58
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +29 -22
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +685 -150
- package/src/llama.cpp/examples/llava/clip.h +11 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +47 -58
- package/src/llama.cpp/examples/llava/llava.cpp +110 -24
- package/src/llama.cpp/examples/llava/llava.h +2 -3
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
- package/src/llama.cpp/examples/llava/requirements.txt +1 -0
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +42 -43
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +10 -8
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +23 -22
- package/src/llama.cpp/examples/lookup/lookup.cpp +40 -43
- package/src/llama.cpp/examples/main/main.cpp +210 -262
- package/src/llama.cpp/examples/parallel/parallel.cpp +49 -49
- package/src/llama.cpp/examples/passkey/passkey.cpp +42 -50
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +187 -200
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -3
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +49 -44
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +24 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +32 -35
- package/src/llama.cpp/examples/server/CMakeLists.txt +3 -5
- package/src/llama.cpp/examples/server/server.cpp +1027 -1073
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
- package/src/llama.cpp/examples/server/utils.hpp +107 -105
- package/src/llama.cpp/examples/simple/simple.cpp +35 -41
- package/src/llama.cpp/examples/speculative/speculative.cpp +129 -103
- package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +25 -27
- package/src/llama.cpp/ggml/CMakeLists.txt +14 -3
- package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-backend.h +145 -60
- package/src/llama.cpp/ggml/include/ggml-blas.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +15 -19
- package/src/llama.cpp/ggml/include/ggml-cuda.h +16 -16
- package/src/llama.cpp/ggml/include/ggml-metal.h +5 -8
- package/src/llama.cpp/ggml/include/ggml-rpc.h +5 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +8 -8
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +7 -7
- package/src/llama.cpp/ggml/include/ggml.h +293 -186
- package/src/llama.cpp/ggml/src/CMakeLists.txt +86 -44
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +2135 -1119
- package/src/llama.cpp/ggml/src/ggml-alloc.c +6 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +152 -70
- package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +606 -286
- package/src/llama.cpp/ggml/src/ggml-blas.cpp +9 -10
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
- package/src/llama.cpp/ggml/src/ggml-cann.cpp +215 -216
- package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
- package/src/llama.cpp/ggml/src/ggml-cpu-impl.h +614 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +49 -603
- package/src/llama.cpp/ggml/src/ggml-kompute.cpp +4 -24
- package/src/llama.cpp/ggml/src/ggml-quants.c +972 -92
- package/src/llama.cpp/ggml/src/ggml-quants.h +15 -0
- package/src/llama.cpp/ggml/src/ggml-rpc.cpp +116 -66
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +52 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +16 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +6 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl.cpp +97 -169
- package/src/llama.cpp/ggml/src/ggml-vulkan.cpp +1508 -1124
- package/src/llama.cpp/ggml/src/ggml.c +3001 -1647
- package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +192 -0
- package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +88 -40
- package/src/llama.cpp/include/llama.h +241 -264
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
- package/src/llama.cpp/src/llama-grammar.cpp +721 -122
- package/src/llama.cpp/src/llama-grammar.h +120 -15
- package/src/llama.cpp/src/llama-impl.h +156 -1
- package/src/llama.cpp/src/llama-sampling.cpp +1375 -303
- package/src/llama.cpp/src/llama-sampling.h +20 -47
- package/src/llama.cpp/src/llama-vocab.cpp +343 -120
- package/src/llama.cpp/src/llama-vocab.h +33 -17
- package/src/llama.cpp/src/llama.cpp +4247 -1525
- package/src/llama.cpp/src/unicode-data.cpp +6 -4
- package/src/llama.cpp/src/unicode-data.h +4 -4
- package/src/llama.cpp/src/unicode.cpp +15 -7
- package/src/llama.cpp/tests/CMakeLists.txt +3 -0
- package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
- package/src/llama.cpp/tests/test-backend-ops.cpp +1592 -289
- package/src/llama.cpp/tests/test-barrier.cpp +93 -0
- package/src/llama.cpp/tests/test-grad0.cpp +187 -70
- package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
- package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +6 -4
- package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
- package/src/llama.cpp/tests/test-log.cpp +39 -0
- package/src/llama.cpp/tests/test-quantize-fns.cpp +6 -0
- package/src/llama.cpp/tests/test-rope.cpp +1 -1
- package/src/llama.cpp/tests/test-sampling.cpp +157 -98
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +55 -35
- package/patches/llama.patch +0 -22
- package/src/llama.cpp/.github/workflows/bench.yml +0 -310
- package/src/llama.cpp/common/grammar-parser.cpp +0 -536
- package/src/llama.cpp/common/grammar-parser.h +0 -29
- package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
- package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
|
@@ -1,11 +1,11 @@
|
|
|
1
|
+
#include "arg.h"
|
|
1
2
|
#include "common.h"
|
|
2
|
-
|
|
3
3
|
#include "console.h"
|
|
4
|
+
#include "log.h"
|
|
5
|
+
#include "sampling.h"
|
|
4
6
|
#include "llama.h"
|
|
5
7
|
|
|
6
8
|
#include <cassert>
|
|
7
|
-
#include <cinttypes>
|
|
8
|
-
#include <cmath>
|
|
9
9
|
#include <cstdio>
|
|
10
10
|
#include <cstring>
|
|
11
11
|
#include <ctime>
|
|
@@ -33,6 +33,7 @@
|
|
|
33
33
|
|
|
34
34
|
static llama_context ** g_ctx;
|
|
35
35
|
static llama_model ** g_model;
|
|
36
|
+
static gpt_sampler ** g_smpl;
|
|
36
37
|
static gpt_params * g_params;
|
|
37
38
|
static std::vector<llama_token> * g_input_tokens;
|
|
38
39
|
static std::ostringstream * g_output_ss;
|
|
@@ -40,6 +41,15 @@ static std::vector<llama_token> * g_output_tokens;
|
|
|
40
41
|
static bool is_interacting = false;
|
|
41
42
|
static bool need_insert_eot = false;
|
|
42
43
|
|
|
44
|
+
static void print_usage(int argc, char ** argv) {
|
|
45
|
+
(void) argc;
|
|
46
|
+
|
|
47
|
+
LOG("\nexample usage:\n");
|
|
48
|
+
LOG("\n text generation: %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]);
|
|
49
|
+
LOG("\n chat (conversation): %s -m your_model.gguf -p \"You are a helpful assistant\" -cnv\n", argv[0]);
|
|
50
|
+
LOG("\n");
|
|
51
|
+
}
|
|
52
|
+
|
|
43
53
|
static bool file_exists(const std::string & path) {
|
|
44
54
|
std::ifstream f(path.c_str());
|
|
45
55
|
return f.good();
|
|
@@ -65,8 +75,7 @@ static void write_logfile(
|
|
|
65
75
|
|
|
66
76
|
const bool success = fs_create_directory_with_parents(params.logdir);
|
|
67
77
|
if (!success) {
|
|
68
|
-
|
|
69
|
-
__func__, params.logdir.c_str());
|
|
78
|
+
LOG_ERR("%s: failed to create logdir %s, cannot write logfile\n", __func__, params.logdir.c_str());
|
|
70
79
|
return;
|
|
71
80
|
}
|
|
72
81
|
|
|
@@ -74,7 +83,7 @@ static void write_logfile(
|
|
|
74
83
|
FILE * logfile = fopen(logfile_path.c_str(), "w");
|
|
75
84
|
|
|
76
85
|
if (logfile == NULL) {
|
|
77
|
-
|
|
86
|
+
LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
|
|
78
87
|
return;
|
|
79
88
|
}
|
|
80
89
|
|
|
@@ -92,7 +101,7 @@ static void write_logfile(
|
|
|
92
101
|
yaml_dump_string_multiline(logfile, "output", output.c_str());
|
|
93
102
|
yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
|
|
94
103
|
|
|
95
|
-
|
|
104
|
+
llama_perf_dump_yaml(logfile, ctx);
|
|
96
105
|
fclose(logfile);
|
|
97
106
|
}
|
|
98
107
|
|
|
@@ -104,50 +113,38 @@ static void sigint_handler(int signo) {
|
|
|
104
113
|
need_insert_eot = true;
|
|
105
114
|
} else {
|
|
106
115
|
console::cleanup();
|
|
107
|
-
|
|
108
|
-
|
|
116
|
+
LOG("\n");
|
|
117
|
+
gpt_perf_print(*g_ctx, *g_smpl);
|
|
109
118
|
write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
|
|
119
|
+
|
|
120
|
+
// make sure all logs are flushed
|
|
121
|
+
LOG("Interrupted by user\n");
|
|
122
|
+
gpt_log_pause(gpt_log_main());
|
|
123
|
+
|
|
110
124
|
_exit(130);
|
|
111
125
|
}
|
|
112
126
|
}
|
|
113
127
|
}
|
|
114
128
|
#endif
|
|
115
129
|
|
|
116
|
-
static
|
|
117
|
-
(void) level;
|
|
118
|
-
(void) user_data;
|
|
119
|
-
LOG_TEE("%s", text);
|
|
120
|
-
}
|
|
121
|
-
|
|
122
|
-
static std::string chat_add_and_format(struct llama_model * model, std::vector<llama_chat_msg> & chat_msgs, std::string role, std::string content) {
|
|
130
|
+
static std::string chat_add_and_format(struct llama_model * model, std::vector<llama_chat_msg> & chat_msgs, const std::string & role, const std::string & content) {
|
|
123
131
|
llama_chat_msg new_msg{role, content};
|
|
124
|
-
auto formatted = llama_chat_format_single(
|
|
125
|
-
model, g_params->chat_template, chat_msgs, new_msg, role == "user");
|
|
132
|
+
auto formatted = llama_chat_format_single(model, g_params->chat_template, chat_msgs, new_msg, role == "user");
|
|
126
133
|
chat_msgs.push_back({role, content});
|
|
127
|
-
|
|
134
|
+
LOG_DBG("formatted: '%s'\n", formatted.c_str());
|
|
128
135
|
return formatted;
|
|
129
136
|
}
|
|
130
137
|
|
|
131
138
|
int main(int argc, char ** argv) {
|
|
132
139
|
gpt_params params;
|
|
133
140
|
g_params = ¶ms;
|
|
134
|
-
|
|
135
|
-
if (!gpt_params_parse(argc, argv, params)) {
|
|
136
|
-
gpt_params_print_usage(argc, argv, params);
|
|
141
|
+
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) {
|
|
137
142
|
return 1;
|
|
138
143
|
}
|
|
139
144
|
|
|
140
|
-
|
|
145
|
+
gpt_init();
|
|
141
146
|
|
|
142
|
-
|
|
143
|
-
log_set_target(log_filename_generator("main", "log"));
|
|
144
|
-
LOG_TEE("Log start\n");
|
|
145
|
-
log_dump_cmdline(argc, argv);
|
|
146
|
-
llama_log_set(llama_log_callback_logTee, nullptr);
|
|
147
|
-
#endif // LOG_DISABLE_LOGS
|
|
148
|
-
|
|
149
|
-
// TODO: Dump params ?
|
|
150
|
-
//LOG("Params perplexity: %s\n", LOG_TOSTR(params.perplexity));
|
|
147
|
+
auto & sparams = params.sparams;
|
|
151
148
|
|
|
152
149
|
// save choice to use color for later
|
|
153
150
|
// (note for later: this is a slightly awkward choice)
|
|
@@ -155,120 +152,141 @@ int main(int argc, char ** argv) {
|
|
|
155
152
|
atexit([]() { console::cleanup(); });
|
|
156
153
|
|
|
157
154
|
if (params.logits_all) {
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
155
|
+
LOG_ERR("************\n");
|
|
156
|
+
LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
|
|
157
|
+
LOG_ERR("************\n\n");
|
|
161
158
|
|
|
162
159
|
return 0;
|
|
163
160
|
}
|
|
164
161
|
|
|
165
162
|
if (params.embedding) {
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
163
|
+
LOG_ERR("************\n");
|
|
164
|
+
LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
|
|
165
|
+
LOG_ERR("************\n\n");
|
|
169
166
|
|
|
170
167
|
return 0;
|
|
171
168
|
}
|
|
172
169
|
|
|
173
170
|
if (params.n_ctx != 0 && params.n_ctx < 8) {
|
|
174
|
-
|
|
171
|
+
LOG_WRN("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
|
|
175
172
|
params.n_ctx = 8;
|
|
176
173
|
}
|
|
177
174
|
|
|
178
175
|
if (params.rope_freq_base != 0.0) {
|
|
179
|
-
|
|
176
|
+
LOG_WRN("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
|
|
180
177
|
}
|
|
181
178
|
|
|
182
179
|
if (params.rope_freq_scale != 0.0) {
|
|
183
|
-
|
|
184
|
-
}
|
|
185
|
-
|
|
186
|
-
LOG_TEE("%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
|
|
187
|
-
LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);
|
|
188
|
-
|
|
189
|
-
if (params.seed == LLAMA_DEFAULT_SEED) {
|
|
190
|
-
params.seed = time(NULL);
|
|
180
|
+
LOG_WRN("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
|
|
191
181
|
}
|
|
192
182
|
|
|
193
|
-
|
|
183
|
+
LOG_INF("%s: llama backend init\n", __func__);
|
|
194
184
|
|
|
195
|
-
std::mt19937 rng(params.seed);
|
|
196
|
-
|
|
197
|
-
LOG("%s: llama backend init\n", __func__);
|
|
198
185
|
llama_backend_init();
|
|
199
186
|
llama_numa_init(params.numa);
|
|
200
187
|
|
|
201
|
-
llama_model * model;
|
|
202
|
-
llama_context * ctx;
|
|
203
|
-
|
|
188
|
+
llama_model * model = nullptr;
|
|
189
|
+
llama_context * ctx = nullptr;
|
|
190
|
+
gpt_sampler * smpl = nullptr;
|
|
191
|
+
|
|
204
192
|
std::vector<llama_chat_msg> chat_msgs;
|
|
193
|
+
|
|
205
194
|
g_model = &model;
|
|
206
195
|
g_ctx = &ctx;
|
|
196
|
+
g_smpl = &smpl;
|
|
207
197
|
|
|
208
198
|
// load the model and apply lora adapter, if any
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
}
|
|
199
|
+
LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
|
|
200
|
+
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
|
201
|
+
|
|
202
|
+
model = llama_init.model;
|
|
203
|
+
ctx = llama_init.context;
|
|
215
204
|
|
|
216
205
|
if (model == NULL) {
|
|
217
|
-
|
|
206
|
+
LOG_ERR("%s: error: unable to load model\n", __func__);
|
|
218
207
|
return 1;
|
|
219
208
|
}
|
|
220
209
|
|
|
210
|
+
LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);
|
|
211
|
+
|
|
212
|
+
struct ggml_threadpool_params tpp_batch =
|
|
213
|
+
ggml_threadpool_params_from_cpu_params(params.cpuparams_batch);
|
|
214
|
+
struct ggml_threadpool_params tpp =
|
|
215
|
+
ggml_threadpool_params_from_cpu_params(params.cpuparams);
|
|
216
|
+
|
|
217
|
+
set_process_priority(params.cpuparams.priority);
|
|
218
|
+
|
|
219
|
+
struct ggml_threadpool * threadpool_batch = NULL;
|
|
220
|
+
if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
|
|
221
|
+
threadpool_batch = ggml_threadpool_new(&tpp_batch);
|
|
222
|
+
if (!threadpool_batch) {
|
|
223
|
+
LOG_ERR("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
|
|
224
|
+
return 1;
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
// Start the non-batch threadpool in the paused state
|
|
228
|
+
tpp.paused = true;
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp);
|
|
232
|
+
if (!threadpool) {
|
|
233
|
+
LOG_ERR("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
|
|
234
|
+
return 1;
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
llama_attach_threadpool(ctx, threadpool, threadpool_batch);
|
|
238
|
+
|
|
221
239
|
const int n_ctx_train = llama_n_ctx_train(model);
|
|
222
240
|
const int n_ctx = llama_n_ctx(ctx);
|
|
223
|
-
LOG("n_ctx: %d\n", n_ctx);
|
|
224
241
|
|
|
225
242
|
if (n_ctx > n_ctx_train) {
|
|
226
|
-
|
|
227
|
-
__func__, n_ctx_train, n_ctx);
|
|
243
|
+
LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx);
|
|
228
244
|
}
|
|
229
245
|
|
|
230
246
|
// print chat template example in conversation mode
|
|
231
247
|
if (params.conversation) {
|
|
232
248
|
if (params.enable_chat_template) {
|
|
233
|
-
|
|
249
|
+
LOG_INF("%s: chat template example:\n%s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str());
|
|
234
250
|
} else {
|
|
235
|
-
|
|
251
|
+
LOG_INF("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
|
|
236
252
|
}
|
|
237
253
|
}
|
|
238
254
|
|
|
239
255
|
// print system information
|
|
240
256
|
{
|
|
241
|
-
|
|
242
|
-
|
|
257
|
+
LOG_INF("\n");
|
|
258
|
+
LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
|
|
259
|
+
LOG_INF("\n");
|
|
243
260
|
}
|
|
244
261
|
|
|
245
262
|
std::string path_session = params.path_prompt_cache;
|
|
246
263
|
std::vector<llama_token> session_tokens;
|
|
247
264
|
|
|
248
265
|
if (!path_session.empty()) {
|
|
249
|
-
|
|
266
|
+
LOG_INF("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
|
|
250
267
|
if (!file_exists(path_session)) {
|
|
251
|
-
|
|
268
|
+
LOG_INF("%s: session file does not exist, will create.\n", __func__);
|
|
252
269
|
} else if (file_is_empty(path_session)) {
|
|
253
|
-
|
|
270
|
+
LOG_INF("%s: The session file is empty. A new session will be initialized.\n", __func__);
|
|
254
271
|
} else {
|
|
255
272
|
// The file exists and is not empty
|
|
256
273
|
session_tokens.resize(n_ctx);
|
|
257
274
|
size_t n_token_count_out = 0;
|
|
258
275
|
if (!llama_state_load_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
|
|
259
|
-
|
|
276
|
+
LOG_ERR("%s: failed to load session file '%s'\n", __func__, path_session.c_str());
|
|
260
277
|
return 1;
|
|
261
278
|
}
|
|
262
279
|
session_tokens.resize(n_token_count_out);
|
|
263
|
-
|
|
280
|
+
LOG_INF("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size());
|
|
264
281
|
}
|
|
265
282
|
}
|
|
266
283
|
|
|
267
|
-
const bool add_bos =
|
|
284
|
+
const bool add_bos = llama_add_bos_token(model);
|
|
268
285
|
if (!llama_model_has_encoder(model)) {
|
|
269
|
-
GGML_ASSERT(llama_add_eos_token(model)
|
|
286
|
+
GGML_ASSERT(!llama_add_eos_token(model));
|
|
270
287
|
}
|
|
271
|
-
|
|
288
|
+
|
|
289
|
+
LOG_DBG("n_ctx: %d, add_bos: %d\n", n_ctx, add_bos);
|
|
272
290
|
|
|
273
291
|
std::vector<llama_token> embd_inp;
|
|
274
292
|
|
|
@@ -277,49 +295,31 @@ int main(int argc, char ** argv) {
|
|
|
277
295
|
? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode
|
|
278
296
|
: params.prompt;
|
|
279
297
|
if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
|
|
280
|
-
|
|
298
|
+
LOG_DBG("tokenize the prompt\n");
|
|
281
299
|
embd_inp = ::llama_tokenize(ctx, prompt, true, true);
|
|
282
300
|
} else {
|
|
283
|
-
|
|
301
|
+
LOG_DBG("use session tokens\n");
|
|
284
302
|
embd_inp = session_tokens;
|
|
285
303
|
}
|
|
286
304
|
|
|
287
|
-
|
|
288
|
-
|
|
305
|
+
LOG_DBG("prompt: \"%s\"\n", prompt.c_str());
|
|
306
|
+
LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str());
|
|
289
307
|
}
|
|
290
308
|
|
|
291
309
|
// Should not run without any tokens
|
|
292
310
|
if (embd_inp.empty()) {
|
|
293
311
|
if (add_bos) {
|
|
294
312
|
embd_inp.push_back(llama_token_bos(model));
|
|
295
|
-
|
|
313
|
+
LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
|
|
296
314
|
} else {
|
|
297
|
-
|
|
315
|
+
LOG_ERR("input is empty\n");
|
|
298
316
|
return -1;
|
|
299
317
|
}
|
|
300
318
|
}
|
|
301
319
|
|
|
302
320
|
// Tokenize negative prompt
|
|
303
|
-
std::vector<llama_token> guidance_inp;
|
|
304
|
-
int guidance_offset = 0;
|
|
305
|
-
int original_prompt_len = 0;
|
|
306
|
-
if (ctx_guidance) {
|
|
307
|
-
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
|
|
308
|
-
|
|
309
|
-
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true, true);
|
|
310
|
-
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
|
|
311
|
-
|
|
312
|
-
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true, true);
|
|
313
|
-
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
|
|
314
|
-
|
|
315
|
-
original_prompt_len = original_inp.size();
|
|
316
|
-
guidance_offset = (int)guidance_inp.size() - original_prompt_len;
|
|
317
|
-
LOG("original_prompt_len: %s", log_tostr(original_prompt_len));
|
|
318
|
-
LOG("guidance_offset: %s", log_tostr(guidance_offset));
|
|
319
|
-
}
|
|
320
|
-
|
|
321
321
|
if ((int) embd_inp.size() > n_ctx - 4) {
|
|
322
|
-
|
|
322
|
+
LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
|
|
323
323
|
return 1;
|
|
324
324
|
}
|
|
325
325
|
|
|
@@ -333,29 +333,28 @@ int main(int argc, char ** argv) {
|
|
|
333
333
|
n_matching_session_tokens++;
|
|
334
334
|
}
|
|
335
335
|
if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) {
|
|
336
|
-
|
|
336
|
+
LOG_INF("%s: using full prompt from session file\n", __func__);
|
|
337
337
|
} else if (n_matching_session_tokens >= embd_inp.size()) {
|
|
338
|
-
|
|
338
|
+
LOG_INF("%s: session file has exact match for prompt!\n", __func__);
|
|
339
339
|
} else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
|
|
340
|
-
|
|
341
|
-
|
|
340
|
+
LOG_WRN("%s: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
|
|
341
|
+
__func__, n_matching_session_tokens, embd_inp.size());
|
|
342
342
|
} else {
|
|
343
|
-
|
|
344
|
-
|
|
343
|
+
LOG_INF("%s: session file matches %zu / %zu tokens of prompt\n",
|
|
344
|
+
__func__, n_matching_session_tokens, embd_inp.size());
|
|
345
345
|
}
|
|
346
346
|
|
|
347
347
|
// remove any "future" tokens that we might have inherited from the previous session
|
|
348
348
|
llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1);
|
|
349
349
|
}
|
|
350
350
|
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
log_tostr(embd_inp.empty()), n_matching_session_tokens, embd_inp.size(), session_tokens.size(), embd_inp.size());
|
|
351
|
+
LOG_DBG("recalculate the cached logits (check): embd_inp.size() %zu, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu\n",
|
|
352
|
+
embd_inp.size(), n_matching_session_tokens, embd_inp.size(), session_tokens.size());
|
|
354
353
|
|
|
355
354
|
// if we will use the cache for the full prompt without reaching the end of the cache, force
|
|
356
355
|
// reevaluation of the last token to recalculate the cached logits
|
|
357
356
|
if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && session_tokens.size() > embd_inp.size()) {
|
|
358
|
-
|
|
357
|
+
LOG_DBG("recalculate the cached logits (do): session_tokens.resize( %zu )\n", embd_inp.size() - 1);
|
|
359
358
|
|
|
360
359
|
session_tokens.resize(embd_inp.size() - 1);
|
|
361
360
|
}
|
|
@@ -377,30 +376,20 @@ int main(int argc, char ** argv) {
|
|
|
377
376
|
}
|
|
378
377
|
|
|
379
378
|
if (params.verbose_prompt) {
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
|
379
|
+
LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
|
380
|
+
LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
|
383
381
|
for (int i = 0; i < (int) embd_inp.size(); i++) {
|
|
384
|
-
|
|
385
|
-
}
|
|
386
|
-
|
|
387
|
-
if (ctx_guidance) {
|
|
388
|
-
LOG_TEE("\n");
|
|
389
|
-
LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
|
|
390
|
-
LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
|
|
391
|
-
for (int i = 0; i < (int) guidance_inp.size(); i++) {
|
|
392
|
-
LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
|
|
393
|
-
}
|
|
382
|
+
LOG_INF("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
|
|
394
383
|
}
|
|
395
384
|
|
|
396
385
|
if (params.n_keep > add_bos) {
|
|
397
|
-
|
|
386
|
+
LOG_INF("%s: static prompt based on n_keep: '", __func__);
|
|
398
387
|
for (int i = 0; i < params.n_keep; i++) {
|
|
399
|
-
|
|
388
|
+
LOG_CNT("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
|
|
400
389
|
}
|
|
401
|
-
|
|
390
|
+
LOG_CNT("'\n");
|
|
402
391
|
}
|
|
403
|
-
|
|
392
|
+
LOG_INF("\n");
|
|
404
393
|
}
|
|
405
394
|
|
|
406
395
|
// ctrl+C handling
|
|
@@ -420,47 +409,56 @@ int main(int argc, char ** argv) {
|
|
|
420
409
|
}
|
|
421
410
|
|
|
422
411
|
if (params.interactive) {
|
|
423
|
-
|
|
412
|
+
LOG_INF("%s: interactive mode on.\n", __func__);
|
|
424
413
|
|
|
425
414
|
if (!params.antiprompt.empty()) {
|
|
426
415
|
for (const auto & antiprompt : params.antiprompt) {
|
|
427
|
-
|
|
416
|
+
LOG_INF("Reverse prompt: '%s'\n", antiprompt.c_str());
|
|
428
417
|
if (params.verbose_prompt) {
|
|
429
418
|
auto tmp = ::llama_tokenize(ctx, antiprompt, false, true);
|
|
430
419
|
for (int i = 0; i < (int) tmp.size(); i++) {
|
|
431
|
-
|
|
420
|
+
LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
|
|
432
421
|
}
|
|
433
422
|
}
|
|
434
423
|
}
|
|
435
424
|
}
|
|
436
425
|
|
|
437
426
|
if (params.input_prefix_bos) {
|
|
438
|
-
|
|
427
|
+
LOG_INF("Input prefix with BOS\n");
|
|
439
428
|
}
|
|
440
429
|
|
|
441
430
|
if (!params.input_prefix.empty()) {
|
|
442
|
-
|
|
431
|
+
LOG_INF("Input prefix: '%s'\n", params.input_prefix.c_str());
|
|
443
432
|
if (params.verbose_prompt) {
|
|
444
433
|
auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true);
|
|
445
434
|
for (int i = 0; i < (int) tmp.size(); i++) {
|
|
446
|
-
|
|
435
|
+
LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
|
|
447
436
|
}
|
|
448
437
|
}
|
|
449
438
|
}
|
|
450
439
|
|
|
451
440
|
if (!params.input_suffix.empty()) {
|
|
452
|
-
|
|
441
|
+
LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str());
|
|
453
442
|
if (params.verbose_prompt) {
|
|
454
443
|
auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true);
|
|
455
444
|
for (int i = 0; i < (int) tmp.size(); i++) {
|
|
456
|
-
|
|
445
|
+
LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
|
|
457
446
|
}
|
|
458
447
|
}
|
|
459
448
|
}
|
|
460
449
|
}
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
450
|
+
|
|
451
|
+
smpl = gpt_sampler_init(model, sparams);
|
|
452
|
+
if (!smpl) {
|
|
453
|
+
LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
|
|
454
|
+
return 1;
|
|
455
|
+
}
|
|
456
|
+
|
|
457
|
+
LOG_INF("sampler seed: %u\n", gpt_sampler_get_seed(smpl));
|
|
458
|
+
LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
|
|
459
|
+
LOG_INF("sampler chain: %s\n", gpt_sampler_print(smpl).c_str());
|
|
460
|
+
|
|
461
|
+
LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
|
464
462
|
|
|
465
463
|
// group-attention state
|
|
466
464
|
// number of grouped KV tokens so far (used only if params.grp_attn_n > 1)
|
|
@@ -474,9 +472,9 @@ int main(int argc, char ** argv) {
|
|
|
474
472
|
GGML_ASSERT(ga_w % ga_n == 0 && "grp_attn_w must be a multiple of grp_attn_n"); // NOLINT
|
|
475
473
|
//GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of grp_attn_w"); // NOLINT
|
|
476
474
|
//GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * grp_attn_n"); // NOLINT
|
|
477
|
-
|
|
475
|
+
LOG_INF("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w);
|
|
478
476
|
}
|
|
479
|
-
|
|
477
|
+
LOG_INF("\n");
|
|
480
478
|
|
|
481
479
|
if (params.interactive) {
|
|
482
480
|
const char * control_message;
|
|
@@ -488,11 +486,11 @@ int main(int argc, char ** argv) {
|
|
|
488
486
|
" - To return control without starting a new line, end your input with '/'.\n"
|
|
489
487
|
" - If you want to submit another line, end your input with '\\'.\n";
|
|
490
488
|
}
|
|
491
|
-
|
|
489
|
+
LOG_INF("== Running in interactive mode. ==\n");
|
|
492
490
|
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
|
493
|
-
|
|
491
|
+
LOG_INF( " - Press Ctrl+C to interject at any time.\n");
|
|
494
492
|
#endif
|
|
495
|
-
|
|
493
|
+
LOG_INF( "%s\n", control_message);
|
|
496
494
|
|
|
497
495
|
is_interacting = params.interactive_first;
|
|
498
496
|
}
|
|
@@ -506,7 +504,6 @@ int main(int argc, char ** argv) {
|
|
|
506
504
|
int n_remain = params.n_predict;
|
|
507
505
|
int n_consumed = 0;
|
|
508
506
|
int n_session_consumed = 0;
|
|
509
|
-
int n_past_guidance = 0;
|
|
510
507
|
|
|
511
508
|
std::vector<int> input_tokens; g_input_tokens = &input_tokens;
|
|
512
509
|
std::vector<int> output_tokens; g_output_tokens = &output_tokens;
|
|
@@ -518,7 +515,6 @@ int main(int argc, char ** argv) {
|
|
|
518
515
|
display = params.display_prompt;
|
|
519
516
|
|
|
520
517
|
std::vector<llama_token> embd;
|
|
521
|
-
std::vector<llama_token> embd_guidance;
|
|
522
518
|
|
|
523
519
|
// tokenized antiprompts
|
|
524
520
|
std::vector<std::vector<llama_token>> antiprompt_ids;
|
|
@@ -528,18 +524,12 @@ int main(int argc, char ** argv) {
|
|
|
528
524
|
antiprompt_ids.emplace_back(::llama_tokenize(ctx, antiprompt, false, true));
|
|
529
525
|
}
|
|
530
526
|
|
|
531
|
-
struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
|
|
532
|
-
if (!ctx_sampling) {
|
|
533
|
-
fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
|
|
534
|
-
exit(1);
|
|
535
|
-
}
|
|
536
|
-
|
|
537
527
|
if (llama_model_has_encoder(model)) {
|
|
538
528
|
int enc_input_size = embd_inp.size();
|
|
539
529
|
llama_token * enc_input_buf = embd_inp.data();
|
|
540
530
|
|
|
541
531
|
if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size, 0, 0))) {
|
|
542
|
-
|
|
532
|
+
LOG_ERR("%s : failed to eval\n", __func__);
|
|
543
533
|
return 1;
|
|
544
534
|
}
|
|
545
535
|
|
|
@@ -565,9 +555,8 @@ int main(int argc, char ** argv) {
|
|
|
565
555
|
embd.resize(max_embd_size);
|
|
566
556
|
|
|
567
557
|
console::set_display(console::error);
|
|
568
|
-
|
|
558
|
+
LOG_WRN("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
|
|
569
559
|
console::set_display(console::reset);
|
|
570
|
-
fflush(stdout);
|
|
571
560
|
}
|
|
572
561
|
|
|
573
562
|
if (ga_n == 1) {
|
|
@@ -575,33 +564,35 @@ int main(int argc, char ** argv) {
|
|
|
575
564
|
// if we run out of context:
|
|
576
565
|
// - take the n_keep first tokens from the original prompt (via n_past)
|
|
577
566
|
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
|
|
578
|
-
if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) >= n_ctx) {
|
|
579
|
-
if (params.n_predict == -2) {
|
|
580
|
-
LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
|
|
581
|
-
break;
|
|
582
|
-
}
|
|
583
567
|
|
|
584
|
-
|
|
585
|
-
|
|
568
|
+
if (n_past + (int) embd.size() >= n_ctx) {
|
|
569
|
+
if (!params.ctx_shift){
|
|
570
|
+
LOG_DBG("\n\n%s: context full and context shift is disabled => stopping\n", __func__);
|
|
571
|
+
break;
|
|
572
|
+
} else {
|
|
573
|
+
if (params.n_predict == -2) {
|
|
574
|
+
LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
|
|
575
|
+
break;
|
|
576
|
+
}
|
|
586
577
|
|
|
587
|
-
|
|
588
|
-
|
|
578
|
+
const int n_left = n_past - params.n_keep;
|
|
579
|
+
const int n_discard = n_left/2;
|
|
589
580
|
|
|
590
|
-
|
|
591
|
-
|
|
581
|
+
LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
|
|
582
|
+
n_past, n_left, n_ctx, params.n_keep, n_discard);
|
|
592
583
|
|
|
593
|
-
|
|
584
|
+
llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard);
|
|
585
|
+
llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
|
|
594
586
|
|
|
595
|
-
|
|
596
|
-
n_past_guidance -= n_discard;
|
|
597
|
-
}
|
|
587
|
+
n_past -= n_discard;
|
|
598
588
|
|
|
599
|
-
|
|
589
|
+
LOG_DBG("after swap: n_past = %d\n", n_past);
|
|
600
590
|
|
|
601
|
-
|
|
591
|
+
LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
|
|
602
592
|
|
|
603
|
-
|
|
604
|
-
|
|
593
|
+
LOG_DBG("clear session path\n");
|
|
594
|
+
path_session.clear();
|
|
595
|
+
}
|
|
605
596
|
}
|
|
606
597
|
} else {
|
|
607
598
|
// context extension via Self-Extend
|
|
@@ -610,10 +601,10 @@ int main(int argc, char ** argv) {
|
|
|
610
601
|
const int bd = (ga_w/ga_n)*(ga_n - 1);
|
|
611
602
|
const int dd = (ga_w/ga_n) - ib*bd - ga_w;
|
|
612
603
|
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
604
|
+
LOG_DBG("\n");
|
|
605
|
+
LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i, n_past, ib*bd, ga_i + ib*bd, n_past + ib*bd);
|
|
606
|
+
LOG_DBG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
|
|
607
|
+
LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);
|
|
617
608
|
|
|
618
609
|
llama_kv_cache_seq_add(ctx, 0, ga_i, n_past, ib*bd);
|
|
619
610
|
llama_kv_cache_seq_div(ctx, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n);
|
|
@@ -623,7 +614,7 @@ int main(int argc, char ** argv) {
|
|
|
623
614
|
|
|
624
615
|
ga_i += ga_w/ga_n;
|
|
625
616
|
|
|
626
|
-
|
|
617
|
+
LOG_DBG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", n_past + bd, n_past, ga_i);
|
|
627
618
|
}
|
|
628
619
|
}
|
|
629
620
|
|
|
@@ -649,65 +640,25 @@ int main(int argc, char ** argv) {
|
|
|
649
640
|
}
|
|
650
641
|
}
|
|
651
642
|
|
|
652
|
-
// evaluate tokens in batches
|
|
653
|
-
// embd is typically prepared beforehand to fit within a batch, but not always
|
|
654
|
-
if (ctx_guidance) {
|
|
655
|
-
int input_size = 0;
|
|
656
|
-
llama_token * input_buf = NULL;
|
|
657
|
-
|
|
658
|
-
if (n_past_guidance < (int) guidance_inp.size()) {
|
|
659
|
-
// Guidance context should have the same data with these modifications:
|
|
660
|
-
//
|
|
661
|
-
// * Replace the initial prompt
|
|
662
|
-
// * Shift everything by guidance_offset
|
|
663
|
-
embd_guidance = guidance_inp;
|
|
664
|
-
if (embd.begin() + original_prompt_len < embd.end()) {
|
|
665
|
-
embd_guidance.insert(
|
|
666
|
-
embd_guidance.end(),
|
|
667
|
-
embd.begin() + original_prompt_len,
|
|
668
|
-
embd.end()
|
|
669
|
-
);
|
|
670
|
-
}
|
|
671
|
-
|
|
672
|
-
input_buf = embd_guidance.data();
|
|
673
|
-
input_size = embd_guidance.size();
|
|
674
|
-
|
|
675
|
-
LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance).c_str());
|
|
676
|
-
} else {
|
|
677
|
-
input_buf = embd.data();
|
|
678
|
-
input_size = embd.size();
|
|
679
|
-
}
|
|
680
|
-
|
|
681
|
-
for (int i = 0; i < input_size; i += params.n_batch) {
|
|
682
|
-
int n_eval = std::min(input_size - i, params.n_batch);
|
|
683
|
-
if (llama_decode(ctx_guidance, llama_batch_get_one(input_buf + i, n_eval, n_past_guidance, 0))) {
|
|
684
|
-
LOG_TEE("%s : failed to eval\n", __func__);
|
|
685
|
-
return 1;
|
|
686
|
-
}
|
|
687
|
-
|
|
688
|
-
n_past_guidance += n_eval;
|
|
689
|
-
}
|
|
690
|
-
}
|
|
691
|
-
|
|
692
643
|
for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
|
|
693
644
|
int n_eval = (int) embd.size() - i;
|
|
694
645
|
if (n_eval > params.n_batch) {
|
|
695
646
|
n_eval = params.n_batch;
|
|
696
647
|
}
|
|
697
648
|
|
|
698
|
-
|
|
649
|
+
LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
|
|
699
650
|
|
|
700
651
|
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
|
|
701
|
-
|
|
652
|
+
LOG_ERR("%s : failed to eval\n", __func__);
|
|
702
653
|
return 1;
|
|
703
654
|
}
|
|
704
655
|
|
|
705
656
|
n_past += n_eval;
|
|
706
657
|
|
|
707
|
-
|
|
658
|
+
LOG_DBG("n_past = %d\n", n_past);
|
|
708
659
|
// Display total tokens alongside total time
|
|
709
660
|
if (params.n_print > 0 && n_past % params.n_print == 0) {
|
|
710
|
-
|
|
661
|
+
LOG_DBG("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx);
|
|
711
662
|
}
|
|
712
663
|
}
|
|
713
664
|
|
|
@@ -718,7 +669,6 @@ int main(int argc, char ** argv) {
|
|
|
718
669
|
}
|
|
719
670
|
|
|
720
671
|
embd.clear();
|
|
721
|
-
embd_guidance.clear();
|
|
722
672
|
|
|
723
673
|
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
|
|
724
674
|
// optionally save the session on first sample (for faster prompt loading next time)
|
|
@@ -726,14 +676,14 @@ int main(int argc, char ** argv) {
|
|
|
726
676
|
need_to_save_session = false;
|
|
727
677
|
llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
|
|
728
678
|
|
|
729
|
-
|
|
679
|
+
LOG_DBG("saved session to %s\n", path_session.c_str());
|
|
730
680
|
}
|
|
731
681
|
|
|
732
|
-
const llama_token id =
|
|
682
|
+
const llama_token id = gpt_sampler_sample(smpl, ctx, -1);
|
|
733
683
|
|
|
734
|
-
|
|
684
|
+
gpt_sampler_accept(smpl, id, /* accept_grammar= */ true);
|
|
735
685
|
|
|
736
|
-
|
|
686
|
+
// LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
|
|
737
687
|
|
|
738
688
|
embd.push_back(id);
|
|
739
689
|
|
|
@@ -743,16 +693,16 @@ int main(int argc, char ** argv) {
|
|
|
743
693
|
// decrement remaining sampling budget
|
|
744
694
|
--n_remain;
|
|
745
695
|
|
|
746
|
-
|
|
696
|
+
LOG_DBG("n_remain: %d\n", n_remain);
|
|
747
697
|
} else {
|
|
748
698
|
// some user input remains from prompt or interaction, forward it to processing
|
|
749
|
-
|
|
699
|
+
LOG_DBG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
|
|
750
700
|
while ((int) embd_inp.size() > n_consumed) {
|
|
751
701
|
embd.push_back(embd_inp[n_consumed]);
|
|
752
702
|
|
|
753
703
|
// push the prompt in the sampling context in order to apply repetition penalties later
|
|
754
704
|
// for the prompt, we don't apply grammar rules
|
|
755
|
-
|
|
705
|
+
gpt_sampler_accept(smpl, embd_inp[n_consumed], /* accept_grammar= */ false);
|
|
756
706
|
|
|
757
707
|
++n_consumed;
|
|
758
708
|
if ((int) embd.size() >= params.n_batch) {
|
|
@@ -767,7 +717,7 @@ int main(int argc, char ** argv) {
|
|
|
767
717
|
const std::string token_str = llama_token_to_piece(ctx, id, params.special);
|
|
768
718
|
|
|
769
719
|
// Console/Stream Output
|
|
770
|
-
|
|
720
|
+
LOG("%s", token_str.c_str());
|
|
771
721
|
|
|
772
722
|
// Record Displayed Tokens To Log
|
|
773
723
|
// Note: Generated tokens are created one by one hence this check
|
|
@@ -779,8 +729,6 @@ int main(int argc, char ** argv) {
|
|
|
779
729
|
output_tokens.push_back(id);
|
|
780
730
|
output_ss << token_str;
|
|
781
731
|
}
|
|
782
|
-
|
|
783
|
-
fflush(stdout);
|
|
784
732
|
}
|
|
785
733
|
}
|
|
786
734
|
|
|
@@ -795,7 +743,7 @@ int main(int argc, char ** argv) {
|
|
|
795
743
|
// check for reverse prompt in the last n_prev tokens
|
|
796
744
|
if (!params.antiprompt.empty()) {
|
|
797
745
|
const int n_prev = 32;
|
|
798
|
-
const std::string last_output =
|
|
746
|
+
const std::string last_output = gpt_sampler_prev_str(smpl, ctx, n_prev);
|
|
799
747
|
|
|
800
748
|
is_antiprompt = false;
|
|
801
749
|
// Check if each of the reverse prompts appears at the end of the output.
|
|
@@ -817,7 +765,7 @@ int main(int argc, char ** argv) {
|
|
|
817
765
|
}
|
|
818
766
|
|
|
819
767
|
// check for reverse prompt using special tokens
|
|
820
|
-
llama_token last_token =
|
|
768
|
+
llama_token last_token = gpt_sampler_last(smpl);
|
|
821
769
|
for (std::vector<llama_token> ids : antiprompt_ids) {
|
|
822
770
|
if (ids.size() == 1 && last_token == ids[0]) {
|
|
823
771
|
if (params.interactive) {
|
|
@@ -829,13 +777,13 @@ int main(int argc, char ** argv) {
|
|
|
829
777
|
}
|
|
830
778
|
|
|
831
779
|
if (is_antiprompt) {
|
|
832
|
-
|
|
780
|
+
LOG_DBG("found antiprompt: %s\n", last_output.c_str());
|
|
833
781
|
}
|
|
834
782
|
}
|
|
835
783
|
|
|
836
784
|
// deal with end of generation tokens in interactive mode
|
|
837
|
-
if (llama_token_is_eog(model,
|
|
838
|
-
|
|
785
|
+
if (llama_token_is_eog(model, gpt_sampler_last(smpl))) {
|
|
786
|
+
LOG_DBG("found an EOG token\n");
|
|
839
787
|
|
|
840
788
|
if (params.interactive) {
|
|
841
789
|
if (!params.antiprompt.empty()) {
|
|
@@ -849,32 +797,32 @@ int main(int argc, char ** argv) {
|
|
|
849
797
|
chat_add_and_format(model, chat_msgs, "assistant", assistant_ss.str());
|
|
850
798
|
}
|
|
851
799
|
is_interacting = true;
|
|
852
|
-
|
|
800
|
+
LOG("\n");
|
|
853
801
|
}
|
|
854
802
|
}
|
|
855
803
|
|
|
856
804
|
// if current token is not EOG, we add it to current assistant message
|
|
857
805
|
if (params.conversation) {
|
|
858
|
-
auto id =
|
|
806
|
+
const auto id = gpt_sampler_last(smpl);
|
|
859
807
|
assistant_ss << llama_token_to_piece(ctx, id, false);
|
|
860
808
|
}
|
|
861
809
|
|
|
862
810
|
if (n_past > 0 && is_interacting) {
|
|
863
|
-
|
|
811
|
+
LOG_DBG("waiting for user input\n");
|
|
864
812
|
|
|
865
813
|
if (params.conversation) {
|
|
866
|
-
|
|
814
|
+
LOG("\n> ");
|
|
867
815
|
}
|
|
868
816
|
|
|
869
817
|
if (params.input_prefix_bos) {
|
|
870
|
-
|
|
818
|
+
LOG_DBG("adding input prefix BOS token\n");
|
|
871
819
|
embd_inp.push_back(llama_token_bos(model));
|
|
872
820
|
}
|
|
873
821
|
|
|
874
822
|
std::string buffer;
|
|
875
823
|
if (!params.input_prefix.empty() && !params.conversation) {
|
|
876
|
-
|
|
877
|
-
|
|
824
|
+
LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str());
|
|
825
|
+
LOG("%s", params.input_prefix.c_str());
|
|
878
826
|
}
|
|
879
827
|
|
|
880
828
|
// color user input only
|
|
@@ -897,11 +845,11 @@ int main(int argc, char ** argv) {
|
|
|
897
845
|
if (buffer.length() > 1) {
|
|
898
846
|
// append input suffix if any
|
|
899
847
|
if (!params.input_suffix.empty() && !params.conversation) {
|
|
900
|
-
|
|
901
|
-
|
|
848
|
+
LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str());
|
|
849
|
+
LOG("%s", params.input_suffix.c_str());
|
|
902
850
|
}
|
|
903
851
|
|
|
904
|
-
|
|
852
|
+
LOG_DBG("buffer: '%s'\n", buffer.c_str());
|
|
905
853
|
|
|
906
854
|
const size_t original_size = embd_inp.size();
|
|
907
855
|
|
|
@@ -918,7 +866,7 @@ int main(int argc, char ** argv) {
|
|
|
918
866
|
const auto line_inp = ::llama_tokenize(ctx, user_inp, false, format_chat);
|
|
919
867
|
const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
|
|
920
868
|
|
|
921
|
-
|
|
869
|
+
LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());
|
|
922
870
|
|
|
923
871
|
// if user stop generation mid-way, we must add EOT to finish model's last response
|
|
924
872
|
if (need_insert_eot && format_chat) {
|
|
@@ -941,9 +889,9 @@ int main(int argc, char ** argv) {
|
|
|
941
889
|
assistant_ss.str("");
|
|
942
890
|
|
|
943
891
|
n_remain -= line_inp.size();
|
|
944
|
-
|
|
892
|
+
LOG_DBG("n_remain: %d\n", n_remain);
|
|
945
893
|
} else {
|
|
946
|
-
|
|
894
|
+
LOG_DBG("empty line, passing control back\n");
|
|
947
895
|
}
|
|
948
896
|
|
|
949
897
|
input_echo = false; // do not echo this again
|
|
@@ -951,7 +899,7 @@ int main(int argc, char ** argv) {
|
|
|
951
899
|
|
|
952
900
|
if (n_past > 0) {
|
|
953
901
|
if (is_interacting) {
|
|
954
|
-
|
|
902
|
+
gpt_sampler_reset(smpl);
|
|
955
903
|
}
|
|
956
904
|
is_interacting = false;
|
|
957
905
|
}
|
|
@@ -959,7 +907,7 @@ int main(int argc, char ** argv) {
|
|
|
959
907
|
|
|
960
908
|
// end of generation
|
|
961
909
|
if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.interactive)) {
|
|
962
|
-
|
|
910
|
+
LOG(" [end of text]\n");
|
|
963
911
|
break;
|
|
964
912
|
}
|
|
965
913
|
|
|
@@ -972,23 +920,23 @@ int main(int argc, char ** argv) {
|
|
|
972
920
|
}
|
|
973
921
|
|
|
974
922
|
if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) {
|
|
975
|
-
|
|
923
|
+
LOG("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
|
|
976
924
|
llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
|
|
977
925
|
}
|
|
978
926
|
|
|
979
|
-
|
|
927
|
+
LOG("\n\n");
|
|
928
|
+
gpt_perf_print(ctx, smpl);
|
|
980
929
|
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
|
|
981
930
|
|
|
982
|
-
|
|
931
|
+
gpt_sampler_free(smpl);
|
|
932
|
+
|
|
983
933
|
llama_free(ctx);
|
|
984
934
|
llama_free_model(model);
|
|
985
935
|
|
|
986
|
-
llama_sampling_free(ctx_sampling);
|
|
987
936
|
llama_backend_free();
|
|
988
937
|
|
|
989
|
-
|
|
990
|
-
|
|
991
|
-
#endif // LOG_DISABLE_LOGS
|
|
938
|
+
ggml_threadpool_free(threadpool);
|
|
939
|
+
ggml_threadpool_free(threadpool_batch);
|
|
992
940
|
|
|
993
941
|
return 0;
|
|
994
942
|
}
|