@fugood/llama.node 0.3.0 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +1 -10
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +6 -4
- package/src/LlamaCompletionWorker.cpp +6 -6
- package/src/LlamaContext.cpp +7 -9
- package/src/common.hpp +2 -1
- package/src/llama.cpp/.github/workflows/build.yml +98 -24
- package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
- package/src/llama.cpp/.github/workflows/docker.yml +43 -34
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
- package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
- package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
- package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
- package/src/llama.cpp/.github/workflows/server.yml +7 -0
- package/src/llama.cpp/CMakeLists.txt +20 -8
- package/src/llama.cpp/common/CMakeLists.txt +12 -10
- package/src/llama.cpp/common/arg.cpp +2006 -0
- package/src/llama.cpp/common/arg.h +77 -0
- package/src/llama.cpp/common/common.cpp +496 -1632
- package/src/llama.cpp/common/common.h +161 -63
- package/src/llama.cpp/common/console.cpp +3 -0
- package/src/llama.cpp/common/log.cpp +401 -0
- package/src/llama.cpp/common/log.h +66 -698
- package/src/llama.cpp/common/ngram-cache.cpp +3 -0
- package/src/llama.cpp/common/sampling.cpp +348 -350
- package/src/llama.cpp/common/sampling.h +62 -139
- package/src/llama.cpp/common/stb_image.h +5990 -6398
- package/src/llama.cpp/common/train.cpp +2 -0
- package/src/llama.cpp/docs/build.md +36 -1
- package/src/llama.cpp/examples/CMakeLists.txt +0 -1
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +1 -2
- package/src/llama.cpp/examples/batched/batched.cpp +39 -55
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +34 -44
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +15 -15
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
- package/src/llama.cpp/examples/embedding/embedding.cpp +143 -87
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +33 -33
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +36 -35
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
- package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +34 -27
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +59 -62
- package/src/llama.cpp/examples/infill/infill.cpp +117 -132
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +265 -58
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +29 -22
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +685 -150
- package/src/llama.cpp/examples/llava/clip.h +11 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +47 -58
- package/src/llama.cpp/examples/llava/llava.cpp +110 -24
- package/src/llama.cpp/examples/llava/llava.h +2 -3
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
- package/src/llama.cpp/examples/llava/requirements.txt +1 -0
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +42 -43
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +10 -8
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +23 -22
- package/src/llama.cpp/examples/lookup/lookup.cpp +40 -43
- package/src/llama.cpp/examples/main/main.cpp +210 -262
- package/src/llama.cpp/examples/parallel/parallel.cpp +49 -49
- package/src/llama.cpp/examples/passkey/passkey.cpp +42 -50
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +187 -200
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -3
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +49 -44
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +24 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +32 -35
- package/src/llama.cpp/examples/server/CMakeLists.txt +3 -5
- package/src/llama.cpp/examples/server/server.cpp +1027 -1073
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
- package/src/llama.cpp/examples/server/utils.hpp +107 -105
- package/src/llama.cpp/examples/simple/simple.cpp +35 -41
- package/src/llama.cpp/examples/speculative/speculative.cpp +129 -103
- package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +25 -27
- package/src/llama.cpp/ggml/CMakeLists.txt +14 -3
- package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-backend.h +145 -60
- package/src/llama.cpp/ggml/include/ggml-blas.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +15 -19
- package/src/llama.cpp/ggml/include/ggml-cuda.h +16 -16
- package/src/llama.cpp/ggml/include/ggml-metal.h +5 -8
- package/src/llama.cpp/ggml/include/ggml-rpc.h +5 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +8 -8
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +7 -7
- package/src/llama.cpp/ggml/include/ggml.h +293 -186
- package/src/llama.cpp/ggml/src/CMakeLists.txt +86 -44
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +2135 -1119
- package/src/llama.cpp/ggml/src/ggml-alloc.c +6 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +152 -70
- package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +606 -286
- package/src/llama.cpp/ggml/src/ggml-blas.cpp +9 -10
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
- package/src/llama.cpp/ggml/src/ggml-cann.cpp +215 -216
- package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
- package/src/llama.cpp/ggml/src/ggml-cpu-impl.h +614 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +49 -603
- package/src/llama.cpp/ggml/src/ggml-kompute.cpp +4 -24
- package/src/llama.cpp/ggml/src/ggml-quants.c +972 -92
- package/src/llama.cpp/ggml/src/ggml-quants.h +15 -0
- package/src/llama.cpp/ggml/src/ggml-rpc.cpp +116 -66
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +52 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +16 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +6 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl.cpp +97 -169
- package/src/llama.cpp/ggml/src/ggml-vulkan.cpp +1508 -1124
- package/src/llama.cpp/ggml/src/ggml.c +3001 -1647
- package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +192 -0
- package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +88 -40
- package/src/llama.cpp/include/llama.h +241 -264
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
- package/src/llama.cpp/src/llama-grammar.cpp +721 -122
- package/src/llama.cpp/src/llama-grammar.h +120 -15
- package/src/llama.cpp/src/llama-impl.h +156 -1
- package/src/llama.cpp/src/llama-sampling.cpp +1375 -303
- package/src/llama.cpp/src/llama-sampling.h +20 -47
- package/src/llama.cpp/src/llama-vocab.cpp +343 -120
- package/src/llama.cpp/src/llama-vocab.h +33 -17
- package/src/llama.cpp/src/llama.cpp +4247 -1525
- package/src/llama.cpp/src/unicode-data.cpp +6 -4
- package/src/llama.cpp/src/unicode-data.h +4 -4
- package/src/llama.cpp/src/unicode.cpp +15 -7
- package/src/llama.cpp/tests/CMakeLists.txt +3 -0
- package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
- package/src/llama.cpp/tests/test-backend-ops.cpp +1592 -289
- package/src/llama.cpp/tests/test-barrier.cpp +93 -0
- package/src/llama.cpp/tests/test-grad0.cpp +187 -70
- package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
- package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +6 -4
- package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
- package/src/llama.cpp/tests/test-log.cpp +39 -0
- package/src/llama.cpp/tests/test-quantize-fns.cpp +6 -0
- package/src/llama.cpp/tests/test-rope.cpp +1 -1
- package/src/llama.cpp/tests/test-sampling.cpp +157 -98
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +55 -35
- package/patches/llama.patch +0 -22
- package/src/llama.cpp/.github/workflows/bench.yml +0 -310
- package/src/llama.cpp/common/grammar-parser.cpp +0 -536
- package/src/llama.cpp/common/grammar-parser.h +0 -29
- package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
- package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
|
@@ -1,8 +1,9 @@
|
|
|
1
|
+
#include "arg.h"
|
|
1
2
|
#include "common.h"
|
|
2
|
-
|
|
3
3
|
#include "console.h"
|
|
4
|
+
#include "sampling.h"
|
|
5
|
+
#include "log.h"
|
|
4
6
|
#include "llama.h"
|
|
5
|
-
#include "grammar-parser.h"
|
|
6
7
|
|
|
7
8
|
#include <cassert>
|
|
8
9
|
#include <cinttypes>
|
|
@@ -34,6 +35,7 @@
|
|
|
34
35
|
|
|
35
36
|
static llama_context ** g_ctx;
|
|
36
37
|
static llama_model ** g_model;
|
|
38
|
+
static gpt_sampler ** g_smpl;
|
|
37
39
|
static gpt_params * g_params;
|
|
38
40
|
static std::vector<llama_token> * g_input_tokens;
|
|
39
41
|
static std::ostringstream * g_output_ss;
|
|
@@ -54,7 +56,7 @@ static void write_logfile(
|
|
|
54
56
|
|
|
55
57
|
const bool success = fs_create_directory_with_parents(params.logdir);
|
|
56
58
|
if (!success) {
|
|
57
|
-
|
|
59
|
+
LOG_ERR("%s: warning: failed to create logdir %s, cannot write logfile\n",
|
|
58
60
|
__func__, params.logdir.c_str());
|
|
59
61
|
return;
|
|
60
62
|
}
|
|
@@ -63,7 +65,7 @@ static void write_logfile(
|
|
|
63
65
|
FILE * logfile = fopen(logfile_path.c_str(), "w");
|
|
64
66
|
|
|
65
67
|
if (logfile == NULL) {
|
|
66
|
-
|
|
68
|
+
LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
|
|
67
69
|
return;
|
|
68
70
|
}
|
|
69
71
|
|
|
@@ -81,7 +83,7 @@ static void write_logfile(
|
|
|
81
83
|
yaml_dump_string_multiline(logfile, "output", output.c_str());
|
|
82
84
|
yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
|
|
83
85
|
|
|
84
|
-
|
|
86
|
+
llama_perf_dump_yaml(logfile, ctx);
|
|
85
87
|
fclose(logfile);
|
|
86
88
|
}
|
|
87
89
|
|
|
@@ -92,9 +94,14 @@ static void sigint_handler(int signo) {
|
|
|
92
94
|
is_interacting = true;
|
|
93
95
|
} else {
|
|
94
96
|
console::cleanup();
|
|
95
|
-
|
|
96
|
-
|
|
97
|
+
LOG("\n");
|
|
98
|
+
gpt_perf_print(*g_ctx, *g_smpl);
|
|
97
99
|
write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
|
|
100
|
+
|
|
101
|
+
// make sure all logs are flushed
|
|
102
|
+
LOG("Interrupted by user\n");
|
|
103
|
+
gpt_log_pause(gpt_log_main());
|
|
104
|
+
|
|
98
105
|
_exit(130);
|
|
99
106
|
}
|
|
100
107
|
}
|
|
@@ -103,106 +110,95 @@ static void sigint_handler(int signo) {
|
|
|
103
110
|
|
|
104
111
|
int main(int argc, char ** argv) {
|
|
105
112
|
gpt_params params;
|
|
106
|
-
llama_sampling_params & sparams = params.sparams;
|
|
107
113
|
g_params = ¶ms;
|
|
108
114
|
|
|
109
|
-
if (!gpt_params_parse(argc, argv, params)) {
|
|
110
|
-
gpt_params_print_usage(argc, argv, params);
|
|
115
|
+
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_INFILL)) {
|
|
111
116
|
return 1;
|
|
112
117
|
}
|
|
113
118
|
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
log_dump_cmdline(argc, argv);
|
|
118
|
-
#endif // LOG_DISABLE_LOGS
|
|
119
|
+
gpt_init();
|
|
120
|
+
|
|
121
|
+
auto & sparams = params.sparams;
|
|
119
122
|
|
|
120
123
|
console::init(params.simple_io, params.use_color);
|
|
121
124
|
atexit([]() { console::cleanup(); });
|
|
122
125
|
|
|
123
126
|
if (params.logits_all) {
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
+
LOG_ERR("\n************\n");
|
|
128
|
+
LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
|
|
129
|
+
LOG_ERR("************\n\n");
|
|
127
130
|
|
|
128
131
|
return 0;
|
|
129
132
|
}
|
|
130
133
|
|
|
131
134
|
if (params.embedding) {
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
+
LOG_ERR("\n************\n");
|
|
136
|
+
LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
|
|
137
|
+
LOG_ERR("************\n\n");
|
|
135
138
|
|
|
136
139
|
return 0;
|
|
137
140
|
}
|
|
138
141
|
|
|
139
142
|
if (params.n_ctx != 0 && params.n_ctx < 8) {
|
|
140
|
-
|
|
143
|
+
LOG_WRN("%s: minimum context size is 8, using minimum size.\n", __func__);
|
|
141
144
|
params.n_ctx = 8;
|
|
142
145
|
}
|
|
146
|
+
|
|
143
147
|
if (!params.interactive_first && (params.input_prefix.empty() && params.input_suffix.empty())) {
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
148
|
+
LOG_ERR("\n************\n");
|
|
149
|
+
LOG_ERR("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
|
|
150
|
+
LOG_ERR("************\n\n");
|
|
147
151
|
|
|
148
152
|
return 0;
|
|
149
153
|
}
|
|
150
154
|
|
|
151
155
|
if (params.rope_freq_base != 0.0) {
|
|
152
|
-
|
|
156
|
+
LOG_WRN("%s: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
|
|
153
157
|
}
|
|
154
158
|
|
|
155
159
|
if (params.rope_freq_scale != 0.0) {
|
|
156
|
-
|
|
160
|
+
LOG_WRN("%s: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
|
|
157
161
|
}
|
|
158
162
|
|
|
159
|
-
|
|
160
|
-
LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);
|
|
161
|
-
|
|
162
|
-
if (params.seed == LLAMA_DEFAULT_SEED) {
|
|
163
|
-
params.seed = time(NULL);
|
|
164
|
-
}
|
|
165
|
-
|
|
166
|
-
LOG_TEE("%s: seed = %u\n", __func__, params.seed);
|
|
167
|
-
|
|
168
|
-
std::mt19937 rng(params.seed);
|
|
169
|
-
|
|
170
|
-
LOG("%s: llama backend init\n", __func__);
|
|
163
|
+
LOG_INF("%s: llama backend init\n", __func__);
|
|
171
164
|
llama_backend_init();
|
|
172
165
|
llama_numa_init(params.numa);
|
|
173
166
|
|
|
174
|
-
llama_model * model;
|
|
175
|
-
llama_context * ctx;
|
|
167
|
+
llama_model * model = nullptr;
|
|
168
|
+
llama_context * ctx = nullptr;
|
|
169
|
+
gpt_sampler * smpl = nullptr;
|
|
176
170
|
|
|
177
171
|
g_model = &model;
|
|
178
172
|
g_ctx = &ctx;
|
|
173
|
+
g_smpl = &smpl;
|
|
179
174
|
|
|
180
175
|
// load the model and apply lora adapter, if any
|
|
181
|
-
|
|
182
|
-
|
|
176
|
+
LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
|
|
177
|
+
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
|
178
|
+
|
|
179
|
+
model = llama_init.model;
|
|
180
|
+
ctx = llama_init.context;
|
|
183
181
|
|
|
184
182
|
if (model == NULL) {
|
|
185
|
-
|
|
183
|
+
LOG_ERR("%s: unable to load model\n", __func__);
|
|
186
184
|
return 1;
|
|
187
185
|
}
|
|
188
186
|
|
|
189
187
|
const int n_ctx_train = llama_n_ctx_train(model);
|
|
190
188
|
const int n_ctx = llama_n_ctx(ctx);
|
|
191
|
-
|
|
189
|
+
LOG_DBG("n_ctx: %d\n", n_ctx);
|
|
192
190
|
|
|
193
191
|
if (n_ctx > n_ctx_train) {
|
|
194
|
-
|
|
195
|
-
__func__, n_ctx_train, n_ctx);
|
|
192
|
+
LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx);
|
|
196
193
|
}
|
|
197
194
|
|
|
198
195
|
// print system information
|
|
199
196
|
{
|
|
200
|
-
|
|
201
|
-
|
|
197
|
+
LOG_INF("\n");
|
|
198
|
+
LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
|
|
202
199
|
}
|
|
203
|
-
const bool add_bos =
|
|
204
|
-
GGML_ASSERT(llama_add_eos_token(model)
|
|
205
|
-
LOG("add_bos: %d\n", add_bos);
|
|
200
|
+
const bool add_bos = llama_add_bos_token(model);
|
|
201
|
+
GGML_ASSERT(!llama_add_eos_token(model));
|
|
206
202
|
|
|
207
203
|
std::vector<llama_token> embd_inp;
|
|
208
204
|
std::vector<llama_token> embd_end;
|
|
@@ -227,18 +223,19 @@ int main(int argc, char ** argv) {
|
|
|
227
223
|
embd_inp.push_back(middle_token);
|
|
228
224
|
}
|
|
229
225
|
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
226
|
+
LOG_DBG("add_bos: %d\n", add_bos);
|
|
227
|
+
LOG_DBG("prefix: \"%s\"\n", params.input_prefix.c_str());
|
|
228
|
+
LOG_DBG("suffix: \"%s\"\n", params.input_suffix.c_str());
|
|
229
|
+
LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str());
|
|
233
230
|
|
|
234
231
|
// Should not run without any tokens
|
|
235
232
|
if (embd_inp.empty()) {
|
|
236
233
|
embd_inp.push_back(llama_token_bos(model));
|
|
237
|
-
|
|
234
|
+
LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
|
|
238
235
|
}
|
|
239
236
|
|
|
240
237
|
if ((int) embd_inp.size() > n_ctx - 4) {
|
|
241
|
-
|
|
238
|
+
LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
|
|
242
239
|
return 1;
|
|
243
240
|
}
|
|
244
241
|
|
|
@@ -247,9 +244,8 @@ int main(int argc, char ** argv) {
|
|
|
247
244
|
params.n_keep = (int)embd_inp.size();
|
|
248
245
|
}
|
|
249
246
|
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
247
|
+
LOG_INF("inp_pfx: %s\n", string_from(ctx, inp_pfx).c_str());
|
|
248
|
+
LOG_INF("inp_sfx: %s\n", string_from(ctx, inp_sfx).c_str());
|
|
253
249
|
|
|
254
250
|
// enable interactive mode if interactive start is specified
|
|
255
251
|
if (params.interactive_first) {
|
|
@@ -257,21 +253,21 @@ int main(int argc, char ** argv) {
|
|
|
257
253
|
}
|
|
258
254
|
|
|
259
255
|
if (params.verbose_prompt) {
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
256
|
+
LOG_INF("\n");
|
|
257
|
+
LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
|
258
|
+
LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
|
263
259
|
for (int i = 0; i < (int) embd_inp.size(); i++) {
|
|
264
|
-
|
|
260
|
+
LOG_INF("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
|
|
265
261
|
}
|
|
266
262
|
|
|
267
263
|
if (params.n_keep > 0) {
|
|
268
|
-
|
|
264
|
+
LOG_INF("%s: static prompt based on n_keep: '", __func__);
|
|
269
265
|
for (int i = 0; i < params.n_keep; i++) {
|
|
270
|
-
|
|
266
|
+
LOG_CNT("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
|
|
271
267
|
}
|
|
272
|
-
|
|
268
|
+
LOG_CNT("'\n");
|
|
273
269
|
}
|
|
274
|
-
|
|
270
|
+
LOG_INF("\n");
|
|
275
271
|
}
|
|
276
272
|
|
|
277
273
|
if (params.interactive) {
|
|
@@ -288,30 +284,30 @@ int main(int argc, char ** argv) {
|
|
|
288
284
|
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
|
|
289
285
|
#endif
|
|
290
286
|
|
|
291
|
-
|
|
287
|
+
LOG_INF("%s: interactive mode on.\n", __func__);
|
|
292
288
|
|
|
293
289
|
if (params.input_prefix_bos) {
|
|
294
|
-
|
|
290
|
+
LOG_INF("Input prefix with BOS\n");
|
|
295
291
|
}
|
|
296
292
|
|
|
297
293
|
if (!params.input_prefix.empty()) {
|
|
298
|
-
|
|
294
|
+
LOG_INF("Input prefix: '%s'\n", params.input_prefix.c_str());
|
|
299
295
|
}
|
|
300
296
|
|
|
301
297
|
if (!params.input_suffix.empty()) {
|
|
302
|
-
|
|
298
|
+
LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str());
|
|
303
299
|
}
|
|
304
300
|
}
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
301
|
+
smpl = gpt_sampler_init(model, sparams);
|
|
302
|
+
|
|
303
|
+
LOG_INF("sampler seed: %u\n", gpt_sampler_get_seed(smpl));
|
|
304
|
+
LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
|
|
305
|
+
LOG_INF("sampler chain: %s\n", gpt_sampler_print(smpl).c_str());
|
|
306
|
+
|
|
307
|
+
LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
|
308
|
+
|
|
309
|
+
LOG_INF("\n");
|
|
310
|
+
LOG_INF("\n##### Infill mode #####\n\n");
|
|
315
311
|
if (params.interactive) {
|
|
316
312
|
const char *control_message;
|
|
317
313
|
if (params.multiline_input) {
|
|
@@ -322,11 +318,11 @@ int main(int argc, char ** argv) {
|
|
|
322
318
|
" - To return control without starting a new line, end your input with '/'.\n"
|
|
323
319
|
" - If you want to submit another line, end your input with '\\'.\n";
|
|
324
320
|
}
|
|
325
|
-
|
|
321
|
+
LOG_INF("== Running in interactive mode. ==\n");
|
|
326
322
|
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
|
327
|
-
|
|
323
|
+
LOG_INF( " - Press Ctrl+C to interject at any time.\n");
|
|
328
324
|
#endif
|
|
329
|
-
|
|
325
|
+
LOG_INF( "%s\n", control_message);
|
|
330
326
|
|
|
331
327
|
is_interacting = params.interactive_first;
|
|
332
328
|
}
|
|
@@ -346,8 +342,6 @@ int main(int argc, char ** argv) {
|
|
|
346
342
|
|
|
347
343
|
std::vector<llama_token> embd;
|
|
348
344
|
|
|
349
|
-
struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
|
|
350
|
-
|
|
351
345
|
while (n_remain != 0 || params.interactive) {
|
|
352
346
|
// predict
|
|
353
347
|
if (!embd.empty()) {
|
|
@@ -361,9 +355,8 @@ int main(int argc, char ** argv) {
|
|
|
361
355
|
embd.resize(max_embd_size);
|
|
362
356
|
|
|
363
357
|
console::set_display(console::error);
|
|
364
|
-
|
|
358
|
+
LOG_WRN("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
|
|
365
359
|
console::set_display(console::reset);
|
|
366
|
-
fflush(stdout);
|
|
367
360
|
}
|
|
368
361
|
|
|
369
362
|
// infinite text generation via context swapping
|
|
@@ -372,14 +365,14 @@ int main(int argc, char ** argv) {
|
|
|
372
365
|
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
|
|
373
366
|
if (n_past + (int) embd.size() > n_ctx) {
|
|
374
367
|
if (params.n_predict == -2) {
|
|
375
|
-
|
|
368
|
+
LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
|
|
376
369
|
break;
|
|
377
370
|
}
|
|
378
371
|
|
|
379
372
|
const int n_left = n_past - params.n_keep - 1;
|
|
380
373
|
const int n_discard = n_left/2;
|
|
381
374
|
|
|
382
|
-
|
|
375
|
+
LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
|
|
383
376
|
n_past, n_left, n_ctx, params.n_keep, n_discard);
|
|
384
377
|
|
|
385
378
|
llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
|
|
@@ -387,9 +380,9 @@ int main(int argc, char ** argv) {
|
|
|
387
380
|
|
|
388
381
|
n_past -= n_discard;
|
|
389
382
|
|
|
390
|
-
|
|
383
|
+
LOG_DBG("after swap: n_past = %d\n", n_past);
|
|
391
384
|
|
|
392
|
-
|
|
385
|
+
LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
|
|
393
386
|
|
|
394
387
|
}
|
|
395
388
|
|
|
@@ -401,16 +394,16 @@ int main(int argc, char ** argv) {
|
|
|
401
394
|
n_eval = params.n_batch;
|
|
402
395
|
}
|
|
403
396
|
|
|
404
|
-
|
|
397
|
+
LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
|
|
405
398
|
|
|
406
399
|
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
|
|
407
|
-
|
|
400
|
+
LOG_ERR("%s : failed to eval\n", __func__);
|
|
408
401
|
return 1;
|
|
409
402
|
}
|
|
410
403
|
|
|
411
404
|
n_past += n_eval;
|
|
412
405
|
|
|
413
|
-
|
|
406
|
+
LOG_DBG("n_past = %d\n", n_past);
|
|
414
407
|
}
|
|
415
408
|
|
|
416
409
|
}
|
|
@@ -418,11 +411,11 @@ int main(int argc, char ** argv) {
|
|
|
418
411
|
embd.clear();
|
|
419
412
|
|
|
420
413
|
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
|
|
421
|
-
const llama_token id =
|
|
414
|
+
const llama_token id = gpt_sampler_sample(smpl, ctx, -1);
|
|
422
415
|
|
|
423
|
-
|
|
416
|
+
gpt_sampler_accept(smpl, id, true);
|
|
424
417
|
|
|
425
|
-
|
|
418
|
+
// LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
|
|
426
419
|
|
|
427
420
|
embd.push_back(id);
|
|
428
421
|
|
|
@@ -432,16 +425,16 @@ int main(int argc, char ** argv) {
|
|
|
432
425
|
// decrement remaining sampling budget
|
|
433
426
|
--n_remain;
|
|
434
427
|
|
|
435
|
-
|
|
428
|
+
LOG_DBG("n_remain: %d\n", n_remain);
|
|
436
429
|
} else {
|
|
437
430
|
// some user input remains from prompt or interaction, forward it to processing
|
|
438
|
-
|
|
431
|
+
LOG_DBG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
|
|
439
432
|
while ((int) embd_inp.size() > n_consumed) {
|
|
440
433
|
embd.push_back(embd_inp[n_consumed]);
|
|
441
434
|
|
|
442
435
|
// push the prompt in the sampling context in order to apply repetition penalties later
|
|
443
436
|
// for the prompt, we don't apply grammar rules
|
|
444
|
-
|
|
437
|
+
gpt_sampler_accept(smpl, embd_inp[n_consumed], false);
|
|
445
438
|
|
|
446
439
|
++n_consumed;
|
|
447
440
|
if ((int) embd.size() >= params.n_batch) {
|
|
@@ -454,7 +447,7 @@ int main(int argc, char ** argv) {
|
|
|
454
447
|
if (input_echo) {
|
|
455
448
|
for (auto id : embd) {
|
|
456
449
|
const std::string token_str = llama_token_to_piece(ctx, id);
|
|
457
|
-
|
|
450
|
+
LOG("%s", token_str.c_str());
|
|
458
451
|
|
|
459
452
|
if (embd.size() > 1) {
|
|
460
453
|
input_tokens.push_back(id);
|
|
@@ -463,7 +456,6 @@ int main(int argc, char ** argv) {
|
|
|
463
456
|
output_ss << token_str;
|
|
464
457
|
}
|
|
465
458
|
}
|
|
466
|
-
fflush(stdout);
|
|
467
459
|
}
|
|
468
460
|
// reset color to default if we there is no pending user input
|
|
469
461
|
if (input_echo && (int) embd_inp.size() == n_consumed) {
|
|
@@ -473,13 +465,12 @@ int main(int argc, char ** argv) {
|
|
|
473
465
|
// if not currently processing queued inputs;
|
|
474
466
|
if ((int) embd_inp.size() <= n_consumed) {
|
|
475
467
|
// deal with eot token in infill mode
|
|
476
|
-
if ((
|
|
468
|
+
if ((gpt_sampler_last(smpl) == llama_token_eot(model) || is_interacting) && params.interactive){
|
|
477
469
|
if (is_interacting && !params.interactive_first) {
|
|
478
470
|
// print an eot token
|
|
479
|
-
|
|
471
|
+
LOG("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
|
|
480
472
|
}
|
|
481
|
-
|
|
482
|
-
printf("\n");
|
|
473
|
+
LOG("\n");
|
|
483
474
|
console::set_display(console::user_input);
|
|
484
475
|
std::string buffer;
|
|
485
476
|
std::string line;
|
|
@@ -535,35 +526,33 @@ int main(int argc, char ** argv) {
|
|
|
535
526
|
n_remain = params.n_predict;
|
|
536
527
|
n_past = 0;
|
|
537
528
|
n_consumed = 0;
|
|
538
|
-
// LOG_TEE("took new input\n");
|
|
539
529
|
is_interacting = false;
|
|
540
530
|
}
|
|
541
531
|
// deal with end of generation tokens in interactive mode
|
|
542
|
-
else if (llama_token_is_eog(model,
|
|
543
|
-
|
|
532
|
+
else if (llama_token_is_eog(model, gpt_sampler_last(smpl))) {
|
|
533
|
+
LOG_DBG("found EOS token\n");
|
|
544
534
|
|
|
545
535
|
if (params.interactive) {
|
|
546
536
|
|
|
547
537
|
is_interacting = true;
|
|
548
|
-
|
|
538
|
+
LOG("\n");
|
|
549
539
|
console::set_display(console::user_input);
|
|
550
|
-
fflush(stdout);
|
|
551
540
|
}
|
|
552
541
|
}
|
|
553
542
|
|
|
554
543
|
if (n_past > 0 && is_interacting && !params.interactive) {
|
|
555
|
-
|
|
544
|
+
LOG_DBG("waiting for user input\n");
|
|
556
545
|
|
|
557
546
|
if (params.input_prefix_bos) {
|
|
558
|
-
|
|
547
|
+
LOG_DBG("adding input prefix BOS token\n");
|
|
559
548
|
embd_inp.push_back(llama_token_bos(model));
|
|
560
549
|
}
|
|
561
550
|
|
|
562
551
|
std::string buffer;
|
|
563
552
|
if (!params.input_prefix.empty()) {
|
|
564
|
-
|
|
553
|
+
LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str());
|
|
565
554
|
buffer += params.input_prefix;
|
|
566
|
-
|
|
555
|
+
LOG("%s", buffer.c_str());
|
|
567
556
|
}
|
|
568
557
|
|
|
569
558
|
std::string line;
|
|
@@ -581,17 +570,17 @@ int main(int argc, char ** argv) {
|
|
|
581
570
|
if (buffer.length() > 1) {
|
|
582
571
|
// append input suffix if any
|
|
583
572
|
if (!params.input_suffix.empty()) {
|
|
584
|
-
|
|
573
|
+
LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str());
|
|
585
574
|
buffer += params.input_suffix;
|
|
586
|
-
|
|
575
|
+
LOG("%s", params.input_suffix.c_str());
|
|
587
576
|
}
|
|
588
577
|
|
|
589
|
-
|
|
578
|
+
LOG_DBG("buffer: '%s'\n", buffer.c_str());
|
|
590
579
|
|
|
591
580
|
const size_t original_size = embd_inp.size();
|
|
592
581
|
|
|
593
582
|
const auto line_inp = ::llama_tokenize(ctx, buffer, false);
|
|
594
|
-
|
|
583
|
+
LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());
|
|
595
584
|
|
|
596
585
|
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
|
|
597
586
|
|
|
@@ -602,9 +591,9 @@ int main(int argc, char ** argv) {
|
|
|
602
591
|
}
|
|
603
592
|
|
|
604
593
|
n_remain -= line_inp.size();
|
|
605
|
-
|
|
594
|
+
LOG_DBG("n_remain: %d\n", n_remain);
|
|
606
595
|
} else {
|
|
607
|
-
|
|
596
|
+
LOG_DBG("empty line, passing control back\n");
|
|
608
597
|
}
|
|
609
598
|
|
|
610
599
|
input_echo = false; // do not echo this again
|
|
@@ -612,7 +601,7 @@ int main(int argc, char ** argv) {
|
|
|
612
601
|
|
|
613
602
|
if (n_past > 0) {
|
|
614
603
|
if (is_interacting) {
|
|
615
|
-
|
|
604
|
+
gpt_sampler_reset(smpl);
|
|
616
605
|
}
|
|
617
606
|
is_interacting = false;
|
|
618
607
|
}
|
|
@@ -631,22 +620,18 @@ int main(int argc, char ** argv) {
|
|
|
631
620
|
}
|
|
632
621
|
}
|
|
633
622
|
if (!params.interactive && n_remain <= 0) {
|
|
634
|
-
|
|
635
|
-
fflush(stdout);
|
|
623
|
+
LOG("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
|
|
636
624
|
}
|
|
637
625
|
|
|
638
|
-
|
|
626
|
+
LOG("\n");
|
|
627
|
+
gpt_perf_print(ctx, smpl);
|
|
639
628
|
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
|
|
640
629
|
|
|
641
630
|
llama_free(ctx);
|
|
642
631
|
llama_free_model(model);
|
|
643
632
|
|
|
644
|
-
|
|
633
|
+
gpt_sampler_free(smpl);
|
|
645
634
|
llama_backend_free();
|
|
646
635
|
|
|
647
|
-
#ifndef LOG_DISABLE_LOGS
|
|
648
|
-
LOG_TEE("Log end\n");
|
|
649
|
-
#endif // LOG_DISABLE_LOGS
|
|
650
|
-
|
|
651
636
|
return 0;
|
|
652
637
|
}
|