@fugood/llama.node 0.3.6 → 0.3.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +17 -2
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +3 -1
- package/lib/index.js +16 -1
- package/lib/index.ts +16 -0
- package/package.json +1 -1
- package/src/EmbeddingWorker.cpp +4 -3
- package/src/LlamaCompletionWorker.cpp +4 -2
- package/src/LlamaContext.cpp +61 -6
- package/src/LlamaContext.h +1 -0
- package/src/common.hpp +6 -11
- package/src/llama.cpp/.github/workflows/build.yml +19 -17
- package/src/llama.cpp/.github/workflows/docker.yml +77 -30
- package/src/llama.cpp/.github/workflows/editorconfig.yml +3 -1
- package/src/llama.cpp/.github/workflows/server.yml +22 -3
- package/src/llama.cpp/CMakeLists.txt +49 -24
- package/src/llama.cpp/common/arg.cpp +82 -26
- package/src/llama.cpp/common/arg.h +3 -0
- package/src/llama.cpp/common/common.cpp +192 -72
- package/src/llama.cpp/common/common.h +51 -18
- package/src/llama.cpp/common/ngram-cache.cpp +12 -12
- package/src/llama.cpp/common/ngram-cache.h +2 -2
- package/src/llama.cpp/common/sampling.cpp +11 -6
- package/src/llama.cpp/common/speculative.cpp +18 -15
- package/src/llama.cpp/docs/build.md +2 -0
- package/src/llama.cpp/examples/batched/batched.cpp +9 -7
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +3 -3
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +10 -8
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +11 -8
- package/src/llama.cpp/examples/cvector-generator/mean.hpp +1 -1
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +8 -7
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +7 -6
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +8 -7
- package/src/llama.cpp/examples/gguf/gguf.cpp +10 -6
- package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +1 -0
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +8 -7
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +13 -10
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +13 -12
- package/src/llama.cpp/examples/infill/infill.cpp +23 -24
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +44 -13
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -6
- package/src/llama.cpp/examples/llava/clip.cpp +4 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +9 -6
- package/src/llama.cpp/examples/llava/llava.cpp +2 -2
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +8 -4
- package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +11 -8
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +6 -7
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +4 -9
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +3 -7
- package/src/llama.cpp/examples/lookup/lookup.cpp +5 -6
- package/src/llama.cpp/examples/main/main.cpp +51 -29
- package/src/llama.cpp/examples/parallel/parallel.cpp +5 -6
- package/src/llama.cpp/examples/passkey/passkey.cpp +7 -5
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +37 -23
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -14
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +8 -8
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +12 -0
- package/src/llama.cpp/examples/run/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +1351 -0
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +114 -0
- package/src/llama.cpp/examples/run/run.cpp +175 -61
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +4 -25
- package/src/llama.cpp/examples/server/CMakeLists.txt +1 -0
- package/src/llama.cpp/examples/server/httplib.h +1295 -409
- package/src/llama.cpp/examples/server/server.cpp +387 -181
- package/src/llama.cpp/examples/server/tests/requirements.txt +1 -0
- package/src/llama.cpp/examples/server/utils.hpp +170 -58
- package/src/llama.cpp/examples/simple/simple.cpp +9 -8
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +16 -12
- package/src/llama.cpp/examples/speculative/speculative.cpp +22 -23
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +8 -12
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +17 -5
- package/src/llama.cpp/examples/tts/tts.cpp +64 -23
- package/src/llama.cpp/ggml/CMakeLists.txt +5 -21
- package/src/llama.cpp/ggml/include/ggml-backend.h +2 -0
- package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -0
- package/src/llama.cpp/ggml/include/ggml.h +36 -145
- package/src/llama.cpp/ggml/include/gguf.h +202 -0
- package/src/llama.cpp/ggml/src/CMakeLists.txt +6 -3
- package/src/llama.cpp/ggml/src/ggml-alloc.c +5 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +0 -1
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +79 -49
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +5 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +33 -23
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +57 -72
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +87 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +335 -66
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +10 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1090 -378
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +2 -2
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +3 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +3 -1
- package/src/llama.cpp/ggml/src/ggml-impl.h +11 -16
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +16 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +6 -6
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +154 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +9 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +18 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +1 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +1 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +40 -95
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +48 -48
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +24 -24
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +238 -164
- package/src/llama.cpp/ggml/src/ggml-sycl/gla.cpp +105 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/gla.hpp +8 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +3 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +1 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +1 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +7 -5
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +1 -2
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +74 -4
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +314 -116
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -2
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +9 -3
- package/src/llama.cpp/ggml/src/ggml.c +117 -1327
- package/src/llama.cpp/ggml/src/gguf.cpp +1329 -0
- package/src/llama.cpp/include/llama-cpp.h +6 -1
- package/src/llama.cpp/include/llama.h +138 -75
- package/src/llama.cpp/src/CMakeLists.txt +13 -1
- package/src/llama.cpp/src/llama-adapter.cpp +347 -0
- package/src/llama.cpp/src/llama-adapter.h +74 -0
- package/src/llama.cpp/src/llama-arch.cpp +1487 -0
- package/src/llama.cpp/src/llama-arch.h +400 -0
- package/src/llama.cpp/src/llama-batch.cpp +368 -0
- package/src/llama.cpp/src/llama-batch.h +88 -0
- package/src/llama.cpp/src/llama-chat.cpp +578 -0
- package/src/llama.cpp/src/llama-chat.h +52 -0
- package/src/llama.cpp/src/llama-context.cpp +1775 -0
- package/src/llama.cpp/src/llama-context.h +128 -0
- package/src/llama.cpp/src/llama-cparams.cpp +1 -0
- package/src/llama.cpp/src/llama-cparams.h +37 -0
- package/src/llama.cpp/src/llama-grammar.cpp +5 -4
- package/src/llama.cpp/src/llama-grammar.h +3 -1
- package/src/llama.cpp/src/llama-hparams.cpp +71 -0
- package/src/llama.cpp/src/llama-hparams.h +139 -0
- package/src/llama.cpp/src/llama-impl.cpp +167 -0
- package/src/llama.cpp/src/llama-impl.h +16 -136
- package/src/llama.cpp/src/llama-kv-cache.cpp +718 -0
- package/src/llama.cpp/src/llama-kv-cache.h +218 -0
- package/src/llama.cpp/src/llama-mmap.cpp +589 -0
- package/src/llama.cpp/src/llama-mmap.h +67 -0
- package/src/llama.cpp/src/llama-model-loader.cpp +1124 -0
- package/src/llama.cpp/src/llama-model-loader.h +167 -0
- package/src/llama.cpp/src/llama-model.cpp +3953 -0
- package/src/llama.cpp/src/llama-model.h +370 -0
- package/src/llama.cpp/src/llama-quant.cpp +934 -0
- package/src/llama.cpp/src/llama-quant.h +1 -0
- package/src/llama.cpp/src/llama-sampling.cpp +147 -32
- package/src/llama.cpp/src/llama-sampling.h +3 -19
- package/src/llama.cpp/src/llama-vocab.cpp +1832 -575
- package/src/llama.cpp/src/llama-vocab.h +97 -142
- package/src/llama.cpp/src/llama.cpp +7160 -20314
- package/src/llama.cpp/src/unicode.cpp +8 -3
- package/src/llama.cpp/tests/CMakeLists.txt +2 -0
- package/src/llama.cpp/tests/test-autorelease.cpp +3 -3
- package/src/llama.cpp/tests/test-backend-ops.cpp +370 -59
- package/src/llama.cpp/tests/test-chat-template.cpp +162 -125
- package/src/llama.cpp/tests/test-gguf.cpp +222 -187
- package/src/llama.cpp/tests/test-model-load-cancel.cpp +1 -1
- package/src/llama.cpp/tests/test-sampling.cpp +0 -1
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +4 -4
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +9 -7
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +8 -6
|
@@ -87,7 +87,7 @@ Java_android_llama_cpp_LLamaAndroid_load_1model(JNIEnv *env, jobject, jstring fi
|
|
|
87
87
|
auto path_to_model = env->GetStringUTFChars(filename, 0);
|
|
88
88
|
LOGi("Loading model from %s", path_to_model);
|
|
89
89
|
|
|
90
|
-
auto model =
|
|
90
|
+
auto model = llama_model_load_from_file(path_to_model, model_params);
|
|
91
91
|
env->ReleaseStringUTFChars(filename, path_to_model);
|
|
92
92
|
|
|
93
93
|
if (!model) {
|
|
@@ -102,7 +102,7 @@ Java_android_llama_cpp_LLamaAndroid_load_1model(JNIEnv *env, jobject, jstring fi
|
|
|
102
102
|
extern "C"
|
|
103
103
|
JNIEXPORT void JNICALL
|
|
104
104
|
Java_android_llama_cpp_LLamaAndroid_free_1model(JNIEnv *, jobject, jlong model) {
|
|
105
|
-
|
|
105
|
+
llama_model_free(reinterpret_cast<llama_model *>(model));
|
|
106
106
|
}
|
|
107
107
|
|
|
108
108
|
extern "C"
|
|
@@ -305,7 +305,9 @@ Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens,
|
|
|
305
305
|
extern "C"
|
|
306
306
|
JNIEXPORT void JNICALL
|
|
307
307
|
Java_android_llama_cpp_LLamaAndroid_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
|
|
308
|
-
llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
|
|
308
|
+
//llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
|
|
309
|
+
const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
|
|
310
|
+
delete batch;
|
|
309
311
|
}
|
|
310
312
|
|
|
311
313
|
extern "C"
|
|
@@ -345,6 +347,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1init(
|
|
|
345
347
|
jlong context_pointer,
|
|
346
348
|
jlong batch_pointer,
|
|
347
349
|
jstring jtext,
|
|
350
|
+
jboolean format_chat,
|
|
348
351
|
jint n_len
|
|
349
352
|
) {
|
|
350
353
|
|
|
@@ -354,7 +357,8 @@ Java_android_llama_cpp_LLamaAndroid_completion_1init(
|
|
|
354
357
|
const auto context = reinterpret_cast<llama_context *>(context_pointer);
|
|
355
358
|
const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
|
|
356
359
|
|
|
357
|
-
|
|
360
|
+
bool parse_special = (format_chat == JNI_TRUE);
|
|
361
|
+
const auto tokens_list = common_tokenize(context, text, true, parse_special);
|
|
358
362
|
|
|
359
363
|
auto n_ctx = llama_n_ctx(context);
|
|
360
364
|
auto n_kv_req = tokens_list.size() + (n_len - tokens_list.size());
|
|
@@ -366,7 +370,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1init(
|
|
|
366
370
|
}
|
|
367
371
|
|
|
368
372
|
for (auto id : tokens_list) {
|
|
369
|
-
LOGi("%
|
|
373
|
+
LOGi("token: `%s`-> %d ", common_token_to_piece(context, id).c_str(), id);
|
|
370
374
|
}
|
|
371
375
|
|
|
372
376
|
common_batch_clear(*batch);
|
|
@@ -403,6 +407,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
|
|
|
403
407
|
const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
|
|
404
408
|
const auto sampler = reinterpret_cast<llama_sampler *>(sampler_pointer);
|
|
405
409
|
const auto model = llama_get_model(context);
|
|
410
|
+
const auto vocab = llama_model_get_vocab(model);
|
|
406
411
|
|
|
407
412
|
if (!la_int_var) la_int_var = env->GetObjectClass(intvar_ncur);
|
|
408
413
|
if (!la_int_var_value) la_int_var_value = env->GetMethodID(la_int_var, "getValue", "()I");
|
|
@@ -412,7 +417,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
|
|
|
412
417
|
const auto new_token_id = llama_sampler_sample(sampler, context, -1);
|
|
413
418
|
|
|
414
419
|
const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
|
|
415
|
-
if (
|
|
420
|
+
if (llama_vocab_is_eog(vocab, new_token_id) || n_cur == n_len) {
|
|
416
421
|
return nullptr;
|
|
417
422
|
}
|
|
418
423
|
|
|
@@ -7,6 +7,7 @@
|
|
|
7
7
|
#include "ggml-cpu.h"
|
|
8
8
|
#include "ggml-alloc.h"
|
|
9
9
|
#include "ggml-backend.h"
|
|
10
|
+
#include "gguf.h"
|
|
10
11
|
|
|
11
12
|
//#ifdef GGML_USE_CUDA
|
|
12
13
|
//#include "ggml-cuda.h"
|
|
@@ -262,7 +263,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
|
|
|
262
263
|
{
|
|
263
264
|
const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
|
|
264
265
|
int arr_n = gguf_get_arr_n(ctx_gguf, i);
|
|
265
|
-
const void * data = gguf_get_arr_data(ctx_gguf, i);
|
|
266
|
+
const void * data = arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx_gguf, i);
|
|
266
267
|
std::stringstream ss;
|
|
267
268
|
ss << "[";
|
|
268
269
|
for (int j = 0; j < arr_n; j++) {
|
|
@@ -2734,7 +2735,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
|
|
|
2734
2735
|
total_size_org += orig_size;
|
|
2735
2736
|
total_size_new += new_size;
|
|
2736
2737
|
gguf_set_tensor_type(ctx_out, name.c_str(), new_type);
|
|
2737
|
-
|
|
2738
|
+
GGML_ASSERT(gguf_get_tensor_size(ctx_out, gguf_find_tensor(ctx_out, name.c_str())) == new_size);
|
|
2739
|
+
gguf_set_tensor_data(ctx_out, name.c_str(), new_data);
|
|
2738
2740
|
fout.write((const char *)new_data, new_size);
|
|
2739
2741
|
size_t pad = GGML_PAD(new_size, gguf_get_alignment(ctx_out)) - new_size;
|
|
2740
2742
|
for (size_t j = 0; j < pad; ++j) {
|
|
@@ -47,8 +47,12 @@ static const char * sample(struct common_sampler * smpl,
|
|
|
47
47
|
int * n_past) {
|
|
48
48
|
const llama_token id = common_sampler_sample(smpl, ctx_llama, -1);
|
|
49
49
|
common_sampler_accept(smpl, id, true);
|
|
50
|
+
|
|
51
|
+
const llama_model * model = llama_get_model(ctx_llama);
|
|
52
|
+
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
53
|
+
|
|
50
54
|
static std::string ret;
|
|
51
|
-
if (
|
|
55
|
+
if (llama_vocab_is_eog(vocab, id)) {
|
|
52
56
|
ret = "</s>";
|
|
53
57
|
} else {
|
|
54
58
|
ret = common_token_to_piece(ctx_llama, id);
|
|
@@ -221,7 +225,7 @@ static struct llama_model * llava_init(common_params * params) {
|
|
|
221
225
|
|
|
222
226
|
llama_model_params model_params = common_model_params_to_llama(*params);
|
|
223
227
|
|
|
224
|
-
llama_model * model =
|
|
228
|
+
llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
|
|
225
229
|
if (model == NULL) {
|
|
226
230
|
LOG_ERR("%s: unable to load model\n" , __func__);
|
|
227
231
|
return NULL;
|
|
@@ -239,11 +243,10 @@ static struct llava_context * llava_init_context(common_params * params, llama_m
|
|
|
239
243
|
|
|
240
244
|
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
|
|
241
245
|
|
|
242
|
-
|
|
243
246
|
llama_context_params ctx_params = common_context_params_to_llama(*params);
|
|
244
247
|
ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
|
|
245
248
|
|
|
246
|
-
llama_context * ctx_llama =
|
|
249
|
+
llama_context * ctx_llama = llama_init_from_model(model, ctx_params);
|
|
247
250
|
|
|
248
251
|
if (ctx_llama == NULL) {
|
|
249
252
|
LOG_ERR("%s: failed to create the llama_context\n" , __func__);
|
|
@@ -265,7 +268,7 @@ static void llava_free(struct llava_context * ctx_llava) {
|
|
|
265
268
|
}
|
|
266
269
|
|
|
267
270
|
llama_free(ctx_llava->ctx_llama);
|
|
268
|
-
|
|
271
|
+
llama_model_free(ctx_llava->model);
|
|
269
272
|
llama_backend_free();
|
|
270
273
|
}
|
|
271
274
|
|
|
@@ -323,7 +326,7 @@ int main(int argc, char ** argv) {
|
|
|
323
326
|
}
|
|
324
327
|
}
|
|
325
328
|
|
|
326
|
-
|
|
329
|
+
llama_model_free(model);
|
|
327
330
|
|
|
328
331
|
return 0;
|
|
329
332
|
}
|
|
@@ -384,7 +384,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
|
|
384
384
|
|
|
385
385
|
bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip) {
|
|
386
386
|
// make sure that the correct mmproj was used, i.e., compare apples to apples
|
|
387
|
-
int n_llama_embd =
|
|
387
|
+
int n_llama_embd = llama_model_n_embd(llama_get_model(ctx_llama));
|
|
388
388
|
auto n_image_embd = clip_n_mmproj_embd(ctx_clip);
|
|
389
389
|
if (n_image_embd != n_llama_embd) {
|
|
390
390
|
LOG_ERR("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
|
|
@@ -456,7 +456,7 @@ struct llava_embd_batch {
|
|
|
456
456
|
};
|
|
457
457
|
|
|
458
458
|
bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed, int n_batch, int * n_past) {
|
|
459
|
-
int n_embd =
|
|
459
|
+
int n_embd = llama_model_n_embd(llama_get_model(ctx_llama));
|
|
460
460
|
|
|
461
461
|
for (int i = 0; i < image_embed->n_image_pos; i += n_batch) {
|
|
462
462
|
int n_eval = image_embed->n_image_pos - i;
|
|
@@ -31,7 +31,7 @@ static struct llama_model * llava_init(common_params * params) {
|
|
|
31
31
|
|
|
32
32
|
llama_model_params model_params = common_model_params_to_llama(*params);
|
|
33
33
|
|
|
34
|
-
llama_model * model =
|
|
34
|
+
llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
|
|
35
35
|
if (model == NULL) {
|
|
36
36
|
LOG_ERR("%s: unable to load model\n" , __func__);
|
|
37
37
|
return NULL;
|
|
@@ -54,7 +54,7 @@ static struct llava_context * llava_init_context(common_params * params, llama_m
|
|
|
54
54
|
ctx_params.n_ctx = params->n_ctx;
|
|
55
55
|
}
|
|
56
56
|
|
|
57
|
-
llama_context * ctx_llama =
|
|
57
|
+
llama_context * ctx_llama = llama_init_from_model(model, ctx_params);
|
|
58
58
|
|
|
59
59
|
if (ctx_llama == NULL) {
|
|
60
60
|
LOG_ERR("%s: failed to create the llama_context\n" , __func__);
|
|
@@ -75,7 +75,7 @@ static void llava_free(struct llava_context * ctx_llava) {
|
|
|
75
75
|
}
|
|
76
76
|
|
|
77
77
|
llama_free(ctx_llava->ctx_llama);
|
|
78
|
-
|
|
78
|
+
llama_model_free(ctx_llava->model);
|
|
79
79
|
llama_backend_free();
|
|
80
80
|
}
|
|
81
81
|
|
|
@@ -167,8 +167,12 @@ static const char * sample(struct common_sampler * smpl,
|
|
|
167
167
|
int * n_past) {
|
|
168
168
|
const llama_token id = common_sampler_sample(smpl, ctx_llama, -1);
|
|
169
169
|
common_sampler_accept(smpl, id, true);
|
|
170
|
+
|
|
171
|
+
const llama_model * model = llama_get_model(ctx_llama);
|
|
172
|
+
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
173
|
+
|
|
170
174
|
static std::string ret;
|
|
171
|
-
if (
|
|
175
|
+
if (llama_vocab_is_eog(vocab, id)) {
|
|
172
176
|
ret = "</s>";
|
|
173
177
|
} else {
|
|
174
178
|
ret = common_token_to_piece(ctx_llama, id);
|
|
@@ -27,7 +27,7 @@
|
|
|
27
27
|
|
|
28
28
|
static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed,
|
|
29
29
|
int n_batch, int * n_past, int * st_pos_id, struct clip_image_size * image_size) {
|
|
30
|
-
int n_embd =
|
|
30
|
+
int n_embd = llama_model_n_embd(llama_get_model(ctx_llama));
|
|
31
31
|
const int patch_size = 14 * 2;
|
|
32
32
|
const int ph = image_size->height / patch_size + (image_size->height % patch_size > 0);
|
|
33
33
|
const int pw = image_size->width / patch_size + (image_size->width % patch_size > 0);
|
|
@@ -132,8 +132,12 @@ static const char * sample(struct common_sampler * smpl,
|
|
|
132
132
|
int * n_past, int * st_pos_id) {
|
|
133
133
|
const llama_token id = common_sampler_sample(smpl, ctx_llama, -1);
|
|
134
134
|
common_sampler_accept(smpl, id, true);
|
|
135
|
+
|
|
136
|
+
const llama_model * model = llama_get_model(ctx_llama);
|
|
137
|
+
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
138
|
+
|
|
135
139
|
static std::string ret;
|
|
136
|
-
if (
|
|
140
|
+
if (llama_vocab_is_eog(vocab, id)) {
|
|
137
141
|
ret = "</s>";
|
|
138
142
|
} else {
|
|
139
143
|
ret = common_token_to_piece(ctx_llama, id);
|
|
@@ -310,7 +314,7 @@ static struct llama_model * llava_init(common_params * params) {
|
|
|
310
314
|
|
|
311
315
|
llama_model_params model_params = common_model_params_to_llama(*params);
|
|
312
316
|
|
|
313
|
-
llama_model * model =
|
|
317
|
+
llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
|
|
314
318
|
if (model == NULL) {
|
|
315
319
|
LOG_ERR("%s: unable to load model\n" , __func__);
|
|
316
320
|
return NULL;
|
|
@@ -328,11 +332,10 @@ static struct llava_context * llava_init_context(common_params * params, llama_m
|
|
|
328
332
|
|
|
329
333
|
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
|
|
330
334
|
|
|
331
|
-
|
|
332
335
|
llama_context_params ctx_params = common_context_params_to_llama(*params);
|
|
333
336
|
ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
|
|
334
337
|
|
|
335
|
-
llama_context * ctx_llama =
|
|
338
|
+
llama_context * ctx_llama = llama_init_from_model(model, ctx_params);
|
|
336
339
|
|
|
337
340
|
if (ctx_llama == NULL) {
|
|
338
341
|
LOG_ERR("%s: failed to create the llama_context\n" , __func__);
|
|
@@ -354,7 +357,7 @@ static void llava_free(struct llava_context * ctx_llava) {
|
|
|
354
357
|
}
|
|
355
358
|
|
|
356
359
|
llama_free(ctx_llava->ctx_llama);
|
|
357
|
-
|
|
360
|
+
llama_model_free(ctx_llava->model);
|
|
358
361
|
llama_backend_free();
|
|
359
362
|
}
|
|
360
363
|
|
|
@@ -481,7 +484,7 @@ static void debug_test_mrope_2d() {
|
|
|
481
484
|
}
|
|
482
485
|
|
|
483
486
|
static void debug_dump_img_embed(struct llava_context * ctx_llava) {
|
|
484
|
-
int n_embd =
|
|
487
|
+
int n_embd = llama_model_n_embd(llama_get_model(ctx_llava->ctx_llama));
|
|
485
488
|
int ne = n_embd * 4;
|
|
486
489
|
float vals[56 * 56 * 3];
|
|
487
490
|
// float embd[ne];
|
|
@@ -575,7 +578,7 @@ int main(int argc, char ** argv) {
|
|
|
575
578
|
}
|
|
576
579
|
}
|
|
577
580
|
|
|
578
|
-
|
|
581
|
+
llama_model_free(model);
|
|
579
582
|
|
|
580
583
|
return 0;
|
|
581
584
|
}
|
|
@@ -58,8 +58,10 @@ int main(int argc, char ** argv) {
|
|
|
58
58
|
// load the target model
|
|
59
59
|
common_init_result llama_init = common_init_from_params(params);
|
|
60
60
|
|
|
61
|
-
llama_model * model = llama_init.model;
|
|
62
|
-
llama_context * ctx = llama_init.context;
|
|
61
|
+
llama_model * model = llama_init.model.get();
|
|
62
|
+
llama_context * ctx = llama_init.context.get();
|
|
63
|
+
|
|
64
|
+
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
63
65
|
|
|
64
66
|
// Tokenize the prompt
|
|
65
67
|
std::vector<llama_token> inp;
|
|
@@ -147,7 +149,7 @@ int main(int argc, char ** argv) {
|
|
|
147
149
|
}
|
|
148
150
|
|
|
149
151
|
// here we keep adding new n-grams as we go
|
|
150
|
-
ngram_container ngrams_observed(
|
|
152
|
+
ngram_container ngrams_observed(llama_vocab_n_tokens(vocab), N, G);
|
|
151
153
|
|
|
152
154
|
// debug
|
|
153
155
|
struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, W + G + 1);
|
|
@@ -297,7 +299,7 @@ int main(int argc, char ** argv) {
|
|
|
297
299
|
}
|
|
298
300
|
fflush(stdout);
|
|
299
301
|
|
|
300
|
-
if (
|
|
302
|
+
if (llama_vocab_is_eog(vocab, id)) {
|
|
301
303
|
has_eos = true;
|
|
302
304
|
}
|
|
303
305
|
|
|
@@ -474,9 +476,6 @@ int main(int argc, char ** argv) {
|
|
|
474
476
|
|
|
475
477
|
llama_batch_free(batch);
|
|
476
478
|
|
|
477
|
-
llama_free(ctx);
|
|
478
|
-
llama_free_model(model);
|
|
479
|
-
|
|
480
479
|
llama_backend_free();
|
|
481
480
|
|
|
482
481
|
LOG("\n\n");
|
|
@@ -1,14 +1,9 @@
|
|
|
1
1
|
#include "arg.h"
|
|
2
2
|
#include "common.h"
|
|
3
3
|
#include "ngram-cache.h"
|
|
4
|
-
#include "ggml.h"
|
|
5
4
|
#include "llama.h"
|
|
6
5
|
|
|
7
|
-
#include <cstdint>
|
|
8
|
-
#include <fstream>
|
|
9
|
-
#include <iostream>
|
|
10
6
|
#include <string>
|
|
11
|
-
#include <unordered_map>
|
|
12
7
|
#include <vector>
|
|
13
8
|
|
|
14
9
|
int main(int argc, char ** argv){
|
|
@@ -25,16 +20,16 @@ int main(int argc, char ** argv){
|
|
|
25
20
|
// load the model
|
|
26
21
|
common_init_result llama_init = common_init_from_params(params);
|
|
27
22
|
|
|
28
|
-
|
|
29
|
-
|
|
23
|
+
llama_model_ptr & model = llama_init.model;
|
|
24
|
+
llama_context_ptr & ctx = llama_init.context;
|
|
25
|
+
|
|
30
26
|
GGML_ASSERT(model != nullptr);
|
|
31
27
|
|
|
32
28
|
// tokenize the prompt
|
|
33
29
|
std::vector<llama_token> inp;
|
|
34
|
-
inp = common_tokenize(ctx, params.prompt, true, true);
|
|
30
|
+
inp = common_tokenize(ctx.get(), params.prompt, true, true);
|
|
35
31
|
fprintf(stderr, "%s: tokenization done\n", __func__);
|
|
36
32
|
|
|
37
|
-
|
|
38
33
|
common_ngram_cache ngram_cache;
|
|
39
34
|
common_ngram_cache_update(ngram_cache, LLAMA_NGRAM_STATIC, LLAMA_NGRAM_STATIC, inp, inp.size(), true);
|
|
40
35
|
fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.lookup_cache_static.c_str());
|
|
@@ -30,12 +30,11 @@ int main(int argc, char ** argv){
|
|
|
30
30
|
// load the model
|
|
31
31
|
common_init_result llama_init = common_init_from_params(params);
|
|
32
32
|
|
|
33
|
-
|
|
34
|
-
llama_context * ctx = llama_init.context;
|
|
33
|
+
llama_context_ptr & ctx = llama_init.context;
|
|
35
34
|
|
|
36
35
|
// tokenize the prompt
|
|
37
36
|
std::vector<llama_token> inp;
|
|
38
|
-
inp = common_tokenize(ctx, params.prompt, true, true);
|
|
37
|
+
inp = common_tokenize(ctx.get(), params.prompt, true, true);
|
|
39
38
|
|
|
40
39
|
common_ngram_cache ngram_cache_context;
|
|
41
40
|
common_ngram_cache ngram_cache_dynamic;
|
|
@@ -66,7 +65,7 @@ int main(int argc, char ** argv){
|
|
|
66
65
|
}
|
|
67
66
|
|
|
68
67
|
const int n_input = inp.size();
|
|
69
|
-
const int n_ctx = llama_n_ctx(ctx);
|
|
68
|
+
const int n_ctx = llama_n_ctx(ctx.get());
|
|
70
69
|
|
|
71
70
|
int n_drafted = 0;
|
|
72
71
|
int n_accept = 0;
|
|
@@ -150,9 +149,6 @@ int main(int argc, char ** argv){
|
|
|
150
149
|
LOG_INF("n_accept = %d\n", n_accept);
|
|
151
150
|
LOG_INF("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
|
|
152
151
|
|
|
153
|
-
llama_free(ctx);
|
|
154
|
-
llama_free_model(model);
|
|
155
|
-
|
|
156
152
|
llama_backend_free();
|
|
157
153
|
|
|
158
154
|
LOG("\n\n");
|
|
@@ -33,8 +33,10 @@ int main(int argc, char ** argv){
|
|
|
33
33
|
// load the model
|
|
34
34
|
common_init_result llama_init = common_init_from_params(params);
|
|
35
35
|
|
|
36
|
-
llama_model * model = llama_init.model;
|
|
37
|
-
llama_context * ctx = llama_init.context;
|
|
36
|
+
llama_model * model = llama_init.model.get();
|
|
37
|
+
llama_context * ctx = llama_init.context.get();
|
|
38
|
+
|
|
39
|
+
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
38
40
|
|
|
39
41
|
// tokenize the prompt
|
|
40
42
|
std::vector<llama_token> inp;
|
|
@@ -136,7 +138,7 @@ int main(int argc, char ** argv){
|
|
|
136
138
|
LOG("%s", token_str.c_str());
|
|
137
139
|
}
|
|
138
140
|
|
|
139
|
-
if (
|
|
141
|
+
if (llama_vocab_is_eog(vocab, id)) {
|
|
140
142
|
has_eos = true;
|
|
141
143
|
}
|
|
142
144
|
|
|
@@ -243,9 +245,6 @@ int main(int argc, char ** argv){
|
|
|
243
245
|
|
|
244
246
|
llama_batch_free(batch_tgt);
|
|
245
247
|
|
|
246
|
-
llama_free(ctx);
|
|
247
|
-
llama_free_model(model);
|
|
248
|
-
|
|
249
248
|
llama_backend_free();
|
|
250
249
|
|
|
251
250
|
LOG("\n\n");
|
|
@@ -5,7 +5,6 @@
|
|
|
5
5
|
#include "sampling.h"
|
|
6
6
|
#include "llama.h"
|
|
7
7
|
|
|
8
|
-
#include <cassert>
|
|
9
8
|
#include <cstdio>
|
|
10
9
|
#include <cstring>
|
|
11
10
|
#include <ctime>
|
|
@@ -31,6 +30,8 @@
|
|
|
31
30
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
|
32
31
|
#endif
|
|
33
32
|
|
|
33
|
+
static const char * DEFAULT_SYSTEM_MESSAGE = "You are a helpful assistant";
|
|
34
|
+
|
|
34
35
|
static llama_context ** g_ctx;
|
|
35
36
|
static llama_model ** g_model;
|
|
36
37
|
static common_sampler ** g_smpl;
|
|
@@ -145,24 +146,26 @@ int main(int argc, char ** argv) {
|
|
|
145
146
|
llama_context * ctx = nullptr;
|
|
146
147
|
common_sampler * smpl = nullptr;
|
|
147
148
|
|
|
148
|
-
std::vector<common_chat_msg> chat_msgs;
|
|
149
|
-
|
|
150
149
|
g_model = &model;
|
|
151
150
|
g_ctx = &ctx;
|
|
152
151
|
g_smpl = &smpl;
|
|
153
152
|
|
|
153
|
+
std::vector<common_chat_msg> chat_msgs;
|
|
154
|
+
|
|
154
155
|
// load the model and apply lora adapter, if any
|
|
155
156
|
LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
|
|
156
157
|
common_init_result llama_init = common_init_from_params(params);
|
|
157
158
|
|
|
158
|
-
model = llama_init.model;
|
|
159
|
-
ctx = llama_init.context;
|
|
159
|
+
model = llama_init.model.get();
|
|
160
|
+
ctx = llama_init.context.get();
|
|
160
161
|
|
|
161
162
|
if (model == NULL) {
|
|
162
163
|
LOG_ERR("%s: error: unable to load model\n", __func__);
|
|
163
164
|
return 1;
|
|
164
165
|
}
|
|
165
166
|
|
|
167
|
+
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
168
|
+
|
|
166
169
|
LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);
|
|
167
170
|
|
|
168
171
|
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
|
|
@@ -196,15 +199,31 @@ int main(int argc, char ** argv) {
|
|
|
196
199
|
|
|
197
200
|
llama_attach_threadpool(ctx, threadpool, threadpool_batch);
|
|
198
201
|
|
|
199
|
-
const int n_ctx_train =
|
|
202
|
+
const int n_ctx_train = llama_model_n_ctx_train(model);
|
|
200
203
|
const int n_ctx = llama_n_ctx(ctx);
|
|
201
204
|
|
|
202
205
|
if (n_ctx > n_ctx_train) {
|
|
203
206
|
LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx);
|
|
204
207
|
}
|
|
205
208
|
|
|
209
|
+
// auto enable conversation mode if chat template is available
|
|
210
|
+
const bool has_chat_template = !common_get_builtin_chat_template(model).empty() || !params.chat_template.empty();
|
|
211
|
+
if (params.conversation_mode == COMMON_CONVERSATION_MODE_AUTO) {
|
|
212
|
+
if (has_chat_template) {
|
|
213
|
+
LOG_INF("%s: chat template is available, enabling conversation mode (disable it with -no-cnv)\n", __func__);
|
|
214
|
+
params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED;
|
|
215
|
+
} else {
|
|
216
|
+
params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED;
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
// in case user force-activate conversation mode (via -cnv) without proper chat template, we show a warning
|
|
221
|
+
if (params.conversation_mode && !has_chat_template) {
|
|
222
|
+
LOG_WRN("%s: chat template is not available or is not supported. This may cause the model to output suboptimal responses\n", __func__);
|
|
223
|
+
}
|
|
224
|
+
|
|
206
225
|
// print chat template example in conversation mode
|
|
207
|
-
if (params.
|
|
226
|
+
if (params.conversation_mode) {
|
|
208
227
|
if (params.enable_chat_template) {
|
|
209
228
|
LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(model, params.chat_template).c_str());
|
|
210
229
|
} else {
|
|
@@ -241,9 +260,9 @@ int main(int argc, char ** argv) {
|
|
|
241
260
|
}
|
|
242
261
|
}
|
|
243
262
|
|
|
244
|
-
const bool add_bos =
|
|
263
|
+
const bool add_bos = llama_vocab_get_add_bos(vocab);
|
|
245
264
|
if (!llama_model_has_encoder(model)) {
|
|
246
|
-
GGML_ASSERT(!
|
|
265
|
+
GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
|
|
247
266
|
}
|
|
248
267
|
|
|
249
268
|
LOG_DBG("n_ctx: %d, add_bos: %d\n", n_ctx, add_bos);
|
|
@@ -251,8 +270,10 @@ int main(int argc, char ** argv) {
|
|
|
251
270
|
std::vector<llama_token> embd_inp;
|
|
252
271
|
|
|
253
272
|
{
|
|
254
|
-
auto prompt = (params.
|
|
255
|
-
|
|
273
|
+
auto prompt = (params.conversation_mode && params.enable_chat_template)
|
|
274
|
+
// format the system prompt in conversation mode (fallback to default if empty)
|
|
275
|
+
? chat_add_and_format(model, chat_msgs, "system", params.prompt.empty() ? DEFAULT_SYSTEM_MESSAGE : params.prompt)
|
|
276
|
+
// otherwise use the prompt as is
|
|
256
277
|
: params.prompt;
|
|
257
278
|
if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
|
|
258
279
|
LOG_DBG("tokenize the prompt\n");
|
|
@@ -269,7 +290,7 @@ int main(int argc, char ** argv) {
|
|
|
269
290
|
// Should not run without any tokens
|
|
270
291
|
if (embd_inp.empty()) {
|
|
271
292
|
if (add_bos) {
|
|
272
|
-
embd_inp.push_back(
|
|
293
|
+
embd_inp.push_back(llama_vocab_bos(vocab));
|
|
273
294
|
LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
|
|
274
295
|
} else {
|
|
275
296
|
LOG_ERR("input is empty\n");
|
|
@@ -326,7 +347,7 @@ int main(int argc, char ** argv) {
|
|
|
326
347
|
params.n_keep += add_bos; // always keep the BOS token
|
|
327
348
|
}
|
|
328
349
|
|
|
329
|
-
if (params.
|
|
350
|
+
if (params.conversation_mode) {
|
|
330
351
|
params.interactive_first = true;
|
|
331
352
|
}
|
|
332
353
|
|
|
@@ -450,7 +471,11 @@ int main(int argc, char ** argv) {
|
|
|
450
471
|
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
|
451
472
|
LOG_INF( " - Press Ctrl+C to interject at any time.\n");
|
|
452
473
|
#endif
|
|
453
|
-
LOG_INF( "%s
|
|
474
|
+
LOG_INF( "%s", control_message);
|
|
475
|
+
if (params.conversation_mode && params.enable_chat_template && params.prompt.empty()) {
|
|
476
|
+
LOG_INF( " - Using default system message. To change it, set a different value via -p PROMPT or -f FILE argument.\n");
|
|
477
|
+
}
|
|
478
|
+
LOG_INF("\n");
|
|
454
479
|
|
|
455
480
|
is_interacting = params.interactive_first;
|
|
456
481
|
}
|
|
@@ -494,8 +519,8 @@ int main(int argc, char ** argv) {
|
|
|
494
519
|
}
|
|
495
520
|
|
|
496
521
|
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
|
|
497
|
-
if (decoder_start_token_id ==
|
|
498
|
-
decoder_start_token_id =
|
|
522
|
+
if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
|
|
523
|
+
decoder_start_token_id = llama_vocab_bos(vocab);
|
|
499
524
|
}
|
|
500
525
|
|
|
501
526
|
embd_inp.clear();
|
|
@@ -742,7 +767,7 @@ int main(int argc, char ** argv) {
|
|
|
742
767
|
}
|
|
743
768
|
|
|
744
769
|
// deal with end of generation tokens in interactive mode
|
|
745
|
-
if (
|
|
770
|
+
if (llama_vocab_is_eog(vocab, common_sampler_last(smpl))) {
|
|
746
771
|
LOG_DBG("found an EOG token\n");
|
|
747
772
|
|
|
748
773
|
if (params.interactive) {
|
|
@@ -762,7 +787,7 @@ int main(int argc, char ** argv) {
|
|
|
762
787
|
}
|
|
763
788
|
|
|
764
789
|
// if current token is not EOG, we add it to current assistant message
|
|
765
|
-
if (params.
|
|
790
|
+
if (params.conversation_mode) {
|
|
766
791
|
const auto id = common_sampler_last(smpl);
|
|
767
792
|
assistant_ss << common_token_to_piece(ctx, id, false);
|
|
768
793
|
}
|
|
@@ -770,17 +795,17 @@ int main(int argc, char ** argv) {
|
|
|
770
795
|
if (n_past > 0 && is_interacting) {
|
|
771
796
|
LOG_DBG("waiting for user input\n");
|
|
772
797
|
|
|
773
|
-
if (params.
|
|
798
|
+
if (params.conversation_mode) {
|
|
774
799
|
LOG("\n> ");
|
|
775
800
|
}
|
|
776
801
|
|
|
777
802
|
if (params.input_prefix_bos) {
|
|
778
803
|
LOG_DBG("adding input prefix BOS token\n");
|
|
779
|
-
embd_inp.push_back(
|
|
804
|
+
embd_inp.push_back(llama_vocab_bos(vocab));
|
|
780
805
|
}
|
|
781
806
|
|
|
782
807
|
std::string buffer;
|
|
783
|
-
if (!params.input_prefix.empty() && !params.
|
|
808
|
+
if (!params.input_prefix.empty() && !params.conversation_mode) {
|
|
784
809
|
LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str());
|
|
785
810
|
LOG("%s", params.input_prefix.c_str());
|
|
786
811
|
}
|
|
@@ -804,7 +829,7 @@ int main(int argc, char ** argv) {
|
|
|
804
829
|
// Entering a empty line lets the user pass control back
|
|
805
830
|
if (buffer.length() > 1) {
|
|
806
831
|
// append input suffix if any
|
|
807
|
-
if (!params.input_suffix.empty() && !params.
|
|
832
|
+
if (!params.input_suffix.empty() && !params.conversation_mode) {
|
|
808
833
|
LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str());
|
|
809
834
|
LOG("%s", params.input_suffix.c_str());
|
|
810
835
|
}
|
|
@@ -817,7 +842,7 @@ int main(int argc, char ** argv) {
|
|
|
817
842
|
string_process_escapes(buffer);
|
|
818
843
|
}
|
|
819
844
|
|
|
820
|
-
bool format_chat = params.
|
|
845
|
+
bool format_chat = params.conversation_mode && params.enable_chat_template;
|
|
821
846
|
std::string user_inp = format_chat
|
|
822
847
|
? chat_add_and_format(model, chat_msgs, "user", std::move(buffer))
|
|
823
848
|
: std::move(buffer);
|
|
@@ -830,8 +855,8 @@ int main(int argc, char ** argv) {
|
|
|
830
855
|
|
|
831
856
|
// if user stop generation mid-way, we must add EOT to finish model's last response
|
|
832
857
|
if (need_insert_eot && format_chat) {
|
|
833
|
-
llama_token eot =
|
|
834
|
-
embd_inp.push_back(eot ==
|
|
858
|
+
llama_token eot = llama_vocab_eot(vocab);
|
|
859
|
+
embd_inp.push_back(eot == LLAMA_TOKEN_NULL ? llama_vocab_eos(vocab) : eot);
|
|
835
860
|
need_insert_eot = false;
|
|
836
861
|
}
|
|
837
862
|
|
|
@@ -866,7 +891,7 @@ int main(int argc, char ** argv) {
|
|
|
866
891
|
}
|
|
867
892
|
|
|
868
893
|
// end of generation
|
|
869
|
-
if (!embd.empty() &&
|
|
894
|
+
if (!embd.empty() && llama_vocab_is_eog(vocab, embd.back()) && !(params.interactive)) {
|
|
870
895
|
LOG(" [end of text]\n");
|
|
871
896
|
break;
|
|
872
897
|
}
|
|
@@ -889,9 +914,6 @@ int main(int argc, char ** argv) {
|
|
|
889
914
|
|
|
890
915
|
common_sampler_free(smpl);
|
|
891
916
|
|
|
892
|
-
llama_free(ctx);
|
|
893
|
-
llama_free_model(model);
|
|
894
|
-
|
|
895
917
|
llama_backend_free();
|
|
896
918
|
|
|
897
919
|
ggml_threadpool_free_fn(threadpool);
|