@fugood/llama.node 0.3.0 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +1 -10
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +6 -4
- package/src/LlamaCompletionWorker.cpp +6 -6
- package/src/LlamaContext.cpp +7 -9
- package/src/common.hpp +2 -1
- package/src/llama.cpp/.github/workflows/build.yml +98 -24
- package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
- package/src/llama.cpp/.github/workflows/docker.yml +43 -34
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
- package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
- package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
- package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
- package/src/llama.cpp/.github/workflows/server.yml +7 -0
- package/src/llama.cpp/CMakeLists.txt +20 -8
- package/src/llama.cpp/common/CMakeLists.txt +12 -10
- package/src/llama.cpp/common/arg.cpp +2006 -0
- package/src/llama.cpp/common/arg.h +77 -0
- package/src/llama.cpp/common/common.cpp +496 -1632
- package/src/llama.cpp/common/common.h +161 -63
- package/src/llama.cpp/common/console.cpp +3 -0
- package/src/llama.cpp/common/log.cpp +401 -0
- package/src/llama.cpp/common/log.h +66 -698
- package/src/llama.cpp/common/ngram-cache.cpp +3 -0
- package/src/llama.cpp/common/sampling.cpp +348 -350
- package/src/llama.cpp/common/sampling.h +62 -139
- package/src/llama.cpp/common/stb_image.h +5990 -6398
- package/src/llama.cpp/common/train.cpp +2 -0
- package/src/llama.cpp/docs/build.md +36 -1
- package/src/llama.cpp/examples/CMakeLists.txt +0 -1
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +1 -2
- package/src/llama.cpp/examples/batched/batched.cpp +39 -55
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +34 -44
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +15 -15
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
- package/src/llama.cpp/examples/embedding/embedding.cpp +143 -87
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +33 -33
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +36 -35
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
- package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +34 -27
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +59 -62
- package/src/llama.cpp/examples/infill/infill.cpp +117 -132
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +265 -58
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +29 -22
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +685 -150
- package/src/llama.cpp/examples/llava/clip.h +11 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +47 -58
- package/src/llama.cpp/examples/llava/llava.cpp +110 -24
- package/src/llama.cpp/examples/llava/llava.h +2 -3
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
- package/src/llama.cpp/examples/llava/requirements.txt +1 -0
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +42 -43
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +10 -8
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +23 -22
- package/src/llama.cpp/examples/lookup/lookup.cpp +40 -43
- package/src/llama.cpp/examples/main/main.cpp +210 -262
- package/src/llama.cpp/examples/parallel/parallel.cpp +49 -49
- package/src/llama.cpp/examples/passkey/passkey.cpp +42 -50
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +187 -200
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -3
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +49 -44
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +24 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +32 -35
- package/src/llama.cpp/examples/server/CMakeLists.txt +3 -5
- package/src/llama.cpp/examples/server/server.cpp +1027 -1073
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
- package/src/llama.cpp/examples/server/utils.hpp +107 -105
- package/src/llama.cpp/examples/simple/simple.cpp +35 -41
- package/src/llama.cpp/examples/speculative/speculative.cpp +129 -103
- package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +25 -27
- package/src/llama.cpp/ggml/CMakeLists.txt +14 -3
- package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-backend.h +145 -60
- package/src/llama.cpp/ggml/include/ggml-blas.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +15 -19
- package/src/llama.cpp/ggml/include/ggml-cuda.h +16 -16
- package/src/llama.cpp/ggml/include/ggml-metal.h +5 -8
- package/src/llama.cpp/ggml/include/ggml-rpc.h +5 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +8 -8
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +7 -7
- package/src/llama.cpp/ggml/include/ggml.h +293 -186
- package/src/llama.cpp/ggml/src/CMakeLists.txt +86 -44
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +2135 -1119
- package/src/llama.cpp/ggml/src/ggml-alloc.c +6 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +152 -70
- package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +606 -286
- package/src/llama.cpp/ggml/src/ggml-blas.cpp +9 -10
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
- package/src/llama.cpp/ggml/src/ggml-cann.cpp +215 -216
- package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
- package/src/llama.cpp/ggml/src/ggml-cpu-impl.h +614 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +49 -603
- package/src/llama.cpp/ggml/src/ggml-kompute.cpp +4 -24
- package/src/llama.cpp/ggml/src/ggml-quants.c +972 -92
- package/src/llama.cpp/ggml/src/ggml-quants.h +15 -0
- package/src/llama.cpp/ggml/src/ggml-rpc.cpp +116 -66
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +52 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +16 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +6 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl.cpp +97 -169
- package/src/llama.cpp/ggml/src/ggml-vulkan.cpp +1508 -1124
- package/src/llama.cpp/ggml/src/ggml.c +3001 -1647
- package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +192 -0
- package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +88 -40
- package/src/llama.cpp/include/llama.h +241 -264
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
- package/src/llama.cpp/src/llama-grammar.cpp +721 -122
- package/src/llama.cpp/src/llama-grammar.h +120 -15
- package/src/llama.cpp/src/llama-impl.h +156 -1
- package/src/llama.cpp/src/llama-sampling.cpp +1375 -303
- package/src/llama.cpp/src/llama-sampling.h +20 -47
- package/src/llama.cpp/src/llama-vocab.cpp +343 -120
- package/src/llama.cpp/src/llama-vocab.h +33 -17
- package/src/llama.cpp/src/llama.cpp +4247 -1525
- package/src/llama.cpp/src/unicode-data.cpp +6 -4
- package/src/llama.cpp/src/unicode-data.h +4 -4
- package/src/llama.cpp/src/unicode.cpp +15 -7
- package/src/llama.cpp/tests/CMakeLists.txt +3 -0
- package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
- package/src/llama.cpp/tests/test-backend-ops.cpp +1592 -289
- package/src/llama.cpp/tests/test-barrier.cpp +93 -0
- package/src/llama.cpp/tests/test-grad0.cpp +187 -70
- package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
- package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +6 -4
- package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
- package/src/llama.cpp/tests/test-log.cpp +39 -0
- package/src/llama.cpp/tests/test-quantize-fns.cpp +6 -0
- package/src/llama.cpp/tests/test-rope.cpp +1 -1
- package/src/llama.cpp/tests/test-sampling.cpp +157 -98
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +55 -35
- package/patches/llama.patch +0 -22
- package/src/llama.cpp/.github/workflows/bench.yml +0 -310
- package/src/llama.cpp/common/grammar-parser.cpp +0 -536
- package/src/llama.cpp/common/grammar-parser.h +0 -29
- package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
- package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
|
@@ -0,0 +1,323 @@
|
|
|
1
|
+
#include "arg.h"
|
|
2
|
+
#include "log.h"
|
|
3
|
+
#include "common.h"
|
|
4
|
+
#include "sampling.h"
|
|
5
|
+
#include "clip.h"
|
|
6
|
+
#include "llava.h"
|
|
7
|
+
#include "llama.h"
|
|
8
|
+
#include "ggml.h"
|
|
9
|
+
|
|
10
|
+
#include <algorithm>
|
|
11
|
+
#include <cstdio>
|
|
12
|
+
#include <cstdlib>
|
|
13
|
+
#include <cstring>
|
|
14
|
+
#include <vector>
|
|
15
|
+
#include <iostream> // TODO: remove me
|
|
16
|
+
|
|
17
|
+
struct llava_context {
|
|
18
|
+
struct clip_ctx * ctx_clip = NULL;
|
|
19
|
+
struct llama_context * ctx_llama = NULL;
|
|
20
|
+
struct llama_model * model = NULL;
|
|
21
|
+
};
|
|
22
|
+
|
|
23
|
+
static void show_additional_info(int /*argc*/, char ** argv) {
|
|
24
|
+
LOG("\nexample usage:\n\n%s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
|
|
25
|
+
LOG("\nnote: a lower temperature value like 0.1 is recommended for better quality.\n");
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
static struct llama_model * llava_init(gpt_params * params) {
|
|
29
|
+
llama_backend_init();
|
|
30
|
+
llama_numa_init(params->numa);
|
|
31
|
+
|
|
32
|
+
llama_model_params model_params = llama_model_params_from_gpt_params(*params);
|
|
33
|
+
|
|
34
|
+
llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
|
|
35
|
+
if (model == NULL) {
|
|
36
|
+
LOG_ERR("%s: unable to load model\n" , __func__);
|
|
37
|
+
return NULL;
|
|
38
|
+
}
|
|
39
|
+
return model;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
static struct llava_context * llava_init_context(gpt_params * params, llama_model * model) {
|
|
43
|
+
auto prompt = params->prompt;
|
|
44
|
+
if (prompt.empty()) {
|
|
45
|
+
prompt = "describe the image in detail.";
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
|
|
49
|
+
if (params->n_ctx < 2048) {
|
|
50
|
+
// warn user here, "Image processing requires at least 2048 context, setting context to 2048"
|
|
51
|
+
LOG_WRN("%s: Image processing requires at least 2048 context, setting context to 2048\n" , __func__);
|
|
52
|
+
ctx_params.n_ctx = 2048;
|
|
53
|
+
} else {
|
|
54
|
+
ctx_params.n_ctx = params->n_ctx;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
|
|
58
|
+
|
|
59
|
+
if (ctx_llama == NULL) {
|
|
60
|
+
LOG_ERR("%s: failed to create the llama_context\n" , __func__);
|
|
61
|
+
return NULL;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
|
|
65
|
+
|
|
66
|
+
ctx_llava->ctx_llama = ctx_llama;
|
|
67
|
+
ctx_llava->model = model;
|
|
68
|
+
return ctx_llava;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
static void llava_free(struct llava_context * ctx_llava) {
|
|
72
|
+
if (ctx_llava->ctx_clip) {
|
|
73
|
+
clip_free(ctx_llava->ctx_clip);
|
|
74
|
+
ctx_llava->ctx_clip = NULL;
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
llama_free(ctx_llava->ctx_llama);
|
|
78
|
+
llama_free_model(ctx_llava->model);
|
|
79
|
+
llama_backend_free();
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
static struct clip_ctx * clip_init_context(gpt_params * params) {
|
|
83
|
+
const char * clip_path = params->mmproj.c_str();
|
|
84
|
+
|
|
85
|
+
auto prompt = params->prompt;
|
|
86
|
+
if (prompt.empty()) {
|
|
87
|
+
prompt = "describe the image in detail.";
|
|
88
|
+
}
|
|
89
|
+
auto * ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
|
|
90
|
+
return ctx_clip;
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
|
|
94
|
+
int N = (int) tokens.size();
|
|
95
|
+
for (int i = 0; i < N; i += n_batch) {
|
|
96
|
+
int n_eval = (int) tokens.size() - i;
|
|
97
|
+
if (n_eval > n_batch) {
|
|
98
|
+
n_eval = n_batch;
|
|
99
|
+
}
|
|
100
|
+
if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
|
|
101
|
+
LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
|
|
102
|
+
return false;
|
|
103
|
+
}
|
|
104
|
+
*n_past += n_eval;
|
|
105
|
+
}
|
|
106
|
+
return true;
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
|
|
110
|
+
std::vector<llama_token> tokens;
|
|
111
|
+
tokens.push_back(id);
|
|
112
|
+
return eval_tokens(ctx_llama, tokens, 1, n_past);
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
|
|
116
|
+
std::string str2 = str;
|
|
117
|
+
std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true);
|
|
118
|
+
return eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
static void process_eval_image_embed(struct llava_context * ctx_llava, const struct llava_image_embed * embeds, int n_batch, int * n_past, int idx) {
|
|
122
|
+
float * image_embed = (float *)malloc(clip_embd_nbytes(ctx_llava->ctx_clip));
|
|
123
|
+
std::memcpy(image_embed, embeds->embed + idx * clip_n_patches(ctx_llava->ctx_clip) * clip_n_mmproj_embd(ctx_llava->ctx_clip), clip_embd_nbytes(ctx_llava->ctx_clip));
|
|
124
|
+
|
|
125
|
+
auto * slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed));
|
|
126
|
+
slice_embed->embed = image_embed;
|
|
127
|
+
slice_embed->n_image_pos = clip_n_patches(ctx_llava->ctx_clip);
|
|
128
|
+
llava_eval_image_embed(ctx_llava->ctx_llama, slice_embed, n_batch, n_past);
|
|
129
|
+
llava_image_embed_free(slice_embed);
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
static void process_image(struct llava_context * ctx_llava, struct llava_image_embed * embeds, gpt_params * params, int &n_past) {
|
|
133
|
+
std::string system_prompt;
|
|
134
|
+
int idx = 0;
|
|
135
|
+
int num_image_embeds = embeds->n_image_pos / clip_n_patches(ctx_llava->ctx_clip);
|
|
136
|
+
int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
|
|
137
|
+
if (has_minicpmv_projector == 2) {
|
|
138
|
+
system_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n";
|
|
139
|
+
}
|
|
140
|
+
else if (has_minicpmv_projector == 3) {
|
|
141
|
+
system_prompt = "<|im_start|>user\n";
|
|
142
|
+
}
|
|
143
|
+
LOG_INF("%s: image token past: %d\n", __func__, n_past);
|
|
144
|
+
eval_string(ctx_llava->ctx_llama, (system_prompt+"<image>").c_str(), params->n_batch, &n_past, false);
|
|
145
|
+
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
|
|
146
|
+
eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
|
|
147
|
+
if (num_image_embeds > 1) {
|
|
148
|
+
size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip);
|
|
149
|
+
eval_string(ctx_llava->ctx_llama, std::string("<slice>").c_str(), params->n_batch, &n_past, false);
|
|
150
|
+
for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) {
|
|
151
|
+
for (size_t j = 0; j < num_image_embeds_col; ++j) {
|
|
152
|
+
eval_string(ctx_llava->ctx_llama, std::string("<image>").c_str(), params->n_batch, &n_past, false);
|
|
153
|
+
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
|
|
154
|
+
eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
|
|
155
|
+
if (j == num_image_embeds_col - 1) {
|
|
156
|
+
eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false);
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
|
|
161
|
+
}
|
|
162
|
+
LOG_INF("%s: image token past: %d\n", __func__, n_past);
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
static const char * sample(struct gpt_sampler * smpl,
|
|
166
|
+
struct llama_context * ctx_llama,
|
|
167
|
+
int * n_past) {
|
|
168
|
+
const llama_token id = gpt_sampler_sample(smpl, ctx_llama, -1);
|
|
169
|
+
gpt_sampler_accept(smpl, id, true);
|
|
170
|
+
static std::string ret;
|
|
171
|
+
if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
|
|
172
|
+
ret = "</s>";
|
|
173
|
+
} else {
|
|
174
|
+
ret = llama_token_to_piece(ctx_llama, id);
|
|
175
|
+
}
|
|
176
|
+
eval_id(ctx_llama, id, n_past);
|
|
177
|
+
return ret.c_str();
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){
|
|
181
|
+
auto * ctx_clip = clip_init_context(params);
|
|
182
|
+
auto * embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str());
|
|
183
|
+
if (!embeds) {
|
|
184
|
+
LOG_ERR("failed to load image %s. Terminating\n\n", fname.c_str());
|
|
185
|
+
return NULL;
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
// process the prompt
|
|
189
|
+
if (params->prompt.empty() && params->interactive == false) {
|
|
190
|
+
LOG_ERR("prompt should be given or interactive mode should be on");
|
|
191
|
+
return NULL;
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
auto * model = llava_init(params);
|
|
195
|
+
if (model == NULL) {
|
|
196
|
+
fprintf(stderr, "%s: error: failed to init minicpmv model\n", __func__);
|
|
197
|
+
return NULL;
|
|
198
|
+
}
|
|
199
|
+
const int64_t t_llava_init_start_us = ggml_time_us();
|
|
200
|
+
auto * ctx_llava = llava_init_context(params, model);
|
|
201
|
+
ctx_llava->ctx_clip = ctx_clip;
|
|
202
|
+
const int64_t t_llava_init_end_us = ggml_time_us();
|
|
203
|
+
float t_llava_init_ms = (t_llava_init_end_us - t_llava_init_start_us) / 1000.0;
|
|
204
|
+
LOG_INF("%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms);
|
|
205
|
+
|
|
206
|
+
const int64_t t_process_image_start_us = ggml_time_us();
|
|
207
|
+
process_image(ctx_llava, embeds, params, n_past);
|
|
208
|
+
const int64_t t_process_image_end_us = ggml_time_us();
|
|
209
|
+
float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0;
|
|
210
|
+
LOG_INF("%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms);
|
|
211
|
+
|
|
212
|
+
llava_image_embed_free(embeds);
|
|
213
|
+
return ctx_llava;
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_params * params, const std::string & prompt, int & n_past, bool is_first = false){
|
|
217
|
+
std::string user_prompt = prompt;
|
|
218
|
+
int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
|
|
219
|
+
if (!is_first) {
|
|
220
|
+
if (has_minicpmv_projector == 2) {
|
|
221
|
+
user_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" + prompt;
|
|
222
|
+
}
|
|
223
|
+
else if (has_minicpmv_projector == 3) {
|
|
224
|
+
user_prompt = "<|im_start|>user\n" + prompt;
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
|
|
229
|
+
if (has_minicpmv_projector == 2) {
|
|
230
|
+
eval_string(ctx_llava->ctx_llama, "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", params->n_batch, &n_past, false);
|
|
231
|
+
}
|
|
232
|
+
else if (has_minicpmv_projector == 3) {
|
|
233
|
+
eval_string(ctx_llava->ctx_llama, "<|im_end|><|im_start|>assistant\n", params->n_batch, &n_past, false);
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
// generate the response
|
|
237
|
+
|
|
238
|
+
LOG_INF("\n");
|
|
239
|
+
|
|
240
|
+
struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams);
|
|
241
|
+
return smpl;
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
static const char * llama_loop(struct llava_context * ctx_llava,struct gpt_sampler * smpl, int &n_past){
|
|
245
|
+
|
|
246
|
+
const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past);
|
|
247
|
+
return tmp;
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
int main(int argc, char ** argv) {
|
|
251
|
+
ggml_time_init();
|
|
252
|
+
|
|
253
|
+
gpt_params params;
|
|
254
|
+
|
|
255
|
+
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) {
|
|
256
|
+
return 1;
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
gpt_init();
|
|
260
|
+
|
|
261
|
+
if (params.mmproj.empty() || (params.image.empty())) {
|
|
262
|
+
show_additional_info(argc, argv);
|
|
263
|
+
return 1;
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
for (auto & image : params.image) {
|
|
267
|
+
int n_past = 0;
|
|
268
|
+
auto * ctx_llava = minicpmv_init(¶ms, image, n_past);
|
|
269
|
+
|
|
270
|
+
if (!params.prompt.empty()) {
|
|
271
|
+
LOG("<user>%s\n", params.prompt.c_str());
|
|
272
|
+
LOG("<assistant>");
|
|
273
|
+
auto * smpl = llama_init(ctx_llava, ¶ms, params.prompt, n_past, true);
|
|
274
|
+
const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
|
|
275
|
+
std::string response;
|
|
276
|
+
bool have_tmp = false;
|
|
277
|
+
for (int i = 0; i < max_tgt_len; i++) {
|
|
278
|
+
const auto * tmp = llama_loop(ctx_llava, smpl, n_past);
|
|
279
|
+
response += tmp;
|
|
280
|
+
if (strcmp(tmp, "</s>") == 0){
|
|
281
|
+
if (!have_tmp) {
|
|
282
|
+
continue;
|
|
283
|
+
}
|
|
284
|
+
break;
|
|
285
|
+
}
|
|
286
|
+
if (strstr(tmp, "###")) break; // Yi-VL behavior
|
|
287
|
+
have_tmp = true;
|
|
288
|
+
printf("%s", tmp);
|
|
289
|
+
if (strstr(response.c_str(), "<user>")) break; // minicpm-v
|
|
290
|
+
|
|
291
|
+
fflush(stdout);
|
|
292
|
+
}
|
|
293
|
+
gpt_sampler_free(smpl);
|
|
294
|
+
}else {
|
|
295
|
+
while (true) {
|
|
296
|
+
LOG("<user>");
|
|
297
|
+
std::string prompt;
|
|
298
|
+
std::getline(std::cin, prompt);
|
|
299
|
+
LOG("<assistant>");
|
|
300
|
+
auto * smpl = llama_init(ctx_llava, ¶ms, prompt, n_past, true);
|
|
301
|
+
const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
|
|
302
|
+
std::string response;
|
|
303
|
+
for (int i = 0; i < max_tgt_len; i++) {
|
|
304
|
+
const auto * tmp = llama_loop(ctx_llava, smpl, n_past);
|
|
305
|
+
response += tmp;
|
|
306
|
+
if (strcmp(tmp, "</s>") == 0) break;
|
|
307
|
+
if (strstr(tmp, "###")) break; // Yi-VL behavior
|
|
308
|
+
printf("%s", tmp);// mistral llava-1.6
|
|
309
|
+
if (strstr(response.c_str(), "<user>")) break; // minicpm-v
|
|
310
|
+
fflush(stdout);
|
|
311
|
+
}
|
|
312
|
+
gpt_sampler_free(smpl);
|
|
313
|
+
}
|
|
314
|
+
}
|
|
315
|
+
printf("\n");
|
|
316
|
+
llama_perf_context_print(ctx_llava->ctx_llama);
|
|
317
|
+
|
|
318
|
+
ctx_llava->model = NULL;
|
|
319
|
+
llava_free(ctx_llava);
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
return 0;
|
|
323
|
+
}
|
|
@@ -1,7 +1,9 @@
|
|
|
1
|
+
#include "arg.h"
|
|
1
2
|
#include "common.h"
|
|
3
|
+
#include "sampling.h"
|
|
4
|
+
#include "log.h"
|
|
2
5
|
#include "llama.h"
|
|
3
6
|
|
|
4
|
-
#include <cmath>
|
|
5
7
|
#include <cstdio>
|
|
6
8
|
#include <string>
|
|
7
9
|
#include <vector>
|
|
@@ -37,32 +39,27 @@ struct ngram_container {
|
|
|
37
39
|
int main(int argc, char ** argv) {
|
|
38
40
|
gpt_params params;
|
|
39
41
|
|
|
40
|
-
if (!gpt_params_parse(argc, argv, params)) {
|
|
41
|
-
gpt_params_print_usage(argc, argv, params);
|
|
42
|
+
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
|
|
42
43
|
return 1;
|
|
43
44
|
}
|
|
44
45
|
|
|
46
|
+
gpt_init();
|
|
47
|
+
|
|
45
48
|
const int W = 15; // lookahead window
|
|
46
49
|
const int N = 5; // n-gram size
|
|
47
50
|
const int G = 15; // max verification n-grams
|
|
48
51
|
|
|
49
52
|
const bool dump_kv_cache = params.dump_kv_cache;
|
|
50
53
|
|
|
51
|
-
#ifndef LOG_DISABLE_LOGS
|
|
52
|
-
log_set_target(log_filename_generator("lookahead", "log"));
|
|
53
|
-
LOG_TEE("Log start\n");
|
|
54
|
-
log_dump_cmdline(argc, argv);
|
|
55
|
-
#endif // LOG_DISABLE_LOGS
|
|
56
|
-
|
|
57
54
|
// init llama.cpp
|
|
58
55
|
llama_backend_init();
|
|
59
56
|
llama_numa_init(params.numa);
|
|
60
57
|
|
|
61
|
-
llama_model * model = NULL;
|
|
62
|
-
llama_context * ctx = NULL;
|
|
63
|
-
|
|
64
58
|
// load the target model
|
|
65
|
-
|
|
59
|
+
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
|
60
|
+
|
|
61
|
+
llama_model * model = llama_init.model;
|
|
62
|
+
llama_context * ctx = llama_init.context;
|
|
66
63
|
|
|
67
64
|
// Tokenize the prompt
|
|
68
65
|
std::vector<llama_token> inp;
|
|
@@ -75,14 +72,14 @@ int main(int argc, char ** argv) {
|
|
|
75
72
|
const int max_tokens_list_size = max_context_size - 4;
|
|
76
73
|
|
|
77
74
|
if ((int) inp.size() > max_tokens_list_size) {
|
|
78
|
-
|
|
75
|
+
LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
|
|
79
76
|
return 1;
|
|
80
77
|
}
|
|
81
78
|
|
|
82
|
-
|
|
79
|
+
LOG("\n\n");
|
|
83
80
|
|
|
84
81
|
for (auto id : inp) {
|
|
85
|
-
|
|
82
|
+
LOG("%s", llama_token_to_piece(ctx, id).c_str());
|
|
86
83
|
}
|
|
87
84
|
|
|
88
85
|
fflush(stderr);
|
|
@@ -118,7 +115,7 @@ int main(int argc, char ** argv) {
|
|
|
118
115
|
llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1);
|
|
119
116
|
|
|
120
117
|
// target model sampling context
|
|
121
|
-
struct
|
|
118
|
+
struct gpt_sampler * smpl = gpt_sampler_init(model, params.sparams);
|
|
122
119
|
|
|
123
120
|
// verification n-grams
|
|
124
121
|
std::vector<ngram_data> ngrams_cur(G);
|
|
@@ -159,14 +156,14 @@ int main(int argc, char ** argv) {
|
|
|
159
156
|
|
|
160
157
|
// sample first token
|
|
161
158
|
{
|
|
162
|
-
id =
|
|
159
|
+
id = gpt_sampler_sample(smpl, ctx, 0);
|
|
163
160
|
|
|
164
|
-
|
|
161
|
+
gpt_sampler_accept(smpl, id, true);
|
|
165
162
|
|
|
166
163
|
{
|
|
167
164
|
const std::string token_str = llama_token_to_piece(ctx, id);
|
|
168
165
|
|
|
169
|
-
|
|
166
|
+
LOG("%s", token_str.c_str());
|
|
170
167
|
fflush(stdout);
|
|
171
168
|
}
|
|
172
169
|
}
|
|
@@ -256,7 +253,7 @@ int main(int argc, char ** argv) {
|
|
|
256
253
|
}
|
|
257
254
|
|
|
258
255
|
if (llama_decode(ctx, batch) != 0) {
|
|
259
|
-
|
|
256
|
+
LOG_ERR("\n\n%s: llama_decode failed - increase KV cache size\n", __func__);
|
|
260
257
|
return 1;
|
|
261
258
|
}
|
|
262
259
|
|
|
@@ -284,19 +281,19 @@ int main(int argc, char ** argv) {
|
|
|
284
281
|
}
|
|
285
282
|
|
|
286
283
|
// sample the next token
|
|
287
|
-
id =
|
|
284
|
+
id = gpt_sampler_sample(smpl, ctx, i_batch);
|
|
288
285
|
|
|
289
|
-
|
|
286
|
+
gpt_sampler_accept(smpl, id, true);
|
|
290
287
|
|
|
291
288
|
// print
|
|
292
289
|
{
|
|
293
290
|
const std::string token_str = llama_token_to_piece(ctx, id);
|
|
294
291
|
|
|
295
292
|
if (v == 0) {
|
|
296
|
-
|
|
293
|
+
LOG("%s", token_str.c_str());
|
|
297
294
|
} else {
|
|
298
295
|
// print light cyan
|
|
299
|
-
|
|
296
|
+
LOG("\033[0;96m%s\033[0m", token_str.c_str());
|
|
300
297
|
}
|
|
301
298
|
fflush(stdout);
|
|
302
299
|
|
|
@@ -330,21 +327,21 @@ int main(int argc, char ** argv) {
|
|
|
330
327
|
// print known n-grams starting with token id (debug)
|
|
331
328
|
if (0 && v == 0) {
|
|
332
329
|
if (ngrams_observed.cnt[id] > 0) {
|
|
333
|
-
|
|
330
|
+
LOG("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], llama_token_to_piece(ctx, id).c_str());
|
|
334
331
|
}
|
|
335
332
|
|
|
336
333
|
for (int i = 0; i < ngrams_observed.cnt[id]; i++) {
|
|
337
|
-
|
|
334
|
+
LOG(" - ngram %2d: ", i);
|
|
338
335
|
|
|
339
336
|
const int idx = id*(N - 1)*G + i*(N - 1);
|
|
340
337
|
|
|
341
338
|
for (int j = 0; j < N - 1; j++) {
|
|
342
339
|
const std::string token_str = llama_token_to_piece(ctx, ngrams_observed.tokens[idx + j]);
|
|
343
340
|
|
|
344
|
-
|
|
341
|
+
LOG("%s", token_str.c_str());
|
|
345
342
|
}
|
|
346
343
|
|
|
347
|
-
|
|
344
|
+
LOG("\n");
|
|
348
345
|
}
|
|
349
346
|
}
|
|
350
347
|
|
|
@@ -361,7 +358,7 @@ int main(int argc, char ** argv) {
|
|
|
361
358
|
if (v == 0) {
|
|
362
359
|
// sample from the last level
|
|
363
360
|
for (int i = 0; i < W; i++) {
|
|
364
|
-
tokens_j[N - 2][i] =
|
|
361
|
+
tokens_j[N - 2][i] = gpt_sampler_sample(smpl, ctx, ngrams_cur.size()*(N-1) + W*(N - 2) + i);
|
|
365
362
|
}
|
|
366
363
|
} else {
|
|
367
364
|
for (int i = 0; i < W; i++) {
|
|
@@ -455,23 +452,25 @@ int main(int argc, char ** argv) {
|
|
|
455
452
|
|
|
456
453
|
auto t_dec_end = ggml_time_us();
|
|
457
454
|
|
|
458
|
-
|
|
455
|
+
LOG("\n\n");
|
|
456
|
+
|
|
457
|
+
LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
|
|
458
|
+
LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
|
|
459
459
|
|
|
460
|
-
|
|
461
|
-
|
|
460
|
+
LOG_INF("\n");
|
|
461
|
+
LOG_INF("W = %2d\n", W);
|
|
462
|
+
LOG_INF("N = %2d\n", N);
|
|
463
|
+
LOG_INF("G = %2d\n", G);
|
|
464
|
+
LOG_INF("\n");
|
|
465
|
+
LOG_INF("n_predict = %d\n", n_predict);
|
|
466
|
+
LOG_INF("n_accept = %d\n", n_accept);
|
|
462
467
|
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
LOG_TEE("N = %2d\n", N);
|
|
466
|
-
LOG_TEE("G = %2d\n", G);
|
|
467
|
-
LOG_TEE("\n");
|
|
468
|
-
LOG_TEE("n_predict = %d\n", n_predict);
|
|
469
|
-
LOG_TEE("n_accept = %d\n", n_accept);
|
|
468
|
+
LOG_INF("\n");
|
|
469
|
+
gpt_perf_print(ctx, smpl);
|
|
470
470
|
|
|
471
|
-
|
|
471
|
+
gpt_sampler_free(smpl);
|
|
472
472
|
|
|
473
473
|
llama_kv_cache_view_free(&kvc_view);
|
|
474
|
-
llama_sampling_free(ctx_sampling);
|
|
475
474
|
|
|
476
475
|
llama_batch_free(batch);
|
|
477
476
|
|
|
@@ -480,7 +479,7 @@ int main(int argc, char ** argv) {
|
|
|
480
479
|
|
|
481
480
|
llama_backend_free();
|
|
482
481
|
|
|
483
|
-
|
|
482
|
+
LOG("\n\n");
|
|
484
483
|
|
|
485
484
|
return 0;
|
|
486
485
|
}
|
|
@@ -1,7 +1,8 @@
|
|
|
1
|
-
#include "
|
|
2
|
-
#include "llama.h"
|
|
1
|
+
#include "arg.h"
|
|
3
2
|
#include "common.h"
|
|
4
3
|
#include "ngram-cache.h"
|
|
4
|
+
#include "ggml.h"
|
|
5
|
+
#include "llama.h"
|
|
5
6
|
|
|
6
7
|
#include <cstdint>
|
|
7
8
|
#include <fstream>
|
|
@@ -13,8 +14,7 @@
|
|
|
13
14
|
int main(int argc, char ** argv){
|
|
14
15
|
gpt_params params;
|
|
15
16
|
|
|
16
|
-
if (!gpt_params_parse(argc, argv, params)) {
|
|
17
|
-
gpt_params_print_usage(argc, argv, params);
|
|
17
|
+
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
|
|
18
18
|
return 1;
|
|
19
19
|
}
|
|
20
20
|
|
|
@@ -22,11 +22,11 @@ int main(int argc, char ** argv){
|
|
|
22
22
|
llama_backend_init();
|
|
23
23
|
llama_numa_init(params.numa);
|
|
24
24
|
|
|
25
|
-
llama_model * model = NULL;
|
|
26
|
-
llama_context * ctx = NULL;
|
|
27
|
-
|
|
28
25
|
// load the model
|
|
29
|
-
|
|
26
|
+
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
|
27
|
+
|
|
28
|
+
llama_model * model = llama_init.model;
|
|
29
|
+
llama_context * ctx = llama_init.context;
|
|
30
30
|
GGML_ASSERT(model != nullptr);
|
|
31
31
|
|
|
32
32
|
// tokenize the prompt
|
|
@@ -40,4 +40,6 @@ int main(int argc, char ** argv){
|
|
|
40
40
|
fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.lookup_cache_static.c_str());
|
|
41
41
|
|
|
42
42
|
llama_ngram_cache_save(ngram_cache, params.lookup_cache_static);
|
|
43
|
+
|
|
44
|
+
return 0;
|
|
43
45
|
}
|
|
@@ -1,36 +1,37 @@
|
|
|
1
|
-
#include "
|
|
1
|
+
#include "arg.h"
|
|
2
2
|
#include "common.h"
|
|
3
|
-
#include "llama.h"
|
|
4
3
|
#include "log.h"
|
|
5
4
|
#include "ngram-cache.h"
|
|
5
|
+
#include "llama.h"
|
|
6
|
+
#include "ggml.h"
|
|
6
7
|
|
|
7
|
-
#include <cmath>
|
|
8
8
|
#include <cstdint>
|
|
9
9
|
#include <cstdio>
|
|
10
|
+
#include <cinttypes>
|
|
10
11
|
#include <fstream>
|
|
11
12
|
#include <string>
|
|
12
13
|
#include <vector>
|
|
13
|
-
#include <unordered_map>
|
|
14
14
|
|
|
15
15
|
int main(int argc, char ** argv){
|
|
16
16
|
gpt_params params;
|
|
17
17
|
|
|
18
|
-
if (!gpt_params_parse(argc, argv, params)) {
|
|
19
|
-
gpt_params_print_usage(argc, argv, params);
|
|
18
|
+
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
|
|
20
19
|
return 1;
|
|
21
20
|
}
|
|
22
21
|
|
|
22
|
+
gpt_init();
|
|
23
|
+
|
|
23
24
|
const int n_draft = params.n_draft;
|
|
24
25
|
|
|
25
26
|
// init llama.cpp
|
|
26
27
|
llama_backend_init();
|
|
27
28
|
llama_numa_init(params.numa);
|
|
28
29
|
|
|
29
|
-
llama_model * model = NULL;
|
|
30
|
-
llama_context * ctx = NULL;
|
|
31
|
-
|
|
32
30
|
// load the model
|
|
33
|
-
|
|
31
|
+
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
|
32
|
+
|
|
33
|
+
llama_model * model = llama_init.model;
|
|
34
|
+
llama_context * ctx = llama_init.context;
|
|
34
35
|
|
|
35
36
|
// tokenize the prompt
|
|
36
37
|
std::vector<llama_token> inp;
|
|
@@ -49,7 +50,7 @@ int main(int argc, char ** argv){
|
|
|
49
50
|
try {
|
|
50
51
|
ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
|
|
51
52
|
} catch (std::ifstream::failure const &) {
|
|
52
|
-
|
|
53
|
+
LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
|
|
53
54
|
exit(1);
|
|
54
55
|
}
|
|
55
56
|
}
|
|
@@ -128,7 +129,7 @@ int main(int argc, char ** argv){
|
|
|
128
129
|
const int64_t eta_min = eta_ms / (60*1000);
|
|
129
130
|
const int64_t eta_s = (eta_ms - 60*1000*eta_min) / 1000;
|
|
130
131
|
|
|
131
|
-
|
|
132
|
+
LOG_INF("lookup-stats: %d/%d done, ETA: %02" PRId64 ":%02" PRId64 "\n", i_start, n_input, eta_min, eta_s);
|
|
132
133
|
}
|
|
133
134
|
|
|
134
135
|
// After each chunk, update the dynamic ngram cache with the context ngram cache:
|
|
@@ -136,24 +137,24 @@ int main(int argc, char ** argv){
|
|
|
136
137
|
ngram_cache_context.clear();
|
|
137
138
|
}
|
|
138
139
|
|
|
139
|
-
|
|
140
|
+
LOG("\n");
|
|
140
141
|
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
142
|
+
LOG_INF("\n");
|
|
143
|
+
LOG_INF("n_draft = %d\n", n_draft);
|
|
144
|
+
LOG_INF("n_predict = %d\n", n_input - n_input % n_ctx);
|
|
145
|
+
LOG_INF("n_drafted = %d\n", n_drafted);
|
|
146
|
+
LOG_INF("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
|
|
147
|
+
LOG_INF("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n",
|
|
147
148
|
t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us));
|
|
148
|
-
|
|
149
|
-
|
|
149
|
+
LOG_INF("n_accept = %d\n", n_accept);
|
|
150
|
+
LOG_INF("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
|
|
150
151
|
|
|
151
152
|
llama_free(ctx);
|
|
152
153
|
llama_free_model(model);
|
|
153
154
|
|
|
154
155
|
llama_backend_free();
|
|
155
156
|
|
|
156
|
-
|
|
157
|
+
LOG("\n\n");
|
|
157
158
|
|
|
158
159
|
return 0;
|
|
159
160
|
}
|