@fugood/llama.node 0.3.0 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +1 -10
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +6 -4
- package/src/LlamaCompletionWorker.cpp +6 -6
- package/src/LlamaContext.cpp +7 -9
- package/src/common.hpp +2 -1
- package/src/llama.cpp/.github/workflows/build.yml +98 -24
- package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
- package/src/llama.cpp/.github/workflows/docker.yml +43 -34
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
- package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
- package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
- package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
- package/src/llama.cpp/.github/workflows/server.yml +7 -0
- package/src/llama.cpp/CMakeLists.txt +20 -8
- package/src/llama.cpp/common/CMakeLists.txt +12 -10
- package/src/llama.cpp/common/arg.cpp +2006 -0
- package/src/llama.cpp/common/arg.h +77 -0
- package/src/llama.cpp/common/common.cpp +496 -1632
- package/src/llama.cpp/common/common.h +161 -63
- package/src/llama.cpp/common/console.cpp +3 -0
- package/src/llama.cpp/common/log.cpp +401 -0
- package/src/llama.cpp/common/log.h +66 -698
- package/src/llama.cpp/common/ngram-cache.cpp +3 -0
- package/src/llama.cpp/common/sampling.cpp +348 -350
- package/src/llama.cpp/common/sampling.h +62 -139
- package/src/llama.cpp/common/stb_image.h +5990 -6398
- package/src/llama.cpp/common/train.cpp +2 -0
- package/src/llama.cpp/docs/build.md +36 -1
- package/src/llama.cpp/examples/CMakeLists.txt +0 -1
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +1 -2
- package/src/llama.cpp/examples/batched/batched.cpp +39 -55
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +34 -44
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +15 -15
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
- package/src/llama.cpp/examples/embedding/embedding.cpp +143 -87
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +33 -33
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +36 -35
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
- package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +34 -27
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +59 -62
- package/src/llama.cpp/examples/infill/infill.cpp +117 -132
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +265 -58
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +29 -22
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +685 -150
- package/src/llama.cpp/examples/llava/clip.h +11 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +47 -58
- package/src/llama.cpp/examples/llava/llava.cpp +110 -24
- package/src/llama.cpp/examples/llava/llava.h +2 -3
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
- package/src/llama.cpp/examples/llava/requirements.txt +1 -0
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +42 -43
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +10 -8
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +23 -22
- package/src/llama.cpp/examples/lookup/lookup.cpp +40 -43
- package/src/llama.cpp/examples/main/main.cpp +210 -262
- package/src/llama.cpp/examples/parallel/parallel.cpp +49 -49
- package/src/llama.cpp/examples/passkey/passkey.cpp +42 -50
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +187 -200
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -3
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +49 -44
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +24 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +32 -35
- package/src/llama.cpp/examples/server/CMakeLists.txt +3 -5
- package/src/llama.cpp/examples/server/server.cpp +1027 -1073
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
- package/src/llama.cpp/examples/server/utils.hpp +107 -105
- package/src/llama.cpp/examples/simple/simple.cpp +35 -41
- package/src/llama.cpp/examples/speculative/speculative.cpp +129 -103
- package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +25 -27
- package/src/llama.cpp/ggml/CMakeLists.txt +14 -3
- package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-backend.h +145 -60
- package/src/llama.cpp/ggml/include/ggml-blas.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +15 -19
- package/src/llama.cpp/ggml/include/ggml-cuda.h +16 -16
- package/src/llama.cpp/ggml/include/ggml-metal.h +5 -8
- package/src/llama.cpp/ggml/include/ggml-rpc.h +5 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +8 -8
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +7 -7
- package/src/llama.cpp/ggml/include/ggml.h +293 -186
- package/src/llama.cpp/ggml/src/CMakeLists.txt +86 -44
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +2135 -1119
- package/src/llama.cpp/ggml/src/ggml-alloc.c +6 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +152 -70
- package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +606 -286
- package/src/llama.cpp/ggml/src/ggml-blas.cpp +9 -10
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
- package/src/llama.cpp/ggml/src/ggml-cann.cpp +215 -216
- package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
- package/src/llama.cpp/ggml/src/ggml-cpu-impl.h +614 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +49 -603
- package/src/llama.cpp/ggml/src/ggml-kompute.cpp +4 -24
- package/src/llama.cpp/ggml/src/ggml-quants.c +972 -92
- package/src/llama.cpp/ggml/src/ggml-quants.h +15 -0
- package/src/llama.cpp/ggml/src/ggml-rpc.cpp +116 -66
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +52 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +16 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +6 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl.cpp +97 -169
- package/src/llama.cpp/ggml/src/ggml-vulkan.cpp +1508 -1124
- package/src/llama.cpp/ggml/src/ggml.c +3001 -1647
- package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +192 -0
- package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +88 -40
- package/src/llama.cpp/include/llama.h +241 -264
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
- package/src/llama.cpp/src/llama-grammar.cpp +721 -122
- package/src/llama.cpp/src/llama-grammar.h +120 -15
- package/src/llama.cpp/src/llama-impl.h +156 -1
- package/src/llama.cpp/src/llama-sampling.cpp +1375 -303
- package/src/llama.cpp/src/llama-sampling.h +20 -47
- package/src/llama.cpp/src/llama-vocab.cpp +343 -120
- package/src/llama.cpp/src/llama-vocab.h +33 -17
- package/src/llama.cpp/src/llama.cpp +4247 -1525
- package/src/llama.cpp/src/unicode-data.cpp +6 -4
- package/src/llama.cpp/src/unicode-data.h +4 -4
- package/src/llama.cpp/src/unicode.cpp +15 -7
- package/src/llama.cpp/tests/CMakeLists.txt +3 -0
- package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
- package/src/llama.cpp/tests/test-backend-ops.cpp +1592 -289
- package/src/llama.cpp/tests/test-barrier.cpp +93 -0
- package/src/llama.cpp/tests/test-grad0.cpp +187 -70
- package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
- package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +6 -4
- package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
- package/src/llama.cpp/tests/test-log.cpp +39 -0
- package/src/llama.cpp/tests/test-quantize-fns.cpp +6 -0
- package/src/llama.cpp/tests/test-rope.cpp +1 -1
- package/src/llama.cpp/tests/test-sampling.cpp +157 -98
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +55 -35
- package/patches/llama.patch +0 -22
- package/src/llama.cpp/.github/workflows/bench.yml +0 -310
- package/src/llama.cpp/common/grammar-parser.cpp +0 -536
- package/src/llama.cpp/common/grammar-parser.h +0 -29
- package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
- package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
|
@@ -1,11 +1,16 @@
|
|
|
1
|
+
#include "arg.h"
|
|
1
2
|
#include "common.h"
|
|
3
|
+
#include "sampling.h"
|
|
4
|
+
#include "log.h"
|
|
2
5
|
#include "llama.h"
|
|
3
6
|
|
|
4
|
-
#include <
|
|
7
|
+
#include <algorithm>
|
|
5
8
|
#include <cstdio>
|
|
9
|
+
#include <cstring>
|
|
10
|
+
#include <random>
|
|
11
|
+
#include <set>
|
|
6
12
|
#include <string>
|
|
7
13
|
#include <vector>
|
|
8
|
-
#include <set>
|
|
9
14
|
|
|
10
15
|
#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 100
|
|
11
16
|
#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
|
|
@@ -21,19 +26,23 @@ struct seq_draft {
|
|
|
21
26
|
std::vector<llama_token> tokens;
|
|
22
27
|
std::vector<std::vector<llama_token_data>> dists;
|
|
23
28
|
|
|
24
|
-
struct
|
|
29
|
+
struct gpt_sampler * smpl = nullptr;
|
|
25
30
|
};
|
|
26
31
|
|
|
27
32
|
int main(int argc, char ** argv) {
|
|
28
33
|
gpt_params params;
|
|
29
34
|
|
|
30
|
-
|
|
31
|
-
|
|
35
|
+
// needed to get candidate probs even for temp <= 0.0
|
|
36
|
+
params.sparams.n_probs = 128;
|
|
37
|
+
|
|
38
|
+
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) {
|
|
32
39
|
return 1;
|
|
33
40
|
}
|
|
34
41
|
|
|
42
|
+
gpt_init();
|
|
43
|
+
|
|
35
44
|
if (params.model_draft.empty()) {
|
|
36
|
-
|
|
45
|
+
LOG_ERR("%s: --model-draft is required\n", __func__);
|
|
37
46
|
return 1;
|
|
38
47
|
}
|
|
39
48
|
|
|
@@ -43,18 +52,9 @@ int main(int argc, char ** argv) {
|
|
|
43
52
|
// probability threshold for splitting a draft branch (only for n_seq_dft > 1)
|
|
44
53
|
const float p_split = params.p_split;
|
|
45
54
|
|
|
46
|
-
|
|
47
|
-
params.seed = time(NULL);
|
|
48
|
-
}
|
|
49
|
-
std::default_random_engine rng(params.seed);
|
|
55
|
+
std::default_random_engine rng(params.sparams.seed == LLAMA_DEFAULT_SEED ? std::random_device()() : params.sparams.seed);
|
|
50
56
|
std::uniform_real_distribution<> u_dist;
|
|
51
57
|
|
|
52
|
-
#ifndef LOG_DISABLE_LOGS
|
|
53
|
-
log_set_target(log_filename_generator("speculative", "log"));
|
|
54
|
-
LOG_TEE("Log start\n");
|
|
55
|
-
log_dump_cmdline(argc, argv);
|
|
56
|
-
#endif // LOG_DISABLE_LOGS
|
|
57
|
-
|
|
58
58
|
// init llama.cpp
|
|
59
59
|
llama_backend_init();
|
|
60
60
|
llama_numa_init(params.numa);
|
|
@@ -66,26 +66,31 @@ int main(int argc, char ** argv) {
|
|
|
66
66
|
llama_context * ctx_dft = NULL;
|
|
67
67
|
|
|
68
68
|
// load the target model
|
|
69
|
-
|
|
69
|
+
llama_init_result llama_init_tgt = llama_init_from_gpt_params(params);
|
|
70
|
+
model_tgt = llama_init_tgt.model;
|
|
71
|
+
ctx_tgt = llama_init_tgt.context;
|
|
70
72
|
|
|
71
73
|
// load the draft model
|
|
72
74
|
params.model = params.model_draft;
|
|
73
75
|
params.n_gpu_layers = params.n_gpu_layers_draft;
|
|
74
|
-
if (params.
|
|
75
|
-
params.n_threads = params.
|
|
76
|
+
if (params.draft_cpuparams.n_threads > 0) {
|
|
77
|
+
params.cpuparams.n_threads = params.draft_cpuparams.n_threads;
|
|
76
78
|
}
|
|
77
|
-
|
|
78
|
-
|
|
79
|
+
|
|
80
|
+
params.cpuparams_batch.n_threads = params.draft_cpuparams_batch.n_threads;
|
|
81
|
+
llama_init_result llama_init_dft = llama_init_from_gpt_params(params);
|
|
82
|
+
model_dft = llama_init_dft.model;
|
|
83
|
+
ctx_dft = llama_init_dft.context;
|
|
79
84
|
|
|
80
85
|
const bool vocab_type_tgt = llama_vocab_type(model_tgt);
|
|
81
|
-
|
|
86
|
+
LOG_DBG("vocab_type tgt: %d\n", vocab_type_tgt);
|
|
82
87
|
|
|
83
88
|
const bool vocab_type_dft = llama_vocab_type(model_dft);
|
|
84
|
-
|
|
89
|
+
LOG_DBG("vocab_type dft: %d\n", vocab_type_dft);
|
|
85
90
|
|
|
86
91
|
if (vocab_type_tgt != vocab_type_dft) {
|
|
87
|
-
|
|
88
|
-
|
|
92
|
+
LOG_ERR("%s: draft model vocab type must match target model to use speculation but ", __func__);
|
|
93
|
+
LOG_ERR("vocab_type_dft = %d while vocab_type_tgt = %d\n", vocab_type_dft, vocab_type_tgt);
|
|
89
94
|
return 1;
|
|
90
95
|
}
|
|
91
96
|
|
|
@@ -95,7 +100,7 @@ int main(int argc, char ** argv) {
|
|
|
95
100
|
llama_token_bos(model_tgt) != llama_token_bos(model_dft) ||
|
|
96
101
|
llama_token_eos(model_tgt) != llama_token_eos(model_dft)
|
|
97
102
|
) {
|
|
98
|
-
|
|
103
|
+
LOG_ERR("%s: draft model special tokens must match target model to use speculation\n", __func__);
|
|
99
104
|
return 1;
|
|
100
105
|
}
|
|
101
106
|
|
|
@@ -107,8 +112,8 @@ int main(int argc, char ** argv) {
|
|
|
107
112
|
: n_vocab_dft - n_vocab_tgt;
|
|
108
113
|
|
|
109
114
|
if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
|
|
110
|
-
|
|
111
|
-
|
|
115
|
+
LOG_ERR("%s: draft model vocab must closely match target model to use speculation but ", __func__);
|
|
116
|
+
LOG_ERR("target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
|
|
112
117
|
n_vocab_tgt, llama_n_vocab(model_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
|
|
113
118
|
return 1;
|
|
114
119
|
}
|
|
@@ -117,8 +122,8 @@ int main(int argc, char ** argv) {
|
|
|
117
122
|
const char * token_text_tgt = llama_token_get_text(model_tgt, i);
|
|
118
123
|
const char * token_text_dft = llama_token_get_text(model_dft, i);
|
|
119
124
|
if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
|
|
120
|
-
|
|
121
|
-
|
|
125
|
+
LOG_ERR("%s: draft model vocab must match target model to use speculation but ", __func__);
|
|
126
|
+
LOG_ERR("token %d content differs - target '%s', draft '%s'\n", i,
|
|
122
127
|
llama_token_to_piece(ctx_tgt, i).c_str(),
|
|
123
128
|
llama_token_to_piece(ctx_dft, i).c_str());
|
|
124
129
|
return 1;
|
|
@@ -135,18 +140,16 @@ int main(int argc, char ** argv) {
|
|
|
135
140
|
const int max_tokens_list_size = max_context_size - 4;
|
|
136
141
|
|
|
137
142
|
if ((int) inp.size() > max_tokens_list_size) {
|
|
138
|
-
|
|
143
|
+
LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
|
|
139
144
|
return 1;
|
|
140
145
|
}
|
|
141
146
|
|
|
142
|
-
|
|
147
|
+
LOG("\n\n");
|
|
143
148
|
|
|
144
149
|
for (auto id : inp) {
|
|
145
|
-
|
|
150
|
+
LOG("%s", llama_token_to_piece(ctx_tgt, id).c_str());
|
|
146
151
|
}
|
|
147
152
|
|
|
148
|
-
fflush(stderr);
|
|
149
|
-
|
|
150
153
|
const int n_input = inp.size();
|
|
151
154
|
|
|
152
155
|
const auto t_enc_start = ggml_time_us();
|
|
@@ -174,19 +177,17 @@ int main(int argc, char ** argv) {
|
|
|
174
177
|
// used to determine end of generation
|
|
175
178
|
bool has_eos = false;
|
|
176
179
|
|
|
177
|
-
// target model sampling context
|
|
178
|
-
struct
|
|
180
|
+
// target model sampling context (reuse the llama_context's sampling instance)
|
|
181
|
+
struct gpt_sampler * smpl = gpt_sampler_init(model_tgt, params.sparams);
|
|
182
|
+
|
|
183
|
+
struct llama_sampler * softmax = llama_sampler_init_softmax();
|
|
179
184
|
|
|
180
185
|
// draft sequence data
|
|
181
186
|
std::vector<seq_draft> drafts(n_seq_dft);
|
|
182
187
|
|
|
183
|
-
params.sparams.grammar.clear(); // the draft samplers will copy the target sampler's grammar
|
|
184
|
-
if (params.sparams.temp == 0) {
|
|
185
|
-
params.sparams.temp = -1.0f; // force greedy sampling with probs for the draft model
|
|
186
|
-
}
|
|
187
|
-
|
|
188
188
|
for (int s = 0; s < n_seq_dft; ++s) {
|
|
189
|
-
|
|
189
|
+
// allocate gpt_sampler for each draft sequence
|
|
190
|
+
drafts[s].smpl = gpt_sampler_init(model_dft, params.sparams);
|
|
190
191
|
}
|
|
191
192
|
|
|
192
193
|
llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 1);
|
|
@@ -210,7 +211,7 @@ int main(int argc, char ** argv) {
|
|
|
210
211
|
active_seqs.insert(s);
|
|
211
212
|
const auto & tokens = drafts[s].tokens;
|
|
212
213
|
|
|
213
|
-
|
|
214
|
+
LOG_DBG("draft %d: %s\n", s, string_from(ctx_dft, tokens).c_str());
|
|
214
215
|
}
|
|
215
216
|
|
|
216
217
|
int i_dft = 0;
|
|
@@ -228,12 +229,12 @@ int main(int argc, char ** argv) {
|
|
|
228
229
|
bool accept = false;
|
|
229
230
|
if (params.sparams.temp > 0) {
|
|
230
231
|
// stochastic verification
|
|
232
|
+
gpt_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft], true);
|
|
231
233
|
|
|
232
|
-
|
|
233
|
-
llama_sample_softmax(ctx_tgt, &dist_tgt);
|
|
234
|
-
float p_tgt = 0, p_dft = 0;
|
|
234
|
+
auto & dist_tgt = *gpt_sampler_get_candidates(smpl);
|
|
235
235
|
|
|
236
|
-
|
|
236
|
+
float p_tgt = 0.0f;
|
|
237
|
+
float p_dft = 0.0f;
|
|
237
238
|
|
|
238
239
|
while (active_seqs.size() > 0) {
|
|
239
240
|
// randomly select a sequence to verify from active sequences
|
|
@@ -252,9 +253,13 @@ int main(int argc, char ** argv) {
|
|
|
252
253
|
}
|
|
253
254
|
continue;
|
|
254
255
|
}
|
|
255
|
-
|
|
256
|
+
|
|
257
|
+
LOG_DBG("verifying sequence #%d at pos #%d from %d active sequence(s)\n", s, i_dft, (int) active_seqs.size());
|
|
256
258
|
float r = u_dist(rng);
|
|
257
|
-
llama_token_data_array dist_dft = { drafts[s].dists[i_dft].data() , drafts[s].dists[i_dft].size(), true };
|
|
259
|
+
llama_token_data_array dist_dft = { drafts[s].dists[i_dft].data() , drafts[s].dists[i_dft].size(), LLAMA_TOKEN_NULL, true };
|
|
260
|
+
|
|
261
|
+
//GGML_ASSERT(dist_tgt.size <= dist_dft.size);
|
|
262
|
+
|
|
258
263
|
// acquire the token probabilities assigned by the draft and target models
|
|
259
264
|
for (size_t i = 0; i < dist_tgt.size; i++) {
|
|
260
265
|
if (dist_tgt.data[i].id == drafts[s].tokens[i_dft]) {
|
|
@@ -267,24 +272,23 @@ int main(int argc, char ** argv) {
|
|
|
267
272
|
break;
|
|
268
273
|
}
|
|
269
274
|
}
|
|
270
|
-
|
|
275
|
+
LOG_DBG("r = %f, p_dft = %f, p_tgt = %f\n", r, p_dft, p_tgt);
|
|
271
276
|
if (r <= p_tgt / p_dft) {
|
|
272
277
|
s_keep = s;
|
|
273
278
|
accept = true;
|
|
274
279
|
token_id = drafts[s].tokens[i_dft];
|
|
275
280
|
token_str = llama_token_to_piece(ctx_tgt, token_id);
|
|
276
|
-
|
|
281
|
+
gpt_sampler_accept(smpl, token_id, true);
|
|
277
282
|
|
|
278
|
-
|
|
283
|
+
LOG_DBG("draft token %d of sequence %d (%d, '%s') accepted\n", i_dft, s, token_id, token_str.c_str());
|
|
279
284
|
break;
|
|
280
285
|
} else {
|
|
281
|
-
|
|
286
|
+
LOG_DBG("draft token %d of sequence %d (%d, '%s') rejected\n", i_dft, s, drafts[s].tokens[i_dft], llama_token_to_piece(ctx_tgt, drafts[s].tokens[i_dft]).c_str());
|
|
282
287
|
drafts[s].active = false;
|
|
283
288
|
|
|
284
289
|
// calculate residual probability
|
|
285
290
|
GGML_ASSERT(dist_tgt.sorted);
|
|
286
291
|
GGML_ASSERT(dist_dft.sorted);
|
|
287
|
-
float sum_probs = 0.0f;
|
|
288
292
|
|
|
289
293
|
// sort dist by id
|
|
290
294
|
std::sort(dist_tgt.data, dist_tgt.data + dist_tgt.size, [](const llama_token_data &a, const llama_token_data &b) {
|
|
@@ -294,10 +298,18 @@ int main(int argc, char ** argv) {
|
|
|
294
298
|
return a.id < b.id;
|
|
295
299
|
});
|
|
296
300
|
|
|
301
|
+
float sum_probs = 0.0f;
|
|
302
|
+
|
|
297
303
|
for (size_t i = 0; i < dist_tgt.size; i++) {
|
|
298
|
-
|
|
304
|
+
if (i < dist_dft.size) {
|
|
305
|
+
dist_tgt.data[i].p = std::max(0.0f, dist_tgt.data[i].p - dist_dft.data[i].p);
|
|
306
|
+
} else {
|
|
307
|
+
dist_tgt.data[i].p = std::max(0.0f, dist_tgt.data[i].p);
|
|
308
|
+
}
|
|
309
|
+
|
|
299
310
|
sum_probs += dist_tgt.data[i].p;
|
|
300
311
|
}
|
|
312
|
+
|
|
301
313
|
for (size_t i = 0; i < dist_tgt.size; i++) {
|
|
302
314
|
dist_tgt.data[i].p /= sum_probs;
|
|
303
315
|
}
|
|
@@ -326,22 +338,28 @@ int main(int argc, char ** argv) {
|
|
|
326
338
|
if (!accept) {
|
|
327
339
|
// all drafted tokens were rejected
|
|
328
340
|
// sample from the target model
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
341
|
+
LOG_DBG("all drafted tokens were rejected, sampling from residual distribution\n");
|
|
342
|
+
std::vector<float> probs(dist_tgt.size);
|
|
343
|
+
for (size_t i = 0; i < dist_tgt.size; ++i) {
|
|
344
|
+
probs[i] = dist_tgt.data[i].p;
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
std::discrete_distribution<> dist(probs.begin(), probs.end());
|
|
348
|
+
|
|
349
|
+
const int idx = dist(rng);
|
|
350
|
+
|
|
351
|
+
token_id = dist_tgt.data[idx].id;
|
|
352
|
+
gpt_sampler_accept(smpl, token_id, true);
|
|
332
353
|
token_str = llama_token_to_piece(ctx_tgt, token_id);
|
|
333
354
|
}
|
|
334
|
-
|
|
335
355
|
} else {
|
|
336
356
|
// greedy verification
|
|
337
357
|
|
|
338
358
|
// sample from the target model
|
|
339
|
-
|
|
340
|
-
token_id =
|
|
359
|
+
LOG_DBG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
|
|
360
|
+
token_id = gpt_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft]);
|
|
341
361
|
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
//LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, ctx_sampling->prev).c_str());
|
|
362
|
+
gpt_sampler_accept(smpl, token_id, true);
|
|
345
363
|
|
|
346
364
|
token_str = llama_token_to_piece(ctx_tgt, token_id);
|
|
347
365
|
|
|
@@ -351,7 +369,7 @@ int main(int argc, char ** argv) {
|
|
|
351
369
|
}
|
|
352
370
|
|
|
353
371
|
if (i_dft < (int) drafts[s].tokens.size() && token_id == drafts[s].tokens[i_dft]) {
|
|
354
|
-
|
|
372
|
+
LOG_DBG("the sampled target token matches the %dth drafted token of sequence %d (%d, '%s') - accepted\n", i_dft, s, token_id, token_str.c_str());
|
|
355
373
|
|
|
356
374
|
s_keep = s;
|
|
357
375
|
accept = true;
|
|
@@ -373,26 +391,24 @@ int main(int argc, char ** argv) {
|
|
|
373
391
|
++i_dft;
|
|
374
392
|
if (params.use_color) {
|
|
375
393
|
// Color token according to its origin sequence
|
|
376
|
-
|
|
394
|
+
LOG("\u001b[%dm%s\u001b[37m", (36 - s_keep % 6), token_str.c_str());
|
|
377
395
|
} else {
|
|
378
|
-
|
|
396
|
+
LOG("%s", token_str.c_str());
|
|
379
397
|
}
|
|
380
|
-
fflush(stdout);
|
|
381
398
|
continue;
|
|
382
399
|
} else {
|
|
383
|
-
|
|
384
|
-
fflush(stdout);
|
|
400
|
+
LOG("%s", token_str.c_str());
|
|
385
401
|
break;
|
|
386
402
|
}
|
|
387
403
|
}
|
|
388
404
|
}
|
|
389
405
|
|
|
390
406
|
{
|
|
391
|
-
|
|
407
|
+
LOG_DBG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", token_id, token_str.c_str());
|
|
392
408
|
|
|
393
409
|
// TODO: simplify
|
|
394
410
|
{
|
|
395
|
-
|
|
411
|
+
LOG_DBG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft);
|
|
396
412
|
|
|
397
413
|
llama_kv_cache_seq_keep(ctx_dft, s_keep);
|
|
398
414
|
llama_kv_cache_seq_cp (ctx_dft, s_keep, 0, -1, -1);
|
|
@@ -419,7 +435,7 @@ int main(int argc, char ** argv) {
|
|
|
419
435
|
llama_batch_add (batch_dft, token_id, n_past_dft, { 0 }, true);
|
|
420
436
|
|
|
421
437
|
llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
|
|
422
|
-
//
|
|
438
|
+
// LOG_DBG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
|
|
423
439
|
llama_decode(ctx_dft, batch_dft);
|
|
424
440
|
|
|
425
441
|
++n_past_dft;
|
|
@@ -429,7 +445,10 @@ int main(int argc, char ** argv) {
|
|
|
429
445
|
break;
|
|
430
446
|
}
|
|
431
447
|
|
|
432
|
-
|
|
448
|
+
if (drafts[0].smpl) {
|
|
449
|
+
gpt_sampler_free(drafts[0].smpl);
|
|
450
|
+
}
|
|
451
|
+
drafts[0].smpl = gpt_sampler_clone(smpl);
|
|
433
452
|
|
|
434
453
|
int n_seq_cur = 1;
|
|
435
454
|
int n_past_cur = n_past_dft;
|
|
@@ -458,21 +477,21 @@ int main(int argc, char ** argv) {
|
|
|
458
477
|
continue;
|
|
459
478
|
}
|
|
460
479
|
|
|
461
|
-
|
|
480
|
+
gpt_sampler_sample(drafts[s].smpl, ctx_dft, drafts[s].i_batch_dft, true);
|
|
462
481
|
|
|
463
|
-
const auto
|
|
482
|
+
const auto * cur_p = gpt_sampler_get_candidates(drafts[s].smpl);
|
|
464
483
|
|
|
465
|
-
for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p
|
|
466
|
-
|
|
467
|
-
k, s, i, cur_p[k].id, cur_p[k].p, llama_token_to_piece(ctx_dft, cur_p[k].id).c_str());
|
|
484
|
+
for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p->size); ++k) {
|
|
485
|
+
LOG_DBG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n",
|
|
486
|
+
k, s, i, cur_p->data[k].id, cur_p->data[k].p, llama_token_to_piece(ctx_dft, cur_p->data[k].id).c_str());
|
|
468
487
|
}
|
|
469
488
|
|
|
470
489
|
std::vector<int> sa(1, s);
|
|
471
490
|
|
|
472
491
|
// attempt to split the branch if the probability is high enough
|
|
473
492
|
for (int f = 1; f < 8; ++f) {
|
|
474
|
-
if (n_seq_cur < n_seq_dft && cur_p[f].p > p_split) {
|
|
475
|
-
|
|
493
|
+
if (n_seq_cur < n_seq_dft && cur_p->data[f].p > p_split) {
|
|
494
|
+
LOG_DBG("splitting seq %3d into %3d\n", s, n_seq_cur);
|
|
476
495
|
|
|
477
496
|
llama_kv_cache_seq_rm(ctx_dft, n_seq_cur, -1, -1);
|
|
478
497
|
llama_kv_cache_seq_cp(ctx_dft, s, n_seq_cur, -1, -1);
|
|
@@ -498,7 +517,10 @@ int main(int argc, char ** argv) {
|
|
|
498
517
|
drafts[n_seq_cur].i_batch_dft = drafts[s].i_batch_dft;
|
|
499
518
|
drafts[n_seq_cur].i_batch_tgt = drafts[s].i_batch_tgt;
|
|
500
519
|
|
|
501
|
-
|
|
520
|
+
if (drafts[n_seq_cur].smpl) {
|
|
521
|
+
gpt_sampler_free(drafts[n_seq_cur].smpl);
|
|
522
|
+
}
|
|
523
|
+
drafts[n_seq_cur].smpl = gpt_sampler_clone(drafts[s].smpl);
|
|
502
524
|
|
|
503
525
|
sa.push_back(n_seq_cur);
|
|
504
526
|
|
|
@@ -510,15 +532,15 @@ int main(int argc, char ** argv) {
|
|
|
510
532
|
|
|
511
533
|
// add drafted token for each sequence
|
|
512
534
|
for (int is = 0; is < (int) sa.size(); ++is) {
|
|
513
|
-
const llama_token id = cur_p[is].id;
|
|
535
|
+
const llama_token id = cur_p->data[is].id;
|
|
514
536
|
|
|
515
537
|
const int s = sa[is];
|
|
516
538
|
|
|
517
|
-
|
|
539
|
+
gpt_sampler_accept(drafts[s].smpl, id, true);
|
|
518
540
|
|
|
519
541
|
drafts[s].tokens.push_back(id);
|
|
520
542
|
// save cur_p.data into drafts[s].dists
|
|
521
|
-
drafts[s].dists.push_back(cur_p);
|
|
543
|
+
drafts[s].dists.push_back({cur_p->data, cur_p->data + cur_p->size});
|
|
522
544
|
|
|
523
545
|
// add unique drafted tokens to the target batch
|
|
524
546
|
drafts[s].i_batch_tgt.push_back(batch_tgt.n_tokens);
|
|
@@ -558,7 +580,7 @@ int main(int argc, char ** argv) {
|
|
|
558
580
|
llama_kv_cache_seq_cp(ctx_tgt, 0, s, -1, -1);
|
|
559
581
|
}
|
|
560
582
|
|
|
561
|
-
//
|
|
583
|
+
// LOG_DBG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
|
|
562
584
|
llama_decode(ctx_tgt, batch_tgt);
|
|
563
585
|
++n_past_tgt;
|
|
564
586
|
}
|
|
@@ -576,29 +598,33 @@ int main(int argc, char ** argv) {
|
|
|
576
598
|
|
|
577
599
|
auto t_dec_end = ggml_time_us();
|
|
578
600
|
|
|
579
|
-
|
|
601
|
+
LOG("\n\n");
|
|
580
602
|
|
|
581
|
-
|
|
582
|
-
|
|
603
|
+
LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
|
|
604
|
+
LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
|
|
583
605
|
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
606
|
+
LOG_INF("\n");
|
|
607
|
+
LOG_INF("n_draft = %d\n", n_draft);
|
|
608
|
+
LOG_INF("n_predict = %d\n", n_predict);
|
|
609
|
+
LOG_INF("n_drafted = %d\n", n_drafted);
|
|
610
|
+
LOG_INF("n_accept = %d\n", n_accept);
|
|
611
|
+
LOG_INF("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
|
|
590
612
|
|
|
591
|
-
|
|
592
|
-
|
|
613
|
+
LOG_INF("\n");
|
|
614
|
+
LOG_INF("draft:\n\n");
|
|
615
|
+
// TODO: print sampling/grammar timings for all drafts
|
|
616
|
+
llama_perf_context_print(ctx_dft);
|
|
593
617
|
|
|
594
|
-
|
|
595
|
-
|
|
618
|
+
LOG_INF("\n");
|
|
619
|
+
LOG_INF("target:\n\n");
|
|
620
|
+
gpt_perf_print(ctx_tgt, smpl);
|
|
596
621
|
|
|
597
|
-
|
|
622
|
+
gpt_sampler_free(smpl);
|
|
598
623
|
for (int s = 0; s < n_seq_dft; ++s) {
|
|
599
|
-
|
|
624
|
+
gpt_sampler_free(drafts[s].smpl);
|
|
600
625
|
}
|
|
601
626
|
|
|
627
|
+
llama_sampler_free(softmax);
|
|
602
628
|
llama_batch_free(batch_dft);
|
|
603
629
|
|
|
604
630
|
llama_free(ctx_tgt);
|
|
@@ -609,7 +635,7 @@ int main(int argc, char ** argv) {
|
|
|
609
635
|
|
|
610
636
|
llama_backend_free();
|
|
611
637
|
|
|
612
|
-
|
|
638
|
+
LOG("\n\n");
|
|
613
639
|
|
|
614
640
|
return 0;
|
|
615
641
|
}
|
|
@@ -4,33 +4,24 @@
|
|
|
4
4
|
# Copyright (C) 2024 Intel Corporation
|
|
5
5
|
# SPDX-License-Identifier: MIT
|
|
6
6
|
|
|
7
|
-
INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
|
|
8
7
|
source /opt/intel/oneapi/setvars.sh
|
|
9
8
|
|
|
10
|
-
if [ $# -gt 0 ]; then
|
|
11
|
-
GGML_SYCL_DEVICE=$1
|
|
12
|
-
GGML_SYCL_SINGLE_GPU=1
|
|
13
|
-
else
|
|
14
|
-
GGML_SYCL_DEVICE=0
|
|
15
|
-
GGML_SYCL_SINGLE_GPU=0
|
|
16
|
-
fi
|
|
17
|
-
|
|
18
9
|
#export GGML_SYCL_DEBUG=1
|
|
19
10
|
|
|
20
|
-
|
|
21
11
|
#ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.
|
|
22
12
|
|
|
23
|
-
|
|
13
|
+
INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:"
|
|
14
|
+
MODEL_FILE=models/llama-2-7b.Q4_0.gguf
|
|
15
|
+
NGL=33
|
|
16
|
+
CONEXT=8192
|
|
17
|
+
|
|
18
|
+
if [ $# -gt 0 ]; then
|
|
19
|
+
GGML_SYCL_DEVICE=$1
|
|
24
20
|
echo "use $GGML_SYCL_DEVICE as main GPU"
|
|
25
21
|
#use signle GPU only
|
|
26
|
-
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m
|
|
22
|
+
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONEXT} -mg $GGML_SYCL_DEVICE -sm none
|
|
23
|
+
|
|
27
24
|
else
|
|
28
25
|
#use multiple GPUs with same max compute units
|
|
29
|
-
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m
|
|
26
|
+
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONEXT}
|
|
30
27
|
fi
|
|
31
|
-
|
|
32
|
-
#use main GPU only
|
|
33
|
-
#ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
|
|
34
|
-
|
|
35
|
-
#use multiple GPUs with same max compute units
|
|
36
|
-
#ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
|
|
@@ -6,4 +6,4 @@ set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
|
|
|
6
6
|
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
|
7
7
|
|
|
8
8
|
|
|
9
|
-
.\build\bin\
|
|
9
|
+
.\build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 33 -s 0
|
|
@@ -1,11 +1,13 @@
|
|
|
1
1
|
#include "common.h"
|
|
2
|
+
//#include "log.h" // TODO: start using log.h
|
|
2
3
|
#include "llama.h"
|
|
3
4
|
|
|
4
|
-
#include <cmath>
|
|
5
5
|
#include <cstdio>
|
|
6
|
+
#include <cstring>
|
|
6
7
|
#include <fstream>
|
|
7
8
|
#include <string>
|
|
8
9
|
#include <vector>
|
|
10
|
+
#include <iostream> // TODO: remove me
|
|
9
11
|
|
|
10
12
|
#if defined(_WIN32)
|
|
11
13
|
#define WIN32_LEAN_AND_MEAN
|
|
@@ -13,25 +15,25 @@
|
|
|
13
15
|
#include <shellapi.h> // For CommandLineToArgvW
|
|
14
16
|
#endif
|
|
15
17
|
|
|
16
|
-
static void print_usage_information(const char * argv0
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
18
|
+
static void print_usage_information(const char * argv0) {
|
|
19
|
+
printf("usage: %s [options]\n\n", argv0);
|
|
20
|
+
printf("The tokenize program tokenizes a prompt using a given model,\n");
|
|
21
|
+
printf("and prints the resulting tokens to standard output.\n\n");
|
|
22
|
+
printf("It needs a model file, a prompt, and optionally other flags\n");
|
|
23
|
+
printf("to control the behavior of the tokenizer.\n\n");
|
|
24
|
+
printf(" The possible options are:\n");
|
|
25
|
+
printf("\n");
|
|
26
|
+
printf(" -h, --help print this help and exit\n");
|
|
27
|
+
printf(" -m MODEL_PATH, --model MODEL_PATH path to model.\n");
|
|
28
|
+
printf(" --ids if given, only print numerical token IDs, and not token strings.\n");
|
|
29
|
+
printf(" The output format looks like [1, 2, 3], i.e. parseable by Python.\n");
|
|
30
|
+
printf(" -f PROMPT_FNAME, --file PROMPT_FNAME read prompt from a file.\n");
|
|
31
|
+
printf(" -p PROMPT, --prompt PROMPT read prompt from the argument.\n");
|
|
32
|
+
printf(" --stdin read prompt from standard input.\n");
|
|
33
|
+
printf(" --no-bos do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n");
|
|
34
|
+
printf(" --no-parse-special do not parse control tokens.\n");
|
|
35
|
+
printf(" --log-disable disable logs. Makes stderr quiet when loading the model.\n");
|
|
36
|
+
printf(" --show-count print the total number of tokens.\n");
|
|
35
37
|
}
|
|
36
38
|
|
|
37
39
|
static void llama_log_callback_null(ggml_log_level level, const char * text, void * user_data) {
|
|
@@ -185,7 +187,7 @@ int main(int raw_argc, char ** raw_argv) {
|
|
|
185
187
|
const int argc = argv.size();
|
|
186
188
|
|
|
187
189
|
if (argc <= 1) {
|
|
188
|
-
print_usage_information(argv[0].c_str()
|
|
190
|
+
print_usage_information(argv[0].c_str());
|
|
189
191
|
return 1;
|
|
190
192
|
}
|
|
191
193
|
|
|
@@ -214,7 +216,7 @@ int main(int raw_argc, char ** raw_argv) {
|
|
|
214
216
|
for (; iarg < argc; ++iarg) {
|
|
215
217
|
std::string arg{argv[iarg]};
|
|
216
218
|
if (arg == "-h" || arg == "--help") {
|
|
217
|
-
print_usage_information(argv[0].c_str()
|
|
219
|
+
print_usage_information(argv[0].c_str());
|
|
218
220
|
return 0;
|
|
219
221
|
}
|
|
220
222
|
else if (arg == "--ids") {
|
|
@@ -323,10 +325,6 @@ int main(int raw_argc, char ** raw_argv) {
|
|
|
323
325
|
// Start actually doing the tokenizing stuff.
|
|
324
326
|
//////
|
|
325
327
|
|
|
326
|
-
#ifdef LOG_DISABLE_LOGS
|
|
327
|
-
disable_logging = true;
|
|
328
|
-
#endif
|
|
329
|
-
|
|
330
328
|
if (disable_logging) {
|
|
331
329
|
llama_log_set(llama_log_callback_null, NULL);
|
|
332
330
|
}
|
|
@@ -362,7 +360,7 @@ int main(int raw_argc, char ** raw_argv) {
|
|
|
362
360
|
prompt = stdin_buffer.str();
|
|
363
361
|
}
|
|
364
362
|
|
|
365
|
-
const bool model_wants_add_bos =
|
|
363
|
+
const bool model_wants_add_bos = llama_add_bos_token(model);
|
|
366
364
|
const bool add_bos = model_wants_add_bos && !no_bos;
|
|
367
365
|
const bool parse_special = !no_parse_special;
|
|
368
366
|
|