@fugood/llama.node 0.3.2 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +2 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +1 -1
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/EmbeddingWorker.cpp +2 -2
- package/src/LlamaCompletionWorker.cpp +8 -8
- package/src/LlamaCompletionWorker.h +2 -2
- package/src/LlamaContext.cpp +8 -9
- package/src/TokenizeWorker.cpp +1 -1
- package/src/common.hpp +4 -4
- package/src/llama.cpp/.github/workflows/build.yml +43 -9
- package/src/llama.cpp/.github/workflows/docker.yml +3 -0
- package/src/llama.cpp/CMakeLists.txt +7 -4
- package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
- package/src/llama.cpp/common/CMakeLists.txt +0 -2
- package/src/llama.cpp/common/arg.cpp +642 -607
- package/src/llama.cpp/common/arg.h +22 -22
- package/src/llama.cpp/common/common.cpp +79 -281
- package/src/llama.cpp/common/common.h +130 -100
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
- package/src/llama.cpp/common/log.cpp +50 -50
- package/src/llama.cpp/common/log.h +18 -18
- package/src/llama.cpp/common/ngram-cache.cpp +36 -36
- package/src/llama.cpp/common/ngram-cache.h +19 -19
- package/src/llama.cpp/common/sampling.cpp +116 -108
- package/src/llama.cpp/common/sampling.h +20 -20
- package/src/llama.cpp/docs/build.md +37 -17
- package/src/llama.cpp/examples/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched/batched.cpp +14 -14
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
- package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +20 -11
- package/src/llama.cpp/examples/infill/infill.cpp +40 -86
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +42 -151
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
- package/src/llama.cpp/examples/llava/clip.cpp +1 -0
- package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
- package/src/llama.cpp/examples/llava/llava.cpp +37 -3
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +14 -14
- package/src/llama.cpp/examples/lookup/lookup.cpp +29 -29
- package/src/llama.cpp/examples/main/main.cpp +64 -109
- package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
- package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +13 -13
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +34 -17
- package/src/llama.cpp/examples/server/CMakeLists.txt +4 -13
- package/src/llama.cpp/examples/server/server.cpp +553 -691
- package/src/llama.cpp/examples/server/utils.hpp +312 -25
- package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple/simple.cpp +128 -96
- package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
- package/src/llama.cpp/examples/speculative/speculative.cpp +54 -51
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +2 -2
- package/src/llama.cpp/ggml/CMakeLists.txt +15 -9
- package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +46 -33
- package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
- package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
- package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
- package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
- package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
- package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
- package/src/llama.cpp/ggml/include/ggml.h +53 -393
- package/src/llama.cpp/ggml/src/CMakeLists.txt +66 -1149
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +46 -3126
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
- package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -27
- package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
- package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
- package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +6 -25
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +303 -864
- package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
- package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +213 -65
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
- package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +255 -149
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
- package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +0 -243
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
- package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +667 -1
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +366 -16
- package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
- package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +238 -72
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
- package/src/llama.cpp/ggml/src/ggml-quants.c +187 -10692
- package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
- package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
- package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +475 -300
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +40 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +258 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +2 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
- package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3584 -4142
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +69 -67
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +3 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
- package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
- package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +555 -623
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +125 -206
- package/src/llama.cpp/ggml/src/ggml.c +4032 -19890
- package/src/llama.cpp/include/llama.h +67 -33
- package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
- package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
- package/src/llama.cpp/src/CMakeLists.txt +2 -1
- package/src/llama.cpp/src/llama-sampling.cpp +745 -105
- package/src/llama.cpp/src/llama-sampling.h +21 -2
- package/src/llama.cpp/src/llama-vocab.cpp +49 -9
- package/src/llama.cpp/src/llama-vocab.h +35 -11
- package/src/llama.cpp/src/llama.cpp +2636 -2406
- package/src/llama.cpp/src/unicode-data.cpp +2 -2
- package/src/llama.cpp/tests/CMakeLists.txt +1 -2
- package/src/llama.cpp/tests/test-arg-parser.cpp +14 -14
- package/src/llama.cpp/tests/test-backend-ops.cpp +185 -60
- package/src/llama.cpp/tests/test-barrier.cpp +1 -0
- package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
- package/src/llama.cpp/tests/test-log.cpp +2 -2
- package/src/llama.cpp/tests/test-opt.cpp +853 -142
- package/src/llama.cpp/tests/test-quantize-fns.cpp +22 -19
- package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
- package/src/llama.cpp/tests/test-rope.cpp +1 -0
- package/src/llama.cpp/tests/test-sampling.cpp +162 -137
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
- package/src/llama.cpp/common/train.cpp +0 -1515
- package/src/llama.cpp/common/train.h +0 -233
- package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
- package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
- /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
- /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
- /package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +0 -0
|
@@ -15,16 +15,16 @@ static void print_usage(int, char ** argv) {
|
|
|
15
15
|
}
|
|
16
16
|
|
|
17
17
|
int main(int argc, char ** argv) {
|
|
18
|
-
|
|
18
|
+
common_params params;
|
|
19
19
|
|
|
20
20
|
params.prompt = "Hello my name is";
|
|
21
21
|
params.n_predict = 32;
|
|
22
22
|
|
|
23
|
-
if (!
|
|
23
|
+
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
|
|
24
24
|
return 1;
|
|
25
25
|
}
|
|
26
26
|
|
|
27
|
-
|
|
27
|
+
common_init();
|
|
28
28
|
|
|
29
29
|
// number of parallel batches
|
|
30
30
|
int n_parallel = params.n_parallel;
|
|
@@ -39,7 +39,7 @@ int main(int argc, char ** argv) {
|
|
|
39
39
|
|
|
40
40
|
// initialize the model
|
|
41
41
|
|
|
42
|
-
llama_model_params model_params =
|
|
42
|
+
llama_model_params model_params = common_model_params_to_llama(params);
|
|
43
43
|
|
|
44
44
|
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
|
|
45
45
|
|
|
@@ -51,13 +51,13 @@ int main(int argc, char ** argv) {
|
|
|
51
51
|
// tokenize the prompt
|
|
52
52
|
|
|
53
53
|
std::vector<llama_token> tokens_list;
|
|
54
|
-
tokens_list =
|
|
54
|
+
tokens_list = common_tokenize(model, params.prompt, true);
|
|
55
55
|
|
|
56
56
|
const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size())*n_parallel;
|
|
57
57
|
|
|
58
58
|
// initialize the context
|
|
59
59
|
|
|
60
|
-
llama_context_params ctx_params =
|
|
60
|
+
llama_context_params ctx_params = common_context_params_to_llama(params);
|
|
61
61
|
|
|
62
62
|
ctx_params.n_ctx = n_kv_req;
|
|
63
63
|
ctx_params.n_batch = std::max(n_predict, n_parallel);
|
|
@@ -94,7 +94,7 @@ int main(int argc, char ** argv) {
|
|
|
94
94
|
LOG("\n");
|
|
95
95
|
|
|
96
96
|
for (auto id : tokens_list) {
|
|
97
|
-
LOG("%s",
|
|
97
|
+
LOG("%s", common_token_to_piece(ctx, id).c_str());
|
|
98
98
|
}
|
|
99
99
|
|
|
100
100
|
// create a llama_batch
|
|
@@ -108,7 +108,7 @@ int main(int argc, char ** argv) {
|
|
|
108
108
|
|
|
109
109
|
// evaluate the initial prompt
|
|
110
110
|
for (size_t i = 0; i < tokens_list.size(); ++i) {
|
|
111
|
-
|
|
111
|
+
common_batch_add(batch, tokens_list[i], i, seq_ids, false);
|
|
112
112
|
}
|
|
113
113
|
GGML_ASSERT(batch.n_tokens == (int) tokens_list.size());
|
|
114
114
|
|
|
@@ -123,8 +123,8 @@ int main(int argc, char ** argv) {
|
|
|
123
123
|
decoder_start_token_id = llama_token_bos(model);
|
|
124
124
|
}
|
|
125
125
|
|
|
126
|
-
|
|
127
|
-
|
|
126
|
+
common_batch_clear(batch);
|
|
127
|
+
common_batch_add(batch, decoder_start_token_id, 0, seq_ids, false);
|
|
128
128
|
}
|
|
129
129
|
|
|
130
130
|
// llama_decode will output logits only for the last token of the prompt
|
|
@@ -161,7 +161,7 @@ int main(int argc, char ** argv) {
|
|
|
161
161
|
|
|
162
162
|
while (n_cur <= n_predict) {
|
|
163
163
|
// prepare the next batch
|
|
164
|
-
|
|
164
|
+
common_batch_clear(batch);
|
|
165
165
|
|
|
166
166
|
// sample the next token for each parallel sequence / stream
|
|
167
167
|
for (int32_t i = 0; i < n_parallel; ++i) {
|
|
@@ -185,15 +185,15 @@ int main(int argc, char ** argv) {
|
|
|
185
185
|
|
|
186
186
|
// if there is only one stream, we print immediately to stdout
|
|
187
187
|
if (n_parallel == 1) {
|
|
188
|
-
LOG("%s",
|
|
188
|
+
LOG("%s", common_token_to_piece(ctx, new_token_id).c_str());
|
|
189
189
|
}
|
|
190
190
|
|
|
191
|
-
streams[i] +=
|
|
191
|
+
streams[i] += common_token_to_piece(ctx, new_token_id);
|
|
192
192
|
|
|
193
193
|
i_batch[i] = batch.n_tokens;
|
|
194
194
|
|
|
195
195
|
// push this new token for next evaluation
|
|
196
|
-
|
|
196
|
+
common_batch_add(batch, new_token_id, n_cur, { i }, true);
|
|
197
197
|
|
|
198
198
|
n_decode += 1;
|
|
199
199
|
}
|
|
@@ -15,13 +15,13 @@ static void print_usage(int, char ** argv) {
|
|
|
15
15
|
}
|
|
16
16
|
|
|
17
17
|
int main(int argc, char ** argv) {
|
|
18
|
-
|
|
18
|
+
common_params params;
|
|
19
19
|
|
|
20
|
-
if (!
|
|
20
|
+
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_BENCH, print_usage)) {
|
|
21
21
|
return 1;
|
|
22
22
|
}
|
|
23
23
|
|
|
24
|
-
|
|
24
|
+
common_init();
|
|
25
25
|
|
|
26
26
|
int is_pp_shared = params.is_pp_shared;
|
|
27
27
|
|
|
@@ -36,7 +36,7 @@ int main(int argc, char ** argv) {
|
|
|
36
36
|
|
|
37
37
|
// initialize the model
|
|
38
38
|
|
|
39
|
-
llama_model_params model_params =
|
|
39
|
+
llama_model_params model_params = common_model_params_to_llama(params);
|
|
40
40
|
|
|
41
41
|
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
|
|
42
42
|
|
|
@@ -45,7 +45,7 @@ int main(int argc, char ** argv) {
|
|
|
45
45
|
return 1;
|
|
46
46
|
}
|
|
47
47
|
|
|
48
|
-
llama_context_params ctx_params =
|
|
48
|
+
llama_context_params ctx_params = common_context_params_to_llama(params);
|
|
49
49
|
|
|
50
50
|
// ensure enough sequences are available
|
|
51
51
|
ctx_params.n_seq_max = n_pl.empty() ? 1 : *std::max_element(n_pl.begin(), n_pl.end());
|
|
@@ -74,7 +74,6 @@ int main(int argc, char ** argv) {
|
|
|
74
74
|
batch.n_seq_id + i,
|
|
75
75
|
batch.seq_id + i,
|
|
76
76
|
batch.logits + i,
|
|
77
|
-
0, 0, 0, // unused
|
|
78
77
|
};
|
|
79
78
|
|
|
80
79
|
const int ret = llama_decode(ctx, batch_view);
|
|
@@ -92,7 +91,7 @@ int main(int argc, char ** argv) {
|
|
|
92
91
|
// warm up
|
|
93
92
|
{
|
|
94
93
|
for (int i = 0; i < 16; ++i) {
|
|
95
|
-
|
|
94
|
+
common_batch_add(batch, 0, i, { 0 }, false);
|
|
96
95
|
}
|
|
97
96
|
|
|
98
97
|
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
|
@@ -122,11 +121,11 @@ int main(int argc, char ** argv) {
|
|
|
122
121
|
continue;
|
|
123
122
|
}
|
|
124
123
|
|
|
125
|
-
|
|
124
|
+
common_batch_clear(batch);
|
|
126
125
|
|
|
127
126
|
for (int i = 0; i < pp; ++i) {
|
|
128
127
|
for (int j = 0; j < (is_pp_shared ? 1 : pl); ++j) {
|
|
129
|
-
|
|
128
|
+
common_batch_add(batch, 0, i, { j }, false);
|
|
130
129
|
}
|
|
131
130
|
}
|
|
132
131
|
batch.logits[batch.n_tokens - 1] = true;
|
|
@@ -151,10 +150,10 @@ int main(int argc, char ** argv) {
|
|
|
151
150
|
const auto t_tg_start = ggml_time_us();
|
|
152
151
|
|
|
153
152
|
for (int i = 0; i < tg; ++i) {
|
|
154
|
-
|
|
153
|
+
common_batch_clear(batch);
|
|
155
154
|
|
|
156
155
|
for (int j = 0; j < pl; ++j) {
|
|
157
|
-
|
|
156
|
+
common_batch_add(batch, 0, pp + i, { j }, true);
|
|
158
157
|
}
|
|
159
158
|
|
|
160
159
|
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
|
@@ -31,7 +31,7 @@ template <class Iter>
|
|
|
31
31
|
static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
|
|
32
32
|
std::string ret;
|
|
33
33
|
for (; begin != end; ++begin) {
|
|
34
|
-
ret +=
|
|
34
|
+
ret += common_token_to_piece(ctx, *begin);
|
|
35
35
|
}
|
|
36
36
|
|
|
37
37
|
return ret;
|
|
@@ -272,8 +272,8 @@ struct tokenized_prompt {
|
|
|
272
272
|
|
|
273
273
|
tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) {
|
|
274
274
|
const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
|
|
275
|
-
tokens_pos =
|
|
276
|
-
tokens_neg =
|
|
275
|
+
tokens_pos = common_tokenize(ctx, pos, add_bos, true);
|
|
276
|
+
tokens_neg = common_tokenize(ctx, neg, add_bos, true);
|
|
277
277
|
max_seq_len = std::max(tokens_pos.size(), tokens_neg.size());
|
|
278
278
|
padding_seq(ctx, tokens_pos, max_seq_len);
|
|
279
279
|
padding_seq(ctx, tokens_neg, max_seq_len);
|
|
@@ -281,7 +281,7 @@ struct tokenized_prompt {
|
|
|
281
281
|
|
|
282
282
|
void padding_seq(llama_context * ctx, std::vector<llama_token> & tokens, size_t len) {
|
|
283
283
|
// TODO: customize padding token
|
|
284
|
-
std::vector<llama_token> pad_tokens =
|
|
284
|
+
std::vector<llama_token> pad_tokens = common_tokenize(ctx, " ", false);
|
|
285
285
|
llama_token pad_tok = pad_tokens.back();
|
|
286
286
|
while (tokens.size() < len) {
|
|
287
287
|
tokens.push_back(pad_tok);
|
|
@@ -339,7 +339,7 @@ static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
|
|
|
339
339
|
|
|
340
340
|
static bool get_hidden_layers(llama_context * ctx, std::vector<llama_token> & tokens) {
|
|
341
341
|
llama_kv_cache_clear(ctx);
|
|
342
|
-
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()
|
|
342
|
+
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
|
|
343
343
|
fprintf(stderr, "%s : failed to eval\n", __func__);
|
|
344
344
|
return false;
|
|
345
345
|
}
|
|
@@ -370,7 +370,7 @@ static void export_gguf(const std::vector<struct ggml_tensor *> & v_ctrl, const
|
|
|
370
370
|
* Load prompt files and completion file.
|
|
371
371
|
* Then format each pair of prompt + completion to make an entry.
|
|
372
372
|
*/
|
|
373
|
-
static int prepare_entries(
|
|
373
|
+
static int prepare_entries(common_params & params, train_context & ctx_train) {
|
|
374
374
|
// load prompts
|
|
375
375
|
std::vector<std::string> positive_prompts = ctrlvec_load_prompt_file(params.cvector_positive_file, true);
|
|
376
376
|
std::vector<std::string> negative_prompts = ctrlvec_load_prompt_file(params.cvector_negative_file, true);
|
|
@@ -388,9 +388,9 @@ static int prepare_entries(gpt_params & params, train_context & ctx_train) {
|
|
|
388
388
|
}
|
|
389
389
|
|
|
390
390
|
int main(int argc, char ** argv) {
|
|
391
|
-
|
|
391
|
+
common_params params;
|
|
392
392
|
|
|
393
|
-
if (!
|
|
393
|
+
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) {
|
|
394
394
|
return 1;
|
|
395
395
|
}
|
|
396
396
|
|
|
@@ -413,7 +413,7 @@ int main(int argc, char ** argv) {
|
|
|
413
413
|
llama_numa_init(params.numa);
|
|
414
414
|
|
|
415
415
|
// load the model to get hparams
|
|
416
|
-
|
|
416
|
+
common_init_result llama_init = common_init_from_params(params);
|
|
417
417
|
|
|
418
418
|
llama_model * model = llama_init.model;
|
|
419
419
|
llama_context * ctx = llama_init.context;
|
|
@@ -28,7 +28,7 @@ static std::vector<std::string> split_lines(const std::string & s, const std::st
|
|
|
28
28
|
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
|
|
29
29
|
size_t n_tokens = tokens.size();
|
|
30
30
|
for (size_t i = 0; i < n_tokens; i++) {
|
|
31
|
-
|
|
31
|
+
common_batch_add(batch, tokens[i], i, { seq_id }, true);
|
|
32
32
|
}
|
|
33
33
|
}
|
|
34
34
|
|
|
@@ -74,18 +74,18 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
|
|
|
74
74
|
}
|
|
75
75
|
|
|
76
76
|
float * out = output + embd_pos * n_embd;
|
|
77
|
-
|
|
77
|
+
common_embd_normalize(embd, out, n_embd, embd_norm);
|
|
78
78
|
}
|
|
79
79
|
}
|
|
80
80
|
|
|
81
81
|
int main(int argc, char ** argv) {
|
|
82
|
-
|
|
82
|
+
common_params params;
|
|
83
83
|
|
|
84
|
-
if (!
|
|
84
|
+
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EMBEDDING)) {
|
|
85
85
|
return 1;
|
|
86
86
|
}
|
|
87
87
|
|
|
88
|
-
|
|
88
|
+
common_init();
|
|
89
89
|
|
|
90
90
|
params.embedding = true;
|
|
91
91
|
// For non-causal models, batch size must be equal to ubatch size
|
|
@@ -95,7 +95,7 @@ int main(int argc, char ** argv) {
|
|
|
95
95
|
llama_numa_init(params.numa);
|
|
96
96
|
|
|
97
97
|
// load the model
|
|
98
|
-
|
|
98
|
+
common_init_result llama_init = common_init_from_params(params);
|
|
99
99
|
|
|
100
100
|
llama_model * model = llama_init.model;
|
|
101
101
|
llama_context * ctx = llama_init.context;
|
|
@@ -122,7 +122,7 @@ int main(int argc, char ** argv) {
|
|
|
122
122
|
// print system information
|
|
123
123
|
{
|
|
124
124
|
LOG_INF("\n");
|
|
125
|
-
LOG_INF("%s\n",
|
|
125
|
+
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
|
|
126
126
|
}
|
|
127
127
|
|
|
128
128
|
// split the prompt into lines
|
|
@@ -135,7 +135,7 @@ int main(int argc, char ** argv) {
|
|
|
135
135
|
// tokenize the prompts and trim
|
|
136
136
|
std::vector<std::vector<int32_t>> inputs;
|
|
137
137
|
for (const auto & prompt : prompts) {
|
|
138
|
-
auto inp =
|
|
138
|
+
auto inp = common_tokenize(ctx, prompt, true, true);
|
|
139
139
|
if (inp.size() > n_batch) {
|
|
140
140
|
LOG_ERR("%s: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
|
|
141
141
|
__func__, (long long int) inp.size(), (long long int) n_batch);
|
|
@@ -159,7 +159,7 @@ int main(int argc, char ** argv) {
|
|
|
159
159
|
LOG_INF("%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str());
|
|
160
160
|
LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size());
|
|
161
161
|
for (int j = 0; j < (int) inputs[i].size(); j++) {
|
|
162
|
-
LOG("%6d -> '%s'\n", inputs[i][j],
|
|
162
|
+
LOG("%6d -> '%s'\n", inputs[i][j], common_token_to_piece(ctx, inputs[i][j]).c_str());
|
|
163
163
|
}
|
|
164
164
|
LOG("\n\n");
|
|
165
165
|
}
|
|
@@ -199,7 +199,7 @@ int main(int argc, char ** argv) {
|
|
|
199
199
|
batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
|
|
200
200
|
e += pooling_type == LLAMA_POOLING_TYPE_NONE ? batch.n_tokens : s;
|
|
201
201
|
s = 0;
|
|
202
|
-
|
|
202
|
+
common_batch_clear(batch);
|
|
203
203
|
}
|
|
204
204
|
|
|
205
205
|
// add to batch
|
|
@@ -263,7 +263,7 @@ int main(int argc, char ** argv) {
|
|
|
263
263
|
LOG("\n");
|
|
264
264
|
for (int i = 0; i < n_prompts; i++) {
|
|
265
265
|
for (int j = 0; j < n_prompts; j++) {
|
|
266
|
-
float sim =
|
|
266
|
+
float sim = common_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
|
|
267
267
|
LOG("%6.2f ", sim);
|
|
268
268
|
}
|
|
269
269
|
LOG("%1.10s", prompts[i].c_str());
|
|
@@ -296,7 +296,7 @@ int main(int argc, char ** argv) {
|
|
|
296
296
|
for (int i = 0;;) { // at least two iteration (n_embd_count > 1)
|
|
297
297
|
LOG(" [");
|
|
298
298
|
for (int j = 0;;) { // at least two iteration (n_embd_count > 1)
|
|
299
|
-
float sim =
|
|
299
|
+
float sim = common_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
|
|
300
300
|
LOG("%6.2f", sim);
|
|
301
301
|
j++;
|
|
302
302
|
if (j < n_embd_count) LOG(", "); else break;
|
|
@@ -126,12 +126,12 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
|
|
|
126
126
|
return true;
|
|
127
127
|
}
|
|
128
128
|
|
|
129
|
-
static bool run(llama_context * ctx, const
|
|
129
|
+
static bool run(llama_context * ctx, const common_params & params) {
|
|
130
130
|
const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
|
|
131
131
|
|
|
132
|
-
std::vector<llama_token> tokens =
|
|
132
|
+
std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, add_bos);
|
|
133
133
|
|
|
134
|
-
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()
|
|
134
|
+
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
|
|
135
135
|
LOG_ERR("%s : failed to eval\n", __func__);
|
|
136
136
|
return false;
|
|
137
137
|
}
|
|
@@ -142,13 +142,13 @@ static bool run(llama_context * ctx, const gpt_params & params) {
|
|
|
142
142
|
int main(int argc, char ** argv) {
|
|
143
143
|
callback_data cb_data;
|
|
144
144
|
|
|
145
|
-
|
|
145
|
+
common_params params;
|
|
146
146
|
|
|
147
|
-
if (!
|
|
147
|
+
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
|
|
148
148
|
return 1;
|
|
149
149
|
}
|
|
150
150
|
|
|
151
|
-
|
|
151
|
+
common_init();
|
|
152
152
|
|
|
153
153
|
llama_backend_init();
|
|
154
154
|
llama_numa_init(params.numa);
|
|
@@ -160,7 +160,7 @@ int main(int argc, char ** argv) {
|
|
|
160
160
|
params.warmup = false;
|
|
161
161
|
|
|
162
162
|
// init
|
|
163
|
-
|
|
163
|
+
common_init_result llama_init = common_init_from_params(params);
|
|
164
164
|
|
|
165
165
|
llama_model * model = llama_init.model;
|
|
166
166
|
llama_context * ctx = llama_init.context;
|
|
@@ -172,7 +172,7 @@ int main(int argc, char ** argv) {
|
|
|
172
172
|
// print system information
|
|
173
173
|
{
|
|
174
174
|
LOG_INF("\n");
|
|
175
|
-
LOG_INF("%s\n",
|
|
175
|
+
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
|
|
176
176
|
LOG_INF("\n");
|
|
177
177
|
}
|
|
178
178
|
|
|
@@ -128,7 +128,7 @@ struct lora_merge_ctx {
|
|
|
128
128
|
|
|
129
129
|
lora_merge_ctx(
|
|
130
130
|
std::string & base_fname,
|
|
131
|
-
std::vector<
|
|
131
|
+
std::vector<common_lora_adapter_info> & lora_files,
|
|
132
132
|
std::string & outfile,
|
|
133
133
|
int n_threads) : base_model(base_fname, 0), n_threads(n_threads), fout(outfile, std::ios::binary) {
|
|
134
134
|
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
|
|
@@ -314,9 +314,9 @@ struct lora_merge_ctx {
|
|
|
314
314
|
// optionally dequantize it
|
|
315
315
|
printf("%s : + dequantize base tensor from %s to F32\n", __func__, ggml_type_name(base->type));
|
|
316
316
|
auto nels = ggml_nelements(inp_base);
|
|
317
|
-
|
|
317
|
+
const auto * qtype = ggml_get_type_traits(base->type);
|
|
318
318
|
std::vector<uint8_t> dequant_buf(nels * sizeof(float));
|
|
319
|
-
qtype
|
|
319
|
+
qtype->to_float(read_buf.data(), (float *)dequant_buf.data(), nels);
|
|
320
320
|
ggml_backend_tensor_set(inp_base, dequant_buf.data(), 0, dequant_buf.size());
|
|
321
321
|
} else {
|
|
322
322
|
ggml_backend_tensor_set(inp_base, read_buf.data(), 0, ggml_nbytes(inp_base));
|
|
@@ -400,9 +400,9 @@ static void print_usage(int, char ** argv) {
|
|
|
400
400
|
}
|
|
401
401
|
|
|
402
402
|
int main(int argc, char ** argv) {
|
|
403
|
-
|
|
403
|
+
common_params params;
|
|
404
404
|
|
|
405
|
-
if (!
|
|
405
|
+
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage)) {
|
|
406
406
|
return 1;
|
|
407
407
|
}
|
|
408
408
|
|
|
@@ -11,7 +11,7 @@ static void write_table_header(std::ofstream & file) {
|
|
|
11
11
|
file << "| -------- | ----------- |\n";
|
|
12
12
|
}
|
|
13
13
|
|
|
14
|
-
static void write_table_entry(std::ofstream & file, const
|
|
14
|
+
static void write_table_entry(std::ofstream & file, const common_arg & opt) {
|
|
15
15
|
file << "| `";
|
|
16
16
|
// args
|
|
17
17
|
for (const auto & arg : opt.args) {
|
|
@@ -40,7 +40,7 @@ static void write_table_entry(std::ofstream & file, const llama_arg & opt) {
|
|
|
40
40
|
file << "` | " << md_help << " |\n";
|
|
41
41
|
}
|
|
42
42
|
|
|
43
|
-
static void write_table(std::ofstream & file, std::vector<
|
|
43
|
+
static void write_table(std::ofstream & file, std::vector<common_arg *> & opts) {
|
|
44
44
|
write_table_header(file);
|
|
45
45
|
for (const auto & opt : opts) {
|
|
46
46
|
write_table_entry(file, *opt);
|
|
@@ -50,12 +50,12 @@ static void write_table(std::ofstream & file, std::vector<llama_arg *> & opts) {
|
|
|
50
50
|
static void export_md(std::string fname, llama_example ex) {
|
|
51
51
|
std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc);
|
|
52
52
|
|
|
53
|
-
|
|
54
|
-
auto ctx_arg =
|
|
53
|
+
common_params params;
|
|
54
|
+
auto ctx_arg = common_params_parser_init(params, ex);
|
|
55
55
|
|
|
56
|
-
std::vector<
|
|
57
|
-
std::vector<
|
|
58
|
-
std::vector<
|
|
56
|
+
std::vector<common_arg *> common_options;
|
|
57
|
+
std::vector<common_arg *> sparam_options;
|
|
58
|
+
std::vector<common_arg *> specific_options;
|
|
59
59
|
for (auto & opt : ctx_arg.options) {
|
|
60
60
|
// in case multiple LLAMA_EXAMPLE_* are set, we prioritize the LLAMA_EXAMPLE_* matching current example
|
|
61
61
|
if (opt.is_sparam) {
|
|
@@ -15,11 +15,11 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
|
|
|
15
15
|
llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1);
|
|
16
16
|
|
|
17
17
|
for (uint64_t i = 0; i < sentences.size(); i++) {
|
|
18
|
-
|
|
18
|
+
common_batch_clear(batch);
|
|
19
19
|
|
|
20
20
|
const std::string input_string = instruction + sentences[i];
|
|
21
21
|
|
|
22
|
-
std::vector<llama_token> inputs =
|
|
22
|
+
std::vector<llama_token> inputs = common_tokenize(model, input_string, true, false);
|
|
23
23
|
|
|
24
24
|
const int32_t n_toks = inputs.size();
|
|
25
25
|
|
|
@@ -28,7 +28,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
|
|
|
28
28
|
// inputs.push_back(llama_token_eos(model));
|
|
29
29
|
|
|
30
30
|
// we want to ignore instruction tokens for mean pooling
|
|
31
|
-
const int32_t n_inst =
|
|
31
|
+
const int32_t n_inst = common_tokenize(model, instruction, true, false).size();
|
|
32
32
|
|
|
33
33
|
#ifdef GRIT_DEBUG
|
|
34
34
|
// debug tokens - should be matching as referenced in the GritLM sample
|
|
@@ -40,7 +40,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
|
|
|
40
40
|
|
|
41
41
|
// add input to batch (this increments n_tokens)
|
|
42
42
|
for (int32_t j = 0; j < n_toks; j++) {
|
|
43
|
-
|
|
43
|
+
common_batch_add(batch, inputs[j], j, { 0 }, j >= n_inst);
|
|
44
44
|
}
|
|
45
45
|
|
|
46
46
|
// clear previous kv_cache values (irrelevant for embeddings)
|
|
@@ -75,7 +75,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
|
|
|
75
75
|
}
|
|
76
76
|
|
|
77
77
|
std::vector<float> emb_norm(emb_unorm.size());
|
|
78
|
-
|
|
78
|
+
common_embd_normalize(emb_unorm.data(), emb_norm.data(), n_embd);
|
|
79
79
|
result.push_back(emb_norm);
|
|
80
80
|
|
|
81
81
|
#ifdef GRIT_DEBUG
|
|
@@ -105,16 +105,16 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std
|
|
|
105
105
|
|
|
106
106
|
llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);
|
|
107
107
|
|
|
108
|
-
std::vector<llama_token> inputs =
|
|
108
|
+
std::vector<llama_token> inputs = common_tokenize(model, prompt, false, true);
|
|
109
109
|
int32_t i_current_token = 0;
|
|
110
110
|
|
|
111
111
|
while (true) {
|
|
112
|
-
|
|
112
|
+
common_batch_clear(bat);
|
|
113
113
|
{
|
|
114
114
|
const int32_t n_inputs = inputs.size();
|
|
115
115
|
|
|
116
116
|
for (int32_t i = 0; i < n_inputs; i++) {
|
|
117
|
-
|
|
117
|
+
common_batch_add(bat, inputs[i], i_current_token++, { 0 }, i == n_inputs - 1);
|
|
118
118
|
}
|
|
119
119
|
}
|
|
120
120
|
inputs.clear();
|
|
@@ -127,7 +127,7 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std
|
|
|
127
127
|
break;
|
|
128
128
|
}
|
|
129
129
|
|
|
130
|
-
std::string piece =
|
|
130
|
+
std::string piece = common_token_to_piece(ctx, token);
|
|
131
131
|
if (stream) {
|
|
132
132
|
std::printf("%s", piece.c_str());
|
|
133
133
|
std::fflush(stdout);
|
|
@@ -152,16 +152,16 @@ static std::string gritlm_instruction(const std::string & instruction) {
|
|
|
152
152
|
}
|
|
153
153
|
|
|
154
154
|
int main(int argc, char * argv[]) {
|
|
155
|
-
|
|
155
|
+
common_params params;
|
|
156
156
|
|
|
157
|
-
if (!
|
|
157
|
+
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
|
|
158
158
|
return 1;
|
|
159
159
|
}
|
|
160
160
|
|
|
161
|
-
|
|
161
|
+
common_init();
|
|
162
162
|
|
|
163
|
-
llama_model_params mparams =
|
|
164
|
-
llama_context_params cparams =
|
|
163
|
+
llama_model_params mparams = common_model_params_to_llama(params);
|
|
164
|
+
llama_context_params cparams = common_context_params_to_llama(params);
|
|
165
165
|
|
|
166
166
|
llama_backend_init();
|
|
167
167
|
|
|
@@ -199,10 +199,10 @@ int main(int argc, char * argv[]) {
|
|
|
199
199
|
|
|
200
200
|
const int n_embd = llama_n_embd(model);
|
|
201
201
|
|
|
202
|
-
const float cosine_sim_q0_d0 =
|
|
203
|
-
const float cosine_sim_q0_d1 =
|
|
204
|
-
const float cosine_sim_q1_d0 =
|
|
205
|
-
const float cosine_sim_q1_d1 =
|
|
202
|
+
const float cosine_sim_q0_d0 = common_embd_similarity_cos(q_rep[0].data(), d_rep[0].data(), n_embd);
|
|
203
|
+
const float cosine_sim_q0_d1 = common_embd_similarity_cos(q_rep[0].data(), d_rep[1].data(), n_embd);
|
|
204
|
+
const float cosine_sim_q1_d0 = common_embd_similarity_cos(q_rep[1].data(), d_rep[0].data(), n_embd);
|
|
205
|
+
const float cosine_sim_q1_d1 = common_embd_similarity_cos(q_rep[1].data(), d_rep[1].data(), n_embd);
|
|
206
206
|
|
|
207
207
|
std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[0].c_str(), cosine_sim_q0_d0);
|
|
208
208
|
std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[1].c_str(), cosine_sim_q0_d1);
|