@fugood/llama.node 0.0.1-alpha.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +85 -0
- package/README.md +56 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/lib/binding.js +13 -0
- package/lib/binding.ts +57 -0
- package/lib/index.js +24 -0
- package/lib/index.ts +13 -0
- package/package.json +65 -0
- package/src/addons.cpp +506 -0
- package/src/llama.cpp/CMakeLists.txt +1320 -0
- package/src/llama.cpp/build.zig +172 -0
- package/src/llama.cpp/cmake/FindSIMD.cmake +100 -0
- package/src/llama.cpp/common/CMakeLists.txt +87 -0
- package/src/llama.cpp/common/base64.hpp +392 -0
- package/src/llama.cpp/common/common.cpp +2949 -0
- package/src/llama.cpp/common/common.h +324 -0
- package/src/llama.cpp/common/console.cpp +501 -0
- package/src/llama.cpp/common/console.h +19 -0
- package/src/llama.cpp/common/grammar-parser.cpp +440 -0
- package/src/llama.cpp/common/grammar-parser.h +29 -0
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +764 -0
- package/src/llama.cpp/common/json-schema-to-grammar.h +4 -0
- package/src/llama.cpp/common/json.hpp +24766 -0
- package/src/llama.cpp/common/log.h +724 -0
- package/src/llama.cpp/common/ngram-cache.cpp +282 -0
- package/src/llama.cpp/common/ngram-cache.h +94 -0
- package/src/llama.cpp/common/sampling.cpp +353 -0
- package/src/llama.cpp/common/sampling.h +147 -0
- package/src/llama.cpp/common/stb_image.h +8396 -0
- package/src/llama.cpp/common/train.cpp +1513 -0
- package/src/llama.cpp/common/train.h +233 -0
- package/src/llama.cpp/examples/CMakeLists.txt +52 -0
- package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +1640 -0
- package/src/llama.cpp/examples/batched/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/batched/batched.cpp +262 -0
- package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +261 -0
- package/src/llama.cpp/examples/beam-search/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/beam-search/beam-search.cpp +188 -0
- package/src/llama.cpp/examples/benchmark/CMakeLists.txt +6 -0
- package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +275 -0
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +936 -0
- package/src/llama.cpp/examples/embedding/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/embedding/embedding.cpp +211 -0
- package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +9 -0
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +195 -0
- package/src/llama.cpp/examples/export-lora/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +462 -0
- package/src/llama.cpp/examples/finetune/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/finetune/finetune.cpp +1861 -0
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +132 -0
- package/src/llama.cpp/examples/gguf/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/gguf/gguf.cpp +256 -0
- package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +553 -0
- package/src/llama.cpp/examples/gritlm/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +215 -0
- package/src/llama.cpp/examples/imatrix/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +655 -0
- package/src/llama.cpp/examples/infill/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/infill/infill.cpp +767 -0
- package/src/llama.cpp/examples/jeopardy/questions.txt +100 -0
- package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +1286 -0
- package/src/llama.cpp/examples/llama.android/app/src/main/cpp/CMakeLists.txt +50 -0
- package/src/llama.cpp/examples/llama.android/app/src/main/cpp/llama-android.cpp +443 -0
- package/src/llama.cpp/examples/llava/CMakeLists.txt +37 -0
- package/src/llama.cpp/examples/llava/clip.cpp +2027 -0
- package/src/llama.cpp/examples/llava/clip.h +85 -0
- package/src/llama.cpp/examples/llava/llava-cli.cpp +309 -0
- package/src/llama.cpp/examples/llava/llava.cpp +426 -0
- package/src/llama.cpp/examples/llava/llava.h +50 -0
- package/src/llama.cpp/examples/llava/requirements.txt +3 -0
- package/src/llama.cpp/examples/lookahead/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +485 -0
- package/src/llama.cpp/examples/lookup/CMakeLists.txt +23 -0
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +41 -0
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +47 -0
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +160 -0
- package/src/llama.cpp/examples/lookup/lookup.cpp +258 -0
- package/src/llama.cpp/examples/main/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/main/main.cpp +957 -0
- package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +33 -0
- package/src/llama.cpp/examples/parallel/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/parallel/parallel.cpp +427 -0
- package/src/llama.cpp/examples/passkey/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/passkey/passkey.cpp +302 -0
- package/src/llama.cpp/examples/perplexity/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +1943 -0
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +6 -0
- package/src/llama.cpp/examples/quantize/quantize.cpp +423 -0
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +6 -0
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +424 -0
- package/src/llama.cpp/examples/retrieval/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +350 -0
- package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +246 -0
- package/src/llama.cpp/examples/server/CMakeLists.txt +40 -0
- package/src/llama.cpp/examples/server/bench/requirements.txt +2 -0
- package/src/llama.cpp/examples/server/httplib.h +9465 -0
- package/src/llama.cpp/examples/server/server.cpp +3826 -0
- package/src/llama.cpp/examples/server/tests/requirements.txt +6 -0
- package/src/llama.cpp/examples/server/utils.hpp +653 -0
- package/src/llama.cpp/examples/simple/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/simple/simple.cpp +183 -0
- package/src/llama.cpp/examples/speculative/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/speculative/speculative.cpp +614 -0
- package/src/llama.cpp/examples/sycl/CMakeLists.txt +9 -0
- package/src/llama.cpp/examples/sycl/ls-sycl-device.cpp +13 -0
- package/src/llama.cpp/examples/tokenize/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +42 -0
- package/src/llama.cpp/examples/train-text-from-scratch/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +1252 -0
- package/src/llama.cpp/ggml-alloc.c +985 -0
- package/src/llama.cpp/ggml-alloc.h +76 -0
- package/src/llama.cpp/ggml-backend-impl.h +141 -0
- package/src/llama.cpp/ggml-backend.c +2099 -0
- package/src/llama.cpp/ggml-backend.h +233 -0
- package/src/llama.cpp/ggml-common.h +1853 -0
- package/src/llama.cpp/ggml-cuda.h +43 -0
- package/src/llama.cpp/ggml-impl.h +265 -0
- package/src/llama.cpp/ggml-kompute.cpp +2006 -0
- package/src/llama.cpp/ggml-kompute.h +46 -0
- package/src/llama.cpp/ggml-metal.h +66 -0
- package/src/llama.cpp/ggml-mpi.c +216 -0
- package/src/llama.cpp/ggml-mpi.h +39 -0
- package/src/llama.cpp/ggml-opencl.cpp +2301 -0
- package/src/llama.cpp/ggml-opencl.h +36 -0
- package/src/llama.cpp/ggml-quants.c +12678 -0
- package/src/llama.cpp/ggml-quants.h +133 -0
- package/src/llama.cpp/ggml-sycl.cpp +17882 -0
- package/src/llama.cpp/ggml-sycl.h +49 -0
- package/src/llama.cpp/ggml-vulkan-shaders.hpp +69849 -0
- package/src/llama.cpp/ggml-vulkan.cpp +6442 -0
- package/src/llama.cpp/ggml-vulkan.h +29 -0
- package/src/llama.cpp/ggml.c +21819 -0
- package/src/llama.cpp/ggml.h +2403 -0
- package/src/llama.cpp/llama.cpp +17468 -0
- package/src/llama.cpp/llama.h +1117 -0
- package/src/llama.cpp/pocs/CMakeLists.txt +12 -0
- package/src/llama.cpp/pocs/vdot/CMakeLists.txt +9 -0
- package/src/llama.cpp/pocs/vdot/q8dot.cpp +172 -0
- package/src/llama.cpp/pocs/vdot/vdot.cpp +310 -0
- package/src/llama.cpp/prompts/LLM-questions.txt +49 -0
- package/src/llama.cpp/prompts/alpaca.txt +1 -0
- package/src/llama.cpp/prompts/assistant.txt +31 -0
- package/src/llama.cpp/prompts/chat-with-baichuan.txt +4 -0
- package/src/llama.cpp/prompts/chat-with-bob.txt +7 -0
- package/src/llama.cpp/prompts/chat-with-qwen.txt +1 -0
- package/src/llama.cpp/prompts/chat-with-vicuna-v0.txt +7 -0
- package/src/llama.cpp/prompts/chat-with-vicuna-v1.txt +7 -0
- package/src/llama.cpp/prompts/chat.txt +28 -0
- package/src/llama.cpp/prompts/dan-modified.txt +1 -0
- package/src/llama.cpp/prompts/dan.txt +1 -0
- package/src/llama.cpp/prompts/mnemonics.txt +93 -0
- package/src/llama.cpp/prompts/parallel-questions.txt +43 -0
- package/src/llama.cpp/prompts/reason-act.txt +18 -0
- package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +3 -0
- package/src/llama.cpp/requirements/requirements-convert-llama-ggml-to-gguf.txt +1 -0
- package/src/llama.cpp/requirements/requirements-convert-lora-to-ggml.txt +2 -0
- package/src/llama.cpp/requirements/requirements-convert-persimmon-to-gguf.txt +2 -0
- package/src/llama.cpp/requirements/requirements-convert.txt +5 -0
- package/src/llama.cpp/requirements.txt +12 -0
- package/src/llama.cpp/scripts/gen-build-info-cpp.cmake +24 -0
- package/src/llama.cpp/scripts/xxd.cmake +16 -0
- package/src/llama.cpp/sgemm.cpp +999 -0
- package/src/llama.cpp/sgemm.h +12 -0
- package/src/llama.cpp/tests/CMakeLists.txt +78 -0
- package/src/llama.cpp/tests/get-model.cpp +21 -0
- package/src/llama.cpp/tests/get-model.h +2 -0
- package/src/llama.cpp/tests/test-autorelease.cpp +24 -0
- package/src/llama.cpp/tests/test-backend-ops.cpp +2266 -0
- package/src/llama.cpp/tests/test-c.c +7 -0
- package/src/llama.cpp/tests/test-chat-template.cpp +107 -0
- package/src/llama.cpp/tests/test-double-float.cpp +57 -0
- package/src/llama.cpp/tests/test-grad0.cpp +1606 -0
- package/src/llama.cpp/tests/test-grammar-integration.cpp +243 -0
- package/src/llama.cpp/tests/test-grammar-parser.cpp +250 -0
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +899 -0
- package/src/llama.cpp/tests/test-llama-grammar.cpp +402 -0
- package/src/llama.cpp/tests/test-model-load-cancel.cpp +27 -0
- package/src/llama.cpp/tests/test-opt.cpp +181 -0
- package/src/llama.cpp/tests/test-quantize-fns.cpp +185 -0
- package/src/llama.cpp/tests/test-quantize-perf.cpp +363 -0
- package/src/llama.cpp/tests/test-rope.cpp +221 -0
- package/src/llama.cpp/tests/test-sampling.cpp +301 -0
- package/src/llama.cpp/tests/test-tokenizer-0-falcon.cpp +187 -0
- package/src/llama.cpp/tests/test-tokenizer-0-llama.cpp +190 -0
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +123 -0
- package/src/llama.cpp/tests/test-tokenizer-1-llama.cpp +111 -0
- package/src/llama.cpp/unicode-data.cpp +1651 -0
- package/src/llama.cpp/unicode-data.h +16 -0
- package/src/llama.cpp/unicode.cpp +277 -0
- package/src/llama.cpp/unicode.h +28 -0
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
#include "common.h"
|
|
2
|
+
#include "llama.h"
|
|
3
|
+
|
|
4
|
+
#include <ctime>
|
|
5
|
+
|
|
6
|
+
#if defined(_MSC_VER)
|
|
7
|
+
#pragma warning(disable: 4244 4267) // possible loss of data
|
|
8
|
+
#endif
|
|
9
|
+
|
|
10
|
+
static std::vector<std::string> split_lines(const std::string & s) {
|
|
11
|
+
std::string line;
|
|
12
|
+
std::vector<std::string> lines;
|
|
13
|
+
std::stringstream ss(s);
|
|
14
|
+
while (std::getline(ss, line)) {
|
|
15
|
+
lines.push_back(line);
|
|
16
|
+
}
|
|
17
|
+
return lines;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) {
|
|
21
|
+
for (size_t i = 0; i < tokens.size(); i++) {
|
|
22
|
+
llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1);
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
|
|
27
|
+
// clear previous kv_cache values (irrelevant for embeddings)
|
|
28
|
+
llama_kv_cache_clear(ctx);
|
|
29
|
+
|
|
30
|
+
// run model
|
|
31
|
+
fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
|
|
32
|
+
if (llama_decode(ctx, batch) < 0) {
|
|
33
|
+
fprintf(stderr, "%s : failed to decode\n", __func__);
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
for (int i = 0; i < batch.n_tokens; i++) {
|
|
37
|
+
if (!batch.logits[i]) {
|
|
38
|
+
continue;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
// try to get sequence embeddings - supported only when pooling_type is not NONE
|
|
42
|
+
const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
|
|
43
|
+
if (embd == NULL) {
|
|
44
|
+
embd = llama_get_embeddings_ith(ctx, i);
|
|
45
|
+
if (embd == NULL) {
|
|
46
|
+
fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i);
|
|
47
|
+
continue;
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
float * out = output + batch.seq_id[i][0] * n_embd;
|
|
52
|
+
llama_embd_normalize(embd, out, n_embd);
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
int main(int argc, char ** argv) {
|
|
57
|
+
gpt_params params;
|
|
58
|
+
|
|
59
|
+
if (!gpt_params_parse(argc, argv, params)) {
|
|
60
|
+
return 1;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
params.embedding = true;
|
|
64
|
+
// For non-causal models, batch size must be equal to ubatch size
|
|
65
|
+
params.n_ubatch = params.n_batch;
|
|
66
|
+
|
|
67
|
+
print_build_info();
|
|
68
|
+
|
|
69
|
+
if (params.seed == LLAMA_DEFAULT_SEED) {
|
|
70
|
+
params.seed = time(NULL);
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
|
|
74
|
+
|
|
75
|
+
std::mt19937 rng(params.seed);
|
|
76
|
+
if (params.random_prompt) {
|
|
77
|
+
params.prompt = gpt_random_prompt(rng);
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
llama_backend_init();
|
|
81
|
+
llama_numa_init(params.numa);
|
|
82
|
+
|
|
83
|
+
llama_model * model;
|
|
84
|
+
llama_context * ctx;
|
|
85
|
+
|
|
86
|
+
// load the model
|
|
87
|
+
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
|
88
|
+
if (model == NULL) {
|
|
89
|
+
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
|
90
|
+
return 1;
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
const int n_ctx_train = llama_n_ctx_train(model);
|
|
94
|
+
const int n_ctx = llama_n_ctx(ctx);
|
|
95
|
+
|
|
96
|
+
if (n_ctx > n_ctx_train) {
|
|
97
|
+
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
|
|
98
|
+
__func__, n_ctx_train, n_ctx);
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
// print system information
|
|
102
|
+
{
|
|
103
|
+
fprintf(stderr, "\n");
|
|
104
|
+
fprintf(stderr, "%s\n", get_system_info(params).c_str());
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
// split the prompt into lines
|
|
108
|
+
std::vector<std::string> prompts = split_lines(params.prompt);
|
|
109
|
+
|
|
110
|
+
// max batch size
|
|
111
|
+
const uint64_t n_batch = params.n_batch;
|
|
112
|
+
GGML_ASSERT(params.n_batch >= params.n_ctx);
|
|
113
|
+
|
|
114
|
+
// tokenize the prompts and trim
|
|
115
|
+
std::vector<std::vector<int32_t>> inputs;
|
|
116
|
+
for (const auto & prompt : prompts) {
|
|
117
|
+
auto inp = ::llama_tokenize(ctx, prompt, true, false);
|
|
118
|
+
if (inp.size() > n_batch) {
|
|
119
|
+
fprintf(stderr, "%s: error: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
|
|
120
|
+
__func__, (long long int) inp.size(), (long long int) n_batch);
|
|
121
|
+
return 1;
|
|
122
|
+
}
|
|
123
|
+
inputs.push_back(inp);
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
// add SEP if not present
|
|
127
|
+
for (auto & inp : inputs) {
|
|
128
|
+
if (inp.empty() || inp.back() != llama_token_sep(model)) {
|
|
129
|
+
inp.push_back(llama_token_sep(model));
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
// tokenization stats
|
|
134
|
+
if (params.verbose_prompt) {
|
|
135
|
+
for (int i = 0; i < (int) inputs.size(); i++) {
|
|
136
|
+
fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str());
|
|
137
|
+
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size());
|
|
138
|
+
for (int j = 0; j < (int) inputs[i].size(); j++) {
|
|
139
|
+
fprintf(stderr, "%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str());
|
|
140
|
+
}
|
|
141
|
+
fprintf(stderr, "\n\n");
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
// initialize batch
|
|
146
|
+
const int n_prompts = prompts.size();
|
|
147
|
+
struct llama_batch batch = llama_batch_init(n_batch, 0, 1);
|
|
148
|
+
|
|
149
|
+
// allocate output
|
|
150
|
+
const int n_embd = llama_n_embd(model);
|
|
151
|
+
std::vector<float> embeddings(n_prompts * n_embd, 0);
|
|
152
|
+
float * emb = embeddings.data();
|
|
153
|
+
|
|
154
|
+
// break into batches
|
|
155
|
+
int p = 0; // number of prompts processed already
|
|
156
|
+
int s = 0; // number of prompts in current batch
|
|
157
|
+
for (int k = 0; k < n_prompts; k++) {
|
|
158
|
+
// clamp to n_batch tokens
|
|
159
|
+
auto & inp = inputs[k];
|
|
160
|
+
|
|
161
|
+
const uint64_t n_toks = inp.size();
|
|
162
|
+
|
|
163
|
+
// encode if at capacity
|
|
164
|
+
if (batch.n_tokens + n_toks > n_batch) {
|
|
165
|
+
float * out = emb + p * n_embd;
|
|
166
|
+
batch_decode(ctx, batch, out, s, n_embd);
|
|
167
|
+
llama_batch_clear(batch);
|
|
168
|
+
p += s;
|
|
169
|
+
s = 0;
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
// add to batch
|
|
173
|
+
batch_add_seq(batch, inp, s);
|
|
174
|
+
s += 1;
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
// final batch
|
|
178
|
+
float * out = emb + p * n_embd;
|
|
179
|
+
batch_decode(ctx, batch, out, s, n_embd);
|
|
180
|
+
|
|
181
|
+
// print the first part of the embeddings or for a single prompt, the full embedding
|
|
182
|
+
fprintf(stdout, "\n");
|
|
183
|
+
for (int j = 0; j < n_prompts; j++) {
|
|
184
|
+
fprintf(stdout, "embedding %d: ", j);
|
|
185
|
+
for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
|
|
186
|
+
fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
|
|
187
|
+
}
|
|
188
|
+
fprintf(stdout, "\n");
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
// print cosine similarity matrix
|
|
192
|
+
if (n_prompts > 1) {
|
|
193
|
+
fprintf(stdout, "\n");
|
|
194
|
+
printf("cosine similarity matrix:\n\n");
|
|
195
|
+
for (int i = 0; i < n_prompts; i++) {
|
|
196
|
+
for (int j = 0; j < n_prompts; j++) {
|
|
197
|
+
float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
|
|
198
|
+
fprintf(stdout, "%6.2f ", sim);
|
|
199
|
+
}
|
|
200
|
+
fprintf(stdout, "\n");
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
// clean up
|
|
205
|
+
llama_print_timings(ctx);
|
|
206
|
+
llama_free(ctx);
|
|
207
|
+
llama_free_model(model);
|
|
208
|
+
llama_backend_free();
|
|
209
|
+
|
|
210
|
+
return 0;
|
|
211
|
+
}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
set(TARGET eval-callback)
|
|
2
|
+
add_executable(${TARGET} eval-callback.cpp)
|
|
3
|
+
install(TARGETS ${TARGET} RUNTIME)
|
|
4
|
+
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
|
5
|
+
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
|
6
|
+
|
|
7
|
+
set(TEST_TARGET test-eval-callback)
|
|
8
|
+
add_test(NAME ${TEST_TARGET} COMMAND eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
|
|
9
|
+
set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl)
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
#include "common.h"
|
|
2
|
+
#include "llama.h"
|
|
3
|
+
#include "ggml.h"
|
|
4
|
+
|
|
5
|
+
#include <cstdio>
|
|
6
|
+
#include <random>
|
|
7
|
+
#include <string>
|
|
8
|
+
#include <tuple>
|
|
9
|
+
#include <vector>
|
|
10
|
+
|
|
11
|
+
/**
|
|
12
|
+
* This the arbitrary data which will be passed to each callback.
|
|
13
|
+
* Later on we can for example add operation or tensor name filter from the CLI arg, or a file descriptor to dump the tensor.
|
|
14
|
+
*/
|
|
15
|
+
struct callback_data {
|
|
16
|
+
std::vector<uint8_t> data;
|
|
17
|
+
};
|
|
18
|
+
|
|
19
|
+
static std::string ggml_ne_string(const ggml_tensor * t) {
|
|
20
|
+
std::string str;
|
|
21
|
+
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
|
|
22
|
+
str += std::to_string(t->ne[i]);
|
|
23
|
+
if (i + 1 < GGML_MAX_DIMS) {
|
|
24
|
+
str += ", ";
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
return str;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
|
|
31
|
+
GGML_ASSERT(n > 0);
|
|
32
|
+
float sum = 0;
|
|
33
|
+
for (int64_t i3 = 0; i3 < ne[3]; i3++) {
|
|
34
|
+
printf(" [\n");
|
|
35
|
+
for (int64_t i2 = 0; i2 < ne[2]; i2++) {
|
|
36
|
+
if (i2 == n && ne[2] > 2*n) {
|
|
37
|
+
printf(" ..., \n");
|
|
38
|
+
i2 = ne[2] - n;
|
|
39
|
+
}
|
|
40
|
+
printf(" [\n");
|
|
41
|
+
for (int64_t i1 = 0; i1 < ne[1]; i1++) {
|
|
42
|
+
if (i1 == n && ne[1] > 2*n) {
|
|
43
|
+
printf(" ..., \n");
|
|
44
|
+
i1 = ne[1] - n;
|
|
45
|
+
}
|
|
46
|
+
printf(" [");
|
|
47
|
+
for (int64_t i0 = 0; i0 < ne[0]; i0++) {
|
|
48
|
+
if (i0 == n && ne[0] > 2*n) {
|
|
49
|
+
printf("..., ");
|
|
50
|
+
i0 = ne[0] - n;
|
|
51
|
+
}
|
|
52
|
+
size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
|
|
53
|
+
float v;
|
|
54
|
+
if (type == GGML_TYPE_F16) {
|
|
55
|
+
v = ggml_fp16_to_fp32(*(ggml_fp16_t *) data + i);
|
|
56
|
+
} else if (type == GGML_TYPE_F32) {
|
|
57
|
+
v = *(float *) data + i;
|
|
58
|
+
} else if (type == GGML_TYPE_I32) {
|
|
59
|
+
v = (float) *(int32_t *) data + i;
|
|
60
|
+
} else if (type == GGML_TYPE_I16) {
|
|
61
|
+
v = (float) *(int16_t *) data + i;
|
|
62
|
+
} else if (type == GGML_TYPE_I8) {
|
|
63
|
+
v = (float) *(int8_t *) data + i;
|
|
64
|
+
} else {
|
|
65
|
+
GGML_ASSERT(false);
|
|
66
|
+
}
|
|
67
|
+
printf("%12.4f", v);
|
|
68
|
+
sum += v;
|
|
69
|
+
if (i0 < ne[0] - 1) printf(", ");
|
|
70
|
+
}
|
|
71
|
+
printf("],\n");
|
|
72
|
+
}
|
|
73
|
+
printf(" ],\n");
|
|
74
|
+
}
|
|
75
|
+
printf(" ]\n");
|
|
76
|
+
printf(" sum = %f\n", sum);
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
/**
|
|
81
|
+
* GGML operations callback during the graph execution.
|
|
82
|
+
*
|
|
83
|
+
* @param t current tensor
|
|
84
|
+
* @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor
|
|
85
|
+
* if we return true, a follow-up call will be made with ask=false in which we can do the actual collection.
|
|
86
|
+
* see ggml_backend_sched_eval_callback
|
|
87
|
+
* @param user_data user data to pass at each call back
|
|
88
|
+
* @return true to receive data or continue the graph, false otherwise
|
|
89
|
+
*/
|
|
90
|
+
static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
|
|
91
|
+
auto * cb_data = (callback_data *) user_data;
|
|
92
|
+
|
|
93
|
+
const struct ggml_tensor * src0 = t->src[0];
|
|
94
|
+
const struct ggml_tensor * src1 = t->src[1];
|
|
95
|
+
|
|
96
|
+
if (ask) {
|
|
97
|
+
return true; // Always retrieve data
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
char src1_str[128] = {0};
|
|
101
|
+
if (src1) {
|
|
102
|
+
sprintf(src1_str, "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
printf("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
|
|
106
|
+
t->name, ggml_type_name(t->type), ggml_op_desc(t),
|
|
107
|
+
src0->name, ggml_ne_string(src0).c_str(),
|
|
108
|
+
src1 ? src1_str : "",
|
|
109
|
+
ggml_ne_string(t).c_str());
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
// copy the data from the GPU memory if needed
|
|
113
|
+
const bool is_host = ggml_backend_buffer_is_host(t->buffer);
|
|
114
|
+
|
|
115
|
+
if (!is_host) {
|
|
116
|
+
auto n_bytes = ggml_nbytes(t);
|
|
117
|
+
cb_data->data.resize(n_bytes);
|
|
118
|
+
ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes);
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
if (!ggml_is_quantized(t->type)) {
|
|
122
|
+
uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
|
|
123
|
+
ggml_print_tensor(data, t->type, t->ne, t->nb, 3);
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
return true;
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
static bool run(llama_context * ctx, const gpt_params & params) {
|
|
130
|
+
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
|
131
|
+
|
|
132
|
+
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
|
|
133
|
+
|
|
134
|
+
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
|
|
135
|
+
fprintf(stderr, "%s : failed to eval\n", __func__);
|
|
136
|
+
return false;
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
return true;
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
int main(int argc, char ** argv) {
|
|
143
|
+
|
|
144
|
+
callback_data cb_data;
|
|
145
|
+
|
|
146
|
+
gpt_params params;
|
|
147
|
+
if (!gpt_params_parse(argc, argv, params)) {
|
|
148
|
+
return 1;
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
print_build_info();
|
|
152
|
+
|
|
153
|
+
std::mt19937 rng(params.seed);
|
|
154
|
+
if (params.random_prompt) {
|
|
155
|
+
params.prompt = gpt_random_prompt(rng);
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
llama_backend_init();
|
|
159
|
+
llama_numa_init(params.numa);
|
|
160
|
+
|
|
161
|
+
// pass the callback to the backend scheduler
|
|
162
|
+
// it will be executed for each node during the graph computation
|
|
163
|
+
params.cb_eval = ggml_debug;
|
|
164
|
+
params.cb_eval_user_data = &cb_data;
|
|
165
|
+
params.warmup = false;
|
|
166
|
+
|
|
167
|
+
// init
|
|
168
|
+
llama_model * model;
|
|
169
|
+
llama_context * ctx;
|
|
170
|
+
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
|
171
|
+
if (model == nullptr || ctx == nullptr) {
|
|
172
|
+
fprintf(stderr, "%s : failed to init\n", __func__);
|
|
173
|
+
return 1;
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
// print system information
|
|
177
|
+
{
|
|
178
|
+
fprintf(stderr, "\n");
|
|
179
|
+
fprintf(stderr, "%s\n", get_system_info(params).c_str());
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
bool OK = run(ctx, params);
|
|
183
|
+
if (!OK) {
|
|
184
|
+
return 1;
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
llama_print_timings(ctx);
|
|
188
|
+
|
|
189
|
+
llama_free(ctx);
|
|
190
|
+
llama_free_model(model);
|
|
191
|
+
|
|
192
|
+
llama_backend_free();
|
|
193
|
+
|
|
194
|
+
return 0;
|
|
195
|
+
}
|