@fugood/llama.node 0.3.0 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +1 -10
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +6 -4
- package/src/LlamaCompletionWorker.cpp +6 -6
- package/src/LlamaContext.cpp +7 -9
- package/src/common.hpp +2 -1
- package/src/llama.cpp/.github/workflows/build.yml +98 -24
- package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
- package/src/llama.cpp/.github/workflows/docker.yml +43 -34
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
- package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
- package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
- package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
- package/src/llama.cpp/.github/workflows/server.yml +7 -0
- package/src/llama.cpp/CMakeLists.txt +20 -8
- package/src/llama.cpp/common/CMakeLists.txt +12 -10
- package/src/llama.cpp/common/arg.cpp +2006 -0
- package/src/llama.cpp/common/arg.h +77 -0
- package/src/llama.cpp/common/common.cpp +496 -1632
- package/src/llama.cpp/common/common.h +161 -63
- package/src/llama.cpp/common/console.cpp +3 -0
- package/src/llama.cpp/common/log.cpp +401 -0
- package/src/llama.cpp/common/log.h +66 -698
- package/src/llama.cpp/common/ngram-cache.cpp +3 -0
- package/src/llama.cpp/common/sampling.cpp +348 -350
- package/src/llama.cpp/common/sampling.h +62 -139
- package/src/llama.cpp/common/stb_image.h +5990 -6398
- package/src/llama.cpp/common/train.cpp +2 -0
- package/src/llama.cpp/docs/build.md +36 -1
- package/src/llama.cpp/examples/CMakeLists.txt +0 -1
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +1 -2
- package/src/llama.cpp/examples/batched/batched.cpp +39 -55
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +34 -44
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +15 -15
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
- package/src/llama.cpp/examples/embedding/embedding.cpp +143 -87
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +33 -33
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +36 -35
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
- package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +34 -27
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +59 -62
- package/src/llama.cpp/examples/infill/infill.cpp +117 -132
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +265 -58
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +29 -22
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +685 -150
- package/src/llama.cpp/examples/llava/clip.h +11 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +47 -58
- package/src/llama.cpp/examples/llava/llava.cpp +110 -24
- package/src/llama.cpp/examples/llava/llava.h +2 -3
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
- package/src/llama.cpp/examples/llava/requirements.txt +1 -0
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +42 -43
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +10 -8
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +23 -22
- package/src/llama.cpp/examples/lookup/lookup.cpp +40 -43
- package/src/llama.cpp/examples/main/main.cpp +210 -262
- package/src/llama.cpp/examples/parallel/parallel.cpp +49 -49
- package/src/llama.cpp/examples/passkey/passkey.cpp +42 -50
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +187 -200
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -3
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +49 -44
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +24 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +32 -35
- package/src/llama.cpp/examples/server/CMakeLists.txt +3 -5
- package/src/llama.cpp/examples/server/server.cpp +1027 -1073
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
- package/src/llama.cpp/examples/server/utils.hpp +107 -105
- package/src/llama.cpp/examples/simple/simple.cpp +35 -41
- package/src/llama.cpp/examples/speculative/speculative.cpp +129 -103
- package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +25 -27
- package/src/llama.cpp/ggml/CMakeLists.txt +14 -3
- package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-backend.h +145 -60
- package/src/llama.cpp/ggml/include/ggml-blas.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +15 -19
- package/src/llama.cpp/ggml/include/ggml-cuda.h +16 -16
- package/src/llama.cpp/ggml/include/ggml-metal.h +5 -8
- package/src/llama.cpp/ggml/include/ggml-rpc.h +5 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +8 -8
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +7 -7
- package/src/llama.cpp/ggml/include/ggml.h +293 -186
- package/src/llama.cpp/ggml/src/CMakeLists.txt +86 -44
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +2135 -1119
- package/src/llama.cpp/ggml/src/ggml-alloc.c +6 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +152 -70
- package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +606 -286
- package/src/llama.cpp/ggml/src/ggml-blas.cpp +9 -10
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
- package/src/llama.cpp/ggml/src/ggml-cann.cpp +215 -216
- package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
- package/src/llama.cpp/ggml/src/ggml-cpu-impl.h +614 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +49 -603
- package/src/llama.cpp/ggml/src/ggml-kompute.cpp +4 -24
- package/src/llama.cpp/ggml/src/ggml-quants.c +972 -92
- package/src/llama.cpp/ggml/src/ggml-quants.h +15 -0
- package/src/llama.cpp/ggml/src/ggml-rpc.cpp +116 -66
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +52 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +16 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +6 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl.cpp +97 -169
- package/src/llama.cpp/ggml/src/ggml-vulkan.cpp +1508 -1124
- package/src/llama.cpp/ggml/src/ggml.c +3001 -1647
- package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +192 -0
- package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +88 -40
- package/src/llama.cpp/include/llama.h +241 -264
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
- package/src/llama.cpp/src/llama-grammar.cpp +721 -122
- package/src/llama.cpp/src/llama-grammar.h +120 -15
- package/src/llama.cpp/src/llama-impl.h +156 -1
- package/src/llama.cpp/src/llama-sampling.cpp +1375 -303
- package/src/llama.cpp/src/llama-sampling.h +20 -47
- package/src/llama.cpp/src/llama-vocab.cpp +343 -120
- package/src/llama.cpp/src/llama-vocab.h +33 -17
- package/src/llama.cpp/src/llama.cpp +4247 -1525
- package/src/llama.cpp/src/unicode-data.cpp +6 -4
- package/src/llama.cpp/src/unicode-data.h +4 -4
- package/src/llama.cpp/src/unicode.cpp +15 -7
- package/src/llama.cpp/tests/CMakeLists.txt +3 -0
- package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
- package/src/llama.cpp/tests/test-backend-ops.cpp +1592 -289
- package/src/llama.cpp/tests/test-barrier.cpp +93 -0
- package/src/llama.cpp/tests/test-grad0.cpp +187 -70
- package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
- package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +6 -4
- package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
- package/src/llama.cpp/tests/test-log.cpp +39 -0
- package/src/llama.cpp/tests/test-quantize-fns.cpp +6 -0
- package/src/llama.cpp/tests/test-rope.cpp +1 -1
- package/src/llama.cpp/tests/test-sampling.cpp +157 -98
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +55 -35
- package/patches/llama.patch +0 -22
- package/src/llama.cpp/.github/workflows/bench.yml +0 -310
- package/src/llama.cpp/common/grammar-parser.cpp +0 -536
- package/src/llama.cpp/common/grammar-parser.h +0 -29
- package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
- package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
|
@@ -1,4 +1,6 @@
|
|
|
1
|
+
#include "arg.h"
|
|
1
2
|
#include "common.h"
|
|
3
|
+
#include "log.h"
|
|
2
4
|
#include "llama.h"
|
|
3
5
|
|
|
4
6
|
#include <ctime>
|
|
@@ -31,13 +33,24 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
|
|
|
31
33
|
}
|
|
32
34
|
|
|
33
35
|
static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) {
|
|
36
|
+
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
|
|
37
|
+
const struct llama_model * model = llama_get_model(ctx);
|
|
38
|
+
|
|
34
39
|
// clear previous kv_cache values (irrelevant for embeddings)
|
|
35
40
|
llama_kv_cache_clear(ctx);
|
|
36
41
|
|
|
37
42
|
// run model
|
|
38
|
-
|
|
39
|
-
if (
|
|
40
|
-
|
|
43
|
+
LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
|
|
44
|
+
if (llama_model_has_encoder(model) && !llama_model_has_decoder(model)) {
|
|
45
|
+
// encoder-only model
|
|
46
|
+
if (llama_encode(ctx, batch) < 0) {
|
|
47
|
+
LOG_ERR("%s : failed to encode\n", __func__);
|
|
48
|
+
}
|
|
49
|
+
} else if (!llama_model_has_encoder(model) && llama_model_has_decoder(model)) {
|
|
50
|
+
// decoder-only model
|
|
51
|
+
if (llama_decode(ctx, batch) < 0) {
|
|
52
|
+
LOG_ERR("%s : failed to decode\n", __func__);
|
|
53
|
+
}
|
|
41
54
|
}
|
|
42
55
|
|
|
43
56
|
for (int i = 0; i < batch.n_tokens; i++) {
|
|
@@ -45,11 +58,22 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
|
|
|
45
58
|
continue;
|
|
46
59
|
}
|
|
47
60
|
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
61
|
+
const float * embd = nullptr;
|
|
62
|
+
int embd_pos = 0;
|
|
63
|
+
|
|
64
|
+
if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
|
65
|
+
// try to get token embeddings
|
|
66
|
+
embd = llama_get_embeddings_ith(ctx, i);
|
|
67
|
+
embd_pos = i;
|
|
68
|
+
GGML_ASSERT(embd != NULL && "failed to get token embeddings");
|
|
69
|
+
} else {
|
|
70
|
+
// try to get sequence embeddings - supported only when pooling_type is not NONE
|
|
71
|
+
embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
|
|
72
|
+
embd_pos = batch.seq_id[i][0];
|
|
73
|
+
GGML_ASSERT(embd != NULL && "failed to get sequence embeddings");
|
|
74
|
+
}
|
|
51
75
|
|
|
52
|
-
float * out = output +
|
|
76
|
+
float * out = output + embd_pos * n_embd;
|
|
53
77
|
llama_embd_normalize(embd, out, n_embd, embd_norm);
|
|
54
78
|
}
|
|
55
79
|
}
|
|
@@ -57,35 +81,26 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
|
|
|
57
81
|
int main(int argc, char ** argv) {
|
|
58
82
|
gpt_params params;
|
|
59
83
|
|
|
60
|
-
if (!gpt_params_parse(argc, argv, params)) {
|
|
61
|
-
gpt_params_print_usage(argc, argv, params);
|
|
84
|
+
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_EMBEDDING)) {
|
|
62
85
|
return 1;
|
|
63
86
|
}
|
|
64
87
|
|
|
88
|
+
gpt_init();
|
|
89
|
+
|
|
65
90
|
params.embedding = true;
|
|
66
91
|
// For non-causal models, batch size must be equal to ubatch size
|
|
67
92
|
params.n_ubatch = params.n_batch;
|
|
68
93
|
|
|
69
|
-
print_build_info();
|
|
70
|
-
|
|
71
|
-
if (params.seed == LLAMA_DEFAULT_SEED) {
|
|
72
|
-
params.seed = time(NULL);
|
|
73
|
-
}
|
|
74
|
-
|
|
75
|
-
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
|
|
76
|
-
|
|
77
|
-
std::mt19937 rng(params.seed);
|
|
78
|
-
|
|
79
94
|
llama_backend_init();
|
|
80
95
|
llama_numa_init(params.numa);
|
|
81
96
|
|
|
82
|
-
llama_model * model;
|
|
83
|
-
llama_context * ctx;
|
|
84
|
-
|
|
85
97
|
// load the model
|
|
86
|
-
|
|
98
|
+
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
|
99
|
+
|
|
100
|
+
llama_model * model = llama_init.model;
|
|
101
|
+
llama_context * ctx = llama_init.context;
|
|
87
102
|
if (model == NULL) {
|
|
88
|
-
|
|
103
|
+
LOG_ERR("%s: unable to load model\n", __func__);
|
|
89
104
|
return 1;
|
|
90
105
|
}
|
|
91
106
|
|
|
@@ -93,20 +108,21 @@ int main(int argc, char ** argv) {
|
|
|
93
108
|
const int n_ctx = llama_n_ctx(ctx);
|
|
94
109
|
|
|
95
110
|
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
|
|
96
|
-
|
|
97
|
-
|
|
111
|
+
|
|
112
|
+
if (llama_model_has_encoder(model) && llama_model_has_decoder(model)) {
|
|
113
|
+
LOG_ERR("%s: computing embeddings in encoder-decoder models is not supported\n", __func__);
|
|
98
114
|
return 1;
|
|
99
115
|
}
|
|
100
116
|
|
|
101
117
|
if (n_ctx > n_ctx_train) {
|
|
102
|
-
|
|
118
|
+
LOG_WRN("%s: warning: model was trained on only %d context tokens (%d specified)\n",
|
|
103
119
|
__func__, n_ctx_train, n_ctx);
|
|
104
120
|
}
|
|
105
121
|
|
|
106
122
|
// print system information
|
|
107
123
|
{
|
|
108
|
-
|
|
109
|
-
|
|
124
|
+
LOG_INF("\n");
|
|
125
|
+
LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
|
|
110
126
|
}
|
|
111
127
|
|
|
112
128
|
// split the prompt into lines
|
|
@@ -119,9 +135,9 @@ int main(int argc, char ** argv) {
|
|
|
119
135
|
// tokenize the prompts and trim
|
|
120
136
|
std::vector<std::vector<int32_t>> inputs;
|
|
121
137
|
for (const auto & prompt : prompts) {
|
|
122
|
-
auto inp = ::llama_tokenize(ctx, prompt, true,
|
|
138
|
+
auto inp = ::llama_tokenize(ctx, prompt, true, true);
|
|
123
139
|
if (inp.size() > n_batch) {
|
|
124
|
-
|
|
140
|
+
LOG_ERR("%s: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
|
|
125
141
|
__func__, (long long int) inp.size(), (long long int) n_batch);
|
|
126
142
|
return 1;
|
|
127
143
|
}
|
|
@@ -132,20 +148,20 @@ int main(int argc, char ** argv) {
|
|
|
132
148
|
// it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true'
|
|
133
149
|
for (auto & inp : inputs) {
|
|
134
150
|
if (inp.empty() || inp.back() != llama_token_sep(model)) {
|
|
135
|
-
|
|
136
|
-
|
|
151
|
+
LOG_WRN("%s: last token in the prompt is not SEP\n", __func__);
|
|
152
|
+
LOG_WRN("%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
|
|
137
153
|
}
|
|
138
154
|
}
|
|
139
155
|
|
|
140
156
|
// tokenization stats
|
|
141
157
|
if (params.verbose_prompt) {
|
|
142
158
|
for (int i = 0; i < (int) inputs.size(); i++) {
|
|
143
|
-
|
|
144
|
-
|
|
159
|
+
LOG_INF("%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str());
|
|
160
|
+
LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size());
|
|
145
161
|
for (int j = 0; j < (int) inputs[i].size(); j++) {
|
|
146
|
-
|
|
162
|
+
LOG("%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str());
|
|
147
163
|
}
|
|
148
|
-
|
|
164
|
+
LOG("\n\n");
|
|
149
165
|
}
|
|
150
166
|
}
|
|
151
167
|
|
|
@@ -153,13 +169,23 @@ int main(int argc, char ** argv) {
|
|
|
153
169
|
const int n_prompts = prompts.size();
|
|
154
170
|
struct llama_batch batch = llama_batch_init(n_batch, 0, 1);
|
|
155
171
|
|
|
172
|
+
// count number of embeddings
|
|
173
|
+
int n_embd_count = 0;
|
|
174
|
+
if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
|
175
|
+
for (int k = 0; k < n_prompts; k++) {
|
|
176
|
+
n_embd_count += inputs[k].size();
|
|
177
|
+
}
|
|
178
|
+
} else {
|
|
179
|
+
n_embd_count = n_prompts;
|
|
180
|
+
}
|
|
181
|
+
|
|
156
182
|
// allocate output
|
|
157
183
|
const int n_embd = llama_n_embd(model);
|
|
158
|
-
std::vector<float> embeddings(
|
|
184
|
+
std::vector<float> embeddings(n_embd_count * n_embd, 0);
|
|
159
185
|
float * emb = embeddings.data();
|
|
160
186
|
|
|
161
187
|
// break into batches
|
|
162
|
-
int
|
|
188
|
+
int e = 0; // number of embeddings already stored
|
|
163
189
|
int s = 0; // number of prompts in current batch
|
|
164
190
|
for (int k = 0; k < n_prompts; k++) {
|
|
165
191
|
// clamp to n_batch tokens
|
|
@@ -169,11 +195,11 @@ int main(int argc, char ** argv) {
|
|
|
169
195
|
|
|
170
196
|
// encode if at capacity
|
|
171
197
|
if (batch.n_tokens + n_toks > n_batch) {
|
|
172
|
-
float * out = emb +
|
|
198
|
+
float * out = emb + e * n_embd;
|
|
173
199
|
batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
|
|
174
|
-
|
|
175
|
-
p += s;
|
|
200
|
+
e += pooling_type == LLAMA_POOLING_TYPE_NONE ? batch.n_tokens : s;
|
|
176
201
|
s = 0;
|
|
202
|
+
llama_batch_clear(batch);
|
|
177
203
|
}
|
|
178
204
|
|
|
179
205
|
// add to batch
|
|
@@ -182,39 +208,67 @@ int main(int argc, char ** argv) {
|
|
|
182
208
|
}
|
|
183
209
|
|
|
184
210
|
// final batch
|
|
185
|
-
float * out = emb +
|
|
211
|
+
float * out = emb + e * n_embd;
|
|
186
212
|
batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
|
|
187
213
|
|
|
188
214
|
if (params.embd_out.empty()) {
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
215
|
+
LOG("\n");
|
|
216
|
+
|
|
217
|
+
if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
|
218
|
+
for (int j = 0; j < n_embd_count; j++) {
|
|
219
|
+
LOG("embedding %d: ", j);
|
|
220
|
+
for (int i = 0; i < std::min(3, n_embd); i++) {
|
|
221
|
+
if (params.embd_normalize == 0) {
|
|
222
|
+
LOG("%6.0f ", emb[j * n_embd + i]);
|
|
223
|
+
} else {
|
|
224
|
+
LOG("%9.6f ", emb[j * n_embd + i]);
|
|
225
|
+
}
|
|
198
226
|
}
|
|
227
|
+
LOG(" ... ");
|
|
228
|
+
for (int i = n_embd - 3; i < n_embd; i++) {
|
|
229
|
+
if (params.embd_normalize == 0) {
|
|
230
|
+
LOG("%6.0f ", emb[j * n_embd + i]);
|
|
231
|
+
} else {
|
|
232
|
+
LOG("%9.6f ", emb[j * n_embd + i]);
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
LOG("\n");
|
|
199
236
|
}
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
if (n_prompts > 1) {
|
|
205
|
-
fprintf(stdout, "\n");
|
|
206
|
-
printf("cosine similarity matrix:\n\n");
|
|
207
|
-
for (int i = 0; i < n_prompts; i++) {
|
|
208
|
-
fprintf(stdout, "%6.6s ", prompts[i].c_str());
|
|
237
|
+
} else if (pooling_type == LLAMA_POOLING_TYPE_RANK) {
|
|
238
|
+
for (int j = 0; j < n_embd_count; j++) {
|
|
239
|
+
// NOTE: if you change this log - update the tests in ci/run.sh
|
|
240
|
+
LOG("rerank score %d: %8.3f\n", j, emb[j * n_embd]);
|
|
209
241
|
}
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
242
|
+
} else {
|
|
243
|
+
// print the first part of the embeddings or for a single prompt, the full embedding
|
|
244
|
+
for (int j = 0; j < n_prompts; j++) {
|
|
245
|
+
LOG("embedding %d: ", j);
|
|
246
|
+
for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
|
|
247
|
+
if (params.embd_normalize == 0) {
|
|
248
|
+
LOG("%6.0f ", emb[j * n_embd + i]);
|
|
249
|
+
} else {
|
|
250
|
+
LOG("%9.6f ", emb[j * n_embd + i]);
|
|
251
|
+
}
|
|
252
|
+
}
|
|
253
|
+
LOG("\n");
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
// print cosine similarity matrix
|
|
257
|
+
if (n_prompts > 1) {
|
|
258
|
+
LOG("\n");
|
|
259
|
+
LOG("cosine similarity matrix:\n\n");
|
|
260
|
+
for (int i = 0; i < n_prompts; i++) {
|
|
261
|
+
LOG("%6.6s ", prompts[i].c_str());
|
|
262
|
+
}
|
|
263
|
+
LOG("\n");
|
|
264
|
+
for (int i = 0; i < n_prompts; i++) {
|
|
265
|
+
for (int j = 0; j < n_prompts; j++) {
|
|
266
|
+
float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
|
|
267
|
+
LOG("%6.2f ", sim);
|
|
268
|
+
}
|
|
269
|
+
LOG("%1.10s", prompts[i].c_str());
|
|
270
|
+
LOG("\n");
|
|
215
271
|
}
|
|
216
|
-
fprintf(stdout, "%1.10s", prompts[i].c_str());
|
|
217
|
-
fprintf(stdout, "\n");
|
|
218
272
|
}
|
|
219
273
|
}
|
|
220
274
|
}
|
|
@@ -222,43 +276,45 @@ int main(int argc, char ** argv) {
|
|
|
222
276
|
if (params.embd_out == "json" || params.embd_out == "json+" || params.embd_out == "array") {
|
|
223
277
|
const bool notArray = params.embd_out != "array";
|
|
224
278
|
|
|
225
|
-
|
|
279
|
+
LOG(notArray ? "{\n \"object\": \"list\",\n \"data\": [\n" : "[");
|
|
226
280
|
for (int j = 0;;) { // at least one iteration (one prompt)
|
|
227
|
-
if (notArray)
|
|
228
|
-
|
|
281
|
+
if (notArray) LOG(" {\n \"object\": \"embedding\",\n \"index\": %d,\n \"embedding\": ",j);
|
|
282
|
+
LOG("[");
|
|
229
283
|
for (int i = 0;;) { // at least one iteration (n_embd > 0)
|
|
230
|
-
|
|
284
|
+
LOG(params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]);
|
|
231
285
|
i++;
|
|
232
|
-
if (i < n_embd)
|
|
286
|
+
if (i < n_embd) LOG(","); else break;
|
|
233
287
|
}
|
|
234
|
-
|
|
288
|
+
LOG(notArray ? "]\n }" : "]");
|
|
235
289
|
j++;
|
|
236
|
-
if (j <
|
|
290
|
+
if (j < n_embd_count) LOG(notArray ? ",\n" : ","); else break;
|
|
237
291
|
}
|
|
238
|
-
|
|
292
|
+
LOG(notArray ? "\n ]" : "]\n");
|
|
239
293
|
|
|
240
294
|
if (params.embd_out == "json+" && n_prompts > 1) {
|
|
241
|
-
|
|
242
|
-
for (int i = 0;;) { // at least two iteration (
|
|
243
|
-
|
|
244
|
-
for (int j = 0;;) { // at least two iteration (
|
|
295
|
+
LOG(",\n \"cosineSimilarity\": [\n");
|
|
296
|
+
for (int i = 0;;) { // at least two iteration (n_embd_count > 1)
|
|
297
|
+
LOG(" [");
|
|
298
|
+
for (int j = 0;;) { // at least two iteration (n_embd_count > 1)
|
|
245
299
|
float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
|
|
246
|
-
|
|
300
|
+
LOG("%6.2f", sim);
|
|
247
301
|
j++;
|
|
248
|
-
if (j <
|
|
302
|
+
if (j < n_embd_count) LOG(", "); else break;
|
|
249
303
|
}
|
|
250
|
-
|
|
304
|
+
LOG(" ]");
|
|
251
305
|
i++;
|
|
252
|
-
if (i <
|
|
306
|
+
if (i < n_embd_count) LOG(",\n"); else break;
|
|
253
307
|
}
|
|
254
|
-
|
|
308
|
+
LOG("\n ]");
|
|
255
309
|
}
|
|
256
310
|
|
|
257
|
-
if (notArray)
|
|
311
|
+
if (notArray) LOG("\n}\n");
|
|
258
312
|
}
|
|
259
313
|
|
|
314
|
+
LOG("\n");
|
|
315
|
+
llama_perf_context_print(ctx);
|
|
316
|
+
|
|
260
317
|
// clean up
|
|
261
|
-
llama_print_timings(ctx);
|
|
262
318
|
llama_batch_free(batch);
|
|
263
319
|
llama_free(ctx);
|
|
264
320
|
llama_free_model(model);
|
|
@@ -1,11 +1,11 @@
|
|
|
1
|
+
#include "arg.h"
|
|
1
2
|
#include "common.h"
|
|
3
|
+
#include "log.h"
|
|
2
4
|
#include "llama.h"
|
|
3
5
|
#include "ggml.h"
|
|
4
6
|
|
|
5
7
|
#include <cstdio>
|
|
6
|
-
#include <random>
|
|
7
8
|
#include <string>
|
|
8
|
-
#include <tuple>
|
|
9
9
|
#include <vector>
|
|
10
10
|
|
|
11
11
|
/**
|
|
@@ -31,22 +31,22 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
|
|
|
31
31
|
GGML_ASSERT(n > 0);
|
|
32
32
|
float sum = 0;
|
|
33
33
|
for (int64_t i3 = 0; i3 < ne[3]; i3++) {
|
|
34
|
-
|
|
34
|
+
LOG(" [\n");
|
|
35
35
|
for (int64_t i2 = 0; i2 < ne[2]; i2++) {
|
|
36
36
|
if (i2 == n && ne[2] > 2*n) {
|
|
37
|
-
|
|
37
|
+
LOG(" ..., \n");
|
|
38
38
|
i2 = ne[2] - n;
|
|
39
39
|
}
|
|
40
|
-
|
|
40
|
+
LOG(" [\n");
|
|
41
41
|
for (int64_t i1 = 0; i1 < ne[1]; i1++) {
|
|
42
42
|
if (i1 == n && ne[1] > 2*n) {
|
|
43
|
-
|
|
43
|
+
LOG(" ..., \n");
|
|
44
44
|
i1 = ne[1] - n;
|
|
45
45
|
}
|
|
46
|
-
|
|
46
|
+
LOG(" [");
|
|
47
47
|
for (int64_t i0 = 0; i0 < ne[0]; i0++) {
|
|
48
48
|
if (i0 == n && ne[0] > 2*n) {
|
|
49
|
-
|
|
49
|
+
LOG("..., ");
|
|
50
50
|
i0 = ne[0] - n;
|
|
51
51
|
}
|
|
52
52
|
size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
|
|
@@ -64,16 +64,16 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
|
|
|
64
64
|
} else {
|
|
65
65
|
GGML_ABORT("fatal error");
|
|
66
66
|
}
|
|
67
|
-
|
|
67
|
+
LOG("%12.4f", v);
|
|
68
68
|
sum += v;
|
|
69
|
-
if (i0 < ne[0] - 1)
|
|
69
|
+
if (i0 < ne[0] - 1) LOG(", ");
|
|
70
70
|
}
|
|
71
|
-
|
|
71
|
+
LOG("],\n");
|
|
72
72
|
}
|
|
73
|
-
|
|
73
|
+
LOG(" ],\n");
|
|
74
74
|
}
|
|
75
|
-
|
|
76
|
-
|
|
75
|
+
LOG(" ]\n");
|
|
76
|
+
LOG(" sum = %f\n", sum);
|
|
77
77
|
}
|
|
78
78
|
}
|
|
79
79
|
|
|
@@ -102,11 +102,11 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
|
|
|
102
102
|
snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
|
|
103
103
|
}
|
|
104
104
|
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
105
|
+
LOG("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
|
|
106
|
+
t->name, ggml_type_name(t->type), ggml_op_desc(t),
|
|
107
|
+
src0->name, ggml_ne_string(src0).c_str(),
|
|
108
|
+
src1 ? src1_str : "",
|
|
109
|
+
ggml_ne_string(t).c_str());
|
|
110
110
|
|
|
111
111
|
|
|
112
112
|
// copy the data from the GPU memory if needed
|
|
@@ -127,12 +127,12 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
|
|
|
127
127
|
}
|
|
128
128
|
|
|
129
129
|
static bool run(llama_context * ctx, const gpt_params & params) {
|
|
130
|
-
const bool add_bos =
|
|
130
|
+
const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
|
|
131
131
|
|
|
132
132
|
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
|
|
133
133
|
|
|
134
134
|
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
|
|
135
|
-
|
|
135
|
+
LOG_ERR("%s : failed to eval\n", __func__);
|
|
136
136
|
return false;
|
|
137
137
|
}
|
|
138
138
|
|
|
@@ -144,14 +144,11 @@ int main(int argc, char ** argv) {
|
|
|
144
144
|
|
|
145
145
|
gpt_params params;
|
|
146
146
|
|
|
147
|
-
if (!gpt_params_parse(argc, argv, params)) {
|
|
148
|
-
gpt_params_print_usage(argc, argv, params);
|
|
147
|
+
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
|
|
149
148
|
return 1;
|
|
150
149
|
}
|
|
151
150
|
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
std::mt19937 rng(params.seed);
|
|
151
|
+
gpt_init();
|
|
155
152
|
|
|
156
153
|
llama_backend_init();
|
|
157
154
|
llama_numa_init(params.numa);
|
|
@@ -163,18 +160,20 @@ int main(int argc, char ** argv) {
|
|
|
163
160
|
params.warmup = false;
|
|
164
161
|
|
|
165
162
|
// init
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
163
|
+
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
|
164
|
+
|
|
165
|
+
llama_model * model = llama_init.model;
|
|
166
|
+
llama_context * ctx = llama_init.context;
|
|
169
167
|
if (model == nullptr || ctx == nullptr) {
|
|
170
|
-
|
|
168
|
+
LOG_ERR("%s : failed to init\n", __func__);
|
|
171
169
|
return 1;
|
|
172
170
|
}
|
|
173
171
|
|
|
174
172
|
// print system information
|
|
175
173
|
{
|
|
176
|
-
|
|
177
|
-
|
|
174
|
+
LOG_INF("\n");
|
|
175
|
+
LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
|
|
176
|
+
LOG_INF("\n");
|
|
178
177
|
}
|
|
179
178
|
|
|
180
179
|
bool OK = run(ctx, params);
|
|
@@ -182,7 +181,8 @@ int main(int argc, char ** argv) {
|
|
|
182
181
|
return 1;
|
|
183
182
|
}
|
|
184
183
|
|
|
185
|
-
|
|
184
|
+
LOG("\n");
|
|
185
|
+
llama_perf_context_print(ctx);
|
|
186
186
|
|
|
187
187
|
llama_free(ctx);
|
|
188
188
|
llama_free_model(model);
|