npm - @fugood/llama.node - Versions diffs - 0.3.0 → 0.3.2 - Mend

@fugood/llama.node 0.3.0 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (187) hide show

package/CMakeLists.txt +1 -10
package/bin/darwin/arm64/llama-node.node +0 -0
package/bin/darwin/x64/llama-node.node +0 -0
package/bin/linux/arm64/llama-node.node +0 -0
package/bin/linux/x64/llama-node.node +0 -0
package/bin/linux-vulkan/arm64/llama-node.node +0 -0
package/bin/linux-vulkan/x64/llama-node.node +0 -0
package/bin/win32/arm64/llama-node.node +0 -0
package/bin/win32/arm64/node.lib +0 -0
package/bin/win32/x64/llama-node.node +0 -0
package/bin/win32/x64/node.lib +0 -0
package/bin/win32-vulkan/arm64/llama-node.node +0 -0
package/bin/win32-vulkan/arm64/node.lib +0 -0
package/bin/win32-vulkan/x64/llama-node.node +0 -0
package/bin/win32-vulkan/x64/node.lib +0 -0
package/package.json +6 -4
package/src/LlamaCompletionWorker.cpp +6 -6
package/src/LlamaContext.cpp +7 -9
package/src/common.hpp +2 -1
package/src/llama.cpp/.github/workflows/build.yml +98 -24
package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
package/src/llama.cpp/.github/workflows/docker.yml +43 -34
package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
package/src/llama.cpp/.github/workflows/server.yml +7 -0
package/src/llama.cpp/CMakeLists.txt +20 -8
package/src/llama.cpp/common/CMakeLists.txt +12 -10
package/src/llama.cpp/common/arg.cpp +2006 -0
package/src/llama.cpp/common/arg.h +77 -0
package/src/llama.cpp/common/common.cpp +496 -1632
package/src/llama.cpp/common/common.h +161 -63
package/src/llama.cpp/common/console.cpp +3 -0
package/src/llama.cpp/common/log.cpp +401 -0
package/src/llama.cpp/common/log.h +66 -698
package/src/llama.cpp/common/ngram-cache.cpp +3 -0
package/src/llama.cpp/common/sampling.cpp +348 -350
package/src/llama.cpp/common/sampling.h +62 -139
package/src/llama.cpp/common/stb_image.h +5990 -6398
package/src/llama.cpp/common/train.cpp +2 -0
package/src/llama.cpp/docs/build.md +36 -1
package/src/llama.cpp/examples/CMakeLists.txt +0 -1
package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +1 -2
package/src/llama.cpp/examples/batched/batched.cpp +39 -55
package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +34 -44
package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +15 -15
package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
package/src/llama.cpp/examples/embedding/embedding.cpp +143 -87
package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +33 -33
package/src/llama.cpp/examples/export-lora/export-lora.cpp +36 -35
package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
package/src/llama.cpp/examples/gritlm/gritlm.cpp +34 -27
package/src/llama.cpp/examples/imatrix/imatrix.cpp +59 -62
package/src/llama.cpp/examples/infill/infill.cpp +117 -132
package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +265 -58
package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +29 -22
package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
package/src/llama.cpp/examples/llava/clip.cpp +685 -150
package/src/llama.cpp/examples/llava/clip.h +11 -2
package/src/llama.cpp/examples/llava/llava-cli.cpp +47 -58
package/src/llama.cpp/examples/llava/llava.cpp +110 -24
package/src/llama.cpp/examples/llava/llava.h +2 -3
package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
package/src/llama.cpp/examples/llava/requirements.txt +1 -0
package/src/llama.cpp/examples/lookahead/lookahead.cpp +42 -43
package/src/llama.cpp/examples/lookup/lookup-create.cpp +10 -8
package/src/llama.cpp/examples/lookup/lookup-stats.cpp +23 -22
package/src/llama.cpp/examples/lookup/lookup.cpp +40 -43
package/src/llama.cpp/examples/main/main.cpp +210 -262
package/src/llama.cpp/examples/parallel/parallel.cpp +49 -49
package/src/llama.cpp/examples/passkey/passkey.cpp +42 -50
package/src/llama.cpp/examples/perplexity/perplexity.cpp +187 -200
package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -3
package/src/llama.cpp/examples/retrieval/retrieval.cpp +49 -44
package/src/llama.cpp/examples/rpc/rpc-server.cpp +24 -1
package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +32 -35
package/src/llama.cpp/examples/server/CMakeLists.txt +3 -5
package/src/llama.cpp/examples/server/server.cpp +1027 -1073
package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
package/src/llama.cpp/examples/server/utils.hpp +107 -105
package/src/llama.cpp/examples/simple/simple.cpp +35 -41
package/src/llama.cpp/examples/speculative/speculative.cpp +129 -103
package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
package/src/llama.cpp/examples/tokenize/tokenize.cpp +25 -27
package/src/llama.cpp/ggml/CMakeLists.txt +14 -3
package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
package/src/llama.cpp/ggml/include/ggml-backend.h +145 -60
package/src/llama.cpp/ggml/include/ggml-blas.h +3 -3
package/src/llama.cpp/ggml/include/ggml-cann.h +15 -19
package/src/llama.cpp/ggml/include/ggml-cuda.h +16 -16
package/src/llama.cpp/ggml/include/ggml-metal.h +5 -8
package/src/llama.cpp/ggml/include/ggml-rpc.h +5 -5
package/src/llama.cpp/ggml/include/ggml-sycl.h +8 -8
package/src/llama.cpp/ggml/include/ggml-vulkan.h +7 -7
package/src/llama.cpp/ggml/include/ggml.h +293 -186
package/src/llama.cpp/ggml/src/CMakeLists.txt +86 -44
package/src/llama.cpp/ggml/src/ggml-aarch64.c +2135 -1119
package/src/llama.cpp/ggml/src/ggml-alloc.c +6 -0
package/src/llama.cpp/ggml/src/ggml-backend-impl.h +152 -70
package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +606 -286
package/src/llama.cpp/ggml/src/ggml-blas.cpp +9 -10
package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
package/src/llama.cpp/ggml/src/ggml-cann.cpp +215 -216
package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
package/src/llama.cpp/ggml/src/ggml-cpu-impl.h +614 -0
package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
package/src/llama.cpp/ggml/src/ggml-impl.h +49 -603
package/src/llama.cpp/ggml/src/ggml-kompute.cpp +4 -24
package/src/llama.cpp/ggml/src/ggml-quants.c +972 -92
package/src/llama.cpp/ggml/src/ggml-quants.h +15 -0
package/src/llama.cpp/ggml/src/ggml-rpc.cpp +116 -66
package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +11 -0
package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +52 -0
package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +16 -3
package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1 -1
package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +6 -3
package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
package/src/llama.cpp/ggml/src/ggml-sycl.cpp +97 -169
package/src/llama.cpp/ggml/src/ggml-vulkan.cpp +1508 -1124
package/src/llama.cpp/ggml/src/ggml.c +3001 -1647
package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +192 -0
package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +2 -0
package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +88 -40
package/src/llama.cpp/include/llama.h +241 -264
package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
package/src/llama.cpp/src/llama-grammar.cpp +721 -122
package/src/llama.cpp/src/llama-grammar.h +120 -15
package/src/llama.cpp/src/llama-impl.h +156 -1
package/src/llama.cpp/src/llama-sampling.cpp +1375 -303
package/src/llama.cpp/src/llama-sampling.h +20 -47
package/src/llama.cpp/src/llama-vocab.cpp +343 -120
package/src/llama.cpp/src/llama-vocab.h +33 -17
package/src/llama.cpp/src/llama.cpp +4247 -1525
package/src/llama.cpp/src/unicode-data.cpp +6 -4
package/src/llama.cpp/src/unicode-data.h +4 -4
package/src/llama.cpp/src/unicode.cpp +15 -7
package/src/llama.cpp/tests/CMakeLists.txt +3 -0
package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
package/src/llama.cpp/tests/test-backend-ops.cpp +1592 -289
package/src/llama.cpp/tests/test-barrier.cpp +93 -0
package/src/llama.cpp/tests/test-grad0.cpp +187 -70
package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +6 -4
package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
package/src/llama.cpp/tests/test-log.cpp +39 -0
package/src/llama.cpp/tests/test-quantize-fns.cpp +6 -0
package/src/llama.cpp/tests/test-rope.cpp +1 -1
package/src/llama.cpp/tests/test-sampling.cpp +157 -98
package/src/llama.cpp/tests/test-tokenizer-0.cpp +55 -35
package/patches/llama.patch +0 -22
package/src/llama.cpp/.github/workflows/bench.yml +0 -310
package/src/llama.cpp/common/grammar-parser.cpp +0 -536
package/src/llama.cpp/common/grammar-parser.h +0 -29
package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275

package/src/llama.cpp/examples/embedding/embedding.cpp CHANGED Viewed

@@ -1,4 +1,6 @@
+#include "arg.h"
 #include "common.h"
+#include "log.h"
 #include "llama.h"
 #include <ctime>
@@ -31,13 +33,24 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
 }
 static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) {
+    const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
+    const struct llama_model * model = llama_get_model(ctx);
     // clear previous kv_cache values (irrelevant for embeddings)
     llama_kv_cache_clear(ctx);
     // run model
-    fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
-    if (llama_decode(ctx, batch) < 0) {
-        fprintf(stderr, "%s : failed to decode\n", __func__);
+    LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
+    if (llama_model_has_encoder(model) && !llama_model_has_decoder(model)) {
+        // encoder-only model
+        if (llama_encode(ctx, batch) < 0) {
+            LOG_ERR("%s : failed to encode\n", __func__);
+        }
+    } else if (!llama_model_has_encoder(model) && llama_model_has_decoder(model)) {
+        // decoder-only model
+        if (llama_decode(ctx, batch) < 0) {
+            LOG_ERR("%s : failed to decode\n", __func__);
+        }
     }
     for (int i = 0; i < batch.n_tokens; i++) {
@@ -45,11 +58,22 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
             continue;
         }
-        // try to get sequence embeddings - supported only when pooling_type is not NONE
-        const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
-        GGML_ASSERT(embd != NULL && "failed to get sequence embeddings");
+        const float * embd = nullptr;
+        int embd_pos = 0;
+        if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
+            // try to get token embeddings
+            embd = llama_get_embeddings_ith(ctx, i);
+            embd_pos = i;
+            GGML_ASSERT(embd != NULL && "failed to get token embeddings");
+        } else {
+            // try to get sequence embeddings - supported only when pooling_type is not NONE
+            embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
+            embd_pos = batch.seq_id[i][0];
+            GGML_ASSERT(embd != NULL && "failed to get sequence embeddings");
+        }
-        float * out = output + batch.seq_id[i][0] * n_embd;
+        float * out = output + embd_pos * n_embd;
         llama_embd_normalize(embd, out, n_embd, embd_norm);
     }
 }
@@ -57,35 +81,26 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
 int main(int argc, char ** argv) {
     gpt_params params;
-    if (!gpt_params_parse(argc, argv, params)) {
-        gpt_params_print_usage(argc, argv, params);
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_EMBEDDING)) {
         return 1;
     }
+    gpt_init();
     params.embedding = true;
     // For non-causal models, batch size must be equal to ubatch size
     params.n_ubatch = params.n_batch;
-    print_build_info();
-    if (params.seed == LLAMA_DEFAULT_SEED) {
-        params.seed = time(NULL);
-    }
-    fprintf(stderr, "%s: seed  = %u\n", __func__, params.seed);
-    std::mt19937 rng(params.seed);
     llama_backend_init();
     llama_numa_init(params.numa);
-    llama_model * model;
-    llama_context * ctx;
     // load the model
-    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    llama_init_result llama_init = llama_init_from_gpt_params(params);
+    llama_model * model = llama_init.model;
+    llama_context * ctx = llama_init.context;
     if (model == NULL) {
-        fprintf(stderr, "%s: error: unable to load model\n", __func__);
+        LOG_ERR("%s: unable to load model\n", __func__);
         return 1;
     }
@@ -93,20 +108,21 @@ int main(int argc, char ** argv) {
     const int n_ctx = llama_n_ctx(ctx);
     const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
-    if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
-        fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__);
+    if (llama_model_has_encoder(model) && llama_model_has_decoder(model)) {
+        LOG_ERR("%s: computing embeddings in encoder-decoder models is not supported\n", __func__);
         return 1;
     }
     if (n_ctx > n_ctx_train) {
-        fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
+        LOG_WRN("%s: warning: model was trained on only %d context tokens (%d specified)\n",
                 __func__, n_ctx_train, n_ctx);
     }
     // print system information
     {
-        fprintf(stderr, "\n");
-        fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
+        LOG_INF("\n");
+        LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
     }
     // split the prompt into lines
@@ -119,9 +135,9 @@ int main(int argc, char ** argv) {
     // tokenize the prompts and trim
     std::vector<std::vector<int32_t>> inputs;
     for (const auto & prompt : prompts) {
-        auto inp = ::llama_tokenize(ctx, prompt, true, false);
+        auto inp = ::llama_tokenize(ctx, prompt, true, true);
         if (inp.size() > n_batch) {
-            fprintf(stderr, "%s: error: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
+            LOG_ERR("%s: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
                     __func__, (long long int) inp.size(), (long long int) n_batch);
             return 1;
         }
@@ -132,20 +148,20 @@ int main(int argc, char ** argv) {
     // it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true'
     for (auto & inp : inputs) {
         if (inp.empty() || inp.back() != llama_token_sep(model)) {
-            fprintf(stderr, "%s: warning: last token in the prompt is not SEP\n", __func__);
-            fprintf(stderr, "%s:          'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
+            LOG_WRN("%s: last token in the prompt is not SEP\n", __func__);
+            LOG_WRN("%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
         }
     }
     // tokenization stats
     if (params.verbose_prompt) {
         for (int i = 0; i < (int) inputs.size(); i++) {
-            fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str());
-            fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size());
+            LOG_INF("%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str());
+            LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size());
             for (int j = 0; j < (int) inputs[i].size(); j++) {
-                fprintf(stderr, "%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str());
+                LOG("%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str());
             }
-            fprintf(stderr, "\n\n");
+            LOG("\n\n");
         }
     }
@@ -153,13 +169,23 @@ int main(int argc, char ** argv) {
     const int n_prompts = prompts.size();
     struct llama_batch batch = llama_batch_init(n_batch, 0, 1);
+    // count number of embeddings
+    int n_embd_count = 0;
+    if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
+        for (int k = 0; k < n_prompts; k++) {
+            n_embd_count += inputs[k].size();
+        }
+    } else {
+        n_embd_count = n_prompts;
+    }
     // allocate output
     const int n_embd = llama_n_embd(model);
-    std::vector<float> embeddings(n_prompts * n_embd, 0);
+    std::vector<float> embeddings(n_embd_count * n_embd, 0);
     float * emb = embeddings.data();
     // break into batches
-    int p = 0; // number of prompts processed already
+    int e = 0; // number of embeddings already stored
     int s = 0; // number of prompts in current batch
     for (int k = 0; k < n_prompts; k++) {
         // clamp to n_batch tokens
@@ -169,11 +195,11 @@ int main(int argc, char ** argv) {
         // encode if at capacity
         if (batch.n_tokens + n_toks > n_batch) {
-            float * out = emb + p * n_embd;
+            float * out = emb + e * n_embd;
             batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
-            llama_batch_clear(batch);
-            p += s;
+            e += pooling_type == LLAMA_POOLING_TYPE_NONE ? batch.n_tokens : s;
             s = 0;
+            llama_batch_clear(batch);
         }
         // add to batch
@@ -182,39 +208,67 @@ int main(int argc, char ** argv) {
     }
     // final batch
-    float * out = emb + p * n_embd;
+    float * out = emb + e * n_embd;
     batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
     if (params.embd_out.empty()) {
-        // print the first part of the embeddings or for a single prompt, the full embedding
-        fprintf(stdout, "\n");
-        for (int j = 0; j < n_prompts; j++) {
-            fprintf(stdout, "embedding %d: ", j);
-            for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
-                if (params.embd_normalize == 0) {
-                    fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
-                } else {
-                    fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
+        LOG("\n");
+        if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
+            for (int j = 0; j < n_embd_count; j++) {
+                LOG("embedding %d: ", j);
+                for (int i = 0; i < std::min(3, n_embd); i++) {
+                    if (params.embd_normalize == 0) {
+                        LOG("%6.0f ", emb[j * n_embd + i]);
+                    } else {
+                        LOG("%9.6f ", emb[j * n_embd + i]);
+                    }
                 }
+                LOG(" ... ");
+                for (int i = n_embd - 3; i < n_embd; i++) {
+                    if (params.embd_normalize == 0) {
+                        LOG("%6.0f ", emb[j * n_embd + i]);
+                    } else {
+                        LOG("%9.6f ", emb[j * n_embd + i]);
+                    }
+                }
+                LOG("\n");
             }
-            fprintf(stdout, "\n");
-        }
-        // print cosine similarity matrix
-        if (n_prompts > 1) {
-            fprintf(stdout, "\n");
-            printf("cosine similarity matrix:\n\n");
-            for (int i = 0; i < n_prompts; i++) {
-                fprintf(stdout, "%6.6s ", prompts[i].c_str());
+        } else if (pooling_type == LLAMA_POOLING_TYPE_RANK) {
+            for (int j = 0; j < n_embd_count; j++) {
+                // NOTE: if you change this log - update the tests in ci/run.sh
+                LOG("rerank score %d: %8.3f\n", j, emb[j * n_embd]);
             }
-            fprintf(stdout, "\n");
-            for (int i = 0; i < n_prompts; i++) {
-                for (int j = 0; j < n_prompts; j++) {
-                    float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
-                    fprintf(stdout, "%6.2f ", sim);
+        } else {
+            // print the first part of the embeddings or for a single prompt, the full embedding
+            for (int j = 0; j < n_prompts; j++) {
+                LOG("embedding %d: ", j);
+                for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
+                    if (params.embd_normalize == 0) {
+                        LOG("%6.0f ", emb[j * n_embd + i]);
+                    } else {
+                        LOG("%9.6f ", emb[j * n_embd + i]);
+                    }
+                }
+                LOG("\n");
+            }
+            // print cosine similarity matrix
+            if (n_prompts > 1) {
+                LOG("\n");
+                LOG("cosine similarity matrix:\n\n");
+                for (int i = 0; i < n_prompts; i++) {
+                    LOG("%6.6s ", prompts[i].c_str());
+                }
+                LOG("\n");
+                for (int i = 0; i < n_prompts; i++) {
+                    for (int j = 0; j < n_prompts; j++) {
+                        float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
+                        LOG("%6.2f ", sim);
+                    }
+                    LOG("%1.10s", prompts[i].c_str());
+                    LOG("\n");
                 }
-                fprintf(stdout, "%1.10s", prompts[i].c_str());
-                fprintf(stdout, "\n");
             }
         }
     }
@@ -222,43 +276,45 @@ int main(int argc, char ** argv) {
     if (params.embd_out == "json" || params.embd_out == "json+" || params.embd_out == "array") {
         const bool notArray = params.embd_out != "array";
-        fprintf(stdout, notArray ? "{\n  \"object\": \"list\",\n  \"data\": [\n" : "[");
+        LOG(notArray ? "{\n  \"object\": \"list\",\n  \"data\": [\n" : "[");
         for (int j = 0;;) { // at least one iteration (one prompt)
-            if (notArray) fprintf(stdout, "    {\n      \"object\": \"embedding\",\n      \"index\": %d,\n      \"embedding\": ",j);
-            fprintf(stdout, "[");
+            if (notArray) LOG("    {\n      \"object\": \"embedding\",\n      \"index\": %d,\n      \"embedding\": ",j);
+            LOG("[");
             for (int i = 0;;) { // at least one iteration (n_embd > 0)
-                fprintf(stdout, params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]);
+                LOG(params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]);
                 i++;
-                if (i < n_embd) fprintf(stdout, ","); else break;
+                if (i < n_embd) LOG(","); else break;
             }
-            fprintf(stdout, notArray ? "]\n    }" : "]");
+            LOG(notArray ? "]\n    }" : "]");
             j++;
-            if (j < n_prompts) fprintf(stdout, notArray ? ",\n" : ","); else break;
+            if (j < n_embd_count) LOG(notArray ? ",\n" : ","); else break;
         }
-        fprintf(stdout, notArray ? "\n  ]" : "]\n");
+        LOG(notArray ? "\n  ]" : "]\n");
         if (params.embd_out == "json+" && n_prompts > 1) {
-            fprintf(stdout, ",\n  \"cosineSimilarity\": [\n");
-            for (int i = 0;;) { // at least two iteration (n_prompts > 1)
-                fprintf(stdout, "    [");
-                for (int j = 0;;) { // at least two iteration (n_prompts > 1)
+            LOG(",\n  \"cosineSimilarity\": [\n");
+            for (int i = 0;;) { // at least two iteration (n_embd_count > 1)
+                LOG("    [");
+                for (int j = 0;;) { // at least two iteration (n_embd_count > 1)
                     float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
-                    fprintf(stdout, "%6.2f", sim);
+                    LOG("%6.2f", sim);
                     j++;
-                    if (j < n_prompts) fprintf(stdout, ", "); else break;
+                    if (j < n_embd_count) LOG(", "); else break;
                 }
-                fprintf(stdout, " ]");
+                LOG(" ]");
                 i++;
-                if (i < n_prompts) fprintf(stdout, ",\n"); else break;
+                if (i < n_embd_count) LOG(",\n"); else break;
             }
-            fprintf(stdout, "\n  ]");
+            LOG("\n  ]");
         }
-        if (notArray) fprintf(stdout, "\n}\n");
+        if (notArray) LOG("\n}\n");
     }
+    LOG("\n");
+    llama_perf_context_print(ctx);
     // clean up
-    llama_print_timings(ctx);
     llama_batch_free(batch);
     llama_free(ctx);
     llama_free_model(model);

package/src/llama.cpp/examples/eval-callback/eval-callback.cpp CHANGED Viewed

@@ -1,11 +1,11 @@
+#include "arg.h"
 #include "common.h"
+#include "log.h"
 #include "llama.h"
 #include "ggml.h"
 #include <cstdio>
-#include <random>
 #include <string>
-#include <tuple>
 #include <vector>
 /**
@@ -31,22 +31,22 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
     GGML_ASSERT(n > 0);
     float sum = 0;
     for (int64_t i3 = 0; i3 < ne[3]; i3++) {
-        printf("                                     [\n");
+        LOG("                                     [\n");
         for (int64_t i2 = 0; i2 < ne[2]; i2++) {
             if (i2 == n && ne[2] > 2*n) {
-                printf("                                      ..., \n");
+                LOG("                                      ..., \n");
                 i2 = ne[2] - n;
             }
-            printf("                                      [\n");
+            LOG("                                      [\n");
             for (int64_t i1 = 0; i1 < ne[1]; i1++) {
                 if (i1 == n && ne[1] > 2*n) {
-                    printf("                                       ..., \n");
+                    LOG("                                       ..., \n");
                     i1 = ne[1] - n;
                 }
-                printf("                                       [");
+                LOG("                                       [");
                 for (int64_t i0 = 0; i0 < ne[0]; i0++) {
                     if (i0 == n && ne[0] > 2*n) {
-                        printf("..., ");
+                        LOG("..., ");
                         i0 = ne[0] - n;
                     }
                     size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
@@ -64,16 +64,16 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
                     } else {
                         GGML_ABORT("fatal error");
                     }
-                    printf("%12.4f", v);
+                    LOG("%12.4f", v);
                     sum += v;
-                    if (i0 < ne[0] - 1) printf(", ");
+                    if (i0 < ne[0] - 1) LOG(", ");
                 }
-                printf("],\n");
+                LOG("],\n");
             }
-            printf("                                      ],\n");
+            LOG("                                      ],\n");
         }
-        printf("                                     ]\n");
-        printf("                                     sum = %f\n", sum);
+        LOG("                                     ]\n");
+        LOG("                                     sum = %f\n", sum);
     }
 }
@@ -102,11 +102,11 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
         snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
     }
-    printf("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
-           t->name, ggml_type_name(t->type), ggml_op_desc(t),
-           src0->name, ggml_ne_string(src0).c_str(),
-           src1 ? src1_str : "",
-           ggml_ne_string(t).c_str());
+    LOG("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
+         t->name, ggml_type_name(t->type), ggml_op_desc(t),
+         src0->name, ggml_ne_string(src0).c_str(),
+         src1 ? src1_str : "",
+         ggml_ne_string(t).c_str());
     // copy the data from the GPU memory if needed
@@ -127,12 +127,12 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
 }
 static bool run(llama_context * ctx, const gpt_params & params) {
-    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
+    const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
     std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
     if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
-        fprintf(stderr, "%s : failed to eval\n", __func__);
+        LOG_ERR("%s : failed to eval\n", __func__);
         return false;
     }
@@ -144,14 +144,11 @@ int main(int argc, char ** argv) {
     gpt_params params;
-    if (!gpt_params_parse(argc, argv, params)) {
-        gpt_params_print_usage(argc, argv, params);
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
         return 1;
     }
-    print_build_info();
-    std::mt19937 rng(params.seed);
+    gpt_init();
     llama_backend_init();
     llama_numa_init(params.numa);
@@ -163,18 +160,20 @@ int main(int argc, char ** argv) {
     params.warmup = false;
     // init
-    llama_model * model;
-    llama_context * ctx;
-    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    llama_init_result llama_init = llama_init_from_gpt_params(params);
+    llama_model * model = llama_init.model;
+    llama_context * ctx = llama_init.context;
     if (model == nullptr || ctx == nullptr) {
-        fprintf(stderr, "%s : failed to init\n", __func__);
+        LOG_ERR("%s : failed to init\n", __func__);
         return 1;
     }
     // print system information
     {
-        fprintf(stderr, "\n");
-        fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
+        LOG_INF("\n");
+        LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
+        LOG_INF("\n");
     }
     bool OK = run(ctx, params);
@@ -182,7 +181,8 @@ int main(int argc, char ** argv) {
         return 1;
     }
-    llama_print_timings(ctx);
+    LOG("\n");
+    llama_perf_context_print(ctx);
     llama_free(ctx);
     llama_free_model(model);