npm - @fugood/llama.node - Versions diffs - 0.2.3 → 0.3.1 - Mend

@fugood/llama.node 0.2.3 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (319) hide show

package/src/llama.cpp/examples/cvector-generator/pca.hpp ADDED Viewed

@@ -0,0 +1,325 @@
+#include "common.h"
+#include "llama.h"
+#include "ggml.h"
+#ifdef GGML_USE_CUDA
+#include "ggml-cuda.h"
+#endif
+#ifdef GGML_USE_METAL
+#include "ggml-metal.h"
+#endif
+#include <cstdio>
+#include <ctime>
+#include <string>
+#include <tuple>
+#include <vector>
+#include <algorithm>
+#include <iostream>
+#include <fstream>
+#define DEBUG_POS 5
+static void print_debug_tensor(struct ggml_tensor * t, bool with_data = true) {
+    printf("%s: %s (%s): [%d, %d]\n", __func__, t->name, ggml_type_name(t->type), (int) t->ne[0], (int) t->ne[1]);
+    if (!with_data) return;
+    printf("%s: %s[0] = [", __func__, t->name);
+    for (size_t i = 0; i <= DEBUG_POS; i++) {
+        printf(" %f,", ggml_get_f32_nd(t, i, 0, 0, 0));
+    }
+    printf(" ... ]\n");
+}
+namespace PCA {
+// input params for PCA computations
+struct pca_params {
+    int n_threads = 1;
+    int n_batch = 20; // number of iterations do to in one batch. larger the batch, more memory is used
+    int n_iterations = 1000;
+    float tolerance = 1e-7;
+    // for debugging
+    int i_layer = 0;
+    int n_layers = 0;
+};
+// result from each iteration
+struct pca_result {
+    struct ggml_tensor * calculated_square = NULL;
+    std::vector<struct ggml_tensor *> eigenvectors;
+    std::vector<float> distances;
+};
+struct pca_model {
+    ggml_backend_t backend = NULL;
+    ggml_backend_buffer_t buffer;
+    struct ggml_context * ctx;      // context to compute graph on target device
+    struct ggml_context * ctx_host; // host context to store results
+    // tensors on target device
+    struct ggml_tensor * dev_input;
+    struct ggml_tensor * dev_square;
+    struct ggml_tensor * dev_eigenvector;
+    pca_model(struct ggml_tensor * t_input) {
+#ifdef GGML_USE_CUDA
+        fprintf(stderr, "%s: using CUDA backend\n", __func__);
+        backend = ggml_backend_cuda_init(0); // init device 0
+        if (!backend) {
+            fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
+        }
+#endif
+// TODO: enable Metal support when support for GGML_OP_SQRT is added
+// #ifdef GGML_USE_METAL
+//         fprintf(stderr, "%s: using Metal backend\n", __func__);
+//         backend = ggml_backend_metal_init();
+//         if (!backend) {
+//             fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
+//         }
+// #endif
+        // if there aren't GPU Backends fallback to CPU backend
+        if (!backend) {
+            backend = ggml_backend_cpu_init();
+        }
+        const int num_tensors = 4;
+        struct ggml_init_params params {
+            /*.mem_size   =*/ ggml_tensor_overhead() * num_tensors,
+            /*.mem_buffer =*/ NULL,
+            /*.no_alloc   =*/ true,
+        };
+        ctx = ggml_init(params);
+        auto n_samples = t_input->ne[0];
+        auto n_embd    = t_input->ne[1];
+        dev_input       = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_samples, n_embd);
+        dev_square      = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd,    n_embd);
+        dev_eigenvector = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+        ggml_set_name(dev_input,       "dev_input");
+        ggml_set_name(dev_square,      "dev_square");
+        ggml_set_name(dev_eigenvector, "dev_eigenvector");
+        buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
+        ggml_backend_tensor_set(dev_input, t_input->data, 0, ggml_nbytes(t_input));
+        // initialize eigenvector to random normalized vector
+        {
+            std::vector<float> random_vec(ggml_nelements(dev_eigenvector), 0.0);
+            std::default_random_engine generator(static_cast<unsigned int>(std::time(0)));
+            std::uniform_real_distribution<float> distribution(0.0, 1.0);
+            float sum_sqr = 0.0; // for normalizing random_vec
+            for (size_t i = 0; i < random_vec.size(); ++i) {
+                float f = distribution(generator);
+                sum_sqr += f * f;
+                random_vec[i] = f;
+            }
+            // normalize it
+            float random_vec_norm = std::sqrt(sum_sqr);
+            for (size_t i = 0; i < random_vec.size(); ++i) {
+                random_vec[i] /= random_vec_norm;
+            }
+            ggml_backend_tensor_set(dev_eigenvector, random_vec.data(), 0, ggml_nbytes(dev_eigenvector));
+        }
+    }
+    ~pca_model() {
+        ggml_free(ctx);
+        ggml_backend_buffer_free(buffer);
+        ggml_backend_free(backend);
+    }
+};
+static struct ggml_cgraph * build_graph_piter(
+        const struct pca_params & params,
+        const pca_model & model,
+        bool calc_square = false) {
+    GGML_ASSERT(params.n_batch > 0);
+    // TODO: buf_size must be able to scale with params.n_batch
+    static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
+    static std::vector<uint8_t> buf(buf_size);
+    struct ggml_init_params params0 = {
+        /*.mem_size   =*/ buf_size,
+        /*.mem_buffer =*/ buf.data(),
+        /*.no_alloc   =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
+    };
+    // create a temporally context to build the graph
+    struct ggml_context * ctx0 = ggml_init(params0);
+    struct ggml_cgraph * gf = ggml_new_graph(ctx0);
+    // turn v_diff_original into square matrix if needed
+    struct ggml_tensor * tmp_square;
+    if (calc_square) {
+        tmp_square = ggml_mul_mat(ctx0, model.dev_input, model.dev_input);
+        ggml_set_name(tmp_square, "tmp_square");
+    }
+    struct ggml_tensor * b_tensor;
+    struct ggml_tensor * distance;
+    struct ggml_tensor * old_eigen    = model.dev_eigenvector;
+    struct ggml_tensor * input_square = calc_square ? tmp_square : model.dev_square;
+    for (int i = 0; i < params.n_batch; ++i) {
+        // b_tensor = square * eigenvector^T
+        b_tensor = ggml_mul_mat(ctx0, input_square, old_eigen);
+        ggml_set_name(b_tensor, "b_tensor");
+        // normalize
+        b_tensor = ggml_div_inplace(ctx0,
+            b_tensor,
+            ggml_sqrt_inplace(ctx0, ggml_sum_rows(ctx0, ggml_sqr(ctx0, b_tensor)))
+        );
+        ggml_format_name(b_tensor, "b_tensor_norm_%d", i);
+        // calculate distance(new eigenvector - old eigenvector)
+        // we don't use ggml_sub because it may not be implemented on GPU backend
+        struct ggml_tensor * new_sub_old = ggml_add(ctx0, old_eigen, ggml_scale(ctx0, b_tensor, -1));
+        distance = ggml_sqrt_inplace(ctx0,
+            ggml_sum_rows(ctx0, ggml_sqr_inplace(ctx0, new_sub_old)));
+        ggml_format_name(distance, "distance_%d", i);
+        old_eigen = b_tensor;
+        // build operations nodes
+        ggml_build_forward_expand(gf, distance);
+    }
+    // delete the temporally context used to build the graph
+    ggml_free(ctx0);
+    return gf;
+}
+static ggml_status compute_piter(
+        const struct pca_params & params,
+        const pca_model & model,
+        struct ggml_cgraph * gf,
+        ggml_gallocr_t allocr,
+        struct pca_result & result) {
+    // allocate tensors
+    ggml_gallocr_alloc_graph(allocr, gf);
+    if (ggml_backend_is_cpu(model.backend)) {
+        ggml_backend_cpu_set_n_threads(model.backend, params.n_threads);
+    }
+// TODO: enable GPU support when support for GGML_OP_SQRT is added
+//#ifdef GGML_USE_METAL
+//    if (ggml_backend_is_metal(model.backend)) {
+//        ggml_backend_metal_set_n_cb(model.backend, params.n_threads);
+//    }
+//#endif
+    ggml_status res = ggml_backend_graph_compute(model.backend, gf);
+    if (res == GGML_STATUS_SUCCESS) {
+        auto extract_i = [](std::string prefix, std::string str) -> int {
+            int i = -1;
+            if (str.rfind(prefix, 0) == 0) {
+                sscanf(str.c_str(), (prefix + "%d").c_str(), &i);
+            }
+            return i;
+        };
+        result.calculated_square = NULL;
+        result.eigenvectors.clear();
+        result.distances.clear();
+        result.eigenvectors.resize(params.n_batch);
+        result.distances.resize(params.n_batch);
+        // get output nodes
+        for (int i = 0; i < gf->n_nodes; ++i) {
+            auto node = gf->nodes[i];
+            int iter = -1;
+            // find b_tensor (without copying data from device)
+            if ((iter = extract_i("b_tensor_norm_", node->name)) > -1) {
+                result.eigenvectors[iter] = node;
+            }
+            // find distances, then copy data from device
+            if ((iter = extract_i("distance_", node->name)) > -1) {
+                float d;
+                ggml_backend_tensor_get(node, &d, 0, sizeof(float));
+                result.distances[iter] = d;
+                // std::cout << node->name << " = " << d << "\n";
+            }
+            // find tmp_square if it exists (without copying data from device)
+            if (std::string(node->name) == "tmp_square") {
+                result.calculated_square = node;
+            }
+        }
+    }
+    return res;
+}
+static void power_iteration(
+        const struct pca_params & params,
+        struct ggml_tensor * input, // shape of input: [n_samples, n_embd]
+        struct ggml_tensor * output) {
+    //printf("in power iteration\n");
+    struct pca_model model(input);
+    ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend));
+    struct pca_result result;
+    struct ggml_tensor * last_eigenvector = NULL;
+    int n_iters = params.n_iterations / params.n_batch; // more batch, fewer iterations
+    for (int iter = 0; iter < n_iters; ++iter) {
+        bool calc_square = (iter == 0); // only need to calculate square for first iteration
+        struct ggml_cgraph * gf = build_graph_piter(params, model, calc_square);
+        // ggml_graph_dump_dot(gf, nullptr, "/tmp/_cgraph.dot");
+        compute_piter(params, model, gf, allocr, result);
+        for (size_t k = 0; k < result.distances.size(); ++k) {
+            last_eigenvector = result.eigenvectors[k];
+            if (result.distances[k] < params.tolerance) {
+                break; // done
+            }
+        }
+        if (calc_square) {
+            // copy and store the square matrix if needed
+            GGML_ASSERT(result.calculated_square != NULL);
+            ggml_backend_tensor_copy(result.calculated_square, model.dev_square);
+        }
+        {
+            // copy last eigen vector and store as input for next iteration
+            GGML_ASSERT(last_eigenvector != NULL);
+            ggml_backend_tensor_copy(last_eigenvector, model.dev_eigenvector);
+        }
+        printf("%s: layer %d/%d, iteration: %d / total: %d (batch = %d) ...\n",
+            __func__, params.i_layer+1, params.n_layers, iter+1, n_iters, params.n_batch);
+    }
+    // get output tensor
+    GGML_ASSERT(last_eigenvector);
+    ggml_backend_tensor_get(last_eigenvector, output->data, 0, ggml_nbytes(last_eigenvector));
+    //print_debug_tensor(output);
+    ggml_gallocr_free(allocr);
+    // TODO @ngxson : The output vector is randomly inverted
+    // Solution: https://github.com/ggerganov/llama.cpp/pull/8069#issuecomment-2185328171
+}
+static void run_pca(
+        struct pca_params & params,
+        const std::vector<struct ggml_tensor *> & v_input, // shape of v_input[0]: [n_samples, n_embd]
+        const std::vector<struct ggml_tensor *> & v_output) {
+    printf("%s: Running PCA...\n", __func__);
+    for (size_t il = 0; il < v_input.size(); ++il) {
+        // prepare output vector
+        struct ggml_tensor * ctrl_out = v_output[il];
+        ggml_format_name(ctrl_out, "direction.%ld", il+1);
+        // run power_iteration
+        params.i_layer = il;
+        params.n_layers = v_input.size();
+        power_iteration(params, v_input[il], ctrl_out);
+        printf("%s: Done layer %d / %d\n", __func__, (int) il+1, (int) v_input.size());
+    }
+}
+}

package/src/llama.cpp/examples/cvector-generator/positive.txt ADDED Viewed

@@ -0,0 +1,4 @@
+<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely happy<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI'm the happiest person in this world
+<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely happy<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHello, I'm having the best day ever!
+<|start_header_id|>system<|end_header_id|>\n\nYou are in a very good mood<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHi<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHi, I'm very excited to meet you
+<|start_header_id|>system<|end_header_id|>\n\nYou are the happiest person<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat are you feeling?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nEverything is just perfect right now!

package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp ADDED Viewed

@@ -0,0 +1,35 @@
+// Warns users that this filename was deprecated, and provides a link for more information.
+#include <cstdio>
+#include <string>
+#include <unordered_map>
+// Main
+int main(int argc, char** argv) {
+    std::string filename = "main";
+    if (argc >= 1) {
+        filename = argv[0];
+    }
+    // Get only the program name from the full path
+    auto pos = filename.find_last_of('/');
+    if (pos != std::string::npos) {
+        filename = filename.substr(pos+1);
+    }
+    // Append "llama-" to the beginning of filename to get the replacemnt filename
+    auto replacement_filename = "llama-" + filename;
+    // The exception is if the filename is "main", then our replacement filename is "llama-cli"
+    if (filename == "main") {
+        replacement_filename = "llama-cli";
+    }
+    fprintf(stdout, "\n");
+    fprintf(stdout, "WARNING: The binary '%s' is deprecated.\n", filename.c_str());
+    fprintf(stdout, " Please use '%s' instead.\n", replacement_filename.c_str());
+    fprintf(stdout, " See https://github.com/ggerganov/llama.cpp/tree/master/examples/deprecation-warning/README.md for more information.\n");
+    fprintf(stdout, "\n");
+    return EXIT_FAILURE;
+}

package/src/llama.cpp/examples/embedding/CMakeLists.txt CHANGED Viewed

@@ -1,4 +1,4 @@
-set(TARGET embedding)
+set(TARGET llama-embedding)
 add_executable(${TARGET} embedding.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})

package/src/llama.cpp/examples/embedding/embedding.cpp CHANGED Viewed

@@ -7,23 +7,30 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
-static std::vector<std::string> split_lines(const std::string & s) {
-    std::string line;
+static std::vector<std::string> split_lines(const std::string & s, const std::string & separator = "\n") {
     std::vector<std::string> lines;
-    std::stringstream ss(s);
-    while (std::getline(ss, line)) {
-        lines.push_back(line);
+    size_t start = 0;
+    size_t end = s.find(separator);
+    while (end != std::string::npos) {
+        lines.push_back(s.substr(start, end - start));
+        start = end + separator.length();
+        end = s.find(separator, start);
     }
+    lines.push_back(s.substr(start)); // Add the last part
     return lines;
 }
-static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) {
-    for (size_t i = 0; i < tokens.size(); i++) {
-        llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1);
+static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
+    size_t n_tokens = tokens.size();
+    for (size_t i = 0; i < n_tokens; i++) {
+        llama_batch_add(batch, tokens[i], i, { seq_id }, true);
     }
 }
-static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
+static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) {
     // clear previous kv_cache values (irrelevant for embeddings)
     llama_kv_cache_clear(ctx);
@@ -40,22 +47,10 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
         // try to get sequence embeddings - supported only when pooling_type is not NONE
         const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
-        if (embd == NULL) {
-            embd = llama_get_embeddings_ith(ctx, i);
-            if (embd == NULL) {
-                fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i);
-                continue;
-            }
-        }
+        GGML_ASSERT(embd != NULL && "failed to get sequence embeddings");
         float * out = output + batch.seq_id[i][0] * n_embd;
-        //TODO: I would also add a parameter here to enable normalization or not.
-        /*fprintf(stdout, "unnormalized_embedding:");
-        for (int hh = 0; hh < n_embd; hh++) {
-            fprintf(stdout, "%9.6f ", embd[hh]);
-        }
-        fprintf(stdout, "\n");*/
-        llama_embd_normalize(embd, out, n_embd);
+        llama_embd_normalize(embd, out, n_embd, embd_norm);
     }
 }
@@ -63,6 +58,7 @@ int main(int argc, char ** argv) {
     gpt_params params;
     if (!gpt_params_parse(argc, argv, params)) {
+        gpt_params_print_usage(argc, argv, params);
         return 1;
     }
@@ -79,9 +75,6 @@ int main(int argc, char ** argv) {
     fprintf(stderr, "%s: seed  = %u\n", __func__, params.seed);
     std::mt19937 rng(params.seed);
-    if (params.random_prompt) {
-        params.prompt = string_random_prompt(rng);
-    }
     llama_backend_init();
     llama_numa_init(params.numa);
@@ -99,6 +92,12 @@ int main(int argc, char ** argv) {
     const int n_ctx_train = llama_n_ctx_train(model);
     const int n_ctx = llama_n_ctx(ctx);
+    const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
+    if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
+        fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__);
+        return 1;
+    }
     if (n_ctx > n_ctx_train) {
         fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
                 __func__, n_ctx_train, n_ctx);
@@ -111,7 +110,7 @@ int main(int argc, char ** argv) {
     }
     // split the prompt into lines
-    std::vector<std::string> prompts = split_lines(params.prompt);
+    std::vector<std::string> prompts = split_lines(params.prompt, params.embd_sep);
     // max batch size
     const uint64_t n_batch = params.n_batch;
@@ -171,7 +170,7 @@ int main(int argc, char ** argv) {
         // encode if at capacity
         if (batch.n_tokens + n_toks > n_batch) {
             float * out = emb + p * n_embd;
-            batch_decode(ctx, batch, out, s, n_embd);
+            batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
             llama_batch_clear(batch);
             p += s;
             s = 0;
@@ -184,29 +183,78 @@ int main(int argc, char ** argv) {
     // final batch
     float * out = emb + p * n_embd;
-    batch_decode(ctx, batch, out, s, n_embd);
-    // print the first part of the embeddings or for a single prompt, the full embedding
-    fprintf(stdout, "\n");
-    for (int j = 0; j < n_prompts; j++) {
-        fprintf(stdout, "embedding %d: ", j);
-        for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
-            fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
-        }
-        fprintf(stdout, "\n");
-    }
+    batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
-    // print cosine similarity matrix
-    if (n_prompts > 1) {
+    if (params.embd_out.empty()) {
+        // print the first part of the embeddings or for a single prompt, the full embedding
         fprintf(stdout, "\n");
-        printf("cosine similarity matrix:\n\n");
-        for (int i = 0; i < n_prompts; i++) {
-            for (int j = 0; j < n_prompts; j++) {
-                float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
-                fprintf(stdout, "%6.2f ", sim);
+        for (int j = 0; j < n_prompts; j++) {
+            fprintf(stdout, "embedding %d: ", j);
+            for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
+                if (params.embd_normalize == 0) {
+                    fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
+                } else {
+                    fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
+                }
             }
             fprintf(stdout, "\n");
         }
+        // print cosine similarity matrix
+        if (n_prompts > 1) {
+            fprintf(stdout, "\n");
+            printf("cosine similarity matrix:\n\n");
+            for (int i = 0; i < n_prompts; i++) {
+                fprintf(stdout, "%6.6s ", prompts[i].c_str());
+            }
+            fprintf(stdout, "\n");
+            for (int i = 0; i < n_prompts; i++) {
+                for (int j = 0; j < n_prompts; j++) {
+                    float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
+                    fprintf(stdout, "%6.2f ", sim);
+                }
+                fprintf(stdout, "%1.10s", prompts[i].c_str());
+                fprintf(stdout, "\n");
+            }
+        }
+    }
+    if (params.embd_out == "json" || params.embd_out == "json+" || params.embd_out == "array") {
+        const bool notArray = params.embd_out != "array";
+        fprintf(stdout, notArray ? "{\n  \"object\": \"list\",\n  \"data\": [\n" : "[");
+        for (int j = 0;;) { // at least one iteration (one prompt)
+            if (notArray) fprintf(stdout, "    {\n      \"object\": \"embedding\",\n      \"index\": %d,\n      \"embedding\": ",j);
+            fprintf(stdout, "[");
+            for (int i = 0;;) { // at least one iteration (n_embd > 0)
+                fprintf(stdout, params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]);
+                i++;
+                if (i < n_embd) fprintf(stdout, ","); else break;
+            }
+            fprintf(stdout, notArray ? "]\n    }" : "]");
+            j++;
+            if (j < n_prompts) fprintf(stdout, notArray ? ",\n" : ","); else break;
+        }
+        fprintf(stdout, notArray ? "\n  ]" : "]\n");
+        if (params.embd_out == "json+" && n_prompts > 1) {
+            fprintf(stdout, ",\n  \"cosineSimilarity\": [\n");
+            for (int i = 0;;) { // at least two iteration (n_prompts > 1)
+                fprintf(stdout, "    [");
+                for (int j = 0;;) { // at least two iteration (n_prompts > 1)
+                    float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
+                    fprintf(stdout, "%6.2f", sim);
+                    j++;
+                    if (j < n_prompts) fprintf(stdout, ", "); else break;
+                }
+                fprintf(stdout, " ]");
+                i++;
+                if (i < n_prompts) fprintf(stdout, ",\n"); else break;
+            }
+            fprintf(stdout, "\n  ]");
+        }
+        if (notArray) fprintf(stdout, "\n}\n");
     }
     // clean up

package/src/llama.cpp/examples/eval-callback/CMakeLists.txt CHANGED Viewed

@@ -1,9 +1,9 @@
-set(TARGET eval-callback)
+set(TARGET llama-eval-callback)
 add_executable(${TARGET} eval-callback.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 set(TEST_TARGET test-eval-callback)
-add_test(NAME ${TEST_TARGET} COMMAND eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
+add_test(NAME ${TEST_TARGET} COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
 set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl)

package/src/llama.cpp/examples/eval-callback/eval-callback.cpp CHANGED Viewed

@@ -62,7 +62,7 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
                     } else if (type == GGML_TYPE_I8) {
                         v = (float) *(int8_t *) &data[i];
                     } else {
-                        GGML_ASSERT(false);
+                        GGML_ABORT("fatal error");
                     }
                     printf("%12.4f", v);
                     sum += v;
@@ -99,7 +99,7 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
     char src1_str[128] = {0};
     if (src1) {
-        sprintf(src1_str, "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
+        snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
     }
     printf("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
@@ -140,20 +140,18 @@ static bool run(llama_context * ctx, const gpt_params & params) {
 }
 int main(int argc, char ** argv) {
     callback_data cb_data;
     gpt_params params;
     if (!gpt_params_parse(argc, argv, params)) {
+        gpt_params_print_usage(argc, argv, params);
         return 1;
     }
     print_build_info();
     std::mt19937 rng(params.seed);
-    if (params.random_prompt) {
-        params.prompt = string_random_prompt(rng);
-    }
     llama_backend_init();
     llama_numa_init(params.numa);

package/src/llama.cpp/examples/export-lora/CMakeLists.txt CHANGED Viewed

@@ -1,4 +1,4 @@
-set(TARGET export-lora)
+set(TARGET llama-export-lora)
 add_executable(${TARGET} export-lora.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})