npm - @fugood/llama.node - Versions diffs - 0.2.0 → 0.2.2 - Mend

@fugood/llama.node 0.2.0 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (94) hide show

package/CMakeLists.txt +9 -0
package/README.md +1 -1
package/bin/darwin/arm64/default.metallib +0 -0
package/bin/darwin/arm64/llama-node.node +0 -0
package/bin/darwin/x64/default.metallib +0 -0
package/bin/darwin/x64/llama-node.node +0 -0
package/bin/linux/arm64/llama-node.node +0 -0
package/bin/linux/x64/llama-node.node +0 -0
package/bin/linux-vulkan/arm64/llama-node.node +0 -0
package/bin/linux-vulkan/x64/llama-node.node +0 -0
package/bin/win32/arm64/llama-node.node +0 -0
package/bin/win32/arm64/node.lib +0 -0
package/bin/win32/x64/llama-node.node +0 -0
package/bin/win32/x64/node.lib +0 -0
package/bin/win32-vulkan/arm64/llama-node.node +0 -0
package/bin/win32-vulkan/arm64/node.lib +0 -0
package/bin/win32-vulkan/x64/llama-node.node +0 -0
package/bin/win32-vulkan/x64/node.lib +0 -0
package/lib/binding.ts +1 -1
package/package.json +2 -1
package/patches/llama.patch +22 -0
package/src/LlamaContext.cpp +2 -2
package/src/TokenizeWorker.cpp +1 -1
package/src/llama.cpp/CMakeLists.txt +82 -54
package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +16 -0
package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +6 -0
package/src/llama.cpp/common/common.cpp +748 -754
package/src/llama.cpp/common/common.h +49 -41
package/src/llama.cpp/common/grammar-parser.cpp +10 -1
package/src/llama.cpp/common/json-schema-to-grammar.cpp +6 -6
package/src/llama.cpp/common/log.h +5 -5
package/src/llama.cpp/common/sampling.cpp +92 -10
package/src/llama.cpp/common/sampling.h +6 -1
package/src/llama.cpp/common/train.cpp +2 -2
package/src/llama.cpp/examples/CMakeLists.txt +3 -0
package/src/llama.cpp/examples/batched/batched.cpp +1 -1
package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
package/src/llama.cpp/examples/embedding/embedding.cpp +13 -4
package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +2 -2
package/src/llama.cpp/examples/finetune/finetune.cpp +4 -3
package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -2
package/src/llama.cpp/examples/infill/infill.cpp +8 -8
package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +57 -8
package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +55 -0
package/src/llama.cpp/examples/llama.android/{app → llama}/src/main/cpp/CMakeLists.txt +7 -8
package/src/llama.cpp/examples/llama.android/{app → llama}/src/main/cpp/llama-android.cpp +14 -14
package/src/llama.cpp/examples/llava/clip.h +1 -1
package/src/llama.cpp/examples/llava/llava-cli.cpp +27 -7
package/src/llama.cpp/examples/llava/llava.cpp +0 -15
package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -1
package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
package/src/llama.cpp/examples/main/main.cpp +29 -17
package/src/llama.cpp/examples/parallel/parallel.cpp +1 -1
package/src/llama.cpp/examples/perplexity/perplexity.cpp +9 -9
package/src/llama.cpp/examples/quantize/quantize.cpp +2 -2
package/src/llama.cpp/examples/retrieval/retrieval.cpp +2 -2
package/src/llama.cpp/examples/rpc/CMakeLists.txt +2 -0
package/src/llama.cpp/examples/rpc/rpc-server.cpp +134 -0
package/src/llama.cpp/examples/server/server.cpp +33 -25
package/src/llama.cpp/examples/server/utils.hpp +1 -1
package/src/llama.cpp/examples/tokenize/tokenize.cpp +359 -9
package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +4 -3
package/src/llama.cpp/ggml-backend.c +2 -3
package/src/llama.cpp/ggml-common.h +0 -54
package/src/llama.cpp/ggml-cuda.h +1 -0
package/src/llama.cpp/ggml-impl.h +51 -0
package/src/llama.cpp/ggml-kompute.cpp +13 -3
package/src/llama.cpp/ggml-opencl.cpp +4 -1
package/src/llama.cpp/ggml-quants.c +3715 -2050
package/src/llama.cpp/ggml-rpc.cpp +1155 -0
package/src/llama.cpp/ggml-rpc.h +24 -0
package/src/llama.cpp/ggml-sycl.cpp +119 -673
package/src/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
package/src/llama.cpp/ggml-vulkan.cpp +203 -224
package/src/llama.cpp/ggml.c +1208 -1483
package/src/llama.cpp/ggml.h +71 -46
package/src/llama.cpp/llama.cpp +1374 -938
package/src/llama.cpp/llama.h +22 -6
package/src/llama.cpp/requirements.txt +0 -2
package/src/llama.cpp/tests/CMakeLists.txt +1 -1
package/src/llama.cpp/tests/test-backend-ops.cpp +120 -57
package/src/llama.cpp/tests/test-chat-template.cpp +16 -4
package/src/llama.cpp/tests/test-grad0.cpp +43 -83
package/src/llama.cpp/tests/test-grammar-integration.cpp +46 -0
package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +27 -3
package/src/llama.cpp/unicode-data.cpp +6969 -2169
package/src/llama.cpp/unicode-data.h +15 -12
package/src/llama.cpp/unicode.cpp +89 -111
package/src/llama.cpp/unicode.h +44 -12
package/src/llama.cpp/build.zig +0 -172
package/src/llama.cpp/ggml-mpi.c +0 -216
package/src/llama.cpp/ggml-mpi.h +0 -39
package/src/llama.cpp/requirements/requirements-convert-lora-to-ggml.txt +0 -2
package/src/llama.cpp/requirements/requirements-convert-persimmon-to-gguf.txt +0 -2

package/src/llama.cpp/llama.h CHANGED Viewed

@@ -81,9 +81,11 @@ extern "C" {
         LLAMA_VOCAB_PRE_TYPE_GPT2           = 7,
         LLAMA_VOCAB_PRE_TYPE_REFACT         = 8,
         LLAMA_VOCAB_PRE_TYPE_COMMAND_R      = 9,
-        LLAMA_VOCAB_PRE_TYPE_QWEN2          = 10,
-        LLAMA_VOCAB_PRE_TYPE_OLMO           = 11,
-        LLAMA_VOCAB_PRE_TYPE_DBRX           = 12,
+        LLAMA_VOCAB_PRE_TYPE_STABLELM2      = 10,
+        LLAMA_VOCAB_PRE_TYPE_QWEN2          = 11,
+        LLAMA_VOCAB_PRE_TYPE_OLMO           = 12,
+        LLAMA_VOCAB_PRE_TYPE_DBRX           = 13,
+        LLAMA_VOCAB_PRE_TYPE_SMAUG          = 14,
     };
     // note: these values should be synchronized with ggml_rope
@@ -242,6 +244,9 @@ extern "C" {
         // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
         const float * tensor_split;
+        // comma separated list of RPC servers to use for offloading
+        const char * rpc_servers;
         // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
         // If the provided progress_callback returns true, model loading continues.
         // If it returns false, model loading is immediately aborted.
@@ -260,6 +265,8 @@ extern "C" {
         bool check_tensors; // validate model tensor data
     };
+    // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
+    //       https://github.com/ggerganov/llama.cpp/pull/7544
     struct llama_context_params {
         uint32_t seed;              // RNG seed, -1 for random
         uint32_t n_ctx;             // text context, 0 = from model
@@ -286,14 +293,14 @@ extern "C" {
         ggml_backend_sched_eval_callback cb_eval;
         void * cb_eval_user_data;
-        enum ggml_type type_k; // data type for K cache
-        enum ggml_type type_v; // data type for V cache
+        enum ggml_type type_k; // data type for K cache [EXPERIMENTAL]
+        enum ggml_type type_v; // data type for V cache [EXPERIMENTAL]
         // Keep the booleans together to avoid misalignment during copy-by-value.
         bool logits_all;  // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
         bool embeddings;  // if true, extract embeddings (together with logits)
         bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
-        bool flash_attn;  // whether to use flash attention
+        bool flash_attn;  // whether to use flash attention [EXPERIMENTAL]
         // Abort callback
         // if it returns true, execution of llama_decode() will be aborted
@@ -755,6 +762,12 @@ extern "C" {
     // n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
     LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);
+    // Get the number of threads used for generation of a single token.
+    LLAMA_API uint32_t llama_n_threads(struct llama_context * ctx);
+    // Get the number of threads used for prompt and batch processing (multiple token).
+    LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx);
     // Set whether to use causal attention or not
     // If set to true, the model will only attend to the past tokens
     LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);
@@ -813,6 +826,9 @@ extern "C" {
     // Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
     LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token);
+    // Identify if Token Id is a control token or a render-able token
+    LLAMA_API bool llama_token_is_control(const struct llama_model * model, llama_token token);
     // Special tokens
     LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
     LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence

package/src/llama.cpp/requirements.txt CHANGED Viewed

@@ -9,5 +9,3 @@
 -r ./requirements/requirements-convert-hf-to-gguf.txt
 -r ./requirements/requirements-convert-hf-to-gguf-update.txt
 -r ./requirements/requirements-convert-llama-ggml-to-gguf.txt
--r ./requirements/requirements-convert-lora-to-ggml.txt
--r ./requirements/requirements-convert-persimmon-to-gguf.txt

package/src/llama.cpp/tests/CMakeLists.txt CHANGED Viewed

@@ -92,7 +92,7 @@ target_link_libraries(test-tokenizer-1-bpe PRIVATE common)
 install(TARGETS test-tokenizer-1-bpe RUNTIME)
 # TODO: disabled due to slowness
-#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
+#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf --ignore-merges)
 #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-falcon    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
 #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-aquila    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
 #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-mpt       ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)

package/src/llama.cpp/tests/test-backend-ops.cpp CHANGED Viewed

@@ -2,6 +2,7 @@
 #include <ggml-alloc.h>
 #include <ggml-backend.h>
 #include <ggml-backend-impl.h>
 #include <algorithm>
 #include <array>
 #include <cfloat>
@@ -15,6 +16,7 @@
 #include <thread>
 #include <vector>
 static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) {
     // static RNG initialization (revisit if n_threads stops being constant)
     static const size_t n_threads = std::thread::hardware_concurrency();
@@ -48,6 +50,22 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
         t.join();
     }
+#if 0
+    const char * val_str = getenv("GGML_TEST_EPS");
+    float val = 1e-9f;
+    if (val_str != nullptr) {
+        val = std::stof(val_str);
+        printf("GGML_TEST_EPS=%e\n", val);
+    }
+    // test quantization with very small values that may result in nan scales due to division by zero
+    if (ggml_is_quantized(tensor->type)) {
+        for (int i = 0; i < 256; i++) {
+            data[i] = val;
+        }
+    }
+#endif
     if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) {
         ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float));
     } else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16) {
@@ -63,6 +81,7 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
             }
         }
         ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], im);
+        GGML_ASSERT(ggml_validate_row_data(tensor->type, dataq.data(), dataq.size()));
         ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
     } else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
         // This is going to create some weird integers though.
@@ -1111,11 +1130,7 @@ struct test_soft_max : public test_case {
         if (this->mask) {
             mask = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, ne[0], ne[1]);
         }
-        ggml_tensor * pos = nullptr;
-        if (max_bias > 0.0f) {
-            pos = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ne[0]);
-        }
-        ggml_tensor * out = ggml_soft_max_ext(ctx, a, mask, pos, scale, max_bias);
+        ggml_tensor * out = ggml_soft_max_ext(ctx, a, mask, scale, max_bias);
         return out;
     }
 };
@@ -1127,20 +1142,22 @@ struct test_rope : public test_case {
     int n_dims;
     int mode;
     int n_ctx;
+    bool ff;
     std::string vars() override {
-        return VARS_TO_STR5(type, ne, n_dims, mode, n_ctx);
+        return VARS_TO_STR6(type, ne, n_dims, mode, n_ctx, ff);
     }
     test_rope(ggml_type type = GGML_TYPE_F32,
             std::array<int64_t, 4> ne = {10, 10, 10, 1},
-            int n_dims = 10, int mode = 0, int n_ctx = 512)
-        : type(type), ne(ne), n_dims(n_dims), mode(mode), n_ctx(n_ctx) {}
+            int n_dims = 10, int mode = 0, int n_ctx = 512, bool ff = false)
+        : type(type), ne(ne), n_dims(n_dims), mode(mode), n_ctx(n_ctx), ff(ff) {}
     ggml_tensor * build_graph(ggml_context * ctx) override {
         ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
         ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne[2]);
-        ggml_tensor * out = ggml_rope(ctx, a, pos, n_dims, mode, n_ctx);
+        ggml_tensor * freq = ff ? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_dims/2) : nullptr;
+        ggml_tensor * out = ggml_rope_ext(ctx, a, pos, freq, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f);
         return out;
     }
@@ -1154,7 +1171,12 @@ struct test_rope : public test_case {
                 }
                 ggml_backend_tensor_set(t, data.data(), 0, ne[2] * sizeof(int));
             } else {
-                init_tensor_uniform(t);
+                if (t->ne[0] == n_dims/2) {
+                    // frequency factors in the range [0.9f, 1.1f]
+                    init_tensor_uniform(t, 0.9f, 1.1f);
+                } else {
+                    init_tensor_uniform(t);
+                }
             }
         }
     }
@@ -1237,22 +1259,26 @@ struct test_im2col : public test_case {
 // GGML_OP_CONCAT
 struct test_concat : public test_case {
     const ggml_type type;
-    const std::array<int64_t, 4> ne;
-    const int64_t b_ne2;
+    const std::array<int64_t, 4> ne_a;
+    const int64_t ne_b_d;
+    const int dim;
     std::string vars() override {
-        return VARS_TO_STR3(type, ne, b_ne2);
+        return VARS_TO_STR4(type, ne_a, ne_b_d, dim);
     }
     test_concat(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {10, 10, 10, 10},
-            int64_t b_ne2 = 10)
-        : type(type), ne(ne), b_ne2(b_ne2) {}
+            std::array<int64_t, 4> ne_a = {10, 10, 10, 10},
+            int64_t ne_b_d = 10,
+            int dim = 2)
+        : type(type), ne_a(ne_a), ne_b_d(ne_b_d), dim(dim) {}
     ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_tensor * b = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], b_ne2, ne[3]);
-        ggml_tensor * out = ggml_concat(ctx, a, b);
+        auto ne_b = ne_a;
+        ne_b[dim] = ne_b_d;
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
+        ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne_b.data());
+        ggml_tensor * out = ggml_concat(ctx, a, b, dim);
         return out;
     }
 };
@@ -1332,23 +1358,47 @@ struct test_upscale : public test_case {
     const ggml_type type;
     const std::array<int64_t, 4> ne;
     const int32_t scale_factor;
+    const bool transpose;
     std::string vars() override {
-        return VARS_TO_STR3(type, ne, scale_factor);
+        return VARS_TO_STR4(type, ne, scale_factor, transpose);
     }
     test_upscale(ggml_type type = GGML_TYPE_F32,
             std::array<int64_t, 4> ne = {512, 512, 3, 1},
-            int32_t scale_factor = 2)
-        : type(type), ne(ne), scale_factor(scale_factor) {}
+            int32_t scale_factor = 2, bool transpose = false)
+        : type(type), ne(ne), scale_factor(scale_factor), transpose(transpose) {}
     ggml_tensor * build_graph(ggml_context * ctx) override {
         ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        if (transpose) a = ggml_transpose(ctx, a);
         ggml_tensor * out = ggml_upscale(ctx, a, scale_factor);
         return out;
     }
 };
+// GGML_OP_UPSCALE (ext)
+struct test_upscale_ext : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+    const std::array<int64_t, 4> ne_tgt;
+    std::string vars() override {
+        return VARS_TO_STR3(type, ne, ne_tgt);
+    }
+    test_upscale_ext(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne     = {2, 5,  7, 11},
+            std::array<int64_t, 4> ne_tgt = {5, 7, 11, 13})
+        : type(type), ne(ne), ne_tgt(ne_tgt) {}
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_tensor * out = ggml_upscale_ext(ctx, a, ne_tgt[0], ne_tgt[1],ne_tgt[2], ne_tgt[3]);
+        return out;
+    }
+};
 // GGML_OP_GROUP_NORM
 struct test_group_norm : public test_case {
     const ggml_type type;
@@ -1490,23 +1540,27 @@ struct test_flash_attn_ext : public test_case {
     const int64_t kv; // kv size
     const int64_t nb; // batch size
+    const bool mask; // use mask
+    const float max_bias; // ALiBi
     std::string vars() override {
-        return VARS_TO_STR4(hs, nh, kv, nb);
+        return VARS_TO_STR6(hs, nh, kv, nb, mask, max_bias);
     }
     double max_nmse_err() override {
         return 5e-4;
     }
-    test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8)
-        : hs(hs), nh(nh), kv(kv), nb(nb) {}
+    test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8, bool mask = true, float max_bias = 0.0f)
+        : hs(hs), nh(nh), kv(kv), nb(nb), mask(mask), max_bias(max_bias) {}
     ggml_tensor * build_graph(ggml_context * ctx) override {
         ggml_tensor * q = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, hs, nb, nh, 1);
         ggml_tensor * k = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, hs, kv, nh, 1);
         ggml_tensor * v = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, hs, kv, nh, 1);
-        ggml_tensor * mask = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, GGML_PAD(nb, GGML_KQ_MASK_PAD), 1, 1);
-        ggml_tensor * out = ggml_flash_attn_ext(ctx, q, k, v, mask, 1.0f/sqrtf(hs));
+        ggml_tensor * m = mask ? ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, GGML_PAD(nb, GGML_KQ_MASK_PAD), 1, 1) : nullptr;
+        ggml_tensor * out = ggml_flash_attn_ext(ctx, q, k, v, m, 1.0f/sqrtf(hs), max_bias);
         return out;
     }
 };
@@ -1611,7 +1665,7 @@ public:
         struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
-        kq = ggml_soft_max_ext(ctx, kq, kq_mask, nullptr, kq_scale, 0.0f);
+        kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, 0.0f);
         // split cached v into n_head heads
         struct ggml_tensor * v =
@@ -1720,14 +1774,14 @@ struct test_llama : public test_llm {
                 struct ggml_tensor * Kcur = ggml_mul_mat(ctx, wk, cur);
                 struct ggml_tensor * Vcur = ggml_mul_mat(ctx, wv, cur);
-                Qcur = ggml_rope_custom(
-                    ctx, ggml_reshape_3d(ctx, Qcur, hp.n_embd_head, hp.n_head,    hp.n_tokens), inp_pos,
+                Qcur = ggml_rope_ext(
+                    ctx, ggml_reshape_3d(ctx, Qcur, hp.n_embd_head, hp.n_head,    hp.n_tokens), inp_pos, nullptr,
                     hp.n_rot, 0, 0, hp.n_orig_ctx, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
-                Kcur = ggml_rope_custom(
-                    ctx, ggml_reshape_3d(ctx, Kcur, hp.n_embd_head, hp.n_head_kv, hp.n_tokens), inp_pos,
+                Kcur = ggml_rope_ext(
+                    ctx, ggml_reshape_3d(ctx, Kcur, hp.n_embd_head, hp.n_head_kv, hp.n_tokens), inp_pos, nullptr,
                     hp.n_rot, 0, 0, hp.n_orig_ctx, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
@@ -1846,13 +1900,13 @@ struct test_falcon : public test_llm {
                 Kcur = ggml_reshape_3d(ctx, Kcur, hp.n_embd_head, hp.n_head_kv, hp.n_tokens);
                 // using mode = 2 for neox mode
-                Qcur = ggml_rope_custom(
-                    ctx, Qcur, inp_pos, hp.n_rot, 2, 0, hp.n_orig_ctx,
+                Qcur = ggml_rope_ext(
+                    ctx, Qcur, inp_pos, nullptr, hp.n_rot, 2, 0, hp.n_orig_ctx,
                     freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
                 );
-                Kcur = ggml_rope_custom(
-                    ctx, Kcur, inp_pos, hp.n_rot, 2, 0, hp.n_orig_ctx,
+                Kcur = ggml_rope_ext(
+                    ctx, Kcur, inp_pos, nullptr, hp.n_rot, 2, 0, hp.n_orig_ctx,
                     freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
                 );
@@ -2128,6 +2182,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
 #endif
     for (bool mask : {false, true}) {
         for (float max_bias : {0.0f, 8.0f}) {
+            if (!mask && max_bias > 0.0f) continue;
             for (float scale : {1.0f, 0.1f}) {
                 for (int64_t ne0 : {16, 1024}) {
                     for (int64_t ne1 : {16, 1024}) {
@@ -2141,24 +2196,29 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
     test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, false, 0.1f, 0.0f));
     test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true,  0.1f, 0.0f));
-    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, false, 0.1f, 8.0f));
     test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true,  0.1f, 8.0f));
     for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
-        test_cases.emplace_back(new test_rope(type, {128,  32, 10, 1}, 128, 0, 512)); // llama 7B
-        test_cases.emplace_back(new test_rope(type, {128,  40, 10, 1}, 128, 0, 512)); // llama 13B
-        test_cases.emplace_back(new test_rope(type, {128,  52, 10, 1}, 128, 0, 512)); // llama 30B
-        test_cases.emplace_back(new test_rope(type, {128,  64, 10, 1}, 128, 0, 512)); // llama 65B
-        test_cases.emplace_back(new test_rope(type, { 64,   1, 10, 1},  64, 2, 512)); // neox (falcon 7B)
-        test_cases.emplace_back(new test_rope(type, { 64,  71, 10, 1},  64, 2, 512)); // neox (falcon 7B)
-        test_cases.emplace_back(new test_rope(type, { 64,   8, 10, 1},  64, 2, 512)); // neox (falcon 40B)
-        test_cases.emplace_back(new test_rope(type, { 64, 128, 10, 1},  64, 2, 512)); // neox (falcon 40B)
-        test_cases.emplace_back(new test_rope(type, { 80,  32, 10, 1},  20, 2, 512)); // neox (stablelm)
-        test_cases.emplace_back(new test_rope(type, { 80,  32, 10, 1},  32, 2, 512)); // neox (phi-2)
+        // TODO: ff not supported yet for !neox
+        test_cases.emplace_back(new test_rope(type, {128,  32, 10, 1}, 128, 0, 512, false)); // llama 7B
+        test_cases.emplace_back(new test_rope(type, {128,  40, 10, 1}, 128, 0, 512, false)); // llama 13B
+        test_cases.emplace_back(new test_rope(type, {128,  52, 10, 1}, 128, 0, 512, false)); // llama 30B
+        test_cases.emplace_back(new test_rope(type, {128,  64, 10, 1}, 128, 0, 512, false)); // llama 65B
+        for (bool ff : {false, true}) { // freq_factors
+            test_cases.emplace_back(new test_rope(type, { 64,   1, 10, 1},  64, 2, 512, ff)); // neox (falcon 7B)
+            test_cases.emplace_back(new test_rope(type, { 64,  71, 10, 1},  64, 2, 512, ff)); // neox (falcon 7B)
+            test_cases.emplace_back(new test_rope(type, { 64,   8, 10, 1},  64, 2, 512, ff)); // neox (falcon 40B)
+            test_cases.emplace_back(new test_rope(type, { 64, 128, 10, 1},  64, 2, 512, ff)); // neox (falcon 40B)
+            test_cases.emplace_back(new test_rope(type, { 80,  32, 10, 1},  20, 2, 512, ff)); // neox (stablelm)
+            test_cases.emplace_back(new test_rope(type, { 80,  32, 10, 1},  32, 2, 512, ff)); // neox (phi-2)
+        }
     }
-    test_cases.emplace_back(new test_concat(GGML_TYPE_F32));
-    test_cases.emplace_back(new test_concat(GGML_TYPE_I32));
+    for (int dim : { 0, 1, 2, 3, }) {
+        test_cases.emplace_back(new test_concat(GGML_TYPE_F32, {11, 12, 13, 14}, 7, dim));
+        test_cases.emplace_back(new test_concat(GGML_TYPE_I32, {11, 12, 13, 14}, 7, dim));
+    }
     for (ggml_sort_order order : {GGML_SORT_ORDER_ASC, GGML_SORT_ORDER_DESC}) {
         test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {8, 1, 1, 1}, order));
@@ -2168,6 +2228,8 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
     test_cases.emplace_back(new test_sum_rows());
     test_cases.emplace_back(new test_upscale());
+    test_cases.emplace_back(new test_upscale(GGML_TYPE_F32, { 512, 512, 3, 1 }, 2, true));
+    test_cases.emplace_back(new test_upscale_ext());
     test_cases.emplace_back(new test_group_norm());
     test_cases.emplace_back(new test_acc());
     test_cases.emplace_back(new test_pad());
@@ -2175,15 +2237,16 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
     test_cases.emplace_back(new test_timestep_embedding());
     test_cases.emplace_back(new test_leaky_relu());
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-    for (int hs : { 64, 128, }) { // other head sizes not implemented
-#else
     for (int hs : { 64, 80, 128, 256, }) {
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-        for (int nh : { 32, }) {
-            for (int kv : { 512, 1024, }) {
-                for (int nb : { 1, 2, 4, 8, }) {
-                    test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb));
+        for (bool mask : { true, false } ) {
+            for (float max_bias : { 0.0f, 8.0f }) {
+                if (!mask && max_bias > 0.0f) continue;
+                for (int nh : { 32, }) {
+                    for (int kv : { 512, 1024, }) {
+                        for (int nb : { 1, 2, 4, 8, }) {
+                            test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb, mask, max_bias));
+                        }
+                    }
                 }
             }
         }

package/src/llama.cpp/tests/test-chat-template.cpp CHANGED Viewed

@@ -49,8 +49,14 @@ int main(void) {
         "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true %}{% set loop_messages = messages %}{% set system_message = 'You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + system_message + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% elif message['role'] == 'assistant' %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>'  + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}{% endif %}",
         // Llama-3
         "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}",
-        // Phi-3
-        "{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + ' ' + message['content'] + '<|end|> ' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|> ' }}{% else %}{{ eos_token }}{% endif %}"
+        //Phi-3-mini
+        "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}",
+        //Phi-3-small
+        "{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
+        //Phi-3-medium
+        "{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}",
+        //Phi-3-vision
+        "{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{- '<|assistant|>\n' -}}{% endif %}"
     };
     std::vector<std::string> expected_output = {
         // teknium/OpenHermes-2.5-Mistral-7B
@@ -79,8 +85,14 @@ int main(void) {
         "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>You are a helpful assistant<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hello<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>Hi there<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Who are you<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>I am an assistant<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Another question<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
         // Llama 3
         "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHi there<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI am an assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nAnother question<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
-        // Phi 3
-        "<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\nI am an assistant<|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
+        //Phi-3-mini
+        "<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n   I am an assistant   <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
+        //Phi-3-small
+        "<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n   I am an assistant   <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
+        //Phi-3-medium
+        "<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n   I am an assistant   <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
+        //Phi-3-vision
+        "<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n   I am an assistant   <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
     };
     std::vector<char> formatted_chat(1024);
     int32_t res;

package/src/llama.cpp/tests/test-grad0.cpp CHANGED Viewed

@@ -1515,90 +1515,50 @@ int main(int argc, const char ** argv) {
         }
         // flash_attn f32
-        {
-            srand(seed);
-            const int nargs = 3;
-            int64_t ne2[4];
-            get_random_dims(ne2, 4);
-            int64_t D = ne2[0];
-            int64_t N = ne2[1];
-            int64_t M = ne2[2] + N;
-            int64_t B = ne2[3];
-            for (int masked = 0; masked <= 1; ++masked) {
-                for (int ndims = 2; ndims <= 4; ++ndims) {
-                    int max_nrep = (ndims >= 3) ? 2 : 1;
-                    for (int nrep = 1; nrep < max_nrep; ++nrep) {
-                        int64_t neq[4] = { D, N, B*nrep, ne[3] };
-                        int64_t nek[4] = { D, M, B, ne[3] };
-                        int64_t nev[4] = { M, D, B, ne[3] };
-                        if (ndims == 2) {
-                            neq[2] = 1; neq[3] = 1;
-                            nek[2] = 1; nek[3] = 1;
-                            nev[2] = 1; nev[3] = 1;
-                        } else if (ndims == 3) {
-                            neq[3] = 1;
-                            nek[3] = 1;
-                            nev[3] = 1;
-                        }
-                        x[0] = get_random_tensor_f32(ctx0, ndims, neq, -0.1250f, 0.1250f);
-                        x[1] = get_random_tensor_f32(ctx0, ndims, nek, -0.1250f, 0.1250f);
-                        x[2] = get_random_tensor_f32(ctx0, ndims, nev, -0.1250f, 0.1250f);
-                        ggml_set_param(ctx0, x[0]);
-                        ggml_set_param(ctx0, x[1]);
-                        ggml_set_param(ctx0, x[2]);
-                        struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
-                        check_gradient("flash_attn f32", ctx0, x, f, ndims, nargs, 1.5e-4f, 1e-3f, INFINITY);
-                    }
-                }
-            }
-        }
-        // flash_attn f16, not yet fully implemented
-        if(0)
-        {
-            srand(seed);
-            const int nargs = 3;
-            int64_t ne2[4];
-            get_random_dims(ne2, 4);
-            int64_t D = ne2[0];
-            int64_t N = ne2[1];
-            int64_t M = ne2[2] + N;
-            int64_t B = ne2[3];
-            for (int masked = 0; masked <= 1; ++masked) {
-                for (int ndims = 2; ndims <= 4; ++ndims) {
-                    int64_t neq[4] = { D, N, B, ne[3] };
-                    int64_t nek[4] = { D, M, B, ne[3] };
-                    int64_t nev[4] = { M, D, B, ne[3] };
-                    if (ndims == 2) {
-                        neq[2] = 1; neq[3] = 1;
-                        nek[2] = 1; nek[3] = 1;
-                        nev[2] = 1; nev[3] = 1;
-                    } else if (ndims == 3) {
-                        neq[3] = 1;
-                        nek[3] = 1;
-                        nev[3] = 1;
-                    }
-                    x[0] = get_random_tensor_f16(ctx0, ndims, neq, -0.1250f, 0.1250f);
-                    x[1] = get_random_tensor_f16(ctx0, ndims, nek, -0.1250f, 0.1250f);
-                    x[2] = get_random_tensor_f16(ctx0, ndims, nev, -0.1250f, 0.1250f);
-                    ggml_set_param(ctx0, x[0]);
-                    ggml_set_param(ctx0, x[1]);
-                    ggml_set_param(ctx0, x[2]);
-                    struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
+        // TODO: adapt to ggml_flash_attn_ext() changes
+        //{
+        //    srand(seed);
+        //    const int nargs = 3;
+        //    int64_t ne2[4];
+        //    get_random_dims(ne2, 4);
+        //    int64_t D = ne2[0];
+        //    int64_t N = ne2[1];
+        //    int64_t M = ne2[2] + N;
+        //    int64_t B = ne2[3];
+        //    for (int masked = 0; masked <= 1; ++masked) {
+        //        for (int ndims = 2; ndims <= 4; ++ndims) {
+        //            int max_nrep = (ndims >= 3) ? 2 : 1;
+        //            for (int nrep = 1; nrep < max_nrep; ++nrep) {
+        //                int64_t neq[4] = { D, N, B*nrep, ne[3] };
+        //                int64_t nek[4] = { D, M, B, ne[3] };
+        //                int64_t nev[4] = { M, D, B, ne[3] };
+        //                if (ndims == 2) {
+        //                    neq[2] = 1; neq[3] = 1;
+        //                    nek[2] = 1; nek[3] = 1;
+        //                    nev[2] = 1; nev[3] = 1;
+        //                } else if (ndims == 3) {
+        //                    neq[3] = 1;
+        //                    nek[3] = 1;
+        //                    nev[3] = 1;
+        //                }
+        //                x[0] = get_random_tensor_f32(ctx0, ndims, neq, -0.1250f, 0.1250f);
+        //                x[1] = get_random_tensor_f32(ctx0, ndims, nek, -0.1250f, 0.1250f);
+        //                x[2] = get_random_tensor_f32(ctx0, ndims, nev, -0.1250f, 0.1250f);
+        //                ggml_set_param(ctx0, x[0]);
+        //                ggml_set_param(ctx0, x[1]);
+        //                ggml_set_param(ctx0, x[2]);
+        //                struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
+        //                check_gradient("flash_attn f32", ctx0, x, f, ndims, nargs, 1.5e-4f, 1e-3f, INFINITY);
+        //            }
+        //        }
+        //    }
+        //}
-                    check_gradient("flash_attn f16", ctx0, x, f, ndims, nargs, 1.5e-4f, 1e-3f, INFINITY);
-                }
-            }
-        }
         ggml_free(ctx0);
     }