npm - @fugood/llama.node - Versions diffs - 0.2.2 → 0.3.0 - Mend

@fugood/llama.node 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (320) hide show

package/src/llama.cpp/tests/test-llama-grammar.cpp CHANGED Viewed

@@ -2,10 +2,12 @@
 #undef NDEBUG
 #endif
-#include "llama.cpp" // TODO: not great
+#define LLAMA_API_INTERNAL
+#include "llama.h"
 #include "grammar-parser.h"
 #include <cassert>
+#include <stdexcept>
 int main()
 {
@@ -112,10 +114,14 @@ int main()
         }
     }
-    llama_grammar *grammar = NULL;
+    llama_grammar * grammar = NULL;
     std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
-    grammar = llama_grammar_init(
-        grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
+    grammar = llama_grammar_init(grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
+    if (grammar == nullptr)
+    {
+        throw std::runtime_error("Failed to initialize llama_grammar");
+    }
     std::vector<std::vector<llama_grammar_element>> expected_stacks = {
         {
@@ -168,7 +174,7 @@ int main()
         }};
     auto index = 0;
-    for (auto stack : grammar->stacks)
+    for (auto stack : llama_grammar_get_stacks(grammar))
     {
         // compare stack to expected_stack
         for (uint32_t i = 0; i < stack.size(); i++)
@@ -370,13 +376,13 @@ int main()
         },
     };
-    std::vector<llama_grammar_candidate> rejects = llama_grammar_reject_candidates_for_stack(grammar->rules, grammar->stacks[0], next_candidates);
+    std::vector<llama_grammar_candidate> rejects = llama_grammar_reject_candidates_for_stack(llama_grammar_get_rules(grammar), llama_grammar_get_stacks(grammar)[0], next_candidates);
     std::vector<std::vector<llama_grammar_candidate>> all_rejects;
-    for (std::size_t count = 0; count < grammar->stacks.size(); ++count)
+    for (std::size_t count = 0; count < llama_grammar_get_stacks(grammar).size(); ++count)
     {
-        rejects = llama_grammar_reject_candidates_for_stack(grammar->rules, grammar->stacks[count], next_candidates);
+        rejects = llama_grammar_reject_candidates_for_stack(llama_grammar_get_rules(grammar), llama_grammar_get_stacks(grammar)[count], next_candidates);
         all_rejects.push_back(rejects);
     }
@@ -397,6 +403,6 @@ int main()
         delete[] candidate.code_points;
         candidate.code_points = nullptr;
     }
-    delete grammar;
+    llama_grammar_free(grammar);
     return 0;
 }

package/src/llama.cpp/tests/test-quantize-fns.cpp CHANGED Viewed

@@ -60,7 +60,7 @@ static float reference_quantization_error(ggml_type_traits_t & qfns, size_t test
     qfns.from_float(test_data, tmp_q.data(), test_size);
     qfns.to_float(tmp_q.data(), tmp_out.data(), test_size);
-    qfns.from_float_reference(test_data, tmp_q.data(), test_size);
+    qfns.from_float_ref(test_data, tmp_q.data(), test_size);
     qfns.to_float(tmp_q.data(), tmp_out_ref.data(), test_size);
     return array_rmse(tmp_out.data(), tmp_out_ref.data(), test_size);

package/src/llama.cpp/tests/test-quantize-perf.cpp CHANGED Viewed

@@ -285,7 +285,7 @@ int main(int argc, char * argv[]) {
                 for (size_t size : params.test_sizes) {
                     printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
                     auto quantize_fn = [&](void) -> float {
-                        qfns.from_float_reference(test_data1, test_q1, size);
+                        qfns.from_float_ref(test_data1, test_q1, size);
                         return test_q1[0];
                     };
                     size_t quantized_size = ggml_row_size(type, size);

package/src/llama.cpp/tests/test-rope.cpp CHANGED Viewed

@@ -162,12 +162,12 @@ int main(int /*argc*/, const char ** /*argv*/) {
         x = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
         // 100, 101, 102, ..., 172
-        struct ggml_tensor * r0 = ggml_rope(ctx0, x,  p0, n_rot, mode, 1024);
+        struct ggml_tensor * r0 = ggml_rope(ctx0, x,  p0, n_rot, mode);
         // -67, -67, -67, ..., -67
-        struct ggml_tensor * r1 = ggml_rope(ctx0, r0, p1, n_rot, mode, 1024); // "context swap", i.e. forget n_past_0 - n_past_2 tokens
+        struct ggml_tensor * r1 = ggml_rope(ctx0, r0, p1, n_rot, mode); // "context swap", i.e. forget n_past_0 - n_past_2 tokens
         //  33,  34,  35, ..., 105
-        struct ggml_tensor * r2 = ggml_rope(ctx0, x,  p2, n_rot, mode, 1024);
+        struct ggml_tensor * r2 = ggml_rope(ctx0, x,  p2, n_rot, mode);
         ggml_cgraph * gf = ggml_new_graph(ctx0);
@@ -218,4 +218,3 @@ int main(int /*argc*/, const char ** /*argv*/) {
     return 0;
 }

package/src/llama.cpp/tests/test-sampling.cpp CHANGED Viewed

@@ -166,12 +166,12 @@ static void test_sampler_queue(
     for (auto s : samplers_sequence) {
         switch (s){
             case 'k': llama_sample_top_k    (nullptr, &candidates_p, top_k, 1); break;
-            case 'f': GGML_ASSERT(false && "tail_free test not implemented");   break;
-            case 'y': GGML_ASSERT(false && "typical test not implemented");     break;
+            case 'f': GGML_ABORT("tail_free test not implemented");   break;
+            case 'y': GGML_ABORT("typical test not implemented");     break;
             case 'p': llama_sample_top_p    (nullptr, &candidates_p, top_p, 1); break;
             case 'm': llama_sample_min_p    (nullptr, &candidates_p, min_p, 1); break;
-            case 't': GGML_ASSERT(false && "temperature test not implemented"); break;
-            default : GGML_ASSERT(false && "Unknown sampler");                  break;
+            case 't': GGML_ABORT("temperature test not implemented"); break;
+            default : GGML_ABORT("Unknown sampler");                  break;
         }
         llama_sample_softmax(nullptr, &candidates_p); // make sure tokens are sorted for tests
@@ -222,7 +222,7 @@ static void test_sampler_queue(
             GGML_ASSERT(candidates_p.data[0].id == max_token_id);
             GGML_ASSERT(candidates_p.data[expected_size-1].id == min_token_id);
         } else {
-            GGML_ASSERT(false);
+            GGML_ABORT("fatal error");
         }
     }

package/src/llama.cpp/tests/test-tokenizer-0.cpp CHANGED Viewed

@@ -195,11 +195,11 @@ int main(int argc, char **argv) {
     const bool add_special = false;
     for (const auto & test_kv : k_tests) {
-        const std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first, add_special);
+        const std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first, add_special, false);
         printf("\n");
         printf("src: '%s'\n", test_kv.first.c_str());
-        printf("res: '%s'\n", llama_detokenize_bpe(ctx, res).c_str());
+        printf("res: '%s'\n", llama_detokenize(ctx, res).c_str());
         printf("tok: ");
         for (const auto & tok : res) {
             printf("%d ", tok);
@@ -216,8 +216,8 @@ int main(int argc, char **argv) {
         if (!correct) {
             fprintf(stderr, "%s : failed test:    '%s'\n", __func__, test_kv.first.c_str());
             fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
-                llama_detokenize_bpe(ctx, res).c_str(),
-                llama_detokenize_bpe(ctx, test_kv.second).c_str());
+                llama_detokenize(ctx, res).c_str(),
+                llama_detokenize(ctx, test_kv.second).c_str());
             fprintf(stderr, "%s : expected tokens: ", __func__);
             for (const auto & t : test_kv.second) {
                 fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str());
@@ -253,7 +253,7 @@ int main(int argc, char **argv) {
         {
             const auto t_start = ggml_time_us();
-            res = llama_tokenize(ctx, text, add_special);
+            res = llama_tokenize(ctx, text, add_special, false);
             const auto t_end = ggml_time_us();
@@ -272,7 +272,7 @@ int main(int argc, char **argv) {
             }
             for (const auto & tok : res) {
-                //ofs << tok << " '" << string_strip(llama_detokenize_bpe(ctx, std::vector<int>{tok})) << "'" << std::endl;
+                //ofs << tok << " '" << string_strip(llama_detokenize(ctx, std::vector<int>{tok})) << "'" << std::endl;
                 ofs << tok << "\n";
             }
         }

package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp CHANGED Viewed

@@ -11,6 +11,7 @@
 #include <string>
 #include <thread>
 #include <vector>
+#include <atomic>
 int main(int argc, char **argv) {
     if (argc < 2 || argc > 3) {
@@ -63,7 +64,10 @@ int main(int argc, char **argv) {
         }
     }
-    GGML_ASSERT(llama_vocab_type(model) == LLAMA_VOCAB_TYPE_BPE);
+    //GGML_ASSERT(llama_vocab_type(model) == LLAMA_VOCAB_TYPE_BPE);
+    if (llama_vocab_type(model) != LLAMA_VOCAB_TYPE_BPE) {
+        return 99;
+    }
 #ifdef _WIN32
     // We need this for unicode console support
@@ -74,7 +78,7 @@ int main(int argc, char **argv) {
     const int n_vocab = llama_n_vocab(model);
     for (int i = 0; i < n_vocab; ++i) {
-        std::string str = llama_detokenize_bpe(ctx, std::vector<int>(1, i));
+        std::string str = llama_detokenize(ctx, std::vector<int>(1, i));
         try {
             auto cps = unicode_cpts_from_utf8(str);
             std::vector<llama_token> tokens = llama_tokenize(ctx, str, false, true);
@@ -90,7 +94,7 @@ int main(int argc, char **argv) {
                 fprintf(stderr, "]\n");
                 return 2;
             }
-            std::string check = llama_detokenize_bpe(ctx, tokens);
+            std::string check = llama_detokenize(ctx, tokens);
             if (check != str) {
                 fprintf(stderr, "%s : error: token %d detokenizes to '%s'(%zu) but tokenization of this detokenizes to '%s'(%zu)\n",
                     __func__, i, str.c_str(), str.length(), check.c_str(), check.length());
@@ -108,26 +112,23 @@ int main(int argc, char **argv) {
         std::vector<std::thread> threads(nthread);
+        std::atomic_int errcode = {};
         for (int i = 0; i < nthread; ++i) {
-            threads[i] = std::thread([i, nthread, ctx]() {
-                for (uint32_t cp = i; cp < 0x0010ffff; cp += nthread) {
-                    if (!( // NOLINT
-                                (cp < 0x03       || cp >  0x05)   && cp != 0x0b && cp != 0x11 &&
-                                (cp < 0x13       || cp >  0x17)   && cp != 0x19 &&
-                                (cp < 0x1c       || cp >  0x1e)   &&
-                                (cp < 0xd800     || cp >  0xdfff) &&
-                                (cp < 0x00040000 || cp >= 0x000e0000)
-                        )) {
+            threads[i] = std::thread([i, nthread, ctx, &errcode]() {
+                for (uint32_t cp = i; !errcode && cp < 0x00110000; cp += nthread) {
+                    if ((0x0000D800 <= cp && cp <= 0x0000DFFF) ||  // surrogates \p{Cs}
+                        (0x00040000 <= cp && cp <= 0x000E0000)) {  // undefined  \p{Cn}
                         continue;
                     }
                     std::string str = unicode_cpt_to_utf8(cp);
                     std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
-                    std::string check = llama_detokenize_bpe(ctx, tokens);
+                    std::string check = llama_detokenize(ctx, tokens);
                     if (cp != 9601 && str != check) {
-                        fprintf(stderr, "error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
+                        fprintf(stderr, "error: codepoint 0x%x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
                                 cp, check.c_str(), check.length(), str.c_str(), str.length());
-                        std::exit(3);
+                        errcode = 3;
                     }
                 }
             });
@@ -136,6 +137,10 @@ int main(int argc, char **argv) {
         for (auto & t : threads) {
             t.join();
         }
+        if (errcode) {
+            return errcode;
+        }
     }
     llama_free_model(model);

package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp CHANGED Viewed

@@ -11,6 +11,7 @@
 #include <string>
 #include <thread>
 #include <vector>
+#include <atomic>
 int main(int argc, char ** argv) {
     if (argc < 2) {
@@ -51,7 +52,10 @@ int main(int argc, char ** argv) {
         }
     }
-    GGML_ASSERT(llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
+    //GGML_ASSERT(llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
+    if (llama_vocab_type(model) != LLAMA_VOCAB_TYPE_SPM) {
+        return 99;
+    }
 #ifdef _WIN32
     // We need this for unicode console support
@@ -62,9 +66,9 @@ int main(int argc, char ** argv) {
     const int n_vocab = llama_n_vocab(model);
     for (int i = 0; i < n_vocab; ++i) {
-        std::string str = llama_detokenize_spm(ctx, std::vector<int>(1, i));
-        std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
-        std::string check = llama_detokenize_spm(ctx, tokens);
+        std::string str = llama_detokenize(ctx, std::vector<int>(1, i), true);
+        std::vector<llama_token> tokens = llama_tokenize(ctx, str, false, true);
+        std::string check = llama_detokenize(ctx, tokens);
         if (check != str) {
             fprintf(stderr, "%s : error: token %d detokenizes to '%s'(%zu) but tokenization of this detokenizes to '%s'(%zu)\n",
                 __func__, i, str.c_str(), str.length(), check.c_str(), check.length());
@@ -78,20 +82,23 @@ int main(int argc, char ** argv) {
         std::vector<std::thread> threads(nthread);
+        std::atomic_int errcode = {};
         for (int i = 0; i < nthread; ++i) {
-            threads[i] = std::thread([i, nthread, ctx]() {
-                for (uint32_t cp = i; cp < 0x0010ffff; cp += nthread) {
-                    if (cp >= 0xd800 && cp <= 0xdfff) {
+            threads[i] = std::thread([i, nthread, ctx, &errcode]() {
+                for (uint32_t cp = i; !errcode && cp < 0x00110000; cp += nthread) {
+                    if ((0x0000D800 <= cp && cp <= 0x0000DFFF) ||  // surrogates \p{Cs}
+                        (0x00040000 <= cp && cp <= 0x000E0000)) {  // undefined \p{Cn}
                         continue;
                     }
                     std::string str = unicode_cpt_to_utf8(cp);
-                    std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
-                    std::string check = llama_detokenize_spm(ctx, tokens);
+                    std::vector<llama_token> tokens = llama_tokenize(ctx, str, false, true);
+                    std::string check = llama_detokenize(ctx, tokens);
                     if (cp != 9601 && str != check) {
-                        fprintf(stderr, "error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
+                        fprintf(stderr, "error: codepoint 0x%x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
                                 cp, check.c_str(), check.length(), str.c_str(), str.length());
-                        std::exit(3);
+                        errcode = 3;
                     }
                 }
             });
@@ -100,6 +107,10 @@ int main(int argc, char ** argv) {
         for (auto & t : threads) {
             t.join();
         }
+        if(errcode) {
+            return errcode;
+        }
     }
     llama_free_model(model);

package/bin/darwin/arm64/default.metallib DELETED Viewed

Binary file

package/bin/darwin/x64/default.metallib DELETED Viewed

Binary file

package/src/llama.cpp/examples/beam-search/CMakeLists.txt DELETED Viewed

@@ -1,5 +0,0 @@
-set(TARGET beam-search)
-add_executable(${TARGET} beam-search.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)

package/src/llama.cpp/examples/beam-search/beam-search.cpp DELETED Viewed

@@ -1,188 +0,0 @@
-#include "common.h"
-#include "llama.h"
-#include <cassert>
-#include <cinttypes>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <ctime>
-#include <fstream>
-#include <iostream>
-#include <string>
-#include <vector>
-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
-#include <signal.h>
-#include <unistd.h>
-#elif defined (_WIN32)
-#define WIN32_LEAN_AND_MEAN
-#ifndef NOMINMAX
-#   define NOMINMAX
-#endif
-#include <windows.h>
-#include <signal.h>
-#endif
-// Used for debugging to print out beam tokens.
-struct ostream_beam_view {
-    llama_context * ctx;
-    llama_beam_view beam_view;
-};
-static std::ostream & operator<<(std::ostream & os, const ostream_beam_view & obv) {
-    os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens(";
-    for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) {
-        os << llama_token_to_piece(obv.ctx, obv.beam_view.tokens[i]);
-    }
-    return os << ')';
-}
-// Put here anything you want back in beam_search_callback().
-struct beam_search_callback_data {
-    llama_context * ctx;
-    std::vector<llama_token> response;
-};
-// In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.
-// For example, eob can be flagged due to maximum token length, stop words, etc.
-static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) {
-    return n_tokens && llama_token_is_eog(llama_get_model(callback_data.ctx), tokens[n_tokens-1]);
-}
-// Function matching type llama_beam_search_callback_fn_t.
-// Custom callback example is called each time the beams lengths increase:
-//  * Show progress by printing ',' following by number of convergent beam tokens if any.
-//  * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
-//    This is also called when the stop condition is met.
-//    Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
-static void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) {
-    auto& callback_data = *static_cast<beam_search_callback_data*>(callback_data_ptr);
-    // Mark beams as EOS as needed.
-    for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
-        llama_beam_view& beam_view = beams_state.beam_views[i];
-        if (!beam_view.eob && is_at_eob(callback_data, beam_view.tokens, beam_view.n_tokens)) {
-            beam_view.eob = true;
-        }
-    }
-    printf(",");  // Show progress
-    if (const size_t n = beams_state.common_prefix_length) {
-        callback_data.response.resize(callback_data.response.size() + n);
-        assert(0u < beams_state.n_beams);
-        const llama_token * tokens = beams_state.beam_views[0].tokens;
-        std::copy(tokens, tokens + n, callback_data.response.end() - n);
-        printf("%zu", n);
-    }
-    fflush(stdout);
-#if 1 // DEBUG: print current beams for this iteration
-    std::cout << "\n\nCurrent beams (last_call=" << beams_state.last_call << "):\n";
-    for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
-        std::cout << "beams["<<i<<"]: " << ostream_beam_view{callback_data.ctx,beams_state.beam_views[i]} << std::endl;
-    }
-#endif
-}
-int main(int argc, char ** argv)
-{
-    gpt_params params;
-    //params.n_gpu_layers = 200;
-    //---------------------------------
-    // Print help :
-    //---------------------------------
-    if ( argc < 2 || argv[1][0] == '-' )
-    {
-        printf( "Usage: %s MODEL_PATH [BEAM_WIDTH=2] [PROMPT]\n" , argv[0] );
-        return 1 ;
-    }
-    //---------------------------------
-    // Load parameters :
-    //---------------------------------
-    params.model = argv[1];
-    params.n_beams = 2 < argc ? std::stoi(argv[2]) : 2;
-    if ( argc > 3 )
-    {
-        params.prompt = argv[3];
-    }
-    if ( params.prompt.empty() )
-    {
-        params.prompt = "### Request:\nHow many countries are there?\n\n### Response:\n";
-    }
-    //---------------------------------
-    // Init LLM :
-    //---------------------------------
-    llama_backend_init();
-    llama_numa_init(params.numa);
-    llama_model * model;
-    llama_context * ctx;
-    std::tie(model, ctx) = llama_init_from_gpt_params( params );
-    if ( model == NULL )
-    {
-        fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
-        return 1;
-    }
-    //---------------------------------
-    // Tokenize the prompt :
-    //---------------------------------
-    std::vector<llama_token> tokens_list = llama_tokenize(ctx, params.prompt, true);
-    const size_t max_context_size     = llama_n_ctx( ctx );
-    const size_t max_tokens_list_size = max_context_size - 4 ;
-    if (tokens_list.size() > max_tokens_list_size)
-    {
-        fprintf( stderr , "%s: error: prompt too long (%zu tokens, max %zu)\n" ,
-             __func__ , tokens_list.size() , max_tokens_list_size );
-        return 1;
-    }
-    fprintf( stderr, "\n\n" );
-    // Print the tokens from the prompt :
-    for( auto id : tokens_list )
-    {
-        std::cout << llama_token_to_piece(ctx, id);
-    }
-    std::cout << std::flush;
-    int n_past = 0;
-    if (llama_decode(ctx, llama_batch_get_one(tokens_list.data(), tokens_list.size(), n_past, 0)))
-    {
-        fprintf(stderr, "%s : failed to eval prompt.\n" , __func__ );
-        return 1;
-    }
-    n_past += tokens_list.size();
-    beam_search_callback_data callback_data{ctx, {}};
-    size_t const beam_width = static_cast<size_t>(params.n_beams);
-    int const n_predict = 256;
-    llama_beam_search(ctx, beam_search_callback, &callback_data, beam_width, n_past, n_predict);
-    std::cout << "\n\n";
-    for (llama_token const token_id : callback_data.response) {
-        std::cout << llama_token_to_piece(ctx,token_id);
-    }
-    std::cout << std::endl;
-    llama_free( ctx );
-    llama_free_model( model );
-    llama_backend_free();
-    return 0;
-}