npm - @fugood/llama.node - Versions diffs - 1.0.6 → 1.1.0 - Mend

@fugood/llama.node 1.0.6 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

package/CMakeLists.txt +3 -3
package/lib/binding.js +7 -17
package/lib/binding.ts +116 -32
package/lib/index.js +7 -9
package/lib/index.ts +34 -25
package/package.json +17 -14
package/src/LlamaCompletionWorker.cpp +2 -2
package/src/LlamaContext.cpp +38 -8
package/src/llama.cpp/common/arg.cpp +8 -1
package/src/llama.cpp/common/common.h +4 -3
package/src/llama.cpp/ggml/CMakeLists.txt +3 -1
package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +5 -2
package/src/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +1 -1
package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +109 -12
package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +3 -0
package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +88 -10
package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +0 -1
package/src/llama.cpp/include/llama.h +2 -0
package/src/llama.cpp/src/llama-arch.cpp +6 -6
package/src/llama.cpp/src/llama-chat.cpp +3 -4
package/src/llama.cpp/src/llama-context.cpp +49 -14
package/src/llama.cpp/src/llama-context.h +13 -0
package/src/llama.cpp/src/llama-memory-recurrent.cpp +15 -0
package/src/llama.cpp/src/llama-model.cpp +19 -2
package/src/tts_utils.cpp +12 -0
package/src/tts_utils.h +40 -1

package/src/llama.cpp/src/llama-context.cpp CHANGED Viewed

@@ -105,7 +105,7 @@ llama_context::llama_context(
     {
         const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
-        const bool supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) : false;
+        supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) : false;
         if (!supports_set_rows && !cparams.kv_unified) {
             LLAMA_LOG_WARN("%s: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache\n", __func__);
@@ -508,12 +508,16 @@ enum llama_pooling_type llama_context::pooling_type() const {
 }
 float * llama_context::get_logits() {
+    output_reorder();
     return logits;
 }
 float * llama_context::get_logits_ith(int32_t i) {
     int64_t j = -1;
+    output_reorder();
     try {
         if (logits == nullptr) {
             throw std::runtime_error("no logits");
@@ -550,12 +554,16 @@ float * llama_context::get_logits_ith(int32_t i) {
 }
 float * llama_context::get_embeddings() {
+    output_reorder();
     return embd;
 }
 float * llama_context::get_embeddings_ith(int32_t i) {
     int64_t j = -1;
+    output_reorder();
     try {
         if (embd == nullptr) {
             throw std::runtime_error("no embeddings");
@@ -891,6 +899,12 @@ int llama_context::encode(const llama_batch & batch_inp) {
         }
     }
+    if (!supports_set_rows) {
+        // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
+        // overlap with device computation.
+        ggml_backend_sched_reset(sched.get());
+    }
     // TODO: hacky solution
     if (model.arch == LLM_ARCH_T5 && t_embd) {
         //cross.t_embd = t_embd;
@@ -970,6 +984,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
     // TODO: this clear of the buffer can easily be forgotten - need something better
     embd_seq.clear();
+    output_swaps.clear();
     bool did_optimize = false;
@@ -1189,9 +1204,6 @@ int llama_context::decode(const llama_batch & batch_inp) {
         // make the outputs have the same order they had in the user-provided batch
         // note: this is mostly relevant for recurrent models atm
         if (!sorted_output) {
-            const uint32_t n_vocab = model.vocab.n_tokens();
-            const uint64_t n_embd  = model.hparams.n_embd;
             GGML_ASSERT((size_t) n_outputs == out_ids.size());
             // TODO: is there something more efficient which also minimizes swaps?
@@ -1207,16 +1219,9 @@ int llama_context::decode(const llama_batch & batch_inp) {
                     continue;
                 }
                 std::swap(out_ids[i], out_ids[j_min]);
-                if (logits_size > 0) {
-                    for (uint32_t k = 0; k < n_vocab; k++) {
-                        std::swap(logits[i*n_vocab + k], logits[j_min*n_vocab + k]);
-                    }
-                }
-                if (embd_size > 0) {
-                    for (uint32_t k = 0; k < n_embd; k++) {
-                        std::swap(embd[i*n_embd + k], embd[j_min*n_embd + k]);
-                    }
-                }
+                // remember the swaps and apply them lazily upon logits/embeddings access
+                output_swaps.push_back({ i, j_min });
             }
             std::fill(output_ids.begin(), output_ids.end(), -1);
@@ -1230,6 +1235,12 @@ int llama_context::decode(const llama_batch & batch_inp) {
     // wait for the computation to finish (automatically done when obtaining the model output)
     //synchronize();
+    if (!supports_set_rows) {
+        // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
+        // overlap with device computation.
+        ggml_backend_sched_reset(sched.get());
+    }
     return 0;
 }
@@ -1307,6 +1318,30 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
     return n_outputs_max;
 }
+void llama_context::output_reorder() {
+    const uint32_t n_vocab = model.vocab.n_tokens();
+    const uint64_t n_embd  = model.hparams.n_embd;
+    for (uint32_t s = 0; s < output_swaps.size(); ++s) {
+        const uint32_t i0 = output_swaps[s].i0;
+        const uint32_t i1 = output_swaps[s].i1;
+        if (logits_size > 0) {
+            for (uint32_t k = 0; k < n_vocab; k++) {
+                std::swap(logits[i0*n_vocab + k], logits[i1*n_vocab + k]);
+            }
+        }
+        if (embd_size > 0) {
+            for (uint32_t k = 0; k < n_embd; k++) {
+                std::swap(embd[i0*n_embd + k], embd[i1*n_embd + k]);
+            }
+        }
+    }
+    output_swaps.clear();
+}
 //
 // graph
 //

package/src/llama.cpp/src/llama-context.h CHANGED Viewed

@@ -181,6 +181,8 @@ private:
     // Returns max number of outputs for which space was reserved.
     uint32_t output_reserve(int32_t n_outputs);
+    void output_reorder();
     //
     // graph
     //
@@ -250,6 +252,13 @@ private:
     std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
+    struct swap_info {
+        uint32_t i0;
+        uint32_t i1;
+    };
+    std::vector<swap_info> output_swaps;
     ggml_backend_sched_ptr sched;
     ggml_backend_t backend_cpu = nullptr;
@@ -278,6 +287,10 @@ private:
     bool has_evaluated_once = false;
+    // env: LLAMA_SET_ROWS (temporary)
+    // ref: https://github.com/ggml-org/llama.cpp/pull/14285
+    bool supports_set_rows = false;
     // perf
     mutable int64_t t_start_us  = 0;
     mutable int64_t t_load_us   = 0;

package/src/llama.cpp/src/llama-memory-recurrent.cpp CHANGED Viewed

@@ -768,6 +768,8 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::
     // Iterate and write all the keys first, each row is a cell
     // Get whole range at a time
     for (uint32_t il = 0; il < n_layer; ++il) {
+        // skip null layers (read_data will handle this by checking "r_l" and "s_l" for null)
+        if (r_l[il] == nullptr) continue;
         // Write key type
         const int32_t r_type_i = (int32_t)r_l[il]->type;
@@ -787,6 +789,8 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::
     if (!s_trans) {
         for (uint32_t il = 0; il < n_layer; ++il) {
+            // skip null layers (read_data will handle this by checking "r_l" and "s_l" for null)
+            if (s_l[il] == nullptr) continue;
             // Write value type
             const int32_t s_type_i = (int32_t)s_l[il]->type;
@@ -807,6 +811,9 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::
         // When v is transposed, we also need the element size and get the element ranges from each row
         const uint32_t mem_size = size;
         for (uint32_t il = 0; il < n_layer; ++il) {
+            // skip null layers (read_data will handle this by checking "r_l" and "s_l" for null)
+            if (s_l[il] == nullptr) continue;
             const uint32_t n_embd_s = hparams.n_embd_s();
             // Write value type
@@ -951,6 +958,8 @@ bool llama_memory_recurrent::state_read_data(llama_io_read_i & io, uint32_t cell
     // For each layer, read the keys for each cell, one row is one cell, read as one contiguous block
     for (uint32_t il = 0; il < n_layer; ++il) {
+        // skip null layers
+        if (r_l[il] == nullptr) continue;
         // Read type of key
         int32_t r_type_i_ref;
@@ -978,11 +987,14 @@ bool llama_memory_recurrent::state_read_data(llama_io_read_i & io, uint32_t cell
     if (!s_trans) {
         for (uint32_t il = 0; il < n_layer; ++il) {
+            // skip null layers
+            if (s_l[il] == nullptr) continue;
             // Read type of value
             int32_t s_type_i_ref;
             io.read_to(&s_type_i_ref, sizeof(s_type_i_ref));
             const int32_t s_type_i = (int32_t)s_l[il]->type;
             if (s_type_i != s_type_i_ref) {
                 LLAMA_LOG_ERROR("%s: mismatched s type (%d != %d, layer %d)\n", __func__, s_type_i, s_type_i_ref, il);
                 return false;
@@ -1005,6 +1017,9 @@ bool llama_memory_recurrent::state_read_data(llama_io_read_i & io, uint32_t cell
     } else {
         // For each layer, read the values for each cell (transposed)
         for (uint32_t il = 0; il < n_layer; ++il) {
+            // skip null layers
+            if (s_l[il] == nullptr) continue;
             const uint32_t n_embd_s = hparams.n_embd_s();
             // Read type of value

package/src/llama.cpp/src/llama-model.cpp CHANGED Viewed

@@ -646,6 +646,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                 ml.get_key(LLM_KV_RESIDUAL_SCALE,              hparams.f_residual_scale);
                 ml.get_key(LLM_KV_LOGIT_SCALE,                 hparams.f_logit_scale);
+                // MiniCPM uses rope by default, unlike Granite which uses it as a switch
+                hparams.rope_finetuned = true;
                 switch (hparams.n_layer) {
                     case 52: type = LLM_TYPE_1B; break;
                     case 40: type = LLM_TYPE_2B; break;
@@ -1544,7 +1547,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                 ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT,                      hparams.token_shift_count, false);
                 switch (hparams.n_layer) {
-                    case 12: type = LLM_TYPE_190M; break;
+                    case 12:
+                        switch (hparams.n_embd) {
+                            case 768: type = LLM_TYPE_190M; break;
+                            default: type = LLM_TYPE_UNKNOWN;
+                        } break;
                     case 24:
                         switch (hparams.n_embd) {
                             case 1024: type = LLM_TYPE_450M; break;
@@ -1557,7 +1564,17 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                             case 3584: type = LLM_TYPE_7B; break;
                             default: type = LLM_TYPE_UNKNOWN;
                         } break;
-                    case 32: type = LLM_TYPE_2_9B; break; // RWKV-7-World
+                    case 32:
+                        switch (hparams.n_embd) {
+                            case 2560: type = LLM_TYPE_2_9B; break;
+                            case 4096: type = LLM_TYPE_7B; break;
+                            default: type = LLM_TYPE_UNKNOWN;
+                        } break;
+                    case 61:
+                        switch (hparams.n_embd) {
+                            case 4096: type = LLM_TYPE_14B; break;
+                            default: type = LLM_TYPE_UNKNOWN;
+                        } break;
                     default: type = LLM_TYPE_UNKNOWN;
                 }
             } break;

package/src/tts_utils.cpp CHANGED Viewed

@@ -357,3 +357,15 @@ std::vector<float> embd_to_audio(const float *embd, const int n_codes,
   return audio;
 }
+const char *get_tts_grammar(const tts_type type) {
+  switch (type) {
+    case OUTETTS_V0_1:
+      return OUTETTS_V1_GRAMMAR;
+    case OUTETTS_V0_2:
+    case OUTETTS_V0_3:
+      return OUTETTS_V2_GRAMMAR;
+    default:
+      return nullptr;
+  }
+}

package/src/tts_utils.h CHANGED Viewed

@@ -8,7 +8,7 @@
 #include <nlohmann/json.hpp>
-enum tts_type { UNKNOWN = -1, OUTETTS_V0_2 = 1, OUTETTS_V0_3 = 2 };
+enum tts_type { UNKNOWN = -1, OUTETTS_V0_1 = 1, OUTETTS_V0_2 = 2, OUTETTS_V0_3 = 3 };
 static std::string anyascii_string(const std::string &input);
@@ -20,6 +20,8 @@ std::string process_text(const std::string &text, const tts_type tts_type);
 std::vector<float> embd_to_audio(const float *embd, const int n_codes,
                                  const int n_embd, const int n_thread);
+const char *get_tts_grammar(const tts_type type);
 // the default speaker profile is from:
 // https://github.com/edwko/OuteTTS/blob/main/outetts/version/v1/default_speakers/en_male_1.json
 static const char *DEFAULT_AUDIO_TEXT =
@@ -62,3 +64,40 @@ and<|t_0.15|><|code_start|><|1285|><|987|><|303|><|1037|><|730|><|1164|><|502|><
 it<|t_0.09|><|code_start|><|848|><|1366|><|395|><|1601|><|1513|><|593|><|1302|><|code_end|>
 looks<|t_0.27|><|code_start|><|1281|><|1266|><|1755|><|572|><|248|><|1751|><|1257|><|695|><|1380|><|457|><|659|><|585|><|1315|><|1105|><|1776|><|736|><|24|><|736|><|654|><|1027|><|code_end|>
 lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|1481|><|1721|><|1123|><|438|><|1246|><|1251|><|795|><|659|><|1381|><|1658|><|217|><|1772|><|562|><|952|><|107|><|1129|><|1112|><|467|><|550|><|1079|><|840|><|1615|><|1469|><|1380|><|168|><|917|><|836|><|1827|><|437|><|583|><|67|><|595|><|1087|><|1646|><|1493|><|1677|><|code_end|>)";
+static const char *OUTETTS_V1_GRAMMAR = R"(
+root       ::= NL? wordAudioBlock+ audioEnd NL eos?
+wordAudioBlock ::= WORD codeBlock NL
+codeBlock ::= TIME CODE{1,144}
+eos      ::= "<|im_end|>"
+codeStart ::= "<|code_start|>"
+codeEnd ::= "<|code_end|>"
+audioEnd   ::= "<|audio_end|>"
+WORD       ::= [A-Za-z]+
+NL         ::= "\n"
+TIME  ::= "<|t_" DECIMAL "|>"
+CODE    ::= "<|" DIGITS "|>"
+DIGITS     ::= [0-9]+
+DECIMAL    ::= [0-9]+ "." [0-9]+
+)";
+static const char *OUTETTS_V2_GRAMMAR = R"(
+root       ::= NL? content+ audioEnd NL eos?
+content ::= wordAudioBlock | emotionBlock
+wordAudioBlock ::= WORD punch* codeBlock space NL
+codeBlock ::= TIME CODE{1,144}
+emotionBlock ::= emotionStart TEXT emotionEnd space NL
+TEXT ::= [A-Za-z0-9 .,?!]+
+eos      ::= "<|im_end|>"
+emotionStart ::= "<|emotion_start|>"
+emotionEnd ::= "<|emotion_end|>"
+audioEnd   ::= "<|audio_end|>"
+space      ::= "<|space|>"
+WORD       ::= [A-Za-z]+
+NL         ::= "\n"
+TIME  ::= "<|t_" DECIMAL "|>"
+CODE    ::= "<|" DIGITS "|>"
+DIGITS     ::= [0-9]+
+DECIMAL    ::= [0-9]+ "." [0-9]+
+punch ::= "<|" [a-z_]+ "|>"
+)";