npm - @fugood/llama.node - Versions diffs - 1.1.5 → 1.1.6 - Mend

@fugood/llama.node 1.1.5 → 1.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

package/package.json +14 -14
package/scripts/llama.cpp.patch +17 -13
package/src/LlamaCompletionWorker.cpp +2 -0
package/src/llama.cpp/common/arg.cpp +28 -11
package/src/llama.cpp/common/chat.cpp +46 -2
package/src/llama.cpp/common/chat.h +7 -2
package/src/llama.cpp/common/common.h +3 -2
package/src/llama.cpp/ggml/CMakeLists.txt +3 -2
package/src/llama.cpp/ggml/include/ggml.h +37 -1
package/src/llama.cpp/ggml/src/CMakeLists.txt +12 -1
package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +61 -0
package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +96 -8
package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +6 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14 -1
package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +207 -9
package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -7
package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +35 -0
package/src/llama.cpp/ggml/src/ggml-cpu/quants.h +8 -0
package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +19 -4
package/src/llama.cpp/include/llama.h +1 -0
package/src/llama.cpp/src/llama-arch.cpp +65 -0
package/src/llama.cpp/src/llama-arch.h +10 -0
package/src/llama.cpp/src/llama-chat.cpp +13 -0
package/src/llama.cpp/src/llama-chat.h +1 -0
package/src/llama.cpp/src/llama-context.cpp +8 -8
package/src/llama.cpp/src/llama-graph.cpp +118 -9
package/src/llama.cpp/src/llama-graph.h +38 -0
package/src/llama.cpp/src/llama-hparams.h +5 -3
package/src/llama.cpp/src/llama-kv-cache-unified.cpp +4 -0
package/src/llama.cpp/src/llama-model-loader.cpp +1 -0
package/src/llama.cpp/src/llama-model-loader.h +3 -2
package/src/llama.cpp/src/llama-model.cpp +499 -4
package/src/llama.cpp/src/llama-model.h +24 -4
package/src/llama.cpp/src/llama-quant.cpp +37 -1
package/src/llama.cpp/src/llama-vocab.cpp +42 -0

package/src/llama.cpp/src/llama-quant.cpp CHANGED Viewed

@@ -211,7 +211,10 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
             const int64_t nx = tensor->ne[0];
             const int64_t qk_k = ggml_blck_size(new_type);
-            if (arch == LLM_ARCH_FALCON || nx % qk_k != 0) {
+            if (ftype == LLAMA_FTYPE_MOSTLY_MXFP4_MOE) {
+                new_type = GGML_TYPE_Q8_0;
+            }
+            else if (arch == LLM_ARCH_FALCON || nx % qk_k != 0) {
                 new_type = GGML_TYPE_Q8_0;
             }
             else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
@@ -223,6 +226,14 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
                 new_type = GGML_TYPE_Q6_K;
             }
         }
+    } else if (ftype == LLAMA_FTYPE_MOSTLY_MXFP4_MOE) {
+        // MoE   tensors -> MXFP4
+        // other tensors -> Q8_0
+        if (tensor->ne[2] > 1) {
+            new_type = GGML_TYPE_MXFP4;
+        } else {
+            new_type = GGML_TYPE_Q8_0;
+        }
     } else if (name == "token_embd.weight" || name == "per_layer_token_embd.weight") {
         if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
             new_type = qs.params->token_embedding_type;
@@ -533,6 +544,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         case LLAMA_FTYPE_MOSTLY_BF16: default_type = GGML_TYPE_BF16; break;
         case LLAMA_FTYPE_ALL_F32:     default_type = GGML_TYPE_F32;  break;
+        case LLAMA_FTYPE_MOSTLY_MXFP4_MOE: default_type = GGML_TYPE_MXFP4; break;
         // K-quants
         case LLAMA_FTYPE_MOSTLY_Q2_K_S:
         case LLAMA_FTYPE_MOSTLY_Q2_K:    default_type = GGML_TYPE_Q2_K;    break;
@@ -984,6 +997,29 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
                 const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
                 new_size += llama_tensor_quantize_impl(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
+                // TODO: temporary sanity check that the F16 -> MXFP4 is lossless
+#if 1
+                if (new_type == GGML_TYPE_MXFP4) {
+                    auto * x = f32_data_03;
+                    //LLAMA_LOG_INFO("nrows = %d, n_per_row = %d\n", nrows, n_per_row);
+                    std::vector<float> deq(nrows*n_per_row);
+                    const ggml_type_traits * qtype = ggml_get_type_traits(new_type);
+                    qtype->to_float(new_data_03, deq.data(), deq.size());
+                    double err = 0.0f;
+                    for (int i = 0; i < (int) deq.size(); ++i) {
+                        err += fabsf(deq[i] - x[i]);
+                        //if (fabsf(deq[i] - x[i]) > 0.00001 && i < 256) {
+                        if (deq[i] != x[i]) {
+                            LLAMA_LOG_INFO("deq[%d] = %f, x[%d] = %f\n", i, deq[i], i, x[i]);
+                        }
+                    }
+                    //LLAMA_LOG_INFO("err = %f\n", err);
+                    GGML_ASSERT(err == 0.00000);
+                }
+#endif
             }
             LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
         }

package/src/llama.cpp/src/llama-vocab.cpp CHANGED Viewed

@@ -2191,6 +2191,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                         || t.first == "<｜fim▁begin｜>" // DeepSeek
                         || t.first == "<PRE>"
                         || t.first == "▁<PRE>"          // CodeLlama
+                        || t.first == "<|code_prefix|>" // GLM-4.5
                         ) {
                     special_fim_pre_id = t.second;
                     if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@@ -2210,6 +2211,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                         || t.first == "<｜fim▁hole｜>" // DeepSeek
                         || t.first == "<SUF>"
                         || t.first == "▁<SUF>"         // CodeLlama
+                        || t.first == "<|code_suffix|>" // GLM-4.5
                         ) {
                     special_fim_suf_id = t.second;
                     if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@@ -2229,6 +2231,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                         || t.first == "<｜fim▁end｜>"  // DeepSeek
                         || t.first == "<MID>"
                         || t.first == "▁<MID>"         // CodeLlama
+                        || t.first == "<|code_middle|>" // GLM-4.5
                         ) {
                     special_fim_mid_id = t.second;
                     if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@@ -2311,6 +2314,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                     || t.first == "<|eot_id|>"
                     || t.first == "<|im_end|>"
                     || t.first == "<|end|>"
+                    || t.first == "<|return|>" // o200k_harmony
+                    || t.first == "<|call|>"   // o200k_harmony
                     || t.first == "<end_of_turn>"
                     || t.first == "<|endoftext|>"
                     || t.first == "<|eom_id|>"
@@ -2334,6 +2339,13 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
             }
         }
+        // @ngxson : quick hack for gpt-oss, always render these tokens
+        for (const auto & t : token_to_id) {
+            if (t.first == "<|channel|>" || t.first == "<|message|>" || t.first == "<|start|>") {
+                id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
+            }
+        }
         // sanity checks
         if (special_eos_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_eos_id) == 0) {
             special_eog_ids.insert(special_eos_id);
@@ -2349,6 +2361,36 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
             special_eog_ids.insert(special_eom_id);
             LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
         }
+        // TODO: workaround for o200k_harmony tokenizer: the "<|end|>" token should not be EOG
+        //       we don't have a good way to detect this, so for now, if we have "<|return|>" and "<|call|>" tokens,
+        //       we remove the "<|end|>" token from the EOG list
+        {
+            bool has_return = false;
+            bool has_call   = false;
+            bool has_end    = false;
+            llama_token end_id = LLAMA_TOKEN_NULL;
+            LLAMA_LOG_INFO("%s: printing all EOG tokens:\n", __func__);
+            for (auto tid : special_eog_ids) {
+                LLAMA_LOG_INFO("%s:   - %d ('%s')\n", __func__, tid, id_to_token[tid].text.c_str());
+                if (id_to_token[tid].text == "<|return|>") {
+                    has_return = true;
+                } else if (id_to_token[tid].text == "<|call|>") {
+                    has_call = true;
+                } else if (id_to_token[tid].text == "<|end|>") {
+                    has_end = true;
+                    end_id = tid;
+                }
+            }
+            if (has_return && has_call && has_end) {
+                special_eog_ids.erase(end_id);
+                LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
+            }
+        }
     }
     // build special tokens cache