@fugood/llama.node 1.3.7 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/lib/binding.js +18 -1
  2. package/lib/binding.ts +19 -1
  3. package/lib/index.js +3 -3
  4. package/lib/index.ts +1 -1
  5. package/package.json +15 -15
  6. package/scripts/llama.cpp.patch +7 -7
  7. package/src/LlamaCompletionWorker.cpp +2 -2
  8. package/src/llama.cpp/common/arg.cpp +27 -2
  9. package/src/llama.cpp/common/chat-parser.cpp +968 -0
  10. package/src/llama.cpp/common/chat.cpp +0 -952
  11. package/src/llama.cpp/common/common.cpp +55 -0
  12. package/src/llama.cpp/common/common.h +18 -0
  13. package/src/llama.cpp/common/json-schema-to-grammar.cpp +2 -2
  14. package/src/llama.cpp/ggml/CMakeLists.txt +6 -4
  15. package/src/llama.cpp/ggml/include/ggml-rpc.h +1 -1
  16. package/src/llama.cpp/ggml/include/ggml.h +12 -4
  17. package/src/llama.cpp/ggml/src/CMakeLists.txt +26 -4
  18. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +29 -15
  19. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +721 -0
  20. package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/cpu-feats.cpp +38 -0
  21. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +22 -2
  22. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +9 -0
  23. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +71 -4
  24. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
  25. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +243 -4
  26. package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +6 -0
  27. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +84 -85
  28. package/src/llama.cpp/include/llama.h +18 -0
  29. package/src/llama.cpp/src/CMakeLists.txt +2 -0
  30. package/src/llama.cpp/src/llama-arch.cpp +95 -16
  31. package/src/llama.cpp/src/llama-arch.h +15 -0
  32. package/src/llama.cpp/src/llama-context.cpp +7 -3
  33. package/src/llama.cpp/src/llama-graph.cpp +3 -3
  34. package/src/llama.cpp/src/llama-hparams.h +1 -1
  35. package/src/llama.cpp/src/llama-model.cpp +141 -6
  36. package/src/llama.cpp/src/llama-model.h +4 -0
  37. package/src/llama.cpp/src/llama-quant.cpp +13 -5
  38. package/src/llama.cpp/src/models/lfm2.cpp +5 -3
  39. package/src/llama.cpp/src/models/models.h +55 -1
  40. package/src/llama.cpp/src/models/qwen3next.cpp +1042 -0
  41. package/src/llama.cpp/src/models/rnd1.cpp +126 -0
@@ -0,0 +1,126 @@
1
+ #include "models.h"
2
+
3
+ // RND1 is a Qwen3Moe AR model converted to diffusion model.
4
+ llm_build_rnd1::llm_build_rnd1(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
5
+ const int64_t n_embd_head = hparams.n_embd_head_v;
6
+
7
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
8
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
9
+
10
+ ggml_tensor * cur;
11
+ ggml_tensor * inpL;
12
+
13
+ inpL = build_inp_embd(model.tok_embd);
14
+
15
+ // inp_pos - contains the positions
16
+ ggml_tensor * inp_pos = build_inp_pos();
17
+
18
+ // Non-causal attention for diffusion
19
+ auto * inp_attn = build_attn_inp_no_cache();
20
+
21
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
22
+
23
+ for (int il = 0; il < n_layer; ++il) {
24
+ ggml_tensor * inpSA = inpL;
25
+
26
+ // norm
27
+ cur = build_norm(inpL,
28
+ model.layers[il].attn_norm, NULL,
29
+ LLM_NORM_RMS, il);
30
+ cb(cur, "attn_norm", il);
31
+
32
+ // self_attention
33
+ {
34
+ // compute Q and K and RoPE them
35
+ ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
36
+ cb(Qcur, "Qcur", il);
37
+
38
+ ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
39
+ cb(Kcur, "Kcur", il);
40
+
41
+ ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
42
+ cb(Vcur, "Vcur", il);
43
+
44
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
45
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
46
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
47
+
48
+ Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
49
+ cb(Qcur, "Qcur_normed", il);
50
+
51
+ Qcur = ggml_rope_ext(
52
+ ctx0, Qcur, inp_pos, nullptr,
53
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
54
+ ext_factor, attn_factor, beta_fast, beta_slow
55
+ );
56
+
57
+ Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
58
+ cb(Kcur, "Kcur_normed", il);
59
+
60
+ Kcur = ggml_rope_ext(
61
+ ctx0, Kcur, inp_pos, nullptr,
62
+ n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
63
+ ext_factor, attn_factor, beta_fast, beta_slow
64
+ );
65
+
66
+ cb(Qcur, "Qcur", il);
67
+ cb(Kcur, "Kcur", il);
68
+ cb(Vcur, "Vcur", il);
69
+
70
+ cur = build_attn(inp_attn,
71
+ model.layers[il].wo, model.layers[il].bo,
72
+ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
73
+ }
74
+ if (il == n_layer - 1 && inp_out_ids) {
75
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
76
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
77
+ }
78
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
79
+ cb(ffn_inp, "ffn_inp", il);
80
+
81
+ // MoE branch
82
+ cur = build_norm(ffn_inp,
83
+ model.layers[il].ffn_norm, NULL,
84
+ LLM_NORM_RMS, il);
85
+ cb(cur, "ffn_norm", il);
86
+
87
+ ggml_tensor * moe_out =
88
+ build_moe_ffn(cur,
89
+ model.layers[il].ffn_gate_inp,
90
+ model.layers[il].ffn_up_exps,
91
+ model.layers[il].ffn_gate_exps,
92
+ model.layers[il].ffn_down_exps,
93
+ nullptr,
94
+ n_expert, n_expert_used,
95
+ LLM_FFN_SILU, true,
96
+ false, 0.0,
97
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
98
+ il);
99
+ cb(moe_out, "ffn_moe_out", il);
100
+ cur = moe_out;
101
+
102
+ cur = ggml_add(ctx0, cur, ffn_inp);
103
+
104
+ cur = build_cvec(cur, il);
105
+ cb(cur, "l_out", il);
106
+
107
+ // input for next layer
108
+ inpL = cur;
109
+ }
110
+ cur = inpL;
111
+
112
+ cur = build_norm(cur,
113
+ model.output_norm, NULL,
114
+ LLM_NORM_RMS, -1);
115
+
116
+ cb(cur, "result_norm", -1);
117
+ res->t_embd = cur;
118
+
119
+ // lm_head
120
+ cur = build_lora_mm(model.output, cur);
121
+
122
+ cb(cur, "result_output", -1);
123
+ res->t_logits = cur;
124
+
125
+ ggml_build_forward_expand(gf, cur);
126
+ }