@fugood/llama.node 1.2.0 → 1.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -96,6 +96,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
96
96
  { LLM_ARCH_DREAM, "dream" },
97
97
  { LLM_ARCH_SMALLTHINKER, "smallthinker" },
98
98
  { LLM_ARCH_LLADA, "llada" },
99
+ { LLM_ARCH_LLADA_MOE, "llada-moe" },
99
100
  { LLM_ARCH_SEED_OSS, "seed_oss" },
100
101
  { LLM_ARCH_UNKNOWN, "(unknown)" },
101
102
  };
@@ -139,6 +140,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
139
140
  { LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
140
141
  { LLM_KV_DECODER_BLOCK_COUNT, "%s.decoder_block_count" },
141
142
  { LLM_KV_ATTN_LOGIT_SOFTCAPPING, "%s.attn_logit_softcapping" },
143
+ { LLM_KV_ROUTER_LOGIT_SOFTCAPPING, "%s.router_logit_softcapping" },
142
144
  { LLM_KV_FINAL_LOGIT_SOFTCAPPING, "%s.final_logit_softcapping" },
143
145
  { LLM_KV_SWIN_NORM, "%s.swin_norm" },
144
146
  { LLM_KV_RESCALE_EVERY_N_LAYERS, "%s.rescale_every_n_layers" },
@@ -169,19 +171,25 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
169
171
  { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
170
172
  { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
171
173
  { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
174
+ { LLM_KV_ATTENTION_OUTPUT_SCALE, "%s.attention.output_scale" },
175
+ { LLM_KV_ATTENTION_TEMPERATURE_LENGTH, "%s.attention.temperature_length" },
172
176
  { LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
173
177
  { LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
174
178
 
175
- { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
176
- { LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
177
- { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
178
- { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
179
- { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
180
- { LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" },
181
- { LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" },
182
- { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
183
- { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
184
- { LLM_KV_ROPE_SCALING_YARN_LOG_MUL, "%s.rope.scaling.yarn_log_multiplier" },
179
+ { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
180
+ { LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
181
+ { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
182
+ { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
183
+ { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
184
+ { LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" },
185
+ { LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" },
186
+ { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
187
+ { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
188
+ { LLM_KV_ROPE_SCALING_YARN_LOG_MUL, "%s.rope.scaling.yarn_log_multiplier" },
189
+ { LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR, "%s.rope.scaling.yarn_ext_factor" },
190
+ { LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR, "%s.rope.scaling.yarn_attn_factor" },
191
+ { LLM_KV_ROPE_SCALING_YARN_BETA_FAST, "%s.rope.scaling.yarn_beta_fast" },
192
+ { LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, "%s.rope.scaling.yarn_beta_slow" },
185
193
 
186
194
  { LLM_KV_SPLIT_NO, "split.no" },
187
195
  { LLM_KV_SPLIT_COUNT, "split.count" },
@@ -398,12 +406,16 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
398
406
  { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
399
407
  { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
400
408
  { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
409
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
410
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
411
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
401
412
  { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
402
413
  { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
403
414
  { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
404
415
  { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
405
416
  { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
406
417
  { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
418
+ { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
407
419
  { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
408
420
  { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
409
421
  },
@@ -2136,6 +2148,26 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
2136
2148
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
2137
2149
  },
2138
2150
  },
2151
+ {
2152
+ LLM_ARCH_LLADA_MOE,
2153
+ {
2154
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
2155
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
2156
+ { LLM_TENSOR_OUTPUT, "output" },
2157
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
2158
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
2159
+ { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
2160
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
2161
+ { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
2162
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
2163
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
2164
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
2165
+ { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
2166
+ { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
2167
+ { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
2168
+ { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
2169
+ },
2170
+ },
2139
2171
  {
2140
2172
  LLM_ARCH_SEED_OSS,
2141
2173
  {
@@ -2416,6 +2448,7 @@ bool llm_arch_is_diffusion(const llm_arch & arch) {
2416
2448
  switch (arch) {
2417
2449
  case LLM_ARCH_DREAM:
2418
2450
  case LLM_ARCH_LLADA:
2451
+ case LLM_ARCH_LLADA_MOE:
2419
2452
  return true;
2420
2453
  default:
2421
2454
  return false;
@@ -100,6 +100,7 @@ enum llm_arch {
100
100
  LLM_ARCH_DREAM,
101
101
  LLM_ARCH_SMALLTHINKER,
102
102
  LLM_ARCH_LLADA,
103
+ LLM_ARCH_LLADA_MOE,
103
104
  LLM_ARCH_SEED_OSS,
104
105
  LLM_ARCH_UNKNOWN,
105
106
  };
@@ -143,6 +144,7 @@ enum llm_kv {
143
144
  LLM_KV_DECODER_START_TOKEN_ID,
144
145
  LLM_KV_DECODER_BLOCK_COUNT,
145
146
  LLM_KV_ATTN_LOGIT_SOFTCAPPING,
147
+ LLM_KV_ROUTER_LOGIT_SOFTCAPPING,
146
148
  LLM_KV_FINAL_LOGIT_SOFTCAPPING,
147
149
  LLM_KV_SWIN_NORM,
148
150
  LLM_KV_RESCALE_EVERY_N_LAYERS,
@@ -173,6 +175,8 @@ enum llm_kv {
173
175
  LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
174
176
  LLM_KV_ATTENTION_SLIDING_WINDOW,
175
177
  LLM_KV_ATTENTION_SCALE,
178
+ LLM_KV_ATTENTION_OUTPUT_SCALE,
179
+ LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
176
180
  LLM_KV_ATTENTION_KEY_LENGTH_MLA,
177
181
  LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
178
182
 
@@ -186,6 +190,10 @@ enum llm_kv {
186
190
  LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
187
191
  LLM_KV_ROPE_SCALING_FINETUNED,
188
192
  LLM_KV_ROPE_SCALING_YARN_LOG_MUL,
193
+ LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR,
194
+ LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR,
195
+ LLM_KV_ROPE_SCALING_YARN_BETA_FAST,
196
+ LLM_KV_ROPE_SCALING_YARN_BETA_SLOW,
189
197
 
190
198
  LLM_KV_SPLIT_NO,
191
199
  LLM_KV_SPLIT_COUNT,
@@ -70,6 +70,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
70
70
  { "hunyuan-dense", LLM_CHAT_TEMPLATE_HUNYUAN_DENSE },
71
71
  { "kimi-k2", LLM_CHAT_TEMPLATE_KIMI_K2 },
72
72
  { "seed_oss", LLM_CHAT_TEMPLATE_SEED_OSS },
73
+ { "grok-2", LLM_CHAT_TEMPLATE_GROK_2 },
73
74
  };
74
75
 
75
76
  llm_chat_template llm_chat_template_from_str(const std::string & name) {
@@ -204,6 +205,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
204
205
  return LLM_CHAT_TEMPLATE_KIMI_K2;
205
206
  } else if (tmpl_contains("<seed:bos>")) {
206
207
  return LLM_CHAT_TEMPLATE_SEED_OSS;
208
+ } else if (tmpl_contains("'Assistant: ' + message['content'] + '<|separator|>")) {
209
+ return LLM_CHAT_TEMPLATE_GROK_2;
207
210
  }
208
211
  return LLM_CHAT_TEMPLATE_UNKNOWN;
209
212
  }
@@ -763,6 +766,20 @@ int32_t llm_chat_apply_template(
763
766
  if (add_ass) {
764
767
  ss << "<seed:bos>assistant\n";
765
768
  }
769
+ } else if (tmpl == LLM_CHAT_TEMPLATE_GROK_2) {
770
+ for (auto message : chat) {
771
+ std::string role(message->role);
772
+ if (role == "system") {
773
+ ss << "System: " << trim(message->content) << "<|separator|>\n\n";
774
+ } else if (role == "user") {
775
+ ss << "Human: " << trim(message->content) << "<|separator|>\n\n";
776
+ } else if (role == "assistant") {
777
+ ss << "Assistant: " << message->content << "<|separator|>\n\n";
778
+ }
779
+ }
780
+ if (add_ass) {
781
+ ss << "Assistant:";
782
+ }
766
783
  } else {
767
784
  // template not supported
768
785
  return -1;
@@ -50,6 +50,7 @@ enum llm_chat_template {
50
50
  LLM_CHAT_TEMPLATE_HUNYUAN_DENSE,
51
51
  LLM_CHAT_TEMPLATE_KIMI_K2,
52
52
  LLM_CHAT_TEMPLATE_SEED_OSS,
53
+ LLM_CHAT_TEMPLATE_GROK_2,
53
54
  LLM_CHAT_TEMPLATE_UNKNOWN,
54
55
  };
55
56
 
@@ -35,10 +35,10 @@ llama_context::llama_context(
35
35
 
36
36
  cparams.n_threads = params.n_threads;
37
37
  cparams.n_threads_batch = params.n_threads_batch;
38
- cparams.yarn_ext_factor = params.yarn_ext_factor;
39
- cparams.yarn_attn_factor = params.yarn_attn_factor;
40
- cparams.yarn_beta_fast = params.yarn_beta_fast;
41
- cparams.yarn_beta_slow = params.yarn_beta_slow;
38
+ cparams.yarn_ext_factor = params.yarn_ext_factor >= 0.0f ? params.yarn_ext_factor : hparams.yarn_ext_factor;
39
+ cparams.yarn_attn_factor = params.yarn_attn_factor >= 0.0f ? params.yarn_attn_factor : hparams.yarn_attn_factor;
40
+ cparams.yarn_beta_fast = params.yarn_beta_fast >= 0.0f ? params.yarn_beta_fast : hparams.yarn_beta_fast;
41
+ cparams.yarn_beta_slow = params.yarn_beta_slow >= 0.0f ? params.yarn_beta_slow : hparams.yarn_beta_slow;
42
42
  cparams.embeddings = params.embeddings;
43
43
  cparams.offload_kqv = params.offload_kqv;
44
44
  cparams.no_perf = params.no_perf;
@@ -181,7 +181,7 @@ llama_context::llama_context(
181
181
  // graph outputs buffer
182
182
  {
183
183
  // resized during inference when a batch uses more outputs
184
- if ((uint32_t) output_reserve(params.n_seq_max) < params.n_seq_max) {
184
+ if (output_reserve(params.n_seq_max) < params.n_seq_max) {
185
185
  throw std::runtime_error("failed to reserve initial output buffer");
186
186
  }
187
187
 
@@ -2263,9 +2263,9 @@ llama_context_params llama_context_default_params() {
2263
2263
  /*.rope_freq_base =*/ 0.0f,
2264
2264
  /*.rope_freq_scale =*/ 0.0f,
2265
2265
  /*.yarn_ext_factor =*/ -1.0f,
2266
- /*.yarn_attn_factor =*/ 1.0f,
2267
- /*.yarn_beta_fast =*/ 32.0f,
2268
- /*.yarn_beta_slow =*/ 1.0f,
2266
+ /*.yarn_attn_factor =*/ -1.0f,
2267
+ /*.yarn_beta_fast =*/ -1.0f,
2268
+ /*.yarn_beta_slow =*/ -1.0f,
2269
2269
  /*.yarn_orig_ctx =*/ 0,
2270
2270
  /*.defrag_thold =*/ -1.0f,
2271
2271
  /*.cb_eval =*/ nullptr,
@@ -4,7 +4,7 @@
4
4
 
5
5
  #include <cstdint>
6
6
 
7
- #define LLAMA_MAX_SEQ 64
7
+ #define LLAMA_MAX_SEQ 256
8
8
 
9
9
  struct llama_cparams {
10
10
  uint32_t n_ctx; // context size used during inference
@@ -1335,14 +1335,14 @@ ggml_tensor * llm_graph_context::build_attn_mha(
1335
1335
 
1336
1336
  if (arch == LLM_ARCH_GROK) {
1337
1337
  // need to do the following:
1338
- // multiply by attn_output_multiplyer of 0.08838834764831845
1338
+ // multiply by attn_output_multiplier
1339
1339
  // and then :
1340
1340
  // kq = 30 * tanh(kq / 30)
1341
1341
  // before the softmax below
1342
1342
 
1343
- kq = ggml_tanh(ctx0, ggml_scale(ctx0, kq, 0.08838834764831845f/30.0f));
1343
+ kq = ggml_tanh(ctx0, ggml_scale(ctx0, kq, hparams.f_attn_out_scale / hparams.f_attn_logit_softcapping));
1344
1344
  cb(kq, "kq_tanh", il);
1345
- kq = ggml_scale(ctx0, kq, 30);
1345
+ kq = ggml_scale(ctx0, kq, hparams.f_attn_logit_softcapping);
1346
1346
  cb(kq, "kq_scaled", il);
1347
1347
  }
1348
1348
 
@@ -82,8 +82,9 @@ struct llama_hparams {
82
82
  float f_norm_rms_eps;
83
83
  float f_norm_group_eps;
84
84
 
85
- float f_attn_logit_softcapping = 50.0f;
86
- float f_final_logit_softcapping = 30.0f;
85
+ float f_attn_logit_softcapping = 50.0f;
86
+ float f_router_logit_softcapping = 30.0f;
87
+ float f_final_logit_softcapping = 30.0f;
87
88
 
88
89
  // for RWKV
89
90
  uint32_t rescale_every_n_layers = 0;
@@ -104,6 +105,11 @@ struct llama_hparams {
104
105
  uint32_t n_ctx_orig_yarn;
105
106
  float rope_yarn_log_mul = 0.0f;
106
107
 
108
+ float yarn_ext_factor = -1.0f;
109
+ float yarn_attn_factor = 1.0f;
110
+ float yarn_beta_fast = 32.0f;
111
+ float yarn_beta_slow = 1.0f;
112
+
107
113
  std::array<int, 4> rope_sections;
108
114
 
109
115
  // Sliding Window Attention (SWA)
@@ -136,10 +142,14 @@ struct llama_hparams {
136
142
  float f_embedding_scale = 0.0f;
137
143
  float f_attention_scale = 0.0f;
138
144
 
145
+ // grok-2
146
+ float f_attn_out_scale = 0.0f;
147
+ uint32_t attn_temp_length = 0;
148
+
139
149
  bool causal_attn = true;
140
150
  bool use_alibi = false;
141
151
  bool attn_soft_cap = false;
142
- bool use_kq_norm = true;
152
+ bool use_kq_norm = false;
143
153
 
144
154
  // for Classifiers
145
155
  uint32_t n_cls_out = 1;