@fugood/llama.node 1.1.1 → 1.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1369,7 +1369,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1369
1369
  // that have no expert_gating_func model parameter set
1370
1370
  hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
1371
1371
  }
1372
- ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul);
1372
+ ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false);
1373
1373
 
1374
1374
  switch (hparams.n_layer) {
1375
1375
  case 27: type = LLM_TYPE_16B; break;
@@ -1768,6 +1768,29 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1768
1768
  default: type = LLM_TYPE_UNKNOWN;
1769
1769
  }
1770
1770
  } break;
1771
+ case LLM_ARCH_SMALLTHINKER:
1772
+ {
1773
+ const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
1774
+
1775
+ if (found_swa && hparams.n_swa > 0) {
1776
+ hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
1777
+ hparams.n_swa = 4096;
1778
+ hparams.set_swa_pattern(4, true);
1779
+ } else {
1780
+ hparams.swa_type = LLAMA_SWA_TYPE_NONE;
1781
+ hparams.n_no_rope_layer_step = hparams.n_layer;
1782
+ }
1783
+
1784
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
1785
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
1786
+ ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
1787
+
1788
+ switch (hparams.n_layer) {
1789
+ case 32: type = LLM_TYPE_4B; break;
1790
+ case 52: type = LLM_TYPE_20B; break;
1791
+ default: type = LLM_TYPE_UNKNOWN;
1792
+ }
1793
+ } break;
1771
1794
  default: throw std::runtime_error("unsupported model architecture");
1772
1795
  }
1773
1796
 
@@ -5165,6 +5188,42 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
5165
5188
  }
5166
5189
  }
5167
5190
  } break;
5191
+ case LLM_ARCH_SMALLTHINKER:
5192
+ {
5193
+ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
5194
+
5195
+ // output
5196
+ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
5197
+ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
5198
+
5199
+ // if output is NULL, init from the input tok embed
5200
+ if (output == NULL) {
5201
+ output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
5202
+ }
5203
+
5204
+ for (int i = 0; i < n_layer; ++i) {
5205
+ auto & layer = layers[i];
5206
+
5207
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
5208
+
5209
+ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
5210
+ layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
5211
+ layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
5212
+ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
5213
+
5214
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
5215
+
5216
+ GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for SMALLTHINKER");
5217
+ GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for SMALLTHINKER");
5218
+
5219
+ // MoE branch
5220
+ const int64_t n_ff_exp = hparams.n_ff_exp;
5221
+ layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0);
5222
+ layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
5223
+ layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0);
5224
+ layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0);
5225
+ }
5226
+ } break;
5168
5227
  default:
5169
5228
  throw std::runtime_error("unknown architecture");
5170
5229
  }
@@ -5490,6 +5549,11 @@ void llama_model::print_info() const {
5490
5549
  LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
5491
5550
  }
5492
5551
 
5552
+ if (arch == LLM_ARCH_SMALLTHINKER) {
5553
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
5554
+ LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
5555
+ }
5556
+
5493
5557
  vocab.print_info();
5494
5558
  }
5495
5559
 
@@ -16191,7 +16255,7 @@ private:
16191
16255
  {
16192
16256
  // PLaMo-2 uses combined QKV tensor
16193
16257
  ggml_tensor * qkv = build_lora_mm(model.layers[il].wqkv, cur);
16194
- cb(qkv, "qkv", il);
16258
+ cb(qkv, "wqkv", il);
16195
16259
 
16196
16260
  // split QKV tensor into Q, K, V
16197
16261
  const int64_t n_embd_head_q = hparams.n_embd_head_k;
@@ -16231,7 +16295,7 @@ private:
16231
16295
  ext_factor, attn_factor, beta_fast, beta_slow
16232
16296
  );
16233
16297
 
16234
- cur = build_attn(inp, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, NULL, NULL, 1.0f, il);
16298
+ cur = build_attn(inp, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, NULL, NULL, 1.0f/sqrtf(float(n_embd_head_v)), il);
16235
16299
  }
16236
16300
 
16237
16301
  cb(cur, "attn_out", il);
@@ -16306,8 +16370,9 @@ private:
16306
16370
  ggml_build_forward_expand(gf,
16307
16371
  ggml_cpy(ctx0, last_conv,
16308
16372
  ggml_view_1d(ctx0, conv_states_all,
16309
- (d_conv - 1)*(d_inner)*(n_seqs),
16310
- kv_head*(d_conv - 1)*(d_inner)*ggml_element_size(conv_states_all))));
16373
+ (d_conv - 1)*(d_inner + 2*n_group*d_state)*(n_seqs),
16374
+ kv_head*(d_conv - 1)*(d_inner + 2*n_group*d_state)*ggml_element_size(conv_states_all))));
16375
+ cb(conv_states_all, "mamba_conv1d_state", il);
16311
16376
 
16312
16377
  // 1D convolution
16313
16378
  x = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d);
@@ -16370,9 +16435,9 @@ private:
16370
16435
  // store last states
16371
16436
  ggml_build_forward_expand(gf,
16372
16437
  ggml_cpy(ctx0,
16373
- ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, x->nb[3]*x->ne[3]),
16374
- ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs,
16375
- kv_head*d_state*d_inner*ggml_element_size(ssm_states_all))));
16438
+ ggml_view_1d(ctx0, y_ssm, n_heads*head_dim*d_state*n_seqs, n_heads*head_dim*n_seq_tokens*n_seqs*ggml_element_size(y_ssm)),
16439
+ ggml_view_1d(ctx0, ssm_states_all, n_heads*head_dim*d_state*n_seqs, kv_head*n_seqs*n_heads*head_dim*d_state*ggml_element_size(ssm_states_all))));
16440
+ cb(ssm_states_all, "mamba_ssm_states", il);
16376
16441
 
16377
16442
  ggml_tensor * y = ggml_view_4d(ctx0, y_ssm, head_dim, n_heads, n_seq_tokens, n_seqs, head_dim * ggml_element_size(x), head_dim * n_heads * ggml_element_size(x), head_dim * n_heads * n_seq_tokens * ggml_element_size(x), 0);
16378
16443
  cb(y, "mamba_y_view", il);
@@ -17010,6 +17075,119 @@ struct llm_build_lfm2 : public llm_graph_context {
17010
17075
  }
17011
17076
  };
17012
17077
 
17078
+ template <bool iswa>
17079
+ struct llm_build_smallthinker : public llm_graph_context{
17080
+ llm_build_smallthinker(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params){
17081
+ const int64_t n_embd_head = hparams.n_embd_head_v;
17082
+
17083
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
17084
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
17085
+
17086
+ ggml_tensor * cur;
17087
+ ggml_tensor * inpL;
17088
+
17089
+ inpL = build_inp_embd(model.tok_embd);
17090
+
17091
+ // inp_pos - contains the positions
17092
+ ggml_tensor * inp_pos = build_inp_pos();
17093
+
17094
+ using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_unified_iswa, llm_graph_input_attn_kv_unified>;
17095
+ inp_attn_type * inp_attn = nullptr;
17096
+
17097
+ if constexpr (iswa) {
17098
+ inp_attn = build_attn_inp_kv_unified_iswa();
17099
+ } else {
17100
+ inp_attn = build_attn_inp_kv_unified();
17101
+ }
17102
+
17103
+ ggml_tensor * inp_out_ids = build_inp_out_ids();
17104
+
17105
+ for (int il = 0; il < n_layer; ++il) {
17106
+ ggml_tensor * inpSA = inpL;
17107
+ ggml_tensor * probs = nullptr;
17108
+
17109
+ probs = build_lora_mm(model.layers[il].ffn_gate_inp, inpL); // [n_expert, n_tokens]
17110
+ cb(probs, "ffn_moe_logits", il);
17111
+
17112
+ // norm
17113
+ cur = build_norm(inpL,model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
17114
+ cb(cur, "attn_norm", il);
17115
+
17116
+ // self_attention
17117
+ {
17118
+ // compute Q and K and RoPE them
17119
+ struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
17120
+ cb(Qcur, "Qcur", il);
17121
+
17122
+ struct ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
17123
+ cb(Kcur, "Kcur", il);
17124
+
17125
+ struct ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
17126
+ cb(Vcur, "Vcur", il);
17127
+
17128
+ Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
17129
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
17130
+ Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
17131
+
17132
+ if (hparams.n_no_rope_layer_step == n_layer || il % hparams.n_no_rope_layer_step != 0) {
17133
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
17134
+ ext_factor, attn_factor, beta_fast, beta_slow);
17135
+
17136
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
17137
+ ext_factor, attn_factor, beta_fast, beta_slow);
17138
+ }
17139
+
17140
+ cb(Qcur, "Qcur", il);
17141
+ cb(Kcur, "Kcur", il);
17142
+
17143
+ cur = build_attn(inp_attn,
17144
+ model.layers[il].wo, model.layers[il].bo,
17145
+ Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
17146
+ }
17147
+
17148
+ if (il == n_layer - 1 && inp_out_ids) {
17149
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
17150
+ inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
17151
+ probs = ggml_get_rows(ctx0, probs, inp_out_ids);
17152
+ }
17153
+
17154
+ ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
17155
+ cb(ffn_inp, "ffn_inp", il);
17156
+
17157
+ // MoE branch
17158
+ cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
17159
+ cb(cur, "ffn_norm", il);
17160
+
17161
+ ggml_tensor * ffn_out = build_moe_ffn_from_probs(cur, probs, model.layers[il].ffn_up_exps,
17162
+ model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps,
17163
+ nullptr, n_expert, n_expert_used,
17164
+ static_cast<llama_expert_gating_func_type>(hparams.expert_gating_func), il);
17165
+
17166
+ cb(ffn_out, "ffn_out", il);
17167
+ cur = ffn_out;
17168
+
17169
+ cur = ggml_add(ctx0, cur, ffn_inp);
17170
+ cur = build_cvec(cur, il);
17171
+ cb(cur, "l_out", il);
17172
+
17173
+ // input for next layer
17174
+ inpL = cur;
17175
+ }
17176
+
17177
+ cur = inpL;
17178
+
17179
+ cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
17180
+ cb(cur, "result_norm", -1);
17181
+
17182
+ // lm_head
17183
+ cur = build_lora_mm(model.output, cur);
17184
+ cb(cur, "result_output", -1);
17185
+ res->t_logits = cur;
17186
+
17187
+ ggml_build_forward_expand(gf, cur);
17188
+ }
17189
+ };
17190
+
17013
17191
  llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
17014
17192
  llama_memory_i * res;
17015
17193
 
@@ -17448,6 +17626,14 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
17448
17626
  {
17449
17627
  llm = std::make_unique<llm_build_lfm2>(*this, params);
17450
17628
  } break;
17629
+ case LLM_ARCH_SMALLTHINKER:
17630
+ {
17631
+ if (hparams.swa_type == LLAMA_SWA_TYPE_STANDARD) {
17632
+ llm = std::make_unique<llm_build_smallthinker<true>> (*this, params);
17633
+ } else {
17634
+ llm = std::make_unique<llm_build_smallthinker<false>>(*this, params);
17635
+ }
17636
+ } break;
17451
17637
  default:
17452
17638
  GGML_ABORT("fatal error");
17453
17639
  }
@@ -17646,6 +17832,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
17646
17832
  case LLM_ARCH_DOTS1:
17647
17833
  case LLM_ARCH_HUNYUAN_MOE:
17648
17834
  case LLM_ARCH_LFM2:
17835
+ case LLM_ARCH_SMALLTHINKER:
17649
17836
  return LLAMA_ROPE_TYPE_NEOX;
17650
17837
 
17651
17838
  case LLM_ARCH_QWEN2VL:
package/src/tts_utils.h CHANGED
@@ -68,7 +68,7 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
68
68
  static const char *OUTETTS_V1_GRAMMAR = R"(
69
69
  root ::= NL? wordAudioBlock+ audioEnd NL eos?
70
70
  wordAudioBlock ::= WORD codeBlock NL
71
- codeBlock ::= TIME CODE{1,144}
71
+ codeBlock ::= TIME CODE*
72
72
  eos ::= "<|im_end|>"
73
73
  codeStart ::= "<|code_start|>"
74
74
  codeEnd ::= "<|code_end|>"
@@ -85,7 +85,7 @@ static const char *OUTETTS_V2_GRAMMAR = R"(
85
85
  root ::= NL? content+ audioEnd NL eos?
86
86
  content ::= wordAudioBlock | emotionBlock
87
87
  wordAudioBlock ::= WORD punch* codeBlock space NL
88
- codeBlock ::= TIME CODE{1,144}
88
+ codeBlock ::= TIME CODE*
89
89
  emotionBlock ::= emotionStart TEXT emotionEnd space NL
90
90
  TEXT ::= [A-Za-z0-9 .,?!]+
91
91
  eos ::= "<|im_end|>"
@@ -94,7 +94,7 @@ emotionEnd ::= "<|emotion_end|>"
94
94
  audioEnd ::= "<|audio_end|>"
95
95
  space ::= "<|space|>"
96
96
  WORD ::= [A-Za-z]+
97
- NL ::= "\n"
97
+ NL ::= [\n]
98
98
  TIME ::= "<|t_" DECIMAL "|>"
99
99
  CODE ::= "<|" DIGITS "|>"
100
100
  DIGITS ::= [0-9]+