@fugood/llama.node 1.4.6 → 1.4.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,7 @@
1
1
  #include "models.h"
2
2
 
3
- llm_build_gemma3_iswa::llm_build_gemma3_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
3
+ template <bool iswa>
4
+ llm_build_gemma3<iswa>::llm_build_gemma3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
4
5
  const int64_t n_embd_head = hparams.n_embd_head_k;
5
6
 
6
7
  ggml_tensor * cur;
@@ -17,13 +18,28 @@ llm_build_gemma3_iswa::llm_build_gemma3_iswa(const llama_model & model, const ll
17
18
  ggml_tensor * inp_pos = build_inp_pos();
18
19
 
19
20
  // TODO: is causal == true correct? might need some changes
20
- auto * inp_attn = build_attn_inp_kv_iswa();
21
+ using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
22
+ inp_attn_type * inp_attn = nullptr;
23
+
24
+ if constexpr (iswa) {
25
+ inp_attn = build_attn_inp_kv_iswa();
26
+ } else {
27
+ inp_attn = build_attn_inp_kv();
28
+ }
21
29
 
22
30
  ggml_tensor * inp_out_ids = build_inp_out_ids();
23
31
 
24
32
  for (int il = 0; il < n_layer; ++il) {
25
- const float freq_base_l = model.get_rope_freq_base (cparams, il);
26
- const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
33
+ float freq_base_l = 0.0f;
34
+ float freq_scale_l = 0.0f;
35
+
36
+ if constexpr (iswa) {
37
+ freq_base_l = model.get_rope_freq_base (cparams, il);
38
+ freq_scale_l = model.get_rope_freq_scale(cparams, il);
39
+ } else {
40
+ freq_base_l = freq_base;
41
+ freq_scale_l = freq_scale;
42
+ }
27
43
 
28
44
  // norm
29
45
  cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
@@ -102,7 +118,7 @@ llm_build_gemma3_iswa::llm_build_gemma3_iswa(const llama_model & model, const ll
102
118
  cur = build_norm(cur,
103
119
  model.layers[il].ffn_post_norm, NULL,
104
120
  LLM_NORM_RMS, -1);
105
- cb(cur, "ffn_post_norm", -1);
121
+ cb(cur, "ffn_post_norm", il);
106
122
 
107
123
  cur = ggml_add(ctx0, cur, sa_out);
108
124
 
@@ -124,8 +140,17 @@ llm_build_gemma3_iswa::llm_build_gemma3_iswa(const llama_model & model, const ll
124
140
  // lm_head
125
141
  cur = build_lora_mm(model.output, cur);
126
142
 
143
+ if (hparams.f_final_logit_softcapping) {
144
+ cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
145
+ cur = ggml_tanh(ctx0, cur);
146
+ cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping);
147
+ }
148
+
127
149
  cb(cur, "result_output", -1);
128
150
  res->t_logits = cur;
129
151
 
130
152
  ggml_build_forward_expand(gf, cur);
131
153
  }
154
+
155
+ template struct llm_build_gemma3<false>;
156
+ template struct llm_build_gemma3<true>;
@@ -179,8 +179,9 @@ struct llm_build_gemma2_iswa : public llm_graph_context {
179
179
  llm_build_gemma2_iswa(const llama_model & model, const llm_graph_params & params);
180
180
  };
181
181
 
182
- struct llm_build_gemma3_iswa : public llm_graph_context {
183
- llm_build_gemma3_iswa(const llama_model & model, const llm_graph_params & params);
182
+ template <bool iswa>
183
+ struct llm_build_gemma3 : public llm_graph_context {
184
+ llm_build_gemma3(const llama_model & model, const llm_graph_params & params);
184
185
  };
185
186
 
186
187
  struct llm_build_gemma3n_iswa : public llm_graph_context {