@fugood/llama.node 1.4.3 → 1.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6554,8 +6554,13 @@ static void ggml_call_mul_mat(ggml_type type, const ggml_compute_params * params
6554
6554
  ggml_compute_forward_mul_mat(params, &dst);
6555
6555
  }
6556
6556
 
6557
+ static inline int64_t ggml_wrap_around(int64_t coord, int64_t size) {
6558
+ return (coord + size) % size; // adding size avoids negative number weirdness
6559
+ }
6560
+
6557
6561
  // ggml_compute_forward_conv_2d
6558
6562
 
6563
+
6559
6564
  static void ggml_compute_forward_conv_2d_impl(const ggml_compute_params * params,
6560
6565
  const ggml_tensor * kernel, // [KW, KH, IC, OC]
6561
6566
  const ggml_tensor * src, // [W, H, C, N]
@@ -7591,6 +7596,7 @@ void ggml_compute_forward_upscale(
7591
7596
 
7592
7597
  // ggml_compute_forward_pad
7593
7598
 
7599
+ template<bool circular_t>
7594
7600
  static void ggml_compute_forward_pad_f32(
7595
7601
  const ggml_compute_params * params,
7596
7602
  ggml_tensor * dst) {
@@ -7615,23 +7621,40 @@ static void ggml_compute_forward_pad_f32(
7615
7621
  const int32_t lp3 = ggml_get_op_params_i32(dst, 6);
7616
7622
  const int32_t rp3 = ggml_get_op_params_i32(dst, 7);
7617
7623
 
7618
-
7619
7624
  // TODO: optimize
7620
7625
 
7621
7626
  for (int64_t i2 = 0; i2 < ne2; ++i2) {
7622
7627
  for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
7623
7628
  for (int64_t i0 = 0; i0 < ne0; ++i0) {
7624
7629
  for (int64_t i3 = 0; i3 < ne3; ++i3) {
7625
- const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
7626
- if ((i0 >= lp0 && i0 < ne0 - rp0) \
7627
- && (i1 >= lp1 && i1 < ne1 - rp1) \
7628
- && (i2 >= lp2 && i2 < ne2 - rp2) \
7629
- && (i3 >= lp3 && i3 < ne3 - rp3)) {
7630
- const int64_t src_idx = (i3 - lp3)*nb03 + (i2 - lp2)*nb02 + (i1 - lp1)*nb01 + (i0 - lp0)*nb00;
7630
+ // circular means wrap around on a torus, so x and y loop around
7631
+ if constexpr (circular_t) {
7632
+ const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
7633
+ const int64_t src_i0 = ggml_wrap_around(i0 - lp0, ne00);
7634
+ const int64_t src_i1 = ggml_wrap_around(i1 - lp1, ne01);
7635
+ const int64_t src_i2 = ggml_wrap_around(i2 - lp2, ne02);
7636
+ const int64_t src_i3 = ggml_wrap_around(i3 - lp3, ne03);
7637
+
7638
+ const int64_t src_idx =
7639
+ src_i3*nb03 +
7640
+ src_i2*nb02 +
7641
+ src_i1*nb01 +
7642
+ src_i0*nb00;
7643
+
7631
7644
  const float * src_ptr = (const float *)((char *) src0->data + src_idx);
7632
7645
  dst_ptr[dst_idx] = *src_ptr;
7633
7646
  } else {
7634
- dst_ptr[dst_idx] = 0;
7647
+ const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
7648
+ if ((i0 >= lp0 && i0 < ne0 - rp0) \
7649
+ && (i1 >= lp1 && i1 < ne1 - rp1) \
7650
+ && (i2 >= lp2 && i2 < ne2 - rp2) \
7651
+ && (i3 >= lp3 && i3 < ne3 - rp3)) {
7652
+ const int64_t src_idx = (i3 - lp3)*nb03 + (i2 - lp2)*nb02 + (i1 - lp1)*nb01 + (i0 - lp0)*nb00;
7653
+ const float * src_ptr = (const float *)((char *) src0->data + src_idx);
7654
+ dst_ptr[dst_idx] = *src_ptr;
7655
+ } else {
7656
+ dst_ptr[dst_idx] = 0;
7657
+ }
7635
7658
  }
7636
7659
  }
7637
7660
  }
@@ -7639,16 +7662,20 @@ static void ggml_compute_forward_pad_f32(
7639
7662
  }
7640
7663
  }
7641
7664
 
7665
+
7642
7666
  void ggml_compute_forward_pad(
7643
7667
  const ggml_compute_params * params,
7644
7668
  ggml_tensor * dst) {
7645
-
7646
7669
  const ggml_tensor * src0 = dst->src[0];
7647
-
7670
+ const bool circular = (bool) ggml_get_op_params_i32(dst, 8);
7648
7671
  switch (src0->type) {
7649
7672
  case GGML_TYPE_F32:
7650
7673
  {
7651
- ggml_compute_forward_pad_f32(params, dst);
7674
+ if (circular) {
7675
+ ggml_compute_forward_pad_f32<true>(params, dst);
7676
+ } else {
7677
+ ggml_compute_forward_pad_f32<false>(params, dst);
7678
+ }
7652
7679
  } break;
7653
7680
  default:
7654
7681
  {
@@ -1628,6 +1628,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
1628
1628
  }
1629
1629
  ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false);
1630
1630
 
1631
+ // (optional) temperature tuning - used by mistral-large
1632
+ ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);
1633
+ ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH, hparams.n_attn_temp_floor_scale, false);
1634
+
1631
1635
  switch (hparams.n_layer) {
1632
1636
  case 27: type = LLM_TYPE_16B; break;
1633
1637
  case 60: type = LLM_TYPE_236B; break;
@@ -666,7 +666,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
666
666
 
667
667
  std::map<int, std::string> mapped;
668
668
  int blk_id = 0;
669
- int pruned_attention_w = 0;
670
669
 
671
670
  // make a list of weights
672
671
  std::vector<const llama_model_loader::llama_tensor_weight *> tensors;
@@ -674,11 +673,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
674
673
  for (const auto & it : ml.weights_map) {
675
674
  const std::string remapped_name(remap_layer(it.first, prune_list, mapped, blk_id));
676
675
  if (remapped_name.empty()) {
677
- if (it.first.find("attn_v.weight") != std::string::npos ||
678
- it.first.find("attn_qkv.weight") != std::string::npos ||
679
- it.first.find("attn_kv_b.weight") != std::string::npos) {
680
- pruned_attention_w++;
681
- }
682
676
  LLAMA_LOG_DEBUG("%s: pruning tensor %s\n", __func__, it.first.c_str());
683
677
  continue;
684
678
  }
@@ -703,7 +697,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
703
697
  });
704
698
  }
705
699
 
706
- bool is_clip_model = false;
707
700
  for (const auto * it : tensors) {
708
701
  const struct ggml_tensor * tensor = it->tensor;
709
702
 
@@ -717,32 +710,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
717
710
  } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
718
711
  qs.has_output = true;
719
712
  }
720
-
721
- is_clip_model |= name.rfind("mm.", 0) == 0; // check the "mm." prefix
722
713
  }
723
714
 
724
715
  qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
725
716
 
726
- // sanity checks for models that have attention layers
727
- if (qs.n_attention_wv != 0 && !is_clip_model)
728
- {
729
- const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin();
730
- // attention layers have a non-zero number of kv heads
731
- int32_t n_layer_attn = model.hparams.n_layer - std::count(n_head_kv_iter, n_head_kv_iter + model.hparams.n_layer, 0);
732
- if (llama_model_has_encoder(&model)) {
733
- // now n_layer_attn is the number of attention layers in the encoder
734
- // for each decoder block, there are 2 attention layers
735
- n_layer_attn += 2 * model.hparams.dec_n_layer;
736
- }
737
-
738
- // note: for linear-attention models (such as Qwen3 Next) this is the number of linear layers
739
- const int32_t n_layer_recr = std::count(model.hparams.recurrent_layer_arr.begin(), model.hparams.recurrent_layer_arr.end(), true);
740
-
741
- LLAMA_LOG_INFO("%s: n_layer_attn = %d, n_layer_recr = %d, pruned_attention_w = %d\n", __func__, n_layer_attn, n_layer_recr, pruned_attention_w);
742
-
743
- GGML_ASSERT((qs.n_attention_wv == n_layer_attn - pruned_attention_w - n_layer_recr) && "n_attention_wv is unexpected");
744
- }
745
-
746
717
  size_t total_size_org = 0;
747
718
  size_t total_size_new = 0;
748
719
 
@@ -30,6 +30,12 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
30
30
  // {n_embd, n_tokens}
31
31
  inpL = build_inp_embd(model.tok_embd);
32
32
 
33
+ // (optional) temperature tuning - used by mistral-large
34
+ ggml_tensor * inp_attn_scale = nullptr;
35
+ if (hparams.f_attn_temp_scale != 0.0f) {
36
+ inp_attn_scale = build_inp_attn_scale();
37
+ }
38
+
33
39
  // inp_pos - contains the positions
34
40
  ggml_tensor * inp_pos = build_inp_pos();
35
41
 
@@ -128,6 +134,12 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
128
134
  ggml_tensor * Vcur = kv_cmpr;
129
135
  cb(Vcur, "Vcur", il);
130
136
 
137
+ if (inp_attn_scale) {
138
+ // apply llama 4 temperature scaling
139
+ Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale);
140
+ cb(Qcur, "Qcur_attn_temp_scaled", il);
141
+ }
142
+
131
143
  // note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
132
144
  cur = build_attn(inp_attn,
133
145
  model.layers[il].wo, NULL,
@@ -160,6 +172,12 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
160
172
  ggml_tensor * Kcur = ggml_concat(ctx0, ggml_repeat(ctx0, k_pe, q_pe), k_nope, 0);
161
173
  cb(Kcur, "Kcur", il);
162
174
 
175
+ if (inp_attn_scale) {
176
+ // apply llama 4 temperature scaling
177
+ Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale);
178
+ cb(Qcur, "Qcur_attn_temp_scaled", il);
179
+ }
180
+
163
181
  // note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
164
182
  cur = build_attn(inp_attn,
165
183
  model.layers[il].wo, NULL,
@@ -499,7 +499,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
499
499
 
500
500
  // use std::wregex to split the text
501
501
  static std::vector<size_t> unicode_regex_split_stl(const std::wstring & wtext, const std::wstring & regex_expr, const std::vector<size_t> & offsets) {
502
- std::wregex expr(regex_expr);
502
+ std::wregex expr(regex_expr, std::regex_constants::optimize | std::regex_constants::nosubs);
503
503
  std::vector<size_t> bpe_offsets; // store the offset of each word
504
504
  bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
505
505
  size_t start = 0;
@@ -529,7 +529,7 @@ static std::vector<size_t> unicode_regex_split_stl(const std::wstring & wtext, c
529
529
 
530
530
  // use std::regex to split the text
531
531
  static std::vector<size_t> unicode_regex_split_stl(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
532
- std::regex expr(regex_expr);
532
+ std::regex expr(regex_expr, std::regex_constants::optimize | std::regex_constants::nosubs);
533
533
  std::vector<size_t> bpe_offsets; // store the offset of each word
534
534
  bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
535
535
  size_t start = 0;