@fugood/llama.node 1.4.12 → 1.4.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. package/package.json +15 -15
  2. package/scripts/llama.cpp.patch +9 -9
  3. package/src/llama.cpp/common/arg.cpp +99 -45
  4. package/src/llama.cpp/common/chat.cpp +4 -4
  5. package/src/llama.cpp/common/common.cpp +19 -0
  6. package/src/llama.cpp/common/common.h +10 -0
  7. package/src/llama.cpp/common/llguidance.cpp +10 -6
  8. package/src/llama.cpp/common/regex-partial.cpp +13 -13
  9. package/src/llama.cpp/common/sampling.cpp +58 -14
  10. package/src/llama.cpp/common/sampling.h +3 -1
  11. package/src/llama.cpp/include/llama.h +87 -8
  12. package/src/llama.cpp/src/llama-arch.cpp +2 -0
  13. package/src/llama.cpp/src/llama-arch.h +1 -0
  14. package/src/llama.cpp/src/llama-context.cpp +615 -28
  15. package/src/llama.cpp/src/llama-context.h +43 -1
  16. package/src/llama.cpp/src/llama-grammar.cpp +40 -13
  17. package/src/llama.cpp/src/llama-grammar.h +2 -0
  18. package/src/llama.cpp/src/llama-graph.cpp +173 -5
  19. package/src/llama.cpp/src/llama-graph.h +71 -6
  20. package/src/llama.cpp/src/llama-hparams.cpp +4 -0
  21. package/src/llama.cpp/src/llama-hparams.h +8 -2
  22. package/src/llama.cpp/src/llama-model-saver.cpp +3 -0
  23. package/src/llama.cpp/src/llama-model.cpp +51 -11
  24. package/src/llama.cpp/src/llama-sampling.cpp +1232 -170
  25. package/src/llama.cpp/src/llama-sampling.h +16 -7
  26. package/src/llama.cpp/src/llama.cpp +38 -30
  27. package/src/llama.cpp/src/models/afmoe.cpp +9 -5
  28. package/src/llama.cpp/src/models/cohere2-iswa.cpp +3 -0
  29. package/src/llama.cpp/src/models/gemma2-iswa.cpp +5 -2
  30. package/src/llama.cpp/src/models/llama-iswa.cpp +6 -2
  31. package/src/llama.cpp/src/models/modern-bert.cpp +4 -3
  32. package/src/llama.cpp/src/models/openai-moe-iswa.cpp +5 -2
  33. package/src/llama.cpp/src/models/smallthinker.cpp +11 -5
@@ -14,7 +14,16 @@ struct llama_grammar;
14
14
  struct llama_sampler_chain {
15
15
  llama_sampler_chain_params params;
16
16
 
17
- std::vector<struct llama_sampler *> samplers;
17
+ // has .backend_init() been called?
18
+ bool is_init = false;
19
+
20
+ struct info {
21
+ bool is_backend;
22
+
23
+ llama_sampler * ptr;
24
+ };
25
+
26
+ std::vector<info> samplers;
18
27
 
19
28
  // pre-allocated buffer for llama_sampler_sample to avoid repeated allocations
20
29
  std::vector<llama_token_data> cur;
@@ -27,9 +36,9 @@ struct llama_sampler_chain {
27
36
  };
28
37
 
29
38
  struct llama_sampler * llama_sampler_init_dry_testing(
30
- int32_t context_size,
31
- float dry_multiplier,
32
- float dry_base,
33
- int32_t dry_allowed_length,
34
- int32_t dry_penalty_last_n,
35
- const std::vector<std::vector<llama_token>>& seq_breakers);
39
+ int32_t context_size,
40
+ float dry_multiplier,
41
+ float dry_base,
42
+ int32_t dry_allowed_length,
43
+ int32_t dry_penalty_last_n,
44
+ const std::vector<std::vector<llama_token>> & seq_breakers);
@@ -359,6 +359,11 @@ static void llama_params_fit_impl(
359
359
 
360
360
  // for the first partial layer varying parts can overflow, all further layers use LAYER_FRACTION_MOE:
361
361
  layer_fraction_t overflow_type = LAYER_FRACTION_MOE;
362
+
363
+ uint32_t n_full() const {
364
+ assert(n_layer >= n_part);
365
+ return n_layer - n_part;
366
+ }
362
367
  };
363
368
 
364
369
  const size_t ntbo = llama_max_tensor_buft_overrides();
@@ -382,7 +387,7 @@ static void llama_params_fit_impl(
382
387
 
383
388
  size_t itbo = 0;
384
389
  for (size_t id = 0; id < nd; id++) {
385
- il0 += ngl_per_device[id].n_layer - ngl_per_device[id].n_part;
390
+ il0 += ngl_per_device[id].n_full();
386
391
  for (uint32_t il = il0; il < il0 + ngl_per_device[id].n_part; il++) {
387
392
  if (itbo + 1 >= ntbo) {
388
393
  tensor_buft_overrides[itbo].pattern = nullptr;
@@ -393,7 +398,7 @@ static void llama_params_fit_impl(
393
398
  + std::to_string(ntbo) + " is insufficient for model");
394
399
  }
395
400
  tensor_buft_overrides[itbo].pattern = get_overflow_pattern(il, il == il0 ? ngl_per_device[id].overflow_type : LAYER_FRACTION_MOE);
396
- tensor_buft_overrides[itbo].buft = overflow_bufts[id];
401
+ tensor_buft_overrides[itbo].buft = il == il0 ? overflow_bufts[id] : ggml_backend_cpu_buffer_type();
397
402
  itbo++;
398
403
  }
399
404
  il0 += ngl_per_device[id].n_part;
@@ -468,20 +473,14 @@ static void llama_params_fit_impl(
468
473
  LLAMA_LOG_DEBUG("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
469
474
  }
470
475
 
471
- std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the partial layers of a device overflow to:
476
+ std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the first partial layer of a device overflows to:
472
477
  overflow_bufts.reserve(nd);
473
- for (size_t id = 0; id < nd - 1; ++id) {
474
- overflow_bufts.push_back(ggml_backend_dev_buffer_type(devs[id + 1]));
478
+ for (size_t id = 0; id < nd; id++) {
479
+ overflow_bufts.push_back(ggml_backend_cpu_buffer_type());
475
480
  }
476
- overflow_bufts.push_back(ggml_backend_cpu_buffer_type());
477
481
 
478
482
  std::vector<ngl_t> ngl_per_device(nd);
479
483
  std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts);
480
- if (hp_nex > 0) {
481
- for (size_t id = 0; id < nd; id++) {
482
- ngl_per_device[id].overflow_type = LAYER_FRACTION_MOE;
483
- }
484
- }
485
484
 
486
485
  // optimize the number of layers per device using the method of false position:
487
486
  // - ngl_per_device has 0 layers for each device, lower bound
@@ -512,9 +511,6 @@ static void llama_params_fit_impl(
512
511
  if (mem_high[id] > targets[id]) {
513
512
  assert(ngl_per_device_high[id].n_layer > ngl_per_device[id].n_layer);
514
513
  uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
515
- if (hp_nex > 0 && size_t(id) == nd - 1) {
516
- delta--;
517
- }
518
514
  LLAMA_LOG_DEBUG("%s: start filling device %" PRIu32 ", delta=%" PRIu32 "\n", __func__, id, delta);
519
515
  while (delta > 1) {
520
516
  uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
@@ -524,7 +520,8 @@ static void llama_params_fit_impl(
524
520
  std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
525
521
  ngl_per_device_test[id].n_layer += step_size;
526
522
  if (hp_nex) {
527
- ngl_per_device_test[id].n_part += step_size;
523
+ ngl_per_device_test[id].n_part += size_t(id) == nd - 1 && ngl_per_device_test[id].n_part == 0 ?
524
+ step_size - 1 : step_size; // the first layer is the output layer which must always be full
528
525
  }
529
526
  const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
530
527
 
@@ -573,7 +570,7 @@ static void llama_params_fit_impl(
573
570
  assert(id_dense_start < nd);
574
571
 
575
572
  LLAMA_LOG_INFO("%s: converting dense-only layers to full layers and filling them front-to-back with overflow to next device/system memory:\n", __func__);
576
- for (size_t id = 0; id <= id_dense_start; id++) {
573
+ for (size_t id = 0; id <= id_dense_start && id_dense_start < nd; id++) {
577
574
  std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
578
575
  for (size_t jd = id_dense_start; jd < nd; jd++) {
579
576
  const uint32_t n_layer_move = jd < nd - 1 ? ngl_per_device_high[jd].n_layer : ngl_per_device_high[jd].n_layer - 1;
@@ -585,12 +582,8 @@ static void llama_params_fit_impl(
585
582
  std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
586
583
 
587
584
  if (mem_high[id] > targets[id]) {
588
- assert(ngl_per_device_high[id].n_layer >= ngl_per_device_high[id].n_part);
589
- assert(ngl_per_device[id].n_layer >= ngl_per_device[id].n_part);
590
- assert((ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
591
- >= ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
592
- uint32_t delta = (ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
593
- - (ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
585
+ assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
586
+ uint32_t delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
594
587
  while (delta > 1) {
595
588
  uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
596
589
  step_size = std::max(step_size, uint32_t(1));
@@ -606,7 +599,7 @@ static void llama_params_fit_impl(
606
599
  ngl_per_device_test[id].n_layer += n_convert_jd;
607
600
  n_converted_test += n_convert_jd;
608
601
 
609
- if (ngl_per_device_test[id_dense_start_test].n_layer > 0) {
602
+ if (ngl_per_device_test[id_dense_start_test].n_part > 0) {
610
603
  break;
611
604
  }
612
605
  }
@@ -625,8 +618,8 @@ static void llama_params_fit_impl(
625
618
  LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start_high=%zu\n",
626
619
  __func__, id, ngl_per_device_high[id].n_layer, ngl_per_device_high[id].n_part, id_dense_start_high);
627
620
  }
628
- delta = (ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
629
- - (ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
621
+ assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
622
+ delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
630
623
  }
631
624
  } else {
632
625
  ngl_per_device = ngl_per_device_high;
@@ -644,14 +637,19 @@ static void llama_params_fit_impl(
644
637
  ngl_per_device_test[id_dense_start_test].n_part--;
645
638
  ngl_per_device_test[id].n_layer++;
646
639
  ngl_per_device_test[id].n_part++;
647
- if (ngl_per_device_test[id_dense_start_test].n_layer == 0) {
640
+ if (ngl_per_device_test[id_dense_start_test].n_part == 0) {
648
641
  id_dense_start_test++;
649
642
  }
650
643
  ngl_per_device_test[id].overflow_type = LAYER_FRACTION_UP;
644
+ std::vector<ggml_backend_buffer_type_t> overflow_bufts_test = overflow_bufts;
645
+ if (id < nd - 1) {
646
+ overflow_bufts_test[id] = ggml_backend_dev_buffer_type(devs[id + 1]);
647
+ }
651
648
  LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__);
652
- std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
649
+ std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
653
650
  if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
654
651
  ngl_per_device = ngl_per_device_test;
652
+ overflow_bufts = overflow_bufts_test;
655
653
  mem = mem_test;
656
654
  id_dense_start = id_dense_start_test;
657
655
  LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", UP), id_dense_start=%zu\n",
@@ -659,9 +657,10 @@ static void llama_params_fit_impl(
659
657
 
660
658
  ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE;
661
659
  LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__);
662
- mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
660
+ mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
663
661
  if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
664
662
  ngl_per_device = ngl_per_device_test;
663
+ overflow_bufts = overflow_bufts_test;
665
664
  mem = mem_test;
666
665
  id_dense_start = id_dense_start_test;
667
666
  LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", GATE), id_dense_start=%zu\n",
@@ -670,9 +669,10 @@ static void llama_params_fit_impl(
670
669
  } else {
671
670
  ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN;
672
671
  LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__);
673
- mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
672
+ mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
674
673
  if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
675
674
  ngl_per_device = ngl_per_device_test;
675
+ overflow_bufts = overflow_bufts_test;
676
676
  mem = mem_test;
677
677
  id_dense_start = id_dense_start_test;
678
678
  LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", ATTN), id_dense_start=%zu\n",
@@ -687,6 +687,14 @@ static void llama_params_fit_impl(
687
687
  __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
688
688
  }
689
689
 
690
+ // print info for devices that were not changed during the conversion from dense only to full layers:
691
+ for (size_t id = id_dense_start + 1; id < nd; id++) {
692
+ const int64_t projected_margin = dmds_full[id].free - mem[id];
693
+ LLAMA_LOG_INFO(
694
+ "%s: - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
695
+ __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
696
+ }
697
+
690
698
  set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
691
699
  }
692
700
 
@@ -713,7 +721,7 @@ enum llama_params_fit_status llama_params_fit(
713
721
 
714
722
  struct llama_sampler_chain_params llama_sampler_chain_default_params() {
715
723
  struct llama_sampler_chain_params result = {
716
- /*.no_perf =*/ true,
724
+ /*.no_perf =*/ true,
717
725
  };
718
726
 
719
727
  return result;
@@ -22,8 +22,15 @@ llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_para
22
22
  const float kq_scale = 1.0f/sqrtf(float(n_embd_head));
23
23
 
24
24
  for (int il = 0; il < n_layer; ++il) {
25
+ const float freq_base_l = model.get_rope_freq_base (cparams, il);
26
+ const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
27
+
25
28
  ggml_tensor * inpSA = inpL;
26
29
 
30
+ // This overlaps with SWA layers in current models, so get_rope_freq_base/scale may be superfluous
31
+ const bool use_rope = hparams.n_no_rope_layer_step > 0 &&
32
+ (il + 1) % hparams.n_no_rope_layer_step != 0;
33
+
27
34
  // dual attention normalization (pre)
28
35
  cur = build_norm(inpL,
29
36
  model.layers[il].attn_norm, NULL,
@@ -56,19 +63,16 @@ llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_para
56
63
  cb(Qcur, "Qcur_normed", il);
57
64
  cb(Kcur, "Kcur_normed", il);
58
65
 
59
- // RoPE only for sliding_attention layers
60
- const bool use_rope = hparams.n_no_rope_layer_step > 0 &&
61
- ((il + 1) % hparams.n_no_rope_layer_step) != 0;
62
66
  if (use_rope) {
63
67
  Qcur = ggml_rope_ext(
64
68
  ctx0, Qcur, inp_pos, nullptr,
65
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
69
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
66
70
  ext_factor, attn_factor, beta_fast, beta_slow);
67
71
  cb(Qcur, "Qcur_rope", il);
68
72
 
69
73
  Kcur = ggml_rope_ext(
70
74
  ctx0, Kcur, inp_pos, nullptr,
71
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
75
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
72
76
  ext_factor, attn_factor, beta_fast, beta_slow);
73
77
  cb(Kcur, "Kcur_rope", il);
74
78
  }
@@ -21,6 +21,9 @@ llm_build_cohere2_iswa::llm_build_cohere2_iswa(const llama_model & model, const
21
21
 
22
22
  for (int il = 0; il < n_layer; ++il) {
23
23
  const bool is_swa = hparams.is_swa(il);
24
+ // UNUSED:
25
+ // const float freq_base_l = model.get_rope_freq_base (cparams, il);
26
+ // const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
24
27
 
25
28
  // norm
26
29
  cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM, il);
@@ -19,6 +19,9 @@ llm_build_gemma2_iswa::llm_build_gemma2_iswa(const llama_model & model, const ll
19
19
  ggml_tensor * inp_out_ids = build_inp_out_ids();
20
20
 
21
21
  for (int il = 0; il < n_layer; ++il) {
22
+ const float freq_base_l = model.get_rope_freq_base (cparams, il);
23
+ const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
24
+
22
25
  // norm
23
26
  cur = build_norm(inpL,
24
27
  model.layers[il].attn_norm, NULL,
@@ -43,12 +46,12 @@ llm_build_gemma2_iswa::llm_build_gemma2_iswa(const llama_model & model, const ll
43
46
 
44
47
  Qcur = ggml_rope_ext(
45
48
  ctx0, Qcur, inp_pos, nullptr,
46
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
49
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
47
50
  ext_factor, attn_factor, beta_fast, beta_slow);
48
51
 
49
52
  Kcur = ggml_rope_ext(
50
53
  ctx0, Kcur, inp_pos, nullptr,
51
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
54
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
52
55
  ext_factor, attn_factor, beta_fast, beta_slow);
53
56
 
54
57
  cb(Qcur, "Qcur", il);
@@ -25,8 +25,12 @@ llm_build_llama_iswa::llm_build_llama_iswa(const llama_model & model, const llm_
25
25
  ggml_tensor * inp_out_ids = build_inp_out_ids();
26
26
 
27
27
  for (int il = 0; il < n_layer; ++il) {
28
+ const float freq_base_l = model.get_rope_freq_base (cparams, il);
29
+ const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
30
+
28
31
  ggml_tensor * inpSA = inpL;
29
32
 
33
+ // This overlaps with SWA layers in current models, so get_rope_freq_base/scale may be superfluous
30
34
  const bool use_rope = hparams.n_no_rope_layer_step > 0 &&
31
35
  (il + 1) % hparams.n_no_rope_layer_step != 0;
32
36
 
@@ -67,13 +71,13 @@ llm_build_llama_iswa::llm_build_llama_iswa(const llama_model & model, const llm_
67
71
  if (use_rope) {
68
72
  Qcur = ggml_rope_ext(
69
73
  ctx0, Qcur, inp_pos, rope_factors,
70
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
74
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
71
75
  ext_factor, attn_factor, beta_fast, beta_slow
72
76
  );
73
77
 
74
78
  Kcur = ggml_rope_ext(
75
79
  ctx0, Kcur, inp_pos, rope_factors,
76
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
80
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
77
81
  ext_factor, attn_factor, beta_fast, beta_slow
78
82
  );
79
83
  } else if (inp_attn_scale) {
@@ -23,7 +23,8 @@ llm_build_modern_bert::llm_build_modern_bert(const llama_model & model, const ll
23
23
  auto * inp_attn = build_attn_inp_no_cache();
24
24
 
25
25
  for (int il = 0; il < n_layer; ++il) {
26
- float freq_base_l = model.get_rope_freq_base(cparams, il);
26
+ const float freq_base_l = model.get_rope_freq_base(cparams, il);
27
+ const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
27
28
 
28
29
  cur = inpL;
29
30
 
@@ -48,13 +49,13 @@ llm_build_modern_bert::llm_build_modern_bert(const llama_model & model, const ll
48
49
  // RoPE
49
50
  Qcur = ggml_rope_ext(
50
51
  ctx0, Qcur, inp_pos, nullptr,
51
- n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale,
52
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
52
53
  ext_factor, attn_factor, beta_fast, beta_slow
53
54
  );
54
55
 
55
56
  Kcur = ggml_rope_ext(
56
57
  ctx0, Kcur, inp_pos, nullptr,
57
- n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale,
58
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
58
59
  ext_factor, attn_factor, beta_fast, beta_slow
59
60
  );
60
61
 
@@ -14,6 +14,9 @@ llm_build_openai_moe_iswa::llm_build_openai_moe_iswa(const llama_model & model,
14
14
  ggml_tensor * inp_out_ids = build_inp_out_ids();
15
15
 
16
16
  for (int il = 0; il < n_layer; ++il) {
17
+ const float freq_base_l = model.get_rope_freq_base (cparams, il);
18
+ const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
19
+
17
20
  ggml_tensor * inpSA = inpL;
18
21
 
19
22
  // norm
@@ -49,13 +52,13 @@ llm_build_openai_moe_iswa::llm_build_openai_moe_iswa(const llama_model & model,
49
52
 
50
53
  Qcur = ggml_rope_ext(
51
54
  ctx0, Qcur, inp_pos, nullptr,
52
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
55
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
53
56
  ext_factor, attn_factor, beta_fast, beta_slow
54
57
  );
55
58
 
56
59
  Kcur = ggml_rope_ext(
57
60
  ctx0, Kcur, inp_pos, nullptr,
58
- n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
61
+ n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
59
62
  ext_factor, attn_factor, beta_fast, beta_slow
60
63
  );
61
64
 
@@ -26,10 +26,16 @@ llm_build_smallthinker<iswa>::llm_build_smallthinker(const llama_model & model,
26
26
  ggml_tensor * inp_out_ids = build_inp_out_ids();
27
27
 
28
28
  for (int il = 0; il < n_layer; ++il) {
29
+ const float freq_base_l = model.get_rope_freq_base (cparams, il);
30
+ const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
31
+
29
32
  ggml_tensor * inpSA = inpL;
30
- ggml_tensor * probs = nullptr;
31
33
 
32
- probs = build_lora_mm(model.layers[il].ffn_gate_inp, inpL); // [n_expert, n_tokens]
34
+ // This overlaps with SWA layers in current models, so get_rope_freq_base/scale may be superfluous
35
+ const bool use_rope = hparams.n_no_rope_layer_step == n_layer ||
36
+ il % hparams.n_no_rope_layer_step != 0;
37
+
38
+ ggml_tensor * probs = build_lora_mm(model.layers[il].ffn_gate_inp, inpL); // [n_expert, n_tokens]
33
39
  cb(probs, "ffn_moe_logits", il);
34
40
 
35
41
  // norm
@@ -52,11 +58,11 @@ llm_build_smallthinker<iswa>::llm_build_smallthinker(const llama_model & model,
52
58
  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
53
59
  Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
54
60
 
55
- if (hparams.n_no_rope_layer_step == n_layer || il % hparams.n_no_rope_layer_step != 0) {
56
- Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
61
+ if (use_rope) {
62
+ Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
57
63
  ext_factor, attn_factor, beta_fast, beta_slow);
58
64
 
59
- Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
65
+ Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
60
66
  ext_factor, attn_factor, beta_fast, beta_slow);
61
67
  }
62
68
  cb(Qcur, "Qcur", il);