@fugood/llama.node 1.4.12 → 1.4.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +9 -9
- package/src/llama.cpp/common/arg.cpp +99 -45
- package/src/llama.cpp/common/chat.cpp +4 -4
- package/src/llama.cpp/common/common.cpp +19 -0
- package/src/llama.cpp/common/common.h +10 -0
- package/src/llama.cpp/common/llguidance.cpp +10 -6
- package/src/llama.cpp/common/regex-partial.cpp +13 -13
- package/src/llama.cpp/common/sampling.cpp +58 -14
- package/src/llama.cpp/common/sampling.h +3 -1
- package/src/llama.cpp/include/llama.h +87 -8
- package/src/llama.cpp/src/llama-arch.cpp +2 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +615 -28
- package/src/llama.cpp/src/llama-context.h +43 -1
- package/src/llama.cpp/src/llama-grammar.cpp +40 -13
- package/src/llama.cpp/src/llama-grammar.h +2 -0
- package/src/llama.cpp/src/llama-graph.cpp +173 -5
- package/src/llama.cpp/src/llama-graph.h +71 -6
- package/src/llama.cpp/src/llama-hparams.cpp +4 -0
- package/src/llama.cpp/src/llama-hparams.h +8 -2
- package/src/llama.cpp/src/llama-model-saver.cpp +3 -0
- package/src/llama.cpp/src/llama-model.cpp +51 -11
- package/src/llama.cpp/src/llama-sampling.cpp +1232 -170
- package/src/llama.cpp/src/llama-sampling.h +16 -7
- package/src/llama.cpp/src/llama.cpp +38 -30
- package/src/llama.cpp/src/models/afmoe.cpp +9 -5
- package/src/llama.cpp/src/models/cohere2-iswa.cpp +3 -0
- package/src/llama.cpp/src/models/gemma2-iswa.cpp +5 -2
- package/src/llama.cpp/src/models/llama-iswa.cpp +6 -2
- package/src/llama.cpp/src/models/modern-bert.cpp +4 -3
- package/src/llama.cpp/src/models/openai-moe-iswa.cpp +5 -2
- package/src/llama.cpp/src/models/smallthinker.cpp +11 -5
|
@@ -14,7 +14,16 @@ struct llama_grammar;
|
|
|
14
14
|
struct llama_sampler_chain {
|
|
15
15
|
llama_sampler_chain_params params;
|
|
16
16
|
|
|
17
|
-
|
|
17
|
+
// has .backend_init() been called?
|
|
18
|
+
bool is_init = false;
|
|
19
|
+
|
|
20
|
+
struct info {
|
|
21
|
+
bool is_backend;
|
|
22
|
+
|
|
23
|
+
llama_sampler * ptr;
|
|
24
|
+
};
|
|
25
|
+
|
|
26
|
+
std::vector<info> samplers;
|
|
18
27
|
|
|
19
28
|
// pre-allocated buffer for llama_sampler_sample to avoid repeated allocations
|
|
20
29
|
std::vector<llama_token_data> cur;
|
|
@@ -27,9 +36,9 @@ struct llama_sampler_chain {
|
|
|
27
36
|
};
|
|
28
37
|
|
|
29
38
|
struct llama_sampler * llama_sampler_init_dry_testing(
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
39
|
+
int32_t context_size,
|
|
40
|
+
float dry_multiplier,
|
|
41
|
+
float dry_base,
|
|
42
|
+
int32_t dry_allowed_length,
|
|
43
|
+
int32_t dry_penalty_last_n,
|
|
44
|
+
const std::vector<std::vector<llama_token>> & seq_breakers);
|
|
@@ -359,6 +359,11 @@ static void llama_params_fit_impl(
|
|
|
359
359
|
|
|
360
360
|
// for the first partial layer varying parts can overflow, all further layers use LAYER_FRACTION_MOE:
|
|
361
361
|
layer_fraction_t overflow_type = LAYER_FRACTION_MOE;
|
|
362
|
+
|
|
363
|
+
uint32_t n_full() const {
|
|
364
|
+
assert(n_layer >= n_part);
|
|
365
|
+
return n_layer - n_part;
|
|
366
|
+
}
|
|
362
367
|
};
|
|
363
368
|
|
|
364
369
|
const size_t ntbo = llama_max_tensor_buft_overrides();
|
|
@@ -382,7 +387,7 @@ static void llama_params_fit_impl(
|
|
|
382
387
|
|
|
383
388
|
size_t itbo = 0;
|
|
384
389
|
for (size_t id = 0; id < nd; id++) {
|
|
385
|
-
il0 += ngl_per_device[id].
|
|
390
|
+
il0 += ngl_per_device[id].n_full();
|
|
386
391
|
for (uint32_t il = il0; il < il0 + ngl_per_device[id].n_part; il++) {
|
|
387
392
|
if (itbo + 1 >= ntbo) {
|
|
388
393
|
tensor_buft_overrides[itbo].pattern = nullptr;
|
|
@@ -393,7 +398,7 @@ static void llama_params_fit_impl(
|
|
|
393
398
|
+ std::to_string(ntbo) + " is insufficient for model");
|
|
394
399
|
}
|
|
395
400
|
tensor_buft_overrides[itbo].pattern = get_overflow_pattern(il, il == il0 ? ngl_per_device[id].overflow_type : LAYER_FRACTION_MOE);
|
|
396
|
-
tensor_buft_overrides[itbo].buft = overflow_bufts[id];
|
|
401
|
+
tensor_buft_overrides[itbo].buft = il == il0 ? overflow_bufts[id] : ggml_backend_cpu_buffer_type();
|
|
397
402
|
itbo++;
|
|
398
403
|
}
|
|
399
404
|
il0 += ngl_per_device[id].n_part;
|
|
@@ -468,20 +473,14 @@ static void llama_params_fit_impl(
|
|
|
468
473
|
LLAMA_LOG_DEBUG("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
|
|
469
474
|
}
|
|
470
475
|
|
|
471
|
-
std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the partial
|
|
476
|
+
std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the first partial layer of a device overflows to:
|
|
472
477
|
overflow_bufts.reserve(nd);
|
|
473
|
-
for (size_t id = 0; id < nd
|
|
474
|
-
overflow_bufts.push_back(
|
|
478
|
+
for (size_t id = 0; id < nd; id++) {
|
|
479
|
+
overflow_bufts.push_back(ggml_backend_cpu_buffer_type());
|
|
475
480
|
}
|
|
476
|
-
overflow_bufts.push_back(ggml_backend_cpu_buffer_type());
|
|
477
481
|
|
|
478
482
|
std::vector<ngl_t> ngl_per_device(nd);
|
|
479
483
|
std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts);
|
|
480
|
-
if (hp_nex > 0) {
|
|
481
|
-
for (size_t id = 0; id < nd; id++) {
|
|
482
|
-
ngl_per_device[id].overflow_type = LAYER_FRACTION_MOE;
|
|
483
|
-
}
|
|
484
|
-
}
|
|
485
484
|
|
|
486
485
|
// optimize the number of layers per device using the method of false position:
|
|
487
486
|
// - ngl_per_device has 0 layers for each device, lower bound
|
|
@@ -512,9 +511,6 @@ static void llama_params_fit_impl(
|
|
|
512
511
|
if (mem_high[id] > targets[id]) {
|
|
513
512
|
assert(ngl_per_device_high[id].n_layer > ngl_per_device[id].n_layer);
|
|
514
513
|
uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
|
|
515
|
-
if (hp_nex > 0 && size_t(id) == nd - 1) {
|
|
516
|
-
delta--;
|
|
517
|
-
}
|
|
518
514
|
LLAMA_LOG_DEBUG("%s: start filling device %" PRIu32 ", delta=%" PRIu32 "\n", __func__, id, delta);
|
|
519
515
|
while (delta > 1) {
|
|
520
516
|
uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
|
|
@@ -524,7 +520,8 @@ static void llama_params_fit_impl(
|
|
|
524
520
|
std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
|
|
525
521
|
ngl_per_device_test[id].n_layer += step_size;
|
|
526
522
|
if (hp_nex) {
|
|
527
|
-
ngl_per_device_test[id].n_part +=
|
|
523
|
+
ngl_per_device_test[id].n_part += size_t(id) == nd - 1 && ngl_per_device_test[id].n_part == 0 ?
|
|
524
|
+
step_size - 1 : step_size; // the first layer is the output layer which must always be full
|
|
528
525
|
}
|
|
529
526
|
const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
|
|
530
527
|
|
|
@@ -573,7 +570,7 @@ static void llama_params_fit_impl(
|
|
|
573
570
|
assert(id_dense_start < nd);
|
|
574
571
|
|
|
575
572
|
LLAMA_LOG_INFO("%s: converting dense-only layers to full layers and filling them front-to-back with overflow to next device/system memory:\n", __func__);
|
|
576
|
-
for (size_t id = 0; id <= id_dense_start; id++) {
|
|
573
|
+
for (size_t id = 0; id <= id_dense_start && id_dense_start < nd; id++) {
|
|
577
574
|
std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
|
|
578
575
|
for (size_t jd = id_dense_start; jd < nd; jd++) {
|
|
579
576
|
const uint32_t n_layer_move = jd < nd - 1 ? ngl_per_device_high[jd].n_layer : ngl_per_device_high[jd].n_layer - 1;
|
|
@@ -585,12 +582,8 @@ static void llama_params_fit_impl(
|
|
|
585
582
|
std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
|
|
586
583
|
|
|
587
584
|
if (mem_high[id] > targets[id]) {
|
|
588
|
-
assert(ngl_per_device_high[id].
|
|
589
|
-
|
|
590
|
-
assert((ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
|
|
591
|
-
>= ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
|
|
592
|
-
uint32_t delta = (ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
|
|
593
|
-
- (ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
|
|
585
|
+
assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
|
|
586
|
+
uint32_t delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
|
|
594
587
|
while (delta > 1) {
|
|
595
588
|
uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
|
|
596
589
|
step_size = std::max(step_size, uint32_t(1));
|
|
@@ -606,7 +599,7 @@ static void llama_params_fit_impl(
|
|
|
606
599
|
ngl_per_device_test[id].n_layer += n_convert_jd;
|
|
607
600
|
n_converted_test += n_convert_jd;
|
|
608
601
|
|
|
609
|
-
if (ngl_per_device_test[id_dense_start_test].
|
|
602
|
+
if (ngl_per_device_test[id_dense_start_test].n_part > 0) {
|
|
610
603
|
break;
|
|
611
604
|
}
|
|
612
605
|
}
|
|
@@ -625,8 +618,8 @@ static void llama_params_fit_impl(
|
|
|
625
618
|
LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start_high=%zu\n",
|
|
626
619
|
__func__, id, ngl_per_device_high[id].n_layer, ngl_per_device_high[id].n_part, id_dense_start_high);
|
|
627
620
|
}
|
|
628
|
-
|
|
629
|
-
|
|
621
|
+
assert(ngl_per_device_high[id].n_full() >= ngl_per_device[id].n_full());
|
|
622
|
+
delta = ngl_per_device_high[id].n_full() - ngl_per_device[id].n_full();
|
|
630
623
|
}
|
|
631
624
|
} else {
|
|
632
625
|
ngl_per_device = ngl_per_device_high;
|
|
@@ -644,14 +637,19 @@ static void llama_params_fit_impl(
|
|
|
644
637
|
ngl_per_device_test[id_dense_start_test].n_part--;
|
|
645
638
|
ngl_per_device_test[id].n_layer++;
|
|
646
639
|
ngl_per_device_test[id].n_part++;
|
|
647
|
-
if (ngl_per_device_test[id_dense_start_test].
|
|
640
|
+
if (ngl_per_device_test[id_dense_start_test].n_part == 0) {
|
|
648
641
|
id_dense_start_test++;
|
|
649
642
|
}
|
|
650
643
|
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_UP;
|
|
644
|
+
std::vector<ggml_backend_buffer_type_t> overflow_bufts_test = overflow_bufts;
|
|
645
|
+
if (id < nd - 1) {
|
|
646
|
+
overflow_bufts_test[id] = ggml_backend_dev_buffer_type(devs[id + 1]);
|
|
647
|
+
}
|
|
651
648
|
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__);
|
|
652
|
-
std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test,
|
|
649
|
+
std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
|
|
653
650
|
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
|
|
654
651
|
ngl_per_device = ngl_per_device_test;
|
|
652
|
+
overflow_bufts = overflow_bufts_test;
|
|
655
653
|
mem = mem_test;
|
|
656
654
|
id_dense_start = id_dense_start_test;
|
|
657
655
|
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", UP), id_dense_start=%zu\n",
|
|
@@ -659,9 +657,10 @@ static void llama_params_fit_impl(
|
|
|
659
657
|
|
|
660
658
|
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE;
|
|
661
659
|
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__);
|
|
662
|
-
mem_test = get_memory_for_layers(__func__, ngl_per_device_test,
|
|
660
|
+
mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
|
|
663
661
|
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
|
|
664
662
|
ngl_per_device = ngl_per_device_test;
|
|
663
|
+
overflow_bufts = overflow_bufts_test;
|
|
665
664
|
mem = mem_test;
|
|
666
665
|
id_dense_start = id_dense_start_test;
|
|
667
666
|
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", GATE), id_dense_start=%zu\n",
|
|
@@ -670,9 +669,10 @@ static void llama_params_fit_impl(
|
|
|
670
669
|
} else {
|
|
671
670
|
ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN;
|
|
672
671
|
LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__);
|
|
673
|
-
mem_test = get_memory_for_layers(__func__, ngl_per_device_test,
|
|
672
|
+
mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts_test);
|
|
674
673
|
if (mem_test[id] < targets[id] && (id + 1 == nd || mem_test[id + 1] < targets[id + 1])) {
|
|
675
674
|
ngl_per_device = ngl_per_device_test;
|
|
675
|
+
overflow_bufts = overflow_bufts_test;
|
|
676
676
|
mem = mem_test;
|
|
677
677
|
id_dense_start = id_dense_start_test;
|
|
678
678
|
LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", ATTN), id_dense_start=%zu\n",
|
|
@@ -687,6 +687,14 @@ static void llama_params_fit_impl(
|
|
|
687
687
|
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
|
|
688
688
|
}
|
|
689
689
|
|
|
690
|
+
// print info for devices that were not changed during the conversion from dense only to full layers:
|
|
691
|
+
for (size_t id = id_dense_start + 1; id < nd; id++) {
|
|
692
|
+
const int64_t projected_margin = dmds_full[id].free - mem[id];
|
|
693
|
+
LLAMA_LOG_INFO(
|
|
694
|
+
"%s: - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
|
|
695
|
+
__func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
|
|
696
|
+
}
|
|
697
|
+
|
|
690
698
|
set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
|
|
691
699
|
}
|
|
692
700
|
|
|
@@ -713,7 +721,7 @@ enum llama_params_fit_status llama_params_fit(
|
|
|
713
721
|
|
|
714
722
|
struct llama_sampler_chain_params llama_sampler_chain_default_params() {
|
|
715
723
|
struct llama_sampler_chain_params result = {
|
|
716
|
-
/*.no_perf
|
|
724
|
+
/*.no_perf =*/ true,
|
|
717
725
|
};
|
|
718
726
|
|
|
719
727
|
return result;
|
|
@@ -22,8 +22,15 @@ llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_para
|
|
|
22
22
|
const float kq_scale = 1.0f/sqrtf(float(n_embd_head));
|
|
23
23
|
|
|
24
24
|
for (int il = 0; il < n_layer; ++il) {
|
|
25
|
+
const float freq_base_l = model.get_rope_freq_base (cparams, il);
|
|
26
|
+
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
|
27
|
+
|
|
25
28
|
ggml_tensor * inpSA = inpL;
|
|
26
29
|
|
|
30
|
+
// This overlaps with SWA layers in current models, so get_rope_freq_base/scale may be superfluous
|
|
31
|
+
const bool use_rope = hparams.n_no_rope_layer_step > 0 &&
|
|
32
|
+
(il + 1) % hparams.n_no_rope_layer_step != 0;
|
|
33
|
+
|
|
27
34
|
// dual attention normalization (pre)
|
|
28
35
|
cur = build_norm(inpL,
|
|
29
36
|
model.layers[il].attn_norm, NULL,
|
|
@@ -56,19 +63,16 @@ llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_para
|
|
|
56
63
|
cb(Qcur, "Qcur_normed", il);
|
|
57
64
|
cb(Kcur, "Kcur_normed", il);
|
|
58
65
|
|
|
59
|
-
// RoPE only for sliding_attention layers
|
|
60
|
-
const bool use_rope = hparams.n_no_rope_layer_step > 0 &&
|
|
61
|
-
((il + 1) % hparams.n_no_rope_layer_step) != 0;
|
|
62
66
|
if (use_rope) {
|
|
63
67
|
Qcur = ggml_rope_ext(
|
|
64
68
|
ctx0, Qcur, inp_pos, nullptr,
|
|
65
|
-
n_rot, rope_type, n_ctx_orig,
|
|
69
|
+
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
|
66
70
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
67
71
|
cb(Qcur, "Qcur_rope", il);
|
|
68
72
|
|
|
69
73
|
Kcur = ggml_rope_ext(
|
|
70
74
|
ctx0, Kcur, inp_pos, nullptr,
|
|
71
|
-
n_rot, rope_type, n_ctx_orig,
|
|
75
|
+
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
|
72
76
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
73
77
|
cb(Kcur, "Kcur_rope", il);
|
|
74
78
|
}
|
|
@@ -21,6 +21,9 @@ llm_build_cohere2_iswa::llm_build_cohere2_iswa(const llama_model & model, const
|
|
|
21
21
|
|
|
22
22
|
for (int il = 0; il < n_layer; ++il) {
|
|
23
23
|
const bool is_swa = hparams.is_swa(il);
|
|
24
|
+
// UNUSED:
|
|
25
|
+
// const float freq_base_l = model.get_rope_freq_base (cparams, il);
|
|
26
|
+
// const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
|
24
27
|
|
|
25
28
|
// norm
|
|
26
29
|
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM, il);
|
|
@@ -19,6 +19,9 @@ llm_build_gemma2_iswa::llm_build_gemma2_iswa(const llama_model & model, const ll
|
|
|
19
19
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
20
20
|
|
|
21
21
|
for (int il = 0; il < n_layer; ++il) {
|
|
22
|
+
const float freq_base_l = model.get_rope_freq_base (cparams, il);
|
|
23
|
+
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
|
24
|
+
|
|
22
25
|
// norm
|
|
23
26
|
cur = build_norm(inpL,
|
|
24
27
|
model.layers[il].attn_norm, NULL,
|
|
@@ -43,12 +46,12 @@ llm_build_gemma2_iswa::llm_build_gemma2_iswa(const llama_model & model, const ll
|
|
|
43
46
|
|
|
44
47
|
Qcur = ggml_rope_ext(
|
|
45
48
|
ctx0, Qcur, inp_pos, nullptr,
|
|
46
|
-
n_rot, rope_type, n_ctx_orig,
|
|
49
|
+
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
|
47
50
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
48
51
|
|
|
49
52
|
Kcur = ggml_rope_ext(
|
|
50
53
|
ctx0, Kcur, inp_pos, nullptr,
|
|
51
|
-
n_rot, rope_type, n_ctx_orig,
|
|
54
|
+
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
|
52
55
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
53
56
|
|
|
54
57
|
cb(Qcur, "Qcur", il);
|
|
@@ -25,8 +25,12 @@ llm_build_llama_iswa::llm_build_llama_iswa(const llama_model & model, const llm_
|
|
|
25
25
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
26
26
|
|
|
27
27
|
for (int il = 0; il < n_layer; ++il) {
|
|
28
|
+
const float freq_base_l = model.get_rope_freq_base (cparams, il);
|
|
29
|
+
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
|
30
|
+
|
|
28
31
|
ggml_tensor * inpSA = inpL;
|
|
29
32
|
|
|
33
|
+
// This overlaps with SWA layers in current models, so get_rope_freq_base/scale may be superfluous
|
|
30
34
|
const bool use_rope = hparams.n_no_rope_layer_step > 0 &&
|
|
31
35
|
(il + 1) % hparams.n_no_rope_layer_step != 0;
|
|
32
36
|
|
|
@@ -67,13 +71,13 @@ llm_build_llama_iswa::llm_build_llama_iswa(const llama_model & model, const llm_
|
|
|
67
71
|
if (use_rope) {
|
|
68
72
|
Qcur = ggml_rope_ext(
|
|
69
73
|
ctx0, Qcur, inp_pos, rope_factors,
|
|
70
|
-
n_rot, rope_type, n_ctx_orig,
|
|
74
|
+
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
|
71
75
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
72
76
|
);
|
|
73
77
|
|
|
74
78
|
Kcur = ggml_rope_ext(
|
|
75
79
|
ctx0, Kcur, inp_pos, rope_factors,
|
|
76
|
-
n_rot, rope_type, n_ctx_orig,
|
|
80
|
+
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
|
77
81
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
78
82
|
);
|
|
79
83
|
} else if (inp_attn_scale) {
|
|
@@ -23,7 +23,8 @@ llm_build_modern_bert::llm_build_modern_bert(const llama_model & model, const ll
|
|
|
23
23
|
auto * inp_attn = build_attn_inp_no_cache();
|
|
24
24
|
|
|
25
25
|
for (int il = 0; il < n_layer; ++il) {
|
|
26
|
-
float freq_base_l
|
|
26
|
+
const float freq_base_l = model.get_rope_freq_base(cparams, il);
|
|
27
|
+
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
|
27
28
|
|
|
28
29
|
cur = inpL;
|
|
29
30
|
|
|
@@ -48,13 +49,13 @@ llm_build_modern_bert::llm_build_modern_bert(const llama_model & model, const ll
|
|
|
48
49
|
// RoPE
|
|
49
50
|
Qcur = ggml_rope_ext(
|
|
50
51
|
ctx0, Qcur, inp_pos, nullptr,
|
|
51
|
-
n_rot, rope_type, n_ctx_orig, freq_base_l,
|
|
52
|
+
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
|
52
53
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
53
54
|
);
|
|
54
55
|
|
|
55
56
|
Kcur = ggml_rope_ext(
|
|
56
57
|
ctx0, Kcur, inp_pos, nullptr,
|
|
57
|
-
n_rot, rope_type, n_ctx_orig, freq_base_l,
|
|
58
|
+
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
|
58
59
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
59
60
|
);
|
|
60
61
|
|
|
@@ -14,6 +14,9 @@ llm_build_openai_moe_iswa::llm_build_openai_moe_iswa(const llama_model & model,
|
|
|
14
14
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
15
15
|
|
|
16
16
|
for (int il = 0; il < n_layer; ++il) {
|
|
17
|
+
const float freq_base_l = model.get_rope_freq_base (cparams, il);
|
|
18
|
+
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
|
19
|
+
|
|
17
20
|
ggml_tensor * inpSA = inpL;
|
|
18
21
|
|
|
19
22
|
// norm
|
|
@@ -49,13 +52,13 @@ llm_build_openai_moe_iswa::llm_build_openai_moe_iswa(const llama_model & model,
|
|
|
49
52
|
|
|
50
53
|
Qcur = ggml_rope_ext(
|
|
51
54
|
ctx0, Qcur, inp_pos, nullptr,
|
|
52
|
-
n_rot, rope_type, n_ctx_orig,
|
|
55
|
+
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
|
53
56
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
54
57
|
);
|
|
55
58
|
|
|
56
59
|
Kcur = ggml_rope_ext(
|
|
57
60
|
ctx0, Kcur, inp_pos, nullptr,
|
|
58
|
-
n_rot, rope_type, n_ctx_orig,
|
|
61
|
+
n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
|
59
62
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
60
63
|
);
|
|
61
64
|
|
|
@@ -26,10 +26,16 @@ llm_build_smallthinker<iswa>::llm_build_smallthinker(const llama_model & model,
|
|
|
26
26
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
27
27
|
|
|
28
28
|
for (int il = 0; il < n_layer; ++il) {
|
|
29
|
+
const float freq_base_l = model.get_rope_freq_base (cparams, il);
|
|
30
|
+
const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
|
|
31
|
+
|
|
29
32
|
ggml_tensor * inpSA = inpL;
|
|
30
|
-
ggml_tensor * probs = nullptr;
|
|
31
33
|
|
|
32
|
-
|
|
34
|
+
// This overlaps with SWA layers in current models, so get_rope_freq_base/scale may be superfluous
|
|
35
|
+
const bool use_rope = hparams.n_no_rope_layer_step == n_layer ||
|
|
36
|
+
il % hparams.n_no_rope_layer_step != 0;
|
|
37
|
+
|
|
38
|
+
ggml_tensor * probs = build_lora_mm(model.layers[il].ffn_gate_inp, inpL); // [n_expert, n_tokens]
|
|
33
39
|
cb(probs, "ffn_moe_logits", il);
|
|
34
40
|
|
|
35
41
|
// norm
|
|
@@ -52,11 +58,11 @@ llm_build_smallthinker<iswa>::llm_build_smallthinker(const llama_model & model,
|
|
|
52
58
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
53
59
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
54
60
|
|
|
55
|
-
if (
|
|
56
|
-
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
|
|
61
|
+
if (use_rope) {
|
|
62
|
+
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
|
57
63
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
58
64
|
|
|
59
|
-
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
|
|
65
|
+
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
|
|
60
66
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
61
67
|
}
|
|
62
68
|
cb(Qcur, "Qcur", il);
|