@fugood/llama.node 1.4.12 → 1.4.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +9 -9
- package/src/llama.cpp/common/arg.cpp +99 -45
- package/src/llama.cpp/common/chat.cpp +4 -4
- package/src/llama.cpp/common/common.cpp +19 -0
- package/src/llama.cpp/common/common.h +10 -0
- package/src/llama.cpp/common/llguidance.cpp +10 -6
- package/src/llama.cpp/common/regex-partial.cpp +13 -13
- package/src/llama.cpp/common/sampling.cpp +58 -14
- package/src/llama.cpp/common/sampling.h +3 -1
- package/src/llama.cpp/include/llama.h +87 -8
- package/src/llama.cpp/src/llama-arch.cpp +2 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +615 -28
- package/src/llama.cpp/src/llama-context.h +43 -1
- package/src/llama.cpp/src/llama-grammar.cpp +40 -13
- package/src/llama.cpp/src/llama-grammar.h +2 -0
- package/src/llama.cpp/src/llama-graph.cpp +173 -5
- package/src/llama.cpp/src/llama-graph.h +71 -6
- package/src/llama.cpp/src/llama-hparams.cpp +4 -0
- package/src/llama.cpp/src/llama-hparams.h +8 -2
- package/src/llama.cpp/src/llama-model-saver.cpp +3 -0
- package/src/llama.cpp/src/llama-model.cpp +51 -11
- package/src/llama.cpp/src/llama-sampling.cpp +1232 -170
- package/src/llama.cpp/src/llama-sampling.h +16 -7
- package/src/llama.cpp/src/llama.cpp +38 -30
- package/src/llama.cpp/src/models/afmoe.cpp +9 -5
- package/src/llama.cpp/src/models/cohere2-iswa.cpp +3 -0
- package/src/llama.cpp/src/models/gemma2-iswa.cpp +5 -2
- package/src/llama.cpp/src/models/llama-iswa.cpp +6 -2
- package/src/llama.cpp/src/models/modern-bert.cpp +4 -3
- package/src/llama.cpp/src/models/openai-moe-iswa.cpp +5 -2
- package/src/llama.cpp/src/models/smallthinker.cpp +11 -5
|
@@ -4,6 +4,8 @@
|
|
|
4
4
|
#include "llama-vocab.h"
|
|
5
5
|
#include "llama-grammar.h"
|
|
6
6
|
|
|
7
|
+
#include "ggml-cpp.h"
|
|
8
|
+
|
|
7
9
|
#include <array>
|
|
8
10
|
#include <algorithm>
|
|
9
11
|
#include <cassert>
|
|
@@ -346,7 +348,9 @@ static uint32_t get_rng_seed(uint32_t seed) {
|
|
|
346
348
|
|
|
347
349
|
// llama_sampler API
|
|
348
350
|
|
|
349
|
-
struct llama_sampler * llama_sampler_init(
|
|
351
|
+
struct llama_sampler * llama_sampler_init(
|
|
352
|
+
struct llama_sampler_i * iface,
|
|
353
|
+
llama_sampler_context_t ctx) {
|
|
350
354
|
return new llama_sampler {
|
|
351
355
|
/* .iface = */ iface,
|
|
352
356
|
/* .ctx = */ ctx,
|
|
@@ -421,6 +425,202 @@ void llama_sampler_free(struct llama_sampler * smpl) {
|
|
|
421
425
|
delete smpl;
|
|
422
426
|
}
|
|
423
427
|
|
|
428
|
+
// empty sampler
|
|
429
|
+
|
|
430
|
+
struct llama_sampler_empty {
|
|
431
|
+
const char * name;
|
|
432
|
+
};
|
|
433
|
+
|
|
434
|
+
static struct llama_sampler * llama_sampler_init_empty(const char * name);
|
|
435
|
+
|
|
436
|
+
static const char * llama_sampler_empty_name(const struct llama_sampler * smpl) {
|
|
437
|
+
auto * ctx = (llama_sampler_empty *) smpl->ctx;
|
|
438
|
+
return ctx->name;
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
static void llama_sampler_empty_accept(struct llama_sampler * smpl, llama_token token) {
|
|
442
|
+
GGML_UNUSED(smpl);
|
|
443
|
+
GGML_UNUSED(token);
|
|
444
|
+
}
|
|
445
|
+
|
|
446
|
+
static void llama_sampler_empty_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
|
447
|
+
GGML_UNUSED(smpl);
|
|
448
|
+
GGML_UNUSED(cur_p);
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
static void llama_sampler_empty_reset(struct llama_sampler * smpl) {
|
|
452
|
+
GGML_UNUSED(smpl);
|
|
453
|
+
}
|
|
454
|
+
|
|
455
|
+
static struct llama_sampler * llama_sampler_empty_clone(const struct llama_sampler * smpl) {
|
|
456
|
+
auto * ctx = (llama_sampler_empty *) smpl->ctx;
|
|
457
|
+
return llama_sampler_init_empty(ctx->name);
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
static void llama_sampler_empty_free(struct llama_sampler * smpl) {
|
|
461
|
+
delete (llama_sampler_empty *) smpl->ctx;
|
|
462
|
+
}
|
|
463
|
+
|
|
464
|
+
static bool llama_sampler_empty_backend_init(
|
|
465
|
+
struct llama_sampler * smpl,
|
|
466
|
+
ggml_backend_buffer_type_t buft) {
|
|
467
|
+
GGML_UNUSED(smpl);
|
|
468
|
+
GGML_UNUSED(buft);
|
|
469
|
+
|
|
470
|
+
return true;
|
|
471
|
+
}
|
|
472
|
+
|
|
473
|
+
static void llama_sampler_empty_backend_accept(
|
|
474
|
+
struct llama_sampler * smpl,
|
|
475
|
+
ggml_context * ctx,
|
|
476
|
+
ggml_cgraph * gf,
|
|
477
|
+
struct ggml_tensor * selected_token) {
|
|
478
|
+
GGML_UNUSED(smpl);
|
|
479
|
+
GGML_UNUSED(ctx);
|
|
480
|
+
GGML_UNUSED(gf);
|
|
481
|
+
GGML_UNUSED(selected_token);
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
static void llama_sampler_empty_backend_apply(
|
|
485
|
+
struct llama_sampler * smpl,
|
|
486
|
+
struct ggml_context * ctx,
|
|
487
|
+
struct ggml_cgraph * gf,
|
|
488
|
+
struct llama_sampler_data * data) {
|
|
489
|
+
GGML_UNUSED(smpl);
|
|
490
|
+
GGML_UNUSED(ctx);
|
|
491
|
+
GGML_UNUSED(gf);
|
|
492
|
+
GGML_UNUSED(data);
|
|
493
|
+
}
|
|
494
|
+
|
|
495
|
+
static void llama_sampler_empty_backend_set_input(struct llama_sampler * smpl) {
|
|
496
|
+
GGML_UNUSED(smpl);
|
|
497
|
+
}
|
|
498
|
+
|
|
499
|
+
static struct llama_sampler_i llama_sampler_empty_i = {
|
|
500
|
+
/* .name = */ llama_sampler_empty_name,
|
|
501
|
+
/* .accept = */ llama_sampler_empty_accept,
|
|
502
|
+
/* .apply = */ llama_sampler_empty_apply,
|
|
503
|
+
/* .reset = */ llama_sampler_empty_reset,
|
|
504
|
+
/* .clone = */ llama_sampler_empty_clone,
|
|
505
|
+
/* .free = */ llama_sampler_empty_free,
|
|
506
|
+
/* .backend_init = */ llama_sampler_empty_backend_init,
|
|
507
|
+
/* .backend_accept = */ llama_sampler_empty_backend_accept,
|
|
508
|
+
/* .backend_apply = */ llama_sampler_empty_backend_apply,
|
|
509
|
+
/* .backend_set_input = */ llama_sampler_empty_backend_set_input,
|
|
510
|
+
};
|
|
511
|
+
|
|
512
|
+
struct llama_sampler * llama_sampler_init_empty(const char * name) {
|
|
513
|
+
return llama_sampler_init(
|
|
514
|
+
/* .iface = */ &llama_sampler_empty_i,
|
|
515
|
+
/* .ctx = */ new llama_sampler_empty {
|
|
516
|
+
/* .name = */ name,
|
|
517
|
+
}
|
|
518
|
+
);
|
|
519
|
+
}
|
|
520
|
+
|
|
521
|
+
// common backend sampler functionality
|
|
522
|
+
//
|
|
523
|
+
// +name : means that the sampler is support and will run on the backend
|
|
524
|
+
// -name : means that a ggml operator is not supported by the backend
|
|
525
|
+
//
|
|
526
|
+
struct llama_sampler_backend {
|
|
527
|
+
llama_sampler_backend(const char * name) : name(name), name_ext(name), is_init(false), support(false) {}
|
|
528
|
+
|
|
529
|
+
const char * get_name() {
|
|
530
|
+
if (!is_init) {
|
|
531
|
+
return name.c_str();
|
|
532
|
+
}
|
|
533
|
+
|
|
534
|
+
if (support) {
|
|
535
|
+
name_ext = "+" + name;
|
|
536
|
+
} else {
|
|
537
|
+
name_ext = "-" + name;
|
|
538
|
+
}
|
|
539
|
+
|
|
540
|
+
return name_ext.c_str();
|
|
541
|
+
}
|
|
542
|
+
|
|
543
|
+
void init(bool support) {
|
|
544
|
+
GGML_ASSERT(this->is_init == false);
|
|
545
|
+
|
|
546
|
+
this->is_init = true;
|
|
547
|
+
this->support = support;
|
|
548
|
+
}
|
|
549
|
+
|
|
550
|
+
private:
|
|
551
|
+
std::string name;
|
|
552
|
+
std::string name_ext;
|
|
553
|
+
|
|
554
|
+
bool is_init;
|
|
555
|
+
bool support;
|
|
556
|
+
};
|
|
557
|
+
|
|
558
|
+
// check if all ggml ops used by the sampler are supported by the backend
|
|
559
|
+
static bool llama_sampler_backend_support(
|
|
560
|
+
llama_sampler * smpl,
|
|
561
|
+
ggml_backend_buffer_type_t buft) {
|
|
562
|
+
auto * device = ggml_backend_buft_get_device(buft);
|
|
563
|
+
if (!device) {
|
|
564
|
+
// CPU backend always supported
|
|
565
|
+
return true;
|
|
566
|
+
}
|
|
567
|
+
|
|
568
|
+
ggml_init_params params = {
|
|
569
|
+
/*.mem_size =*/ 128*ggml_tensor_overhead() + ggml_graph_overhead(),
|
|
570
|
+
/*.mem_buffer =*/ NULL,
|
|
571
|
+
/*.no_alloc =*/ true,
|
|
572
|
+
};
|
|
573
|
+
|
|
574
|
+
ggml_context_ptr ctx_ptr { ggml_init(params) };
|
|
575
|
+
if (!ctx_ptr) {
|
|
576
|
+
throw std::runtime_error(format("failed to create ggml context"));
|
|
577
|
+
}
|
|
578
|
+
|
|
579
|
+
ggml_context * ctx = ctx_ptr.get();
|
|
580
|
+
|
|
581
|
+
const int64_t n = 1024*1024;
|
|
582
|
+
|
|
583
|
+
llama_sampler_data data = {
|
|
584
|
+
/*.logits = */ ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n),
|
|
585
|
+
/*.probs = */ nullptr,
|
|
586
|
+
/*.sampled = */ nullptr,
|
|
587
|
+
/*.candidates = */ ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n),
|
|
588
|
+
};
|
|
589
|
+
|
|
590
|
+
ggml_cgraph * gf = ggml_new_graph(ctx);
|
|
591
|
+
|
|
592
|
+
smpl->iface->backend_apply(smpl, ctx, gf, &data);
|
|
593
|
+
|
|
594
|
+
if (data.logits) {
|
|
595
|
+
ggml_build_forward_expand(gf, data.logits);
|
|
596
|
+
}
|
|
597
|
+
|
|
598
|
+
if (data.probs) {
|
|
599
|
+
ggml_build_forward_expand(gf, data.probs);
|
|
600
|
+
}
|
|
601
|
+
|
|
602
|
+
if (data.sampled) {
|
|
603
|
+
ggml_build_forward_expand(gf, data.sampled);
|
|
604
|
+
}
|
|
605
|
+
|
|
606
|
+
if (data.candidates) {
|
|
607
|
+
ggml_build_forward_expand(gf, data.candidates);
|
|
608
|
+
}
|
|
609
|
+
|
|
610
|
+
for (int i = 0; i < ggml_graph_n_nodes(gf); i++) {
|
|
611
|
+
struct ggml_tensor * op = ggml_graph_node(gf, i);
|
|
612
|
+
|
|
613
|
+
if (!ggml_backend_dev_supports_op(device, op)) {
|
|
614
|
+
LLAMA_LOG_WARN("%s: device '%s' does not have support for op %s needed for sampler '%s'\n",
|
|
615
|
+
__func__, ggml_backend_dev_name(device), ggml_op_name(op->op), smpl->iface->name(smpl));
|
|
616
|
+
|
|
617
|
+
return false;
|
|
618
|
+
}
|
|
619
|
+
}
|
|
620
|
+
|
|
621
|
+
return true;
|
|
622
|
+
}
|
|
623
|
+
|
|
424
624
|
// sampler chain
|
|
425
625
|
|
|
426
626
|
static const char * llama_sampler_chain_name(const struct llama_sampler * /*smpl*/) {
|
|
@@ -432,8 +632,8 @@ static void llama_sampler_chain_accept(struct llama_sampler * smpl, llama_token
|
|
|
432
632
|
|
|
433
633
|
time_meas tm(chain->t_sample_us, chain->params.no_perf);
|
|
434
634
|
|
|
435
|
-
for (auto
|
|
436
|
-
llama_sampler_accept(smpl, token);
|
|
635
|
+
for (auto & smpl : chain->samplers) {
|
|
636
|
+
llama_sampler_accept(smpl.ptr, token);
|
|
437
637
|
}
|
|
438
638
|
|
|
439
639
|
chain->n_sample++;
|
|
@@ -444,16 +644,28 @@ static void llama_sampler_chain_apply(struct llama_sampler * smpl, llama_token_d
|
|
|
444
644
|
|
|
445
645
|
time_meas tm(chain->t_sample_us, chain->params.no_perf);
|
|
446
646
|
|
|
447
|
-
|
|
448
|
-
|
|
647
|
+
bool is_backend = chain->is_init;
|
|
648
|
+
|
|
649
|
+
for (auto & smpl : chain->samplers) {
|
|
650
|
+
if (is_backend && smpl.is_backend) {
|
|
651
|
+
continue;
|
|
652
|
+
}
|
|
653
|
+
|
|
654
|
+
is_backend = false;
|
|
655
|
+
|
|
656
|
+
if (smpl.ptr->iface->apply == nullptr) {
|
|
657
|
+
continue;
|
|
658
|
+
}
|
|
659
|
+
|
|
660
|
+
llama_sampler_apply(smpl.ptr, cur_p);
|
|
449
661
|
}
|
|
450
662
|
}
|
|
451
663
|
|
|
452
664
|
static void llama_sampler_chain_reset(struct llama_sampler * smpl) {
|
|
453
665
|
auto * chain = (llama_sampler_chain *) smpl->ctx;
|
|
454
666
|
|
|
455
|
-
for (auto
|
|
456
|
-
llama_sampler_reset(smpl);
|
|
667
|
+
for (auto & smpl : chain->samplers) {
|
|
668
|
+
llama_sampler_reset(smpl.ptr);
|
|
457
669
|
}
|
|
458
670
|
}
|
|
459
671
|
|
|
@@ -462,8 +674,8 @@ static struct llama_sampler * llama_sampler_chain_clone(const struct llama_sampl
|
|
|
462
674
|
|
|
463
675
|
auto * result = llama_sampler_chain_init(chain_src->params);
|
|
464
676
|
|
|
465
|
-
for (auto
|
|
466
|
-
llama_sampler_chain_add(result, llama_sampler_clone(smpl));
|
|
677
|
+
for (const auto & smpl : chain_src->samplers) {
|
|
678
|
+
llama_sampler_chain_add(result, llama_sampler_clone(smpl.ptr));
|
|
467
679
|
}
|
|
468
680
|
|
|
469
681
|
return result;
|
|
@@ -472,20 +684,109 @@ static struct llama_sampler * llama_sampler_chain_clone(const struct llama_sampl
|
|
|
472
684
|
static void llama_sampler_chain_free(struct llama_sampler * smpl) {
|
|
473
685
|
auto * chain = (llama_sampler_chain *) smpl->ctx;
|
|
474
686
|
|
|
475
|
-
for (auto
|
|
476
|
-
llama_sampler_free(smpl);
|
|
687
|
+
for (auto & smpl : chain->samplers) {
|
|
688
|
+
llama_sampler_free(smpl.ptr);
|
|
477
689
|
}
|
|
478
690
|
|
|
479
691
|
delete chain;
|
|
480
692
|
}
|
|
481
693
|
|
|
694
|
+
static bool llama_sampler_chain_backend_init(
|
|
695
|
+
struct llama_sampler * smpl,
|
|
696
|
+
ggml_backend_buffer_type_t buft) {
|
|
697
|
+
auto * chain = (llama_sampler_chain *) smpl->ctx;
|
|
698
|
+
|
|
699
|
+
GGML_ASSERT(chain->is_init == false && "llama_sampler_chain_backend_init() called twice");
|
|
700
|
+
|
|
701
|
+
chain->is_init = true;
|
|
702
|
+
|
|
703
|
+
bool res = true;
|
|
704
|
+
|
|
705
|
+
for (auto & smpl : chain->samplers) {
|
|
706
|
+
bool res_cur = true;
|
|
707
|
+
|
|
708
|
+
// to be able to run a sampler on the backend, it has to:
|
|
709
|
+
// - have the .backend_init() API implemented
|
|
710
|
+
// - return true during .backend_init()
|
|
711
|
+
if (smpl.ptr->iface->backend_init) {
|
|
712
|
+
if (!smpl.ptr->iface->backend_init(smpl.ptr, buft)) {
|
|
713
|
+
res_cur = false;
|
|
714
|
+
}
|
|
715
|
+
} else {
|
|
716
|
+
res_cur = false;
|
|
717
|
+
}
|
|
718
|
+
|
|
719
|
+
smpl.is_backend = res_cur;
|
|
720
|
+
|
|
721
|
+
res = res && res_cur;
|
|
722
|
+
}
|
|
723
|
+
|
|
724
|
+
return res;
|
|
725
|
+
}
|
|
726
|
+
|
|
727
|
+
static void llama_sampler_chain_backend_accept(
|
|
728
|
+
struct llama_sampler * smpl,
|
|
729
|
+
ggml_context * ctx,
|
|
730
|
+
ggml_cgraph * gf,
|
|
731
|
+
struct ggml_tensor * selected_token) {
|
|
732
|
+
auto * chain = (llama_sampler_chain *) smpl->ctx;
|
|
733
|
+
|
|
734
|
+
for (auto & smpl : chain->samplers) {
|
|
735
|
+
if (!smpl.is_backend) {
|
|
736
|
+
break;
|
|
737
|
+
}
|
|
738
|
+
|
|
739
|
+
if (smpl.ptr->iface->backend_accept) {
|
|
740
|
+
smpl.ptr->iface->backend_accept(smpl.ptr, ctx, gf, selected_token);
|
|
741
|
+
}
|
|
742
|
+
}
|
|
743
|
+
}
|
|
744
|
+
|
|
745
|
+
static void llama_sampler_chain_backend_apply(
|
|
746
|
+
struct llama_sampler * smpl,
|
|
747
|
+
struct ggml_context * ctx,
|
|
748
|
+
struct ggml_cgraph * gf,
|
|
749
|
+
struct llama_sampler_data * data) {
|
|
750
|
+
auto * chain = (llama_sampler_chain *) smpl->ctx;
|
|
751
|
+
|
|
752
|
+
GGML_ASSERT(chain->is_init && "llama_sampler_chain_backend_init() not called");
|
|
753
|
+
|
|
754
|
+
for (auto & smpl : chain->samplers) {
|
|
755
|
+
if (!smpl.is_backend) {
|
|
756
|
+
break;
|
|
757
|
+
}
|
|
758
|
+
|
|
759
|
+
if (smpl.ptr->iface->backend_apply) {
|
|
760
|
+
smpl.ptr->iface->backend_apply(smpl.ptr, ctx, gf, data);
|
|
761
|
+
}
|
|
762
|
+
}
|
|
763
|
+
}
|
|
764
|
+
|
|
765
|
+
static void llama_sampler_chain_backend_set_input(struct llama_sampler * smpl) {
|
|
766
|
+
auto * chain = (llama_sampler_chain *) smpl->ctx;
|
|
767
|
+
|
|
768
|
+
for (auto & smpl : chain->samplers) {
|
|
769
|
+
if (!smpl.is_backend) {
|
|
770
|
+
break;
|
|
771
|
+
}
|
|
772
|
+
|
|
773
|
+
if (smpl.ptr->iface->backend_set_input) {
|
|
774
|
+
smpl.ptr->iface->backend_set_input(smpl.ptr);
|
|
775
|
+
}
|
|
776
|
+
}
|
|
777
|
+
}
|
|
778
|
+
|
|
482
779
|
static struct llama_sampler_i llama_sampler_chain_i = {
|
|
483
|
-
/* .name
|
|
484
|
-
/* .accept
|
|
485
|
-
/* .apply
|
|
486
|
-
/* .reset
|
|
487
|
-
/* .clone
|
|
488
|
-
/* .free
|
|
780
|
+
/* .name = */ llama_sampler_chain_name,
|
|
781
|
+
/* .accept = */ llama_sampler_chain_accept,
|
|
782
|
+
/* .apply = */ llama_sampler_chain_apply,
|
|
783
|
+
/* .reset = */ llama_sampler_chain_reset,
|
|
784
|
+
/* .clone = */ llama_sampler_chain_clone,
|
|
785
|
+
/* .free = */ llama_sampler_chain_free,
|
|
786
|
+
/* .backend_init = */ llama_sampler_chain_backend_init,
|
|
787
|
+
/* .backend_accept = */ llama_sampler_chain_backend_accept,
|
|
788
|
+
/* .backend_apply = */ llama_sampler_chain_backend_apply,
|
|
789
|
+
/* .backend_set_input = */ llama_sampler_chain_backend_set_input,
|
|
489
790
|
};
|
|
490
791
|
|
|
491
792
|
struct llama_sampler * llama_sampler_chain_init(struct llama_sampler_chain_params params) {
|
|
@@ -493,6 +794,7 @@ struct llama_sampler * llama_sampler_chain_init(struct llama_sampler_chain_param
|
|
|
493
794
|
/* .iface = */ &llama_sampler_chain_i,
|
|
494
795
|
/* .ctx = */ new llama_sampler_chain {
|
|
495
796
|
/* .params = */ params,
|
|
797
|
+
/* .is_init = */ false,
|
|
496
798
|
/* .samplers = */ {},
|
|
497
799
|
/* .cur = */ {},
|
|
498
800
|
/* .t_sample_us = */ 0,
|
|
@@ -502,7 +804,16 @@ struct llama_sampler * llama_sampler_chain_init(struct llama_sampler_chain_param
|
|
|
502
804
|
}
|
|
503
805
|
|
|
504
806
|
llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx) {
|
|
505
|
-
const
|
|
807
|
+
const llama_token sampled_token = llama_get_sampled_token_ith (ctx, idx);
|
|
808
|
+
const float * sampled_probs = llama_get_sampled_probs_ith (ctx, idx);
|
|
809
|
+
const float * sampled_logits = llama_get_sampled_logits_ith (ctx, idx);
|
|
810
|
+
const llama_token * sampled_ids = llama_get_sampled_candidates_ith(ctx, idx);
|
|
811
|
+
|
|
812
|
+
// If a backend sampler has already sampled a token, return it.
|
|
813
|
+
if (sampled_token != LLAMA_TOKEN_NULL) {
|
|
814
|
+
LLAMA_LOG_DEBUG("%s: Backend sampler selected token for idx %d. Skipping CPU samplers\n", __func__, idx);
|
|
815
|
+
return sampled_token;
|
|
816
|
+
}
|
|
506
817
|
|
|
507
818
|
const llama_model * model = llama_get_model(ctx);
|
|
508
819
|
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
@@ -521,9 +832,26 @@ llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_conte
|
|
|
521
832
|
}
|
|
522
833
|
|
|
523
834
|
auto & cur = *cur_ptr;
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
835
|
+
|
|
836
|
+
if (sampled_probs) {
|
|
837
|
+
const uint32_t sampled_probs_count = llama_get_sampled_probs_count_ith(ctx, idx);
|
|
838
|
+
cur.resize(sampled_probs_count);
|
|
839
|
+
for (uint32_t i = 0; i < sampled_probs_count; ++i) {
|
|
840
|
+
cur[i] = llama_token_data{sampled_ids[i], sampled_logits[i], sampled_probs[i]};
|
|
841
|
+
}
|
|
842
|
+
} else if (sampled_logits) {
|
|
843
|
+
const uint32_t sampled_logits_count = llama_get_sampled_logits_count_ith(ctx, idx);
|
|
844
|
+
cur.resize(sampled_logits_count);
|
|
845
|
+
for (llama_token i = 0; i < (int)sampled_logits_count; i++) {
|
|
846
|
+
cur[i] = llama_token_data{sampled_ids[i], sampled_logits[i], 0.0f};
|
|
847
|
+
}
|
|
848
|
+
} else {
|
|
849
|
+
const auto * logits = llama_get_logits_ith(ctx, idx);
|
|
850
|
+
GGML_ASSERT(logits != nullptr);
|
|
851
|
+
cur.resize(n_vocab);
|
|
852
|
+
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
|
853
|
+
cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
|
|
854
|
+
}
|
|
527
855
|
}
|
|
528
856
|
|
|
529
857
|
llama_token_data_array cur_p = {
|
|
@@ -544,19 +872,35 @@ llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_conte
|
|
|
544
872
|
return token;
|
|
545
873
|
}
|
|
546
874
|
|
|
875
|
+
|
|
547
876
|
void llama_sampler_chain_add(struct llama_sampler * chain, struct llama_sampler * smpl) {
|
|
548
877
|
auto * p = (llama_sampler_chain *) chain->ctx;
|
|
549
|
-
p->samplers.push_back(
|
|
878
|
+
p->samplers.push_back({
|
|
879
|
+
/* .is_backend = */ false,
|
|
880
|
+
/* .ptr = */ smpl,
|
|
881
|
+
});
|
|
550
882
|
}
|
|
551
883
|
|
|
552
|
-
struct llama_sampler * llama_sampler_chain_get(
|
|
884
|
+
struct llama_sampler * llama_sampler_chain_get(struct llama_sampler * chain, int32_t i) {
|
|
885
|
+
if (chain == nullptr) {
|
|
886
|
+
return nullptr;
|
|
887
|
+
}
|
|
888
|
+
|
|
889
|
+
if (chain->iface != &llama_sampler_chain_i) {
|
|
890
|
+
return nullptr;
|
|
891
|
+
}
|
|
892
|
+
|
|
893
|
+
if (i == -1) {
|
|
894
|
+
return chain;
|
|
895
|
+
}
|
|
896
|
+
|
|
553
897
|
const auto * p = (const llama_sampler_chain *) chain->ctx;
|
|
554
898
|
|
|
555
899
|
if (i < 0 || (size_t) i >= p->samplers.size()) {
|
|
556
900
|
return nullptr;
|
|
557
901
|
}
|
|
558
902
|
|
|
559
|
-
return p->samplers[i];
|
|
903
|
+
return p->samplers[i].ptr;
|
|
560
904
|
}
|
|
561
905
|
|
|
562
906
|
struct llama_sampler * llama_sampler_chain_remove(struct llama_sampler * chain, int32_t i) {
|
|
@@ -566,7 +910,7 @@ struct llama_sampler * llama_sampler_chain_remove(struct llama_sampler * chain,
|
|
|
566
910
|
return nullptr;
|
|
567
911
|
}
|
|
568
912
|
|
|
569
|
-
auto * result = p->samplers[i];
|
|
913
|
+
auto * result = p->samplers[i].ptr;
|
|
570
914
|
p->samplers.erase(p->samplers.begin() + i);
|
|
571
915
|
|
|
572
916
|
return result;
|
|
@@ -584,8 +928,36 @@ int llama_sampler_chain_n(const struct llama_sampler * chain) {
|
|
|
584
928
|
|
|
585
929
|
// greedy
|
|
586
930
|
|
|
587
|
-
|
|
588
|
-
|
|
931
|
+
struct llama_sampler_greedy : public llama_sampler_backend {
|
|
932
|
+
};
|
|
933
|
+
|
|
934
|
+
static const char * llama_sampler_greedy_name(const struct llama_sampler * smpl) {
|
|
935
|
+
auto * sctx = (llama_sampler_greedy *) smpl->ctx;
|
|
936
|
+
return sctx->get_name();
|
|
937
|
+
}
|
|
938
|
+
|
|
939
|
+
static void llama_sampler_greedy_reset(struct llama_sampler * smpl) {
|
|
940
|
+
auto * ctx = (llama_sampler_greedy *) smpl->ctx;
|
|
941
|
+
GGML_UNUSED(ctx);
|
|
942
|
+
}
|
|
943
|
+
|
|
944
|
+
static struct llama_sampler * llama_sampler_greedy_clone(const struct llama_sampler * smpl) {
|
|
945
|
+
const auto * ctx = (const llama_sampler_greedy *) smpl->ctx;
|
|
946
|
+
auto * result = llama_sampler_init_greedy();
|
|
947
|
+
|
|
948
|
+
// copy the state
|
|
949
|
+
{
|
|
950
|
+
auto * result_ctx = (llama_sampler_greedy *) result->ctx;
|
|
951
|
+
|
|
952
|
+
GGML_UNUSED(ctx);
|
|
953
|
+
GGML_UNUSED(result_ctx);
|
|
954
|
+
}
|
|
955
|
+
|
|
956
|
+
return result;
|
|
957
|
+
}
|
|
958
|
+
|
|
959
|
+
static void llama_sampler_greedy_free(struct llama_sampler * smpl) {
|
|
960
|
+
delete (llama_sampler_greedy *) smpl->ctx;
|
|
589
961
|
}
|
|
590
962
|
|
|
591
963
|
static void llama_sampler_greedy_apply(struct llama_sampler * /*smpl*/, llama_token_data_array * cur_p) {
|
|
@@ -597,33 +969,72 @@ static void llama_sampler_greedy_apply(struct llama_sampler * /*smpl*/, llama_to
|
|
|
597
969
|
}
|
|
598
970
|
}
|
|
599
971
|
|
|
972
|
+
static bool llama_sampler_greedy_backend_init(
|
|
973
|
+
struct llama_sampler * smpl,
|
|
974
|
+
ggml_backend_buffer_type_t buft) {
|
|
975
|
+
auto * sctx = (llama_sampler_greedy *) smpl->ctx;
|
|
976
|
+
|
|
977
|
+
const bool res = llama_sampler_backend_support(smpl, buft);
|
|
978
|
+
|
|
979
|
+
sctx->init(res);
|
|
980
|
+
|
|
981
|
+
return res;
|
|
982
|
+
}
|
|
983
|
+
|
|
984
|
+
static void llama_sampler_greedy_backend_apply(
|
|
985
|
+
struct llama_sampler * smpl,
|
|
986
|
+
struct ggml_context * ctx,
|
|
987
|
+
struct ggml_cgraph * gf,
|
|
988
|
+
struct llama_sampler_data * data) {
|
|
989
|
+
GGML_UNUSED(gf);
|
|
990
|
+
GGML_UNUSED(smpl);
|
|
991
|
+
|
|
992
|
+
struct ggml_tensor * curl = ggml_argmax(ctx, data->logits);
|
|
993
|
+
ggml_set_name(curl, "greedy_argmax");
|
|
994
|
+
|
|
995
|
+
data->sampled = curl;
|
|
996
|
+
}
|
|
997
|
+
|
|
600
998
|
static struct llama_sampler_i llama_sampler_greedy_i = {
|
|
601
|
-
/* .name
|
|
602
|
-
/* .accept
|
|
603
|
-
/* .apply
|
|
604
|
-
/* .reset
|
|
605
|
-
/* .clone
|
|
606
|
-
/* .free
|
|
999
|
+
/* .name = */ llama_sampler_greedy_name,
|
|
1000
|
+
/* .accept = */ nullptr,
|
|
1001
|
+
/* .apply = */ llama_sampler_greedy_apply,
|
|
1002
|
+
/* .reset = */ llama_sampler_greedy_reset,
|
|
1003
|
+
/* .clone = */ llama_sampler_greedy_clone,
|
|
1004
|
+
/* .free = */ llama_sampler_greedy_free,
|
|
1005
|
+
/* .backend_init = */ llama_sampler_greedy_backend_init,
|
|
1006
|
+
/* .backend_accept = */ nullptr,
|
|
1007
|
+
/* .backend_apply = */ llama_sampler_greedy_backend_apply,
|
|
1008
|
+
/* .backend_set_input = */ nullptr,
|
|
607
1009
|
};
|
|
608
1010
|
|
|
609
1011
|
struct llama_sampler * llama_sampler_init_greedy() {
|
|
610
1012
|
return llama_sampler_init(
|
|
611
1013
|
/* .iface = */ &llama_sampler_greedy_i,
|
|
612
|
-
/* .ctx = */
|
|
1014
|
+
/* .ctx = */ new llama_sampler_greedy {
|
|
1015
|
+
("greedy"),
|
|
1016
|
+
}
|
|
613
1017
|
);
|
|
614
1018
|
}
|
|
615
1019
|
|
|
616
1020
|
// dist
|
|
617
1021
|
|
|
618
|
-
struct llama_sampler_dist {
|
|
1022
|
+
struct llama_sampler_dist : public llama_sampler_backend {
|
|
619
1023
|
const uint32_t seed;
|
|
620
1024
|
uint32_t seed_cur;
|
|
621
1025
|
|
|
622
1026
|
std::mt19937 rng;
|
|
1027
|
+
|
|
1028
|
+
// backend input
|
|
1029
|
+
struct ggml_tensor * inp_uniform;
|
|
1030
|
+
|
|
1031
|
+
ggml_context_ptr inp_ctx;
|
|
1032
|
+
ggml_backend_buffer_ptr inp_buf;
|
|
623
1033
|
};
|
|
624
1034
|
|
|
625
|
-
static const char * llama_sampler_dist_name(const struct llama_sampler *
|
|
626
|
-
|
|
1035
|
+
static const char * llama_sampler_dist_name(const struct llama_sampler * smpl) {
|
|
1036
|
+
auto * sctx = (llama_sampler_dist *) smpl->ctx;
|
|
1037
|
+
return sctx->get_name();
|
|
627
1038
|
}
|
|
628
1039
|
|
|
629
1040
|
static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
|
@@ -698,6 +1109,12 @@ static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_da
|
|
|
698
1109
|
#endif
|
|
699
1110
|
}
|
|
700
1111
|
|
|
1112
|
+
static void llama_sampler_dist_reset(struct llama_sampler * smpl) {
|
|
1113
|
+
auto * ctx = (llama_sampler_dist *) smpl->ctx;
|
|
1114
|
+
ctx->seed_cur = get_rng_seed(ctx->seed);
|
|
1115
|
+
ctx->rng.seed(ctx->seed_cur);
|
|
1116
|
+
}
|
|
1117
|
+
|
|
701
1118
|
static struct llama_sampler * llama_sampler_dist_clone(const struct llama_sampler * smpl) {
|
|
702
1119
|
const auto * ctx = (const llama_sampler_dist *) smpl->ctx;
|
|
703
1120
|
auto * result = llama_sampler_init_dist(ctx->seed);
|
|
@@ -712,23 +1129,127 @@ static struct llama_sampler * llama_sampler_dist_clone(const struct llama_sample
|
|
|
712
1129
|
return result;
|
|
713
1130
|
}
|
|
714
1131
|
|
|
715
|
-
static void llama_sampler_dist_reset(struct llama_sampler * smpl) {
|
|
716
|
-
auto * ctx = (llama_sampler_dist *) smpl->ctx;
|
|
717
|
-
ctx->seed_cur = get_rng_seed(ctx->seed);
|
|
718
|
-
ctx->rng.seed(ctx->seed_cur);
|
|
719
|
-
}
|
|
720
|
-
|
|
721
1132
|
static void llama_sampler_dist_free(struct llama_sampler * smpl) {
|
|
722
1133
|
delete (llama_sampler_dist *) smpl->ctx;
|
|
723
1134
|
}
|
|
724
1135
|
|
|
1136
|
+
static bool llama_sampler_dist_backend_init(
|
|
1137
|
+
struct llama_sampler * smpl,
|
|
1138
|
+
ggml_backend_buffer_type_t buft) {
|
|
1139
|
+
auto * sctx = (llama_sampler_dist *) smpl->ctx;
|
|
1140
|
+
|
|
1141
|
+
// allocate inputs
|
|
1142
|
+
{
|
|
1143
|
+
ggml_init_params params = {
|
|
1144
|
+
/*.mem_size =*/ ggml_tensor_overhead(),
|
|
1145
|
+
/*.mem_buffer =*/ nullptr,
|
|
1146
|
+
/*.no_alloc =*/ true,
|
|
1147
|
+
};
|
|
1148
|
+
|
|
1149
|
+
sctx->inp_ctx.reset(ggml_init(params));
|
|
1150
|
+
|
|
1151
|
+
// Create the uniform random scalar input tensor. This will be set by
|
|
1152
|
+
// llama_sampler_dist_backend_set_input after this graph is built.
|
|
1153
|
+
sctx->inp_uniform = ggml_new_tensor_1d(sctx->inp_ctx.get(), GGML_TYPE_F32, 1);
|
|
1154
|
+
ggml_set_name (sctx->inp_uniform, "uniform");
|
|
1155
|
+
ggml_set_input(sctx->inp_uniform);
|
|
1156
|
+
|
|
1157
|
+
// Allocate all tensors from our context to the backend
|
|
1158
|
+
sctx->inp_buf.reset(ggml_backend_alloc_ctx_tensors_from_buft(sctx->inp_ctx.get(), buft));
|
|
1159
|
+
|
|
1160
|
+
ggml_backend_buffer_clear(sctx->inp_buf.get(), 0);
|
|
1161
|
+
}
|
|
1162
|
+
|
|
1163
|
+
const bool res = llama_sampler_backend_support(smpl, buft);
|
|
1164
|
+
|
|
1165
|
+
sctx->init(res);
|
|
1166
|
+
|
|
1167
|
+
if (!res) {
|
|
1168
|
+
sctx->inp_ctx.reset(nullptr);
|
|
1169
|
+
sctx->inp_buf.reset(nullptr);
|
|
1170
|
+
}
|
|
1171
|
+
|
|
1172
|
+
return res;
|
|
1173
|
+
}
|
|
1174
|
+
|
|
1175
|
+
static void llama_sampler_dist_backend_apply(
|
|
1176
|
+
struct llama_sampler * smpl,
|
|
1177
|
+
struct ggml_context * ctx,
|
|
1178
|
+
struct ggml_cgraph * gf,
|
|
1179
|
+
struct llama_sampler_data * data) {
|
|
1180
|
+
GGML_UNUSED(gf);
|
|
1181
|
+
auto * sctx = (llama_sampler_dist *) smpl->ctx;
|
|
1182
|
+
|
|
1183
|
+
struct ggml_tensor * probs = ggml_soft_max(ctx, data->logits);
|
|
1184
|
+
ggml_set_name(probs, "dist_probs");
|
|
1185
|
+
|
|
1186
|
+
struct ggml_tensor * cumsum = ggml_cumsum(ctx, probs);
|
|
1187
|
+
ggml_set_name(cumsum, "dist_cumsum");
|
|
1188
|
+
|
|
1189
|
+
// The uniform tensor has a random value and we subtract this tensor with
|
|
1190
|
+
// the cumsum tensor (the uniform tensor will be broadcasted by ggml_sub).
|
|
1191
|
+
// Recall that each entry in cumsum is the cumulative probability up to that
|
|
1192
|
+
// index so values stay negative while the cumulative total is below the
|
|
1193
|
+
// random value, and become zero/positive once the threshold is crossed.
|
|
1194
|
+
struct ggml_tensor * diff = ggml_sub(ctx, cumsum, sctx->inp_uniform);
|
|
1195
|
+
ggml_set_name(diff, "dist_cumsum");
|
|
1196
|
+
|
|
1197
|
+
// The ggml_step function produces a tensor where entries are 1 if the
|
|
1198
|
+
// corresponding entry in diff is > 0, and 0 otherwise. So all values up to
|
|
1199
|
+
// the index where the cumulative probability exceeds the random value are 0,
|
|
1200
|
+
// and all entries after that are 1.
|
|
1201
|
+
struct ggml_tensor * mask = ggml_step(ctx, diff);
|
|
1202
|
+
ggml_set_name(mask, "dist_mask");
|
|
1203
|
+
|
|
1204
|
+
// Taking the sum of the mask gives us the sum of elements after the threshold
|
|
1205
|
+
// we are interested in.
|
|
1206
|
+
struct ggml_tensor * idxf = ggml_sum(ctx, mask);
|
|
1207
|
+
ggml_set_name(idxf, "dist_index_f32");
|
|
1208
|
+
|
|
1209
|
+
// Use ggml_scale_bias to scale the index value by -1 and then add the size
|
|
1210
|
+
// of the mask to that value so we get the correct index ((-1 * idxf) + n).
|
|
1211
|
+
struct ggml_tensor * idx = ggml_cast(ctx, ggml_scale_bias(ctx, idxf, -1.0f, mask->ne[0]), GGML_TYPE_I32);
|
|
1212
|
+
ggml_set_name(idx, "dist_index_i32");
|
|
1213
|
+
|
|
1214
|
+
// Map back to original vocab ids if a candidates tensor is available.
|
|
1215
|
+
struct ggml_tensor * sampled_token = idx;
|
|
1216
|
+
if (data->candidates != nullptr) {
|
|
1217
|
+
struct ggml_tensor * candidates = ggml_reshape_2d(ctx, data->candidates, 1, ggml_nelements(data->candidates));
|
|
1218
|
+
|
|
1219
|
+
sampled_token = ggml_get_rows(ctx, candidates, idx);
|
|
1220
|
+
ggml_set_name(sampled_token, "dist_sampled_token");
|
|
1221
|
+
}
|
|
1222
|
+
|
|
1223
|
+
data->sampled = sampled_token;
|
|
1224
|
+
data->probs = probs;
|
|
1225
|
+
}
|
|
1226
|
+
|
|
1227
|
+
static void llama_sampler_dist_backend_set_input(struct llama_sampler * smpl) {
|
|
1228
|
+
auto * sctx = (llama_sampler_dist *) smpl->ctx;
|
|
1229
|
+
GGML_ASSERT(sctx->inp_uniform != nullptr);
|
|
1230
|
+
|
|
1231
|
+
// We sample in double precision and cast to float to match rnd numbers of
|
|
1232
|
+
// llama_dampler_dist which uses double precision (sampling from
|
|
1233
|
+
// std::uniform_real_distribution<double> and
|
|
1234
|
+
// std::uniform_real_distribution<float> with same rng will produce
|
|
1235
|
+
// different sequences).
|
|
1236
|
+
std::uniform_real_distribution<double> dist(0.0f, 1.0f);
|
|
1237
|
+
const float rnd = dist(sctx->rng);
|
|
1238
|
+
|
|
1239
|
+
ggml_backend_tensor_set(sctx->inp_uniform, &rnd, 0, sizeof(float));
|
|
1240
|
+
}
|
|
1241
|
+
|
|
725
1242
|
static struct llama_sampler_i llama_sampler_dist_i = {
|
|
726
|
-
/* .name
|
|
727
|
-
/* .accept
|
|
728
|
-
/* .apply
|
|
729
|
-
/* .reset
|
|
730
|
-
/* .clone
|
|
731
|
-
/* .free
|
|
1243
|
+
/* .name = */ llama_sampler_dist_name,
|
|
1244
|
+
/* .accept = */ nullptr,
|
|
1245
|
+
/* .apply = */ llama_sampler_dist_apply,
|
|
1246
|
+
/* .reset = */ llama_sampler_dist_reset,
|
|
1247
|
+
/* .clone = */ llama_sampler_dist_clone,
|
|
1248
|
+
/* .free = */ llama_sampler_dist_free,
|
|
1249
|
+
/* .backend_init = */ llama_sampler_dist_backend_init,
|
|
1250
|
+
/* .backend_accept = */ nullptr,
|
|
1251
|
+
/* .backend_apply = */ llama_sampler_dist_backend_apply,
|
|
1252
|
+
/* .backend_set_input = */ llama_sampler_dist_backend_set_input,
|
|
732
1253
|
};
|
|
733
1254
|
|
|
734
1255
|
struct llama_sampler * llama_sampler_init_dist(uint32_t seed) {
|
|
@@ -736,21 +1257,26 @@ struct llama_sampler * llama_sampler_init_dist(uint32_t seed) {
|
|
|
736
1257
|
return llama_sampler_init(
|
|
737
1258
|
/* .iface = */ &llama_sampler_dist_i,
|
|
738
1259
|
/* .ctx = */ new llama_sampler_dist {
|
|
739
|
-
|
|
740
|
-
/* .
|
|
741
|
-
/* .
|
|
1260
|
+
("dist"),
|
|
1261
|
+
/* .seed = */ seed,
|
|
1262
|
+
/* .seed_cur = */ seed_cur,
|
|
1263
|
+
/* .rng = */ std::mt19937(seed_cur),
|
|
1264
|
+
/* .inp_uniform = */ nullptr,
|
|
1265
|
+
/* .inp_ctx = */ nullptr,
|
|
1266
|
+
/* .inp_buf = */ nullptr,
|
|
742
1267
|
}
|
|
743
1268
|
);
|
|
744
1269
|
}
|
|
745
1270
|
|
|
746
1271
|
// top-k
|
|
747
1272
|
|
|
748
|
-
struct llama_sampler_top_k {
|
|
1273
|
+
struct llama_sampler_top_k : public llama_sampler_backend {
|
|
749
1274
|
const int32_t k;
|
|
750
1275
|
};
|
|
751
1276
|
|
|
752
|
-
static const char * llama_sampler_top_k_name(const struct llama_sampler *
|
|
753
|
-
|
|
1277
|
+
static const char * llama_sampler_top_k_name(const struct llama_sampler * smpl) {
|
|
1278
|
+
auto * sctx = (llama_sampler_top_k *) smpl->ctx;
|
|
1279
|
+
return sctx->get_name();
|
|
754
1280
|
}
|
|
755
1281
|
|
|
756
1282
|
static void llama_sampler_top_k_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
|
@@ -767,19 +1293,69 @@ static void llama_sampler_top_k_free(struct llama_sampler * smpl) {
|
|
|
767
1293
|
delete (llama_sampler_top_k *) smpl->ctx;
|
|
768
1294
|
}
|
|
769
1295
|
|
|
1296
|
+
static bool llama_sampler_top_k_backend_init(
|
|
1297
|
+
struct llama_sampler * smpl,
|
|
1298
|
+
ggml_backend_buffer_type_t buft) {
|
|
1299
|
+
auto * sctx = (llama_sampler_top_k *) smpl->ctx;
|
|
1300
|
+
|
|
1301
|
+
const bool res = llama_sampler_backend_support(smpl, buft);
|
|
1302
|
+
|
|
1303
|
+
sctx->init(res);
|
|
1304
|
+
|
|
1305
|
+
return res;
|
|
1306
|
+
}
|
|
1307
|
+
|
|
1308
|
+
static void llama_sampler_top_k_backend_apply(
|
|
1309
|
+
struct llama_sampler * smpl,
|
|
1310
|
+
struct ggml_context * ctx,
|
|
1311
|
+
struct ggml_cgraph * gf,
|
|
1312
|
+
struct llama_sampler_data * data) {
|
|
1313
|
+
auto * sctx = (llama_sampler_top_k *) smpl->ctx;
|
|
1314
|
+
|
|
1315
|
+
struct ggml_tensor * top_k = ggml_top_k(ctx, data->logits, sctx->k);
|
|
1316
|
+
ggml_set_name(top_k, "top_k");
|
|
1317
|
+
|
|
1318
|
+
if (data->candidates) {
|
|
1319
|
+
struct ggml_tensor * candidates_rows = ggml_reshape_2d(ctx, data->candidates, 1, data->candidates->ne[0]);
|
|
1320
|
+
data->candidates = ggml_get_rows(ctx, candidates_rows, top_k);
|
|
1321
|
+
data->candidates = ggml_reshape_1d(ctx, data->candidates, sctx->k);
|
|
1322
|
+
ggml_set_name(data->candidates, "top_k_candidates");
|
|
1323
|
+
} else {
|
|
1324
|
+
data->candidates = top_k;
|
|
1325
|
+
}
|
|
1326
|
+
|
|
1327
|
+
struct ggml_tensor * logits_rows = ggml_reshape_2d(ctx, data->logits, 1, data->logits->ne[0]);
|
|
1328
|
+
struct ggml_tensor * top_k_rows = ggml_get_rows(ctx, logits_rows, top_k);
|
|
1329
|
+
data->logits = ggml_reshape_1d(ctx, top_k_rows, sctx->k);
|
|
1330
|
+
ggml_set_name(top_k_rows, "top_k_rows");
|
|
1331
|
+
|
|
1332
|
+
GGML_UNUSED(gf);
|
|
1333
|
+
}
|
|
1334
|
+
|
|
770
1335
|
static struct llama_sampler_i llama_sampler_top_k_i = {
|
|
771
|
-
/* .name
|
|
772
|
-
/* .accept
|
|
773
|
-
/* .apply
|
|
774
|
-
/* .reset
|
|
775
|
-
/* .clone
|
|
776
|
-
/* .free
|
|
1336
|
+
/* .name = */ llama_sampler_top_k_name,
|
|
1337
|
+
/* .accept = */ nullptr,
|
|
1338
|
+
/* .apply = */ llama_sampler_top_k_apply,
|
|
1339
|
+
/* .reset = */ nullptr,
|
|
1340
|
+
/* .clone = */ llama_sampler_top_k_clone,
|
|
1341
|
+
/* .free = */ llama_sampler_top_k_free,
|
|
1342
|
+
/* .backend_init = */ llama_sampler_top_k_backend_init,
|
|
1343
|
+
/* .backend_accept = */ nullptr,
|
|
1344
|
+
/* .backend_apply = */ llama_sampler_top_k_backend_apply,
|
|
1345
|
+
/* .backend_set_input = */ nullptr,
|
|
777
1346
|
};
|
|
778
1347
|
|
|
779
1348
|
struct llama_sampler * llama_sampler_init_top_k(int32_t k) {
|
|
1349
|
+
const bool is_empty = (k <= 0);
|
|
1350
|
+
|
|
1351
|
+
if (is_empty) {
|
|
1352
|
+
return llama_sampler_init_empty("?top-k");
|
|
1353
|
+
}
|
|
1354
|
+
|
|
780
1355
|
return llama_sampler_init(
|
|
781
1356
|
/* .iface = */ &llama_sampler_top_k_i,
|
|
782
1357
|
/* .ctx = */ new llama_sampler_top_k {
|
|
1358
|
+
("top-k"),
|
|
783
1359
|
/* .k = */ k,
|
|
784
1360
|
}
|
|
785
1361
|
);
|
|
@@ -787,15 +1363,16 @@ struct llama_sampler * llama_sampler_init_top_k(int32_t k) {
|
|
|
787
1363
|
|
|
788
1364
|
// top-p
|
|
789
1365
|
|
|
790
|
-
struct llama_sampler_top_p {
|
|
1366
|
+
struct llama_sampler_top_p : public llama_sampler_backend {
|
|
791
1367
|
const float p;
|
|
792
1368
|
const size_t min_keep;
|
|
793
1369
|
|
|
794
1370
|
std::vector<llama_token_data> buf_sort;
|
|
795
1371
|
};
|
|
796
1372
|
|
|
797
|
-
static const char * llama_sampler_top_p_name(const struct llama_sampler *
|
|
798
|
-
|
|
1373
|
+
static const char * llama_sampler_top_p_name(const struct llama_sampler * smpl) {
|
|
1374
|
+
auto * sctx = (llama_sampler_top_p *) smpl->ctx;
|
|
1375
|
+
return sctx->get_name();
|
|
799
1376
|
}
|
|
800
1377
|
|
|
801
1378
|
static void llama_sampler_top_p_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
|
@@ -862,19 +1439,118 @@ static void llama_sampler_top_p_free(struct llama_sampler * smpl) {
|
|
|
862
1439
|
delete (llama_sampler_top_p *) smpl->ctx;
|
|
863
1440
|
}
|
|
864
1441
|
|
|
1442
|
+
static bool llama_sampler_top_p_backend_init(
|
|
1443
|
+
struct llama_sampler * smpl,
|
|
1444
|
+
ggml_backend_buffer_type_t buft) {
|
|
1445
|
+
auto * sctx = (llama_sampler_top_p *) smpl->ctx;
|
|
1446
|
+
|
|
1447
|
+
const bool res = llama_sampler_backend_support(smpl, buft);
|
|
1448
|
+
|
|
1449
|
+
sctx->init(res);
|
|
1450
|
+
|
|
1451
|
+
return res;
|
|
1452
|
+
}
|
|
1453
|
+
|
|
1454
|
+
static void llama_sampler_top_p_backend_apply(
|
|
1455
|
+
struct llama_sampler * smpl,
|
|
1456
|
+
struct ggml_context * ctx,
|
|
1457
|
+
struct ggml_cgraph * gf,
|
|
1458
|
+
struct llama_sampler_data * data) {
|
|
1459
|
+
auto * sctx = (llama_sampler_top_p *) smpl->ctx;
|
|
1460
|
+
|
|
1461
|
+
auto ggml_sort = [ctx](struct ggml_tensor * a, struct ggml_tensor * b) {
|
|
1462
|
+
GGML_ASSERT(ggml_nrows(a) == 1);
|
|
1463
|
+
struct ggml_tensor * a_reshaped = ggml_reshape_2d(ctx, a, 1, a->ne[0]);
|
|
1464
|
+
struct ggml_tensor * a_sorted = ggml_get_rows(ctx, a_reshaped, b);
|
|
1465
|
+
return ggml_reshape_1d(ctx, a_sorted, a->ne[0]);
|
|
1466
|
+
};
|
|
1467
|
+
|
|
1468
|
+
// Get the sorted logits in descending order.
|
|
1469
|
+
struct ggml_tensor * sorted_idx = ggml_argsort(ctx, data->logits, GGML_SORT_ORDER_DESC);
|
|
1470
|
+
ggml_set_name(sorted_idx, "top_p_sorted_idx");
|
|
1471
|
+
|
|
1472
|
+
// Do the sorting via reshape + get_rows
|
|
1473
|
+
struct ggml_tensor * sorted_logits = ggml_sort(data->logits, sorted_idx);
|
|
1474
|
+
ggml_set_name(sorted_logits, "top_p_sorted_logits");
|
|
1475
|
+
|
|
1476
|
+
struct ggml_tensor * softmax = ggml_soft_max(ctx, sorted_logits);
|
|
1477
|
+
ggml_set_name(softmax, "top_p_softmax");
|
|
1478
|
+
|
|
1479
|
+
// If candidates are provided, sort them as well. Otherwise, set sorted indices as candidates.
|
|
1480
|
+
if (data->candidates) {
|
|
1481
|
+
data->candidates = ggml_sort(data->candidates, sorted_idx);
|
|
1482
|
+
} else {
|
|
1483
|
+
data->candidates = sorted_idx;
|
|
1484
|
+
}
|
|
1485
|
+
ggml_set_name(data->candidates, "top_p_candidates");
|
|
1486
|
+
|
|
1487
|
+
// Compute Cumulative Distribution Function (CDF) by means of GGML_OP_CUMSUM.
|
|
1488
|
+
struct ggml_tensor * cdf = ggml_cumsum(ctx, softmax);
|
|
1489
|
+
ggml_set_name(cdf, "top_p_cdf");
|
|
1490
|
+
|
|
1491
|
+
// Invert CDF and add top-p value so that ggml_step yields 1 for values we want to keep
|
|
1492
|
+
struct ggml_tensor * cdf_scaled = ggml_scale_bias(ctx, cdf, -1.0f, sctx->p);
|
|
1493
|
+
ggml_set_name(cdf_scaled, "top_p_cdf_scaled");
|
|
1494
|
+
|
|
1495
|
+
struct ggml_tensor * mask = ggml_step(ctx, cdf_scaled);
|
|
1496
|
+
ggml_set_name(mask, "top_p_mask");
|
|
1497
|
+
|
|
1498
|
+
// Taking the sum of the mask gives us the sum of elements after the threshold
|
|
1499
|
+
// we are interested in.
|
|
1500
|
+
struct ggml_tensor * idxf = ggml_sum(ctx, mask);
|
|
1501
|
+
ggml_set_name(idxf, "top_p_index_f32");
|
|
1502
|
+
|
|
1503
|
+
// prevent out-of-bounds access
|
|
1504
|
+
idxf = ggml_clamp(ctx, idxf, 0.0f, mask->ne[0] - 1);
|
|
1505
|
+
|
|
1506
|
+
// construct ones tensor to set the value in the mask
|
|
1507
|
+
struct ggml_tensor * ones = ggml_scale_bias(ctx, idxf, 0.0f, 1.0f);
|
|
1508
|
+
ggml_set_name(ones, "top_p_ones");
|
|
1509
|
+
|
|
1510
|
+
// Make top-p inclusive (i.e. return all values such that cum_sum/cdf >= p)
|
|
1511
|
+
struct ggml_tensor * mask_reshaped = ggml_reshape_2d(ctx, mask, 1, mask->ne[0]);
|
|
1512
|
+
|
|
1513
|
+
mask_reshaped = ggml_set_rows(ctx, mask_reshaped, ones, ggml_cast(ctx, idxf, GGML_TYPE_I32));
|
|
1514
|
+
mask = ggml_reshape_1d(ctx, mask_reshaped, mask->ne[0]);
|
|
1515
|
+
|
|
1516
|
+
// Use ggml_scale_bias (output = (a * s) + b) which in this case becomes:
|
|
1517
|
+
// top_p_bias = (mask * 1e9f) - 1e9f.
|
|
1518
|
+
// So entries in the mask that we want to discard will become -1e9f, and
|
|
1519
|
+
// others will be 0 (meaning that will not effect the logits).
|
|
1520
|
+
const float large_val = 1e9f;
|
|
1521
|
+
struct ggml_tensor * top_p_bias = ggml_scale_bias(ctx, mask, large_val, -large_val);
|
|
1522
|
+
ggml_set_name(top_p_bias, "top_p_bias");
|
|
1523
|
+
|
|
1524
|
+
data->logits = ggml_add(ctx, sorted_logits, top_p_bias);
|
|
1525
|
+
ggml_set_name(data->logits, "top_p_logits");
|
|
1526
|
+
|
|
1527
|
+
GGML_UNUSED(gf);
|
|
1528
|
+
}
|
|
1529
|
+
|
|
865
1530
|
static struct llama_sampler_i llama_sampler_top_p_i = {
|
|
866
|
-
/* .name
|
|
867
|
-
/* .accept
|
|
868
|
-
/* .apply
|
|
869
|
-
/* .reset
|
|
870
|
-
/* .clone
|
|
871
|
-
/* .free
|
|
1531
|
+
/* .name = */ llama_sampler_top_p_name,
|
|
1532
|
+
/* .accept = */ nullptr,
|
|
1533
|
+
/* .apply = */ llama_sampler_top_p_apply,
|
|
1534
|
+
/* .reset = */ nullptr,
|
|
1535
|
+
/* .clone = */ llama_sampler_top_p_clone,
|
|
1536
|
+
/* .free = */ llama_sampler_top_p_free,
|
|
1537
|
+
/* .backend_init = */ llama_sampler_top_p_backend_init,
|
|
1538
|
+
/* .backend_accept = */ nullptr,
|
|
1539
|
+
/* .backend_apply = */ llama_sampler_top_p_backend_apply,
|
|
1540
|
+
/* .backend_set_input = */ nullptr,
|
|
872
1541
|
};
|
|
873
1542
|
|
|
874
1543
|
struct llama_sampler * llama_sampler_init_top_p(float p, size_t min_keep) {
|
|
1544
|
+
const bool is_empty = p >= 1.0f;
|
|
1545
|
+
|
|
1546
|
+
if (is_empty) {
|
|
1547
|
+
return llama_sampler_init_empty("?top-p");
|
|
1548
|
+
}
|
|
1549
|
+
|
|
875
1550
|
return llama_sampler_init(
|
|
876
1551
|
/* .iface = */ &llama_sampler_top_p_i,
|
|
877
1552
|
/* .ctx = */ new llama_sampler_top_p {
|
|
1553
|
+
("top-p"),
|
|
878
1554
|
/* .p = */ p,
|
|
879
1555
|
/* .min_keep = */ min_keep,
|
|
880
1556
|
/* .buf_sort = */ {},
|
|
@@ -884,13 +1560,14 @@ struct llama_sampler * llama_sampler_init_top_p(float p, size_t min_keep) {
|
|
|
884
1560
|
|
|
885
1561
|
// min-p
|
|
886
1562
|
|
|
887
|
-
struct llama_sampler_min_p {
|
|
1563
|
+
struct llama_sampler_min_p : public llama_sampler_backend {
|
|
888
1564
|
const float p;
|
|
889
1565
|
const size_t min_keep;
|
|
890
1566
|
};
|
|
891
1567
|
|
|
892
|
-
static const char * llama_sampler_min_p_name(const struct llama_sampler *
|
|
893
|
-
|
|
1568
|
+
static const char * llama_sampler_min_p_name(const struct llama_sampler * smpl) {
|
|
1569
|
+
auto * sctx = (llama_sampler_min_p *) smpl->ctx;
|
|
1570
|
+
return sctx->get_name();
|
|
894
1571
|
}
|
|
895
1572
|
|
|
896
1573
|
static void llama_sampler_min_p_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
|
@@ -956,19 +1633,85 @@ static void llama_sampler_min_p_free(struct llama_sampler * smpl) {
|
|
|
956
1633
|
delete (llama_sampler_min_p *) smpl->ctx;
|
|
957
1634
|
}
|
|
958
1635
|
|
|
1636
|
+
static bool llama_sampler_min_p_backend_init(
|
|
1637
|
+
struct llama_sampler * smpl,
|
|
1638
|
+
ggml_backend_buffer_type_t buft) {
|
|
1639
|
+
auto * sctx = (llama_sampler_min_p *) smpl->ctx;
|
|
1640
|
+
|
|
1641
|
+
const bool res = llama_sampler_backend_support(smpl, buft);
|
|
1642
|
+
|
|
1643
|
+
sctx->init(res);
|
|
1644
|
+
|
|
1645
|
+
return res;
|
|
1646
|
+
}
|
|
1647
|
+
|
|
1648
|
+
static void llama_sampler_min_p_backend_apply(
|
|
1649
|
+
struct llama_sampler * smpl,
|
|
1650
|
+
struct ggml_context * ctx,
|
|
1651
|
+
struct ggml_cgraph * gf,
|
|
1652
|
+
struct llama_sampler_data * data) {
|
|
1653
|
+
auto * sctx = (llama_sampler_min_p *) smpl->ctx;
|
|
1654
|
+
|
|
1655
|
+
struct ggml_tensor * max_idx = ggml_argmax(ctx, data->logits);
|
|
1656
|
+
ggml_set_name(max_idx, "max_idx");
|
|
1657
|
+
|
|
1658
|
+
struct ggml_tensor * logits_rows = ggml_reshape_2d(ctx, data->logits, 1, data->logits->ne[0]);
|
|
1659
|
+
ggml_set_name(logits_rows, "logits_rows");
|
|
1660
|
+
|
|
1661
|
+
struct ggml_tensor * max_logit = ggml_get_rows(ctx, logits_rows, max_idx);
|
|
1662
|
+
ggml_set_name(max_logit, "max_logit");
|
|
1663
|
+
|
|
1664
|
+
// Calculate the threshold value.
|
|
1665
|
+
struct ggml_tensor * threshold = ggml_scale_bias(ctx, max_logit, 1.0f, logf(sctx->p));
|
|
1666
|
+
ggml_set_name(threshold, "min_p_threshold");
|
|
1667
|
+
|
|
1668
|
+
// Subtract the threshold from logits.
|
|
1669
|
+
struct ggml_tensor * sub = ggml_sub(ctx, data->logits, threshold);
|
|
1670
|
+
|
|
1671
|
+
// Create a mask where logits below the threshold are 0 (discard),
|
|
1672
|
+
// and others are 1 (keep).
|
|
1673
|
+
struct ggml_tensor * mask = ggml_step(ctx, sub);
|
|
1674
|
+
ggml_set_name(mask, "min_p_mask");
|
|
1675
|
+
|
|
1676
|
+
// Use ggml_scale_bias (output = (a * s) + b) which in this case becomes:
|
|
1677
|
+
// min_p_bias = (mask * 1e9f) - 1e9f.
|
|
1678
|
+
// So entries in the mask that we want to discard will become -1e9f, and
|
|
1679
|
+
// others will be 0 (meaning that will not effect the logits).
|
|
1680
|
+
const float large_val = 1e9f;
|
|
1681
|
+
struct ggml_tensor * min_p_bias = ggml_scale_bias(ctx, mask, large_val, -large_val);
|
|
1682
|
+
ggml_set_name(min_p_bias, "min_p_bias");
|
|
1683
|
+
|
|
1684
|
+
// Add the min_p bias to the logits.
|
|
1685
|
+
data->logits = ggml_add(ctx, data->logits, min_p_bias);
|
|
1686
|
+
ggml_set_name(data->logits, "min_p_logits");
|
|
1687
|
+
|
|
1688
|
+
GGML_UNUSED(gf);
|
|
1689
|
+
}
|
|
1690
|
+
|
|
959
1691
|
static struct llama_sampler_i llama_sampler_min_p_i = {
|
|
960
|
-
/* .name
|
|
961
|
-
/* .accept
|
|
962
|
-
/* .apply
|
|
963
|
-
/* .reset
|
|
964
|
-
/* .clone
|
|
965
|
-
/* .free
|
|
1692
|
+
/* .name = */ llama_sampler_min_p_name,
|
|
1693
|
+
/* .accept = */ nullptr,
|
|
1694
|
+
/* .apply = */ llama_sampler_min_p_apply,
|
|
1695
|
+
/* .reset = */ nullptr,
|
|
1696
|
+
/* .clone = */ llama_sampler_min_p_clone,
|
|
1697
|
+
/* .free = */ llama_sampler_min_p_free,
|
|
1698
|
+
/* .backend_init = */ llama_sampler_min_p_backend_init,
|
|
1699
|
+
/* .backend_accept = */ nullptr,
|
|
1700
|
+
/* .backend_apply = */ llama_sampler_min_p_backend_apply,
|
|
1701
|
+
/* .backend_set_input = */ nullptr,
|
|
966
1702
|
};
|
|
967
1703
|
|
|
968
1704
|
struct llama_sampler * llama_sampler_init_min_p(float p, size_t min_keep) {
|
|
1705
|
+
const bool is_empty = (p <= 0.0f);
|
|
1706
|
+
|
|
1707
|
+
if (is_empty) {
|
|
1708
|
+
return llama_sampler_init_empty("?min-p");
|
|
1709
|
+
}
|
|
1710
|
+
|
|
969
1711
|
return llama_sampler_init(
|
|
970
1712
|
/* .iface = */ &llama_sampler_min_p_i,
|
|
971
1713
|
/* .ctx = */ new llama_sampler_min_p {
|
|
1714
|
+
("min-p"),
|
|
972
1715
|
/* .p = */ p,
|
|
973
1716
|
/* .min_keep = */ min_keep,
|
|
974
1717
|
}
|
|
@@ -1056,15 +1799,25 @@ static void llama_sampler_typical_free(struct llama_sampler * smpl) {
|
|
|
1056
1799
|
}
|
|
1057
1800
|
|
|
1058
1801
|
static struct llama_sampler_i llama_sampler_typical_i = {
|
|
1059
|
-
/* .name
|
|
1060
|
-
/* .accept
|
|
1061
|
-
/* .apply
|
|
1062
|
-
/* .reset
|
|
1063
|
-
/* .clone
|
|
1064
|
-
/* .free
|
|
1802
|
+
/* .name = */ llama_sampler_typical_name,
|
|
1803
|
+
/* .accept = */ nullptr,
|
|
1804
|
+
/* .apply = */ llama_sampler_typical_apply,
|
|
1805
|
+
/* .reset = */ nullptr,
|
|
1806
|
+
/* .clone = */ llama_sampler_typical_clone,
|
|
1807
|
+
/* .free = */ llama_sampler_typical_free,
|
|
1808
|
+
/* .backend_init = */ nullptr,
|
|
1809
|
+
/* .backend_accept = */ nullptr,
|
|
1810
|
+
/* .backend_apply = */ nullptr,
|
|
1811
|
+
/* .backend_set_input = */ nullptr,
|
|
1065
1812
|
};
|
|
1066
1813
|
|
|
1067
1814
|
struct llama_sampler * llama_sampler_init_typical(float p, size_t min_keep) {
|
|
1815
|
+
const bool is_empty = (p >= 1.0f);
|
|
1816
|
+
|
|
1817
|
+
if (is_empty) {
|
|
1818
|
+
return llama_sampler_init_empty("?typical");
|
|
1819
|
+
}
|
|
1820
|
+
|
|
1068
1821
|
return llama_sampler_init(
|
|
1069
1822
|
/* .iface = */ &llama_sampler_typical_i,
|
|
1070
1823
|
/* .ctx = */ new llama_sampler_typical {
|
|
@@ -1076,12 +1829,13 @@ struct llama_sampler * llama_sampler_init_typical(float p, size_t min_keep) {
|
|
|
1076
1829
|
|
|
1077
1830
|
// temp
|
|
1078
1831
|
|
|
1079
|
-
struct llama_sampler_temp {
|
|
1832
|
+
struct llama_sampler_temp : public llama_sampler_backend {
|
|
1080
1833
|
const float temp;
|
|
1081
1834
|
};
|
|
1082
1835
|
|
|
1083
|
-
static const char * llama_sampler_temp_name(const struct llama_sampler *
|
|
1084
|
-
|
|
1836
|
+
static const char * llama_sampler_temp_name(const struct llama_sampler * smpl) {
|
|
1837
|
+
auto * sctx = (llama_sampler_temp *) smpl->ctx;
|
|
1838
|
+
return sctx->get_name();
|
|
1085
1839
|
}
|
|
1086
1840
|
|
|
1087
1841
|
static void llama_sampler_temp_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
|
@@ -1099,19 +1853,79 @@ static void llama_sampler_temp_free(struct llama_sampler * smpl) {
|
|
|
1099
1853
|
delete (llama_sampler_temp *) smpl->ctx;
|
|
1100
1854
|
}
|
|
1101
1855
|
|
|
1856
|
+
static void llama_sampler_backend_temp_sampling(
|
|
1857
|
+
struct ggml_context * ctx,
|
|
1858
|
+
struct ggml_cgraph * gf,
|
|
1859
|
+
struct llama_sampler_data * data,
|
|
1860
|
+
float temp) {
|
|
1861
|
+
if (temp <= 0.0f) {
|
|
1862
|
+
// Find the most probable token index.
|
|
1863
|
+
struct ggml_tensor * max_idx = ggml_argmax(ctx, data->logits);
|
|
1864
|
+
ggml_set_name(max_idx, "temp_max_idx");
|
|
1865
|
+
|
|
1866
|
+
if (data->candidates) {
|
|
1867
|
+
struct ggml_tensor * candidates_rows = ggml_reshape_2d(ctx, data->candidates, 1, data->candidates->ne[0]);
|
|
1868
|
+
data->candidates = ggml_get_rows(ctx, candidates_rows, max_idx);
|
|
1869
|
+
} else {
|
|
1870
|
+
data->candidates = max_idx;
|
|
1871
|
+
}
|
|
1872
|
+
|
|
1873
|
+
struct ggml_tensor * logits_rows = ggml_reshape_2d(ctx, data->logits, 1, data->logits->ne[0]);
|
|
1874
|
+
data->logits = ggml_get_rows(ctx, logits_rows, max_idx);
|
|
1875
|
+
|
|
1876
|
+
return;
|
|
1877
|
+
}
|
|
1878
|
+
|
|
1879
|
+
data->logits = ggml_scale(ctx, data->logits, 1.0f / temp);
|
|
1880
|
+
|
|
1881
|
+
GGML_UNUSED(gf);
|
|
1882
|
+
}
|
|
1883
|
+
|
|
1884
|
+
static bool llama_sampler_temp_backend_init(
|
|
1885
|
+
struct llama_sampler * smpl,
|
|
1886
|
+
ggml_backend_buffer_type_t buft) {
|
|
1887
|
+
auto * sctx = (llama_sampler_temp *) smpl->ctx;
|
|
1888
|
+
|
|
1889
|
+
const bool res = llama_sampler_backend_support(smpl, buft);
|
|
1890
|
+
|
|
1891
|
+
sctx->init(res);
|
|
1892
|
+
|
|
1893
|
+
return res;
|
|
1894
|
+
}
|
|
1895
|
+
|
|
1896
|
+
static void llama_sampler_temp_backend_apply(
|
|
1897
|
+
struct llama_sampler * smpl,
|
|
1898
|
+
struct ggml_context * ctx,
|
|
1899
|
+
struct ggml_cgraph * gf,
|
|
1900
|
+
struct llama_sampler_data * data) {
|
|
1901
|
+
auto * sctx = (llama_sampler_temp *) smpl->ctx;
|
|
1902
|
+
llama_sampler_backend_temp_sampling(ctx, gf, data, sctx->temp);
|
|
1903
|
+
}
|
|
1904
|
+
|
|
1102
1905
|
static struct llama_sampler_i llama_sampler_temp_i = {
|
|
1103
|
-
/* .name
|
|
1104
|
-
/* .accept
|
|
1105
|
-
/* .apply
|
|
1106
|
-
/* .reset
|
|
1107
|
-
/* .clone
|
|
1108
|
-
/* .free
|
|
1906
|
+
/* .name = */ llama_sampler_temp_name,
|
|
1907
|
+
/* .accept = */ nullptr,
|
|
1908
|
+
/* .apply = */ llama_sampler_temp_apply,
|
|
1909
|
+
/* .reset = */ nullptr,
|
|
1910
|
+
/* .clone = */ llama_sampler_temp_clone,
|
|
1911
|
+
/* .free = */ llama_sampler_temp_free,
|
|
1912
|
+
/* .backend_init = */ llama_sampler_temp_backend_init,
|
|
1913
|
+
/* .backend_accept = */ nullptr,
|
|
1914
|
+
/* .backend_apply = */ llama_sampler_temp_backend_apply,
|
|
1915
|
+
/* .backend_set_input = */ nullptr,
|
|
1109
1916
|
};
|
|
1110
1917
|
|
|
1111
1918
|
struct llama_sampler * llama_sampler_init_temp(float temp) {
|
|
1919
|
+
const bool is_empty = temp == 1.0f;
|
|
1920
|
+
|
|
1921
|
+
if (is_empty) {
|
|
1922
|
+
return llama_sampler_init_empty("?temp");
|
|
1923
|
+
}
|
|
1924
|
+
|
|
1112
1925
|
return llama_sampler_init(
|
|
1113
1926
|
/* .iface = */ &llama_sampler_temp_i,
|
|
1114
1927
|
/* .ctx = */ new llama_sampler_temp {
|
|
1928
|
+
("temp"),
|
|
1115
1929
|
/*.temp = */ temp,
|
|
1116
1930
|
}
|
|
1117
1931
|
);
|
|
@@ -1119,14 +1933,15 @@ struct llama_sampler * llama_sampler_init_temp(float temp) {
|
|
|
1119
1933
|
|
|
1120
1934
|
// temp-ext
|
|
1121
1935
|
|
|
1122
|
-
struct llama_sampler_temp_ext {
|
|
1936
|
+
struct llama_sampler_temp_ext : public llama_sampler_backend {
|
|
1123
1937
|
const float temp;
|
|
1124
1938
|
const float delta;
|
|
1125
1939
|
const float exponent;
|
|
1126
1940
|
};
|
|
1127
1941
|
|
|
1128
|
-
static const char * llama_sampler_temp_ext_name(const struct llama_sampler *
|
|
1129
|
-
|
|
1942
|
+
static const char * llama_sampler_temp_ext_name(const struct llama_sampler * smpl) {
|
|
1943
|
+
auto * sctx = (llama_sampler_temp_ext *) smpl->ctx;
|
|
1944
|
+
return sctx->get_name();
|
|
1130
1945
|
}
|
|
1131
1946
|
|
|
1132
1947
|
static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
|
@@ -1209,24 +2024,112 @@ static void llama_sampler_temp_ext_free(struct llama_sampler * smpl) {
|
|
|
1209
2024
|
delete (llama_sampler_temp_ext *) smpl->ctx;
|
|
1210
2025
|
}
|
|
1211
2026
|
|
|
2027
|
+
static bool llama_sampler_temp_ext_backend_init(
|
|
2028
|
+
struct llama_sampler * smpl,
|
|
2029
|
+
ggml_backend_buffer_type_t buft) {
|
|
2030
|
+
auto * sctx = (llama_sampler_temp_ext *) smpl->ctx;
|
|
2031
|
+
|
|
2032
|
+
const bool res = llama_sampler_backend_support(smpl, buft);
|
|
2033
|
+
|
|
2034
|
+
sctx->init(res);
|
|
2035
|
+
|
|
2036
|
+
return res;
|
|
2037
|
+
}
|
|
2038
|
+
|
|
2039
|
+
static void llama_sampler_temp_ext_backend_apply(
|
|
2040
|
+
struct llama_sampler * smpl,
|
|
2041
|
+
struct ggml_context * ctx,
|
|
2042
|
+
struct ggml_cgraph * gf,
|
|
2043
|
+
struct llama_sampler_data * data) {
|
|
2044
|
+
auto * sctx = (llama_sampler_temp_ext *) smpl->ctx;
|
|
2045
|
+
|
|
2046
|
+
// Revert to standard temperature scaling if delta or temp are non-positive.
|
|
2047
|
+
if (sctx->delta <= 0.0f || sctx->temp <= 0.0f) {
|
|
2048
|
+
llama_sampler_backend_temp_sampling(ctx, gf, data, sctx->temp);
|
|
2049
|
+
return;
|
|
2050
|
+
}
|
|
2051
|
+
|
|
2052
|
+
// Calculate min_temp, max_temp, and max_entropy.
|
|
2053
|
+
const float min_temp = std::max(0.0f, sctx->temp - sctx->delta);
|
|
2054
|
+
const float max_temp = sctx->temp + sctx->delta;
|
|
2055
|
+
const float max_entropy = logf(data->logits->ne[0]);
|
|
2056
|
+
|
|
2057
|
+
// Calculate the probabilities.
|
|
2058
|
+
struct ggml_tensor * probs = ggml_soft_max(ctx, data->logits);
|
|
2059
|
+
ggml_set_name(probs, "temp_ext_softmax_probs");
|
|
2060
|
+
|
|
2061
|
+
// Clamp probabilities to avoid log(0) which would give -inf
|
|
2062
|
+
struct ggml_tensor * probs_clamped = ggml_clamp(ctx, probs, 1e-10f, 1.0f);
|
|
2063
|
+
ggml_set_name(probs_clamped, "temp_ext_probs_clamped");
|
|
2064
|
+
|
|
2065
|
+
// Calculate the entropy, entropy = -Σ(p * log(p)).
|
|
2066
|
+
struct ggml_tensor * log_probs = ggml_log(ctx, probs_clamped);
|
|
2067
|
+
struct ggml_tensor * p_log_p = ggml_mul(ctx, probs_clamped, log_probs);
|
|
2068
|
+
struct ggml_tensor * sum_p_log_p = ggml_sum(ctx, p_log_p);
|
|
2069
|
+
struct ggml_tensor * entropy = ggml_scale(ctx, sum_p_log_p, -1.0f);
|
|
2070
|
+
ggml_set_name(log_probs, "temp_ext_log_probs");
|
|
2071
|
+
ggml_set_name(p_log_p, "temp_ext_p_log_p");
|
|
2072
|
+
ggml_set_name(sum_p_log_p, "temp_ext_sum_p_log_p");
|
|
2073
|
+
ggml_set_name(entropy, "temp_ext_entropy");
|
|
2074
|
+
|
|
2075
|
+
// Normalize the entropy, norm_entropy = entropy / max_entropy
|
|
2076
|
+
struct ggml_tensor * norm_entropy = ggml_scale(ctx, entropy, 1.0f / max_entropy);
|
|
2077
|
+
ggml_set_name(norm_entropy, "temp_ext_norm_entropy");
|
|
2078
|
+
|
|
2079
|
+
// Calculate the dynamic temperature:
|
|
2080
|
+
// dyn_temp = min_temp + (max_temp - min_temp) * powf(normalized_entropy, exponent);
|
|
2081
|
+
//
|
|
2082
|
+
// Calculate powf(normalized_entropy, exponent) as
|
|
2083
|
+
// norm_entropy^exponent = exp(exponent * log(norm_entropy))
|
|
2084
|
+
struct ggml_tensor * log_norm_entropy = ggml_log(ctx, norm_entropy);
|
|
2085
|
+
struct ggml_tensor * scaled_log = ggml_scale(ctx, log_norm_entropy, sctx->exponent);
|
|
2086
|
+
struct ggml_tensor * pow_entropy = ggml_exp(ctx, scaled_log);
|
|
2087
|
+
// With pow_entropy computed we can now compute dyn_temp, scaling by
|
|
2088
|
+
// (max_temp - min_temp) and then adding min_temp.
|
|
2089
|
+
struct ggml_tensor * dyn_temp = ggml_scale_bias(ctx, pow_entropy, max_temp - min_temp, min_temp);
|
|
2090
|
+
ggml_set_name(log_norm_entropy, "temp_ext_log_norm_entropy");
|
|
2091
|
+
ggml_set_name(scaled_log, "temp_ext_scaled_log");
|
|
2092
|
+
ggml_set_name(pow_entropy, "temp_ext_pow_entropy");
|
|
2093
|
+
ggml_set_name(dyn_temp, "temp_ext_dyn_temp");
|
|
2094
|
+
|
|
2095
|
+
// Scale the logits by the dynamic temperature
|
|
2096
|
+
struct ggml_tensor * scaled_logits = ggml_div(ctx, data->logits, dyn_temp);
|
|
2097
|
+
ggml_set_name(scaled_logits, "temp_ext_scaled_logits");
|
|
2098
|
+
|
|
2099
|
+
data->logits = scaled_logits;
|
|
2100
|
+
}
|
|
2101
|
+
|
|
1212
2102
|
static struct llama_sampler_i llama_sampler_temp_ext_i = {
|
|
1213
|
-
/* .name
|
|
1214
|
-
/* .accept
|
|
1215
|
-
/* .apply
|
|
1216
|
-
/* .reset
|
|
1217
|
-
/* .clone
|
|
1218
|
-
/* .free
|
|
2103
|
+
/* .name = */ llama_sampler_temp_ext_name,
|
|
2104
|
+
/* .accept = */ nullptr,
|
|
2105
|
+
/* .apply = */ llama_sampler_temp_ext_apply,
|
|
2106
|
+
/* .reset = */ nullptr,
|
|
2107
|
+
/* .clone = */ llama_sampler_temp_ext_clone,
|
|
2108
|
+
/* .free = */ llama_sampler_temp_ext_free,
|
|
2109
|
+
/* .backend_init = */ llama_sampler_temp_ext_backend_init,
|
|
2110
|
+
/* .backend_accept = */ nullptr,
|
|
2111
|
+
/* .backend_apply = */ llama_sampler_temp_ext_backend_apply,
|
|
2112
|
+
/* .backend_set_input = */ nullptr,
|
|
1219
2113
|
};
|
|
1220
2114
|
|
|
1221
2115
|
struct llama_sampler * llama_sampler_init_temp_ext(float temp, float delta, float exponent) {
|
|
1222
|
-
|
|
2116
|
+
const bool is_empty = temp == 1.0f && delta <= 0.0f;
|
|
2117
|
+
|
|
2118
|
+
if (is_empty) {
|
|
2119
|
+
return llama_sampler_init_empty("?temp-ext");
|
|
2120
|
+
}
|
|
2121
|
+
|
|
2122
|
+
auto * res = llama_sampler_init(
|
|
1223
2123
|
/* .iface = */ &llama_sampler_temp_ext_i,
|
|
1224
2124
|
/* .ctx = */ new llama_sampler_temp_ext {
|
|
2125
|
+
("temp-ext"),
|
|
1225
2126
|
/* .temp = */ temp,
|
|
1226
2127
|
/* .delta = */ delta,
|
|
1227
2128
|
/* .exponent = */ exponent,
|
|
1228
2129
|
}
|
|
1229
2130
|
);
|
|
2131
|
+
|
|
2132
|
+
return res;
|
|
1230
2133
|
}
|
|
1231
2134
|
|
|
1232
2135
|
// xtc
|
|
@@ -1304,16 +2207,27 @@ static void llama_sampler_xtc_reset(struct llama_sampler * smpl) {
|
|
|
1304
2207
|
}
|
|
1305
2208
|
|
|
1306
2209
|
static struct llama_sampler_i llama_sampler_xtc_i = {
|
|
1307
|
-
/* .name
|
|
1308
|
-
/* .accept
|
|
1309
|
-
/* .apply
|
|
1310
|
-
/* .reset
|
|
1311
|
-
/* .clone
|
|
1312
|
-
/* .free
|
|
2210
|
+
/* .name = */ llama_sampler_xtc_name,
|
|
2211
|
+
/* .accept = */ nullptr,
|
|
2212
|
+
/* .apply = */ llama_sample_xtc_apply,
|
|
2213
|
+
/* .reset = */ llama_sampler_xtc_reset,
|
|
2214
|
+
/* .clone = */ llama_sampler_xtc_clone,
|
|
2215
|
+
/* .free = */ llama_sampler_xtc_free,
|
|
2216
|
+
/* .backend_init = */ nullptr,
|
|
2217
|
+
/* .backend_accept = */ nullptr,
|
|
2218
|
+
/* .backend_apply = */ nullptr,
|
|
2219
|
+
/* .backend_set_input = */ nullptr,
|
|
1313
2220
|
};
|
|
1314
2221
|
|
|
1315
2222
|
struct llama_sampler * llama_sampler_init_xtc(float p, float t, size_t min_keep, uint32_t seed) {
|
|
1316
|
-
|
|
2223
|
+
const bool is_empty = (p <= 0.0f || t > 0.5f);
|
|
2224
|
+
|
|
2225
|
+
if (is_empty) {
|
|
2226
|
+
return llama_sampler_init_empty("?xtc");
|
|
2227
|
+
}
|
|
2228
|
+
|
|
2229
|
+
const auto seed_cur = get_rng_seed(seed);
|
|
2230
|
+
|
|
1317
2231
|
return llama_sampler_init(
|
|
1318
2232
|
/* .iface = */ &llama_sampler_xtc_i,
|
|
1319
2233
|
/* .ctx = */ new llama_sampler_xtc {
|
|
@@ -1412,16 +2326,21 @@ static void llama_sampler_mirostat_free(struct llama_sampler * smpl) {
|
|
|
1412
2326
|
}
|
|
1413
2327
|
|
|
1414
2328
|
static struct llama_sampler_i llama_sampler_mirostat_i = {
|
|
1415
|
-
/* .name
|
|
1416
|
-
/* .accept
|
|
1417
|
-
/* .apply
|
|
1418
|
-
/* .reset
|
|
1419
|
-
/* .clone
|
|
1420
|
-
/* .free
|
|
2329
|
+
/* .name = */ llama_sampler_mirostat_name,
|
|
2330
|
+
/* .accept = */ nullptr,
|
|
2331
|
+
/* .apply = */ llama_sampler_mirostat_apply,
|
|
2332
|
+
/* .reset = */ llama_sampler_mirostat_reset,
|
|
2333
|
+
/* .clone = */ llama_sampler_mirostat_clone,
|
|
2334
|
+
/* .free = */ llama_sampler_mirostat_free,
|
|
2335
|
+
/* .backend_init = */ nullptr,
|
|
2336
|
+
/* .backend_accept = */ nullptr,
|
|
2337
|
+
/* .backend_apply = */ nullptr,
|
|
2338
|
+
/* .backend_set_input = */ nullptr,
|
|
1421
2339
|
};
|
|
1422
2340
|
|
|
1423
2341
|
struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t seed, float tau, float eta, int32_t m) {
|
|
1424
|
-
auto seed_cur = get_rng_seed(seed);
|
|
2342
|
+
const auto seed_cur = get_rng_seed(seed);
|
|
2343
|
+
|
|
1425
2344
|
return llama_sampler_init(
|
|
1426
2345
|
/* .iface = */ &llama_sampler_mirostat_i,
|
|
1427
2346
|
/* .ctx = */ new llama_sampler_mirostat {
|
|
@@ -1511,12 +2430,16 @@ static void llama_sampler_mirostat_v2_free(struct llama_sampler * smpl) {
|
|
|
1511
2430
|
}
|
|
1512
2431
|
|
|
1513
2432
|
static struct llama_sampler_i llama_sampler_mirostat_v2_i = {
|
|
1514
|
-
/* .name
|
|
1515
|
-
/* .accept
|
|
1516
|
-
/* .apply
|
|
1517
|
-
/* .reset
|
|
1518
|
-
/* .clone
|
|
1519
|
-
/* .free
|
|
2433
|
+
/* .name = */ llama_sampler_mirostat_v2_name,
|
|
2434
|
+
/* .accept = */ nullptr,
|
|
2435
|
+
/* .apply = */ llama_sampler_mirostat_v2_apply,
|
|
2436
|
+
/* .reset = */ llama_sampler_mirostat_v2_reset,
|
|
2437
|
+
/* .clone = */ llama_sampler_mirostat_v2_clone,
|
|
2438
|
+
/* .free = */ llama_sampler_mirostat_v2_free,
|
|
2439
|
+
/* .backend_init = */ nullptr,
|
|
2440
|
+
/* .backend_accept = */ nullptr,
|
|
2441
|
+
/* .backend_apply = */ nullptr,
|
|
2442
|
+
/* .backend_set_input = */ nullptr,
|
|
1520
2443
|
};
|
|
1521
2444
|
|
|
1522
2445
|
struct llama_sampler * llama_sampler_init_mirostat_v2(uint32_t seed, float tau, float eta) {
|
|
@@ -1628,12 +2551,16 @@ static void llama_sampler_grammar_free(struct llama_sampler * smpl) {
|
|
|
1628
2551
|
}
|
|
1629
2552
|
|
|
1630
2553
|
static struct llama_sampler_i llama_sampler_grammar_i = {
|
|
1631
|
-
/* .name
|
|
1632
|
-
/* .accept
|
|
1633
|
-
/* .apply
|
|
1634
|
-
/* .reset
|
|
1635
|
-
/* .clone
|
|
1636
|
-
/* .free
|
|
2554
|
+
/* .name = */ llama_sampler_grammar_name,
|
|
2555
|
+
/* .accept = */ llama_sampler_grammar_accept_impl,
|
|
2556
|
+
/* .apply = */ llama_sampler_grammar_apply,
|
|
2557
|
+
/* .reset = */ llama_sampler_grammar_reset,
|
|
2558
|
+
/* .clone = */ llama_sampler_grammar_clone,
|
|
2559
|
+
/* .free = */ llama_sampler_grammar_free,
|
|
2560
|
+
/* .backend_init = */ nullptr,
|
|
2561
|
+
/* .backend_accept = */ nullptr,
|
|
2562
|
+
/* .backend_apply = */ nullptr,
|
|
2563
|
+
/* .backend_set_input = */ nullptr,
|
|
1637
2564
|
};
|
|
1638
2565
|
|
|
1639
2566
|
static struct llama_sampler * llama_sampler_init_grammar_impl(
|
|
@@ -1835,12 +2762,16 @@ static void llama_sampler_penalties_free(struct llama_sampler * smpl) {
|
|
|
1835
2762
|
}
|
|
1836
2763
|
|
|
1837
2764
|
static struct llama_sampler_i llama_sampler_penalties_i = {
|
|
1838
|
-
/* .name
|
|
1839
|
-
/* .accept
|
|
1840
|
-
/* .apply
|
|
1841
|
-
/* .reset
|
|
1842
|
-
/* .clone
|
|
1843
|
-
/* .free
|
|
2765
|
+
/* .name = */ llama_sampler_penalties_name,
|
|
2766
|
+
/* .accept = */ llama_sampler_penalties_accept,
|
|
2767
|
+
/* .apply = */ llama_sampler_penalties_apply,
|
|
2768
|
+
/* .reset = */ llama_sampler_penalties_reset,
|
|
2769
|
+
/* .clone = */ llama_sampler_penalties_clone,
|
|
2770
|
+
/* .free = */ llama_sampler_penalties_free,
|
|
2771
|
+
/* .backend_init = */ nullptr,
|
|
2772
|
+
/* .backend_accept = */ nullptr,
|
|
2773
|
+
/* .backend_apply = */ nullptr,
|
|
2774
|
+
/* .backend_set_input = */ nullptr,
|
|
1844
2775
|
};
|
|
1845
2776
|
|
|
1846
2777
|
struct llama_sampler * llama_sampler_init_penalties(
|
|
@@ -1850,6 +2781,12 @@ struct llama_sampler * llama_sampler_init_penalties(
|
|
|
1850
2781
|
float penalty_present) {
|
|
1851
2782
|
penalty_last_n = std::max(penalty_last_n, 0);
|
|
1852
2783
|
|
|
2784
|
+
const bool is_empty = (penalty_last_n == 0 || (penalty_repeat == 1.0f && penalty_freq == 0.0f && penalty_present == 0.0f));
|
|
2785
|
+
|
|
2786
|
+
if (is_empty) {
|
|
2787
|
+
return llama_sampler_init_empty("?penalties");
|
|
2788
|
+
}
|
|
2789
|
+
|
|
1853
2790
|
return llama_sampler_init(
|
|
1854
2791
|
/* .iface = */ &llama_sampler_penalties_i,
|
|
1855
2792
|
/* .ctx = */ new llama_sampler_penalties {
|
|
@@ -1887,9 +2824,7 @@ static void llama_sampler_top_n_sigma_apply(struct llama_sampler * smpl, llama_t
|
|
|
1887
2824
|
for (size_t i = 0; i < cur_p->size; ++i) {
|
|
1888
2825
|
// Only count non-negative infinity values
|
|
1889
2826
|
if (cur_p->data[i].logit != -INFINITY) {
|
|
1890
|
-
|
|
1891
|
-
max = cur_p->data[i].logit;
|
|
1892
|
-
}
|
|
2827
|
+
max = std::max(max, cur_p->data[i].logit);
|
|
1893
2828
|
logits_sum += cur_p->data[i].logit;
|
|
1894
2829
|
valid_count++;
|
|
1895
2830
|
}
|
|
@@ -1926,15 +2861,25 @@ static void llama_sampler_top_n_sigma_free(struct llama_sampler * smpl) {
|
|
|
1926
2861
|
}
|
|
1927
2862
|
|
|
1928
2863
|
static struct llama_sampler_i llama_sampler_top_n_sigma_i = {
|
|
1929
|
-
/* .name
|
|
1930
|
-
/* .accept
|
|
1931
|
-
/* .apply
|
|
1932
|
-
/* .reset
|
|
1933
|
-
/* .clone
|
|
1934
|
-
/* .free
|
|
2864
|
+
/* .name = */ llama_sampler_top_n_sigma_name,
|
|
2865
|
+
/* .accept = */ nullptr,
|
|
2866
|
+
/* .apply = */ llama_sampler_top_n_sigma_apply,
|
|
2867
|
+
/* .reset = */ nullptr,
|
|
2868
|
+
/* .clone = */ llama_sampler_top_n_sigma_clone,
|
|
2869
|
+
/* .free = */ llama_sampler_top_n_sigma_free,
|
|
2870
|
+
/* .backend_init = */ nullptr,
|
|
2871
|
+
/* .backend_accept = */ nullptr,
|
|
2872
|
+
/* .backend_apply = */ nullptr,
|
|
2873
|
+
/* .backend_set_input = */ nullptr,
|
|
1935
2874
|
};
|
|
1936
2875
|
|
|
1937
2876
|
struct llama_sampler * llama_sampler_init_top_n_sigma(float n) {
|
|
2877
|
+
const bool is_empty = (n <= 0.0f);
|
|
2878
|
+
|
|
2879
|
+
if (is_empty) {
|
|
2880
|
+
return llama_sampler_init_empty("?top-n-sigma");
|
|
2881
|
+
}
|
|
2882
|
+
|
|
1938
2883
|
return llama_sampler_init(
|
|
1939
2884
|
/* .iface = */ &llama_sampler_top_n_sigma_i,
|
|
1940
2885
|
/* .ctx = */ new llama_sampler_top_n_sigma {
|
|
@@ -2256,12 +3201,16 @@ static void llama_sampler_dry_free(struct llama_sampler * smpl) {
|
|
|
2256
3201
|
}
|
|
2257
3202
|
|
|
2258
3203
|
static struct llama_sampler_i llama_sampler_dry_i = {
|
|
2259
|
-
/* .name
|
|
2260
|
-
/* .accept
|
|
2261
|
-
/* .apply
|
|
2262
|
-
/* .reset
|
|
2263
|
-
/* .clone
|
|
2264
|
-
/* .free
|
|
3204
|
+
/* .name = */ llama_sampler_dry_name,
|
|
3205
|
+
/* .accept = */ llama_sampler_dry_accept,
|
|
3206
|
+
/* .apply = */ llama_sampler_dry_apply,
|
|
3207
|
+
/* .reset = */ llama_sampler_dry_reset,
|
|
3208
|
+
/* .clone = */ llama_sampler_dry_clone,
|
|
3209
|
+
/* .free = */ llama_sampler_dry_free,
|
|
3210
|
+
/* .backend_init = */ nullptr,
|
|
3211
|
+
/* .backend_accept = */ nullptr,
|
|
3212
|
+
/* .backend_apply = */ nullptr,
|
|
3213
|
+
/* .backend_set_input = */ nullptr,
|
|
2265
3214
|
};
|
|
2266
3215
|
|
|
2267
3216
|
struct llama_sampler * llama_sampler_init_dry(const struct llama_vocab * vocab, int32_t n_ctx_train, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const char** seq_breakers, size_t num_breakers) {
|
|
@@ -2272,6 +3221,10 @@ struct llama_sampler * llama_sampler_init_dry(const struct llama_vocab * vocab,
|
|
|
2272
3221
|
|
|
2273
3222
|
const bool dry_enabled = (dry_multiplier != 0.0f && dry_base >= 1.0f && dry_penalty_last_n != 0);
|
|
2274
3223
|
|
|
3224
|
+
if (!dry_enabled) {
|
|
3225
|
+
return llama_sampler_init_empty("?dry");
|
|
3226
|
+
}
|
|
3227
|
+
|
|
2275
3228
|
if (dry_enabled && seq_breakers != nullptr && num_breakers > 0) {
|
|
2276
3229
|
// Process sequence breakers
|
|
2277
3230
|
for (size_t i = 0; i < num_breakers; ++i) {
|
|
@@ -2342,16 +3295,23 @@ struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, floa
|
|
|
2342
3295
|
|
|
2343
3296
|
// logit-bias
|
|
2344
3297
|
|
|
2345
|
-
struct llama_sampler_logit_bias {
|
|
3298
|
+
struct llama_sampler_logit_bias : public llama_sampler_backend {
|
|
2346
3299
|
const int32_t n_vocab;
|
|
2347
3300
|
|
|
2348
3301
|
const std::vector<llama_logit_bias> logit_bias;
|
|
2349
3302
|
|
|
2350
3303
|
std::vector<llama_logit_bias> to_search;
|
|
3304
|
+
|
|
3305
|
+
struct ggml_tensor * inp_logit_bias;
|
|
3306
|
+
struct ggml_tensor * inp_logit_idxs;
|
|
3307
|
+
|
|
3308
|
+
ggml_context_ptr inp_ctx;
|
|
3309
|
+
ggml_backend_buffer_ptr inp_buf;
|
|
2351
3310
|
};
|
|
2352
3311
|
|
|
2353
|
-
static const char * llama_sampler_logit_bias_name(const struct llama_sampler *
|
|
2354
|
-
|
|
3312
|
+
static const char * llama_sampler_logit_bias_name(const struct llama_sampler * smpl) {
|
|
3313
|
+
auto * ctx = (llama_sampler_logit_bias *) smpl->ctx;
|
|
3314
|
+
return ctx->get_name();
|
|
2355
3315
|
}
|
|
2356
3316
|
|
|
2357
3317
|
static void llama_sampler_logit_bias_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
|
@@ -2396,25 +3356,123 @@ static void llama_sampler_logit_bias_free(struct llama_sampler * smpl) {
|
|
|
2396
3356
|
delete (llama_sampler_logit_bias *) smpl->ctx;
|
|
2397
3357
|
}
|
|
2398
3358
|
|
|
3359
|
+
static void llama_sampler_logit_bias_backend_apply(
|
|
3360
|
+
struct llama_sampler * smpl,
|
|
3361
|
+
struct ggml_context * ctx,
|
|
3362
|
+
struct ggml_cgraph * gf,
|
|
3363
|
+
struct llama_sampler_data * data) {
|
|
3364
|
+
GGML_UNUSED(gf);
|
|
3365
|
+
GGML_UNUSED(ctx);
|
|
3366
|
+
|
|
3367
|
+
auto * sctx = (llama_sampler_logit_bias *) smpl->ctx;
|
|
3368
|
+
if (sctx->logit_bias.empty()) {
|
|
3369
|
+
return;
|
|
3370
|
+
}
|
|
3371
|
+
|
|
3372
|
+
ggml_tensor * cur = ggml_fill(ctx, data->logits, 0.0f);
|
|
3373
|
+
|
|
3374
|
+
cur = ggml_reshape_2d(ctx, cur, 1, ggml_nelements(cur));
|
|
3375
|
+
cur = ggml_set_rows(ctx, cur, sctx->inp_logit_bias, sctx->inp_logit_idxs);
|
|
3376
|
+
cur = ggml_reshape_1d(ctx, cur, ggml_nelements(cur));
|
|
3377
|
+
|
|
3378
|
+
data->logits = ggml_add(ctx, data->logits, cur);
|
|
3379
|
+
}
|
|
3380
|
+
|
|
3381
|
+
static void llama_sampler_logit_bias_backend_set_input(struct llama_sampler * smpl) {
|
|
3382
|
+
auto * sctx = (llama_sampler_logit_bias *) smpl->ctx;
|
|
3383
|
+
if (sctx->logit_bias.empty()) {
|
|
3384
|
+
return;
|
|
3385
|
+
}
|
|
3386
|
+
|
|
3387
|
+
GGML_ASSERT(sctx->inp_logit_bias != nullptr);
|
|
3388
|
+
GGML_ASSERT(sctx->inp_logit_idxs != nullptr);
|
|
3389
|
+
|
|
3390
|
+
const size_t n = sctx->logit_bias.size();
|
|
3391
|
+
|
|
3392
|
+
std::vector<float> data_logit_bias(n, 0.0f);
|
|
3393
|
+
std::vector<int32_t> data_logit_idxs(n, 0);
|
|
3394
|
+
for (size_t i = 0; i < n; ++i) {
|
|
3395
|
+
const auto & lb = sctx->logit_bias[i];
|
|
3396
|
+
GGML_ASSERT(lb.token >= 0 && lb.token < (int32_t) sctx->n_vocab);
|
|
3397
|
+
data_logit_bias[i] = lb.bias;
|
|
3398
|
+
data_logit_idxs[i] = lb.token;
|
|
3399
|
+
}
|
|
3400
|
+
|
|
3401
|
+
ggml_backend_tensor_set(sctx->inp_logit_bias, data_logit_bias.data(), 0, ggml_nbytes(sctx->inp_logit_bias));
|
|
3402
|
+
ggml_backend_tensor_set(sctx->inp_logit_idxs, data_logit_idxs.data(), 0, ggml_nbytes(sctx->inp_logit_idxs));
|
|
3403
|
+
}
|
|
3404
|
+
|
|
3405
|
+
static bool llama_sampler_logit_bias_backend_init(
|
|
3406
|
+
struct llama_sampler * smpl,
|
|
3407
|
+
ggml_backend_buffer_type_t buft) {
|
|
3408
|
+
auto * sctx = (llama_sampler_logit_bias *) smpl->ctx;
|
|
3409
|
+
|
|
3410
|
+
sctx->init(true);
|
|
3411
|
+
|
|
3412
|
+
if (sctx->logit_bias.empty()) {
|
|
3413
|
+
return true;
|
|
3414
|
+
}
|
|
3415
|
+
|
|
3416
|
+
ggml_init_params params = {
|
|
3417
|
+
/*.mem_size =*/ 2*ggml_tensor_overhead(),
|
|
3418
|
+
/*.mem_buffer =*/ nullptr,
|
|
3419
|
+
/*.no_alloc =*/ true,
|
|
3420
|
+
};
|
|
3421
|
+
|
|
3422
|
+
sctx->inp_ctx.reset(ggml_init(params));
|
|
3423
|
+
|
|
3424
|
+
const size_t n = sctx->logit_bias.size();
|
|
3425
|
+
|
|
3426
|
+
sctx->inp_logit_bias = ggml_new_tensor_2d(sctx->inp_ctx.get(), GGML_TYPE_F32, 1, n);
|
|
3427
|
+
ggml_set_name(sctx->inp_logit_bias, "logit_bias");
|
|
3428
|
+
ggml_set_input(sctx->inp_logit_bias);
|
|
3429
|
+
|
|
3430
|
+
sctx->inp_logit_idxs = ggml_new_tensor_1d(sctx->inp_ctx.get(), GGML_TYPE_I32, n);
|
|
3431
|
+
ggml_set_name(sctx->inp_logit_idxs, "logit_idxs");
|
|
3432
|
+
ggml_set_input(sctx->inp_logit_idxs);
|
|
3433
|
+
|
|
3434
|
+
// Allocate all tensors from our context to the backend
|
|
3435
|
+
sctx->inp_buf.reset(ggml_backend_alloc_ctx_tensors_from_buft(sctx->inp_ctx.get(), buft));
|
|
3436
|
+
|
|
3437
|
+
ggml_backend_buffer_clear(sctx->inp_buf.get(), 0);
|
|
3438
|
+
|
|
3439
|
+
return true;
|
|
3440
|
+
}
|
|
3441
|
+
|
|
2399
3442
|
static struct llama_sampler_i llama_sampler_logit_bias_i = {
|
|
2400
|
-
/* .name
|
|
2401
|
-
/* .accept
|
|
2402
|
-
/* .apply
|
|
2403
|
-
/* .reset
|
|
2404
|
-
/* .clone
|
|
2405
|
-
/* .free
|
|
3443
|
+
/* .name = */ llama_sampler_logit_bias_name,
|
|
3444
|
+
/* .accept = */ nullptr,
|
|
3445
|
+
/* .apply = */ llama_sampler_logit_bias_apply,
|
|
3446
|
+
/* .reset = */ nullptr,
|
|
3447
|
+
/* .clone = */ llama_sampler_logit_bias_clone,
|
|
3448
|
+
/* .free = */ llama_sampler_logit_bias_free,
|
|
3449
|
+
/* .backend_init = */ llama_sampler_logit_bias_backend_init,
|
|
3450
|
+
/* .backend_accept = */ nullptr,
|
|
3451
|
+
/* .backend_apply = */ llama_sampler_logit_bias_backend_apply,
|
|
3452
|
+
/* .backend_set_input = */ llama_sampler_logit_bias_backend_set_input,
|
|
2406
3453
|
};
|
|
2407
3454
|
|
|
2408
3455
|
struct llama_sampler * llama_sampler_init_logit_bias(
|
|
2409
3456
|
int32_t n_vocab,
|
|
2410
3457
|
int32_t n_logit_bias,
|
|
2411
3458
|
const llama_logit_bias * logit_bias) {
|
|
3459
|
+
const bool is_empty = n_logit_bias <= 0;
|
|
3460
|
+
|
|
3461
|
+
if (is_empty) {
|
|
3462
|
+
return llama_sampler_init_empty("?logit-bias");
|
|
3463
|
+
}
|
|
3464
|
+
|
|
2412
3465
|
return llama_sampler_init(
|
|
2413
3466
|
/* .iface = */ &llama_sampler_logit_bias_i,
|
|
2414
3467
|
/* .ctx = */ new llama_sampler_logit_bias {
|
|
2415
|
-
|
|
2416
|
-
/* .
|
|
2417
|
-
/* .
|
|
3468
|
+
("logit-bias"),
|
|
3469
|
+
/* .n_vocab = */ n_vocab,
|
|
3470
|
+
/* .logit_bias = */ std::vector<llama_logit_bias>(logit_bias, logit_bias + n_logit_bias),
|
|
3471
|
+
/* .to_search = */ {},
|
|
3472
|
+
/* .inp_logit_bias = */ nullptr,
|
|
3473
|
+
/* .inp_logit_idxs = */ nullptr,
|
|
3474
|
+
/* .inp_ctx = */ nullptr,
|
|
3475
|
+
/* .inp_buf = */ nullptr,
|
|
2418
3476
|
}
|
|
2419
3477
|
);
|
|
2420
3478
|
}
|
|
@@ -2627,12 +3685,16 @@ static void llama_sampler_infill_free(struct llama_sampler * smpl) {
|
|
|
2627
3685
|
}
|
|
2628
3686
|
|
|
2629
3687
|
static struct llama_sampler_i llama_sampler_infill_i = {
|
|
2630
|
-
/* .name
|
|
2631
|
-
/* .accept
|
|
2632
|
-
/* .apply
|
|
2633
|
-
/* .reset
|
|
2634
|
-
/* .clone
|
|
2635
|
-
/* .free
|
|
3688
|
+
/* .name = */ llama_sampler_infill_name,
|
|
3689
|
+
/* .accept = */ nullptr,
|
|
3690
|
+
/* .apply = */ llama_sampler_infill_apply,
|
|
3691
|
+
/* .reset = */ nullptr,
|
|
3692
|
+
/* .clone = */ llama_sampler_infill_clone,
|
|
3693
|
+
/* .free = */ llama_sampler_infill_free,
|
|
3694
|
+
/* .backend_apply = */ nullptr,
|
|
3695
|
+
/* .backend_accept = */ nullptr,
|
|
3696
|
+
/* .backend_set_input = */ nullptr,
|
|
3697
|
+
/* .backend_init = */ nullptr,
|
|
2636
3698
|
};
|
|
2637
3699
|
|
|
2638
3700
|
struct llama_sampler * llama_sampler_init_infill(const struct llama_vocab * vocab) {
|
|
@@ -2664,7 +3726,7 @@ uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl) {
|
|
|
2664
3726
|
if (smpl->iface == &llama_sampler_chain_i) {
|
|
2665
3727
|
const auto * ctx = (const llama_sampler_chain *) smpl->ctx;
|
|
2666
3728
|
for (auto it = ctx->samplers.rbegin(); it != ctx->samplers.rend(); ++it) {
|
|
2667
|
-
const uint32_t seed = llama_sampler_get_seed(
|
|
3729
|
+
const uint32_t seed = llama_sampler_get_seed(it->ptr);
|
|
2668
3730
|
if (seed != LLAMA_DEFAULT_SEED) {
|
|
2669
3731
|
return seed;
|
|
2670
3732
|
}
|