@fugood/llama.node 1.4.11 → 1.4.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +31 -31
- package/src/llama.cpp/common/arg.cpp +128 -59
- package/src/llama.cpp/common/arg.h +1 -0
- package/src/llama.cpp/common/chat-parser.cpp +11 -0
- package/src/llama.cpp/common/chat.cpp +36 -7
- package/src/llama.cpp/common/chat.h +1 -0
- package/src/llama.cpp/common/common.cpp +42 -23
- package/src/llama.cpp/common/common.h +11 -1
- package/src/llama.cpp/common/llguidance.cpp +10 -6
- package/src/llama.cpp/common/regex-partial.cpp +13 -13
- package/src/llama.cpp/common/sampling.cpp +58 -14
- package/src/llama.cpp/common/sampling.h +3 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
- package/src/llama.cpp/ggml/include/ggml-backend.h +1 -1
- package/src/llama.cpp/ggml/src/CMakeLists.txt +23 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +12 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +86 -25
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +15 -8
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +768 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +0 -4
- package/src/llama.cpp/include/llama.h +100 -12
- package/src/llama.cpp/src/CMakeLists.txt +4 -0
- package/src/llama.cpp/src/llama-adapter.cpp +12 -3
- package/src/llama.cpp/src/llama-adapter.h +7 -1
- package/src/llama.cpp/src/llama-arch.cpp +78 -0
- package/src/llama.cpp/src/llama-arch.h +8 -0
- package/src/llama.cpp/src/llama-chat.cpp +11 -0
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +637 -49
- package/src/llama.cpp/src/llama-context.h +43 -1
- package/src/llama.cpp/src/llama-grammar.cpp +40 -13
- package/src/llama.cpp/src/llama-grammar.h +2 -0
- package/src/llama.cpp/src/llama-graph.cpp +173 -5
- package/src/llama.cpp/src/llama-graph.h +71 -6
- package/src/llama.cpp/src/llama-hparams.cpp +4 -0
- package/src/llama.cpp/src/llama-hparams.h +12 -5
- package/src/llama.cpp/src/llama-kv-cache.h +1 -1
- package/src/llama.cpp/src/llama-mmap.cpp +11 -4
- package/src/llama.cpp/src/llama-model-loader.cpp +23 -0
- package/src/llama.cpp/src/llama-model-loader.h +2 -0
- package/src/llama.cpp/src/llama-model-saver.cpp +3 -0
- package/src/llama.cpp/src/llama-model.cpp +337 -26
- package/src/llama.cpp/src/llama-model.h +13 -2
- package/src/llama.cpp/src/llama-sampling.cpp +1259 -186
- package/src/llama.cpp/src/llama-sampling.h +19 -7
- package/src/llama.cpp/src/llama-vocab.cpp +101 -33
- package/src/llama.cpp/src/llama-vocab.h +2 -0
- package/src/llama.cpp/src/llama.cpp +87 -64
- package/src/llama.cpp/src/models/afmoe.cpp +9 -5
- package/src/llama.cpp/src/models/bert.cpp +4 -2
- package/src/llama.cpp/src/models/cogvlm.cpp +5 -3
- package/src/llama.cpp/src/models/cohere2-iswa.cpp +3 -0
- package/src/llama.cpp/src/models/deepseek2.cpp +1 -1
- package/src/llama.cpp/src/models/gemma-embedding.cpp +2 -6
- package/src/llama.cpp/src/models/gemma2-iswa.cpp +5 -2
- package/src/llama.cpp/src/models/gemma3.cpp +3 -4
- package/src/llama.cpp/src/models/gemma3n-iswa.cpp +4 -7
- package/src/llama.cpp/src/models/llama-iswa.cpp +6 -2
- package/src/llama.cpp/src/models/llama.cpp +19 -6
- package/src/llama.cpp/src/models/maincoder.cpp +117 -0
- package/src/llama.cpp/src/models/mimo2-iswa.cpp +123 -0
- package/src/llama.cpp/src/models/models.h +18 -0
- package/src/llama.cpp/src/models/modern-bert.cpp +116 -0
- package/src/llama.cpp/src/models/openai-moe-iswa.cpp +5 -2
- package/src/llama.cpp/src/models/plamo3.cpp +128 -0
- package/src/llama.cpp/src/models/smallthinker.cpp +11 -5
- package/src/llama.cpp/src/unicode.cpp +23 -14
|
@@ -4,6 +4,8 @@
|
|
|
4
4
|
#include "llama-vocab.h"
|
|
5
5
|
#include "llama-grammar.h"
|
|
6
6
|
|
|
7
|
+
#include "ggml-cpp.h"
|
|
8
|
+
|
|
7
9
|
#include <array>
|
|
8
10
|
#include <algorithm>
|
|
9
11
|
#include <cassert>
|
|
@@ -346,7 +348,9 @@ static uint32_t get_rng_seed(uint32_t seed) {
|
|
|
346
348
|
|
|
347
349
|
// llama_sampler API
|
|
348
350
|
|
|
349
|
-
struct llama_sampler * llama_sampler_init(
|
|
351
|
+
struct llama_sampler * llama_sampler_init(
|
|
352
|
+
struct llama_sampler_i * iface,
|
|
353
|
+
llama_sampler_context_t ctx) {
|
|
350
354
|
return new llama_sampler {
|
|
351
355
|
/* .iface = */ iface,
|
|
352
356
|
/* .ctx = */ ctx,
|
|
@@ -421,37 +425,200 @@ void llama_sampler_free(struct llama_sampler * smpl) {
|
|
|
421
425
|
delete smpl;
|
|
422
426
|
}
|
|
423
427
|
|
|
424
|
-
|
|
425
|
-
const auto * logits = llama_get_logits_ith(ctx, idx);
|
|
428
|
+
// empty sampler
|
|
426
429
|
|
|
427
|
-
|
|
428
|
-
const
|
|
430
|
+
struct llama_sampler_empty {
|
|
431
|
+
const char * name;
|
|
432
|
+
};
|
|
429
433
|
|
|
430
|
-
|
|
434
|
+
static struct llama_sampler * llama_sampler_init_empty(const char * name);
|
|
435
|
+
|
|
436
|
+
static const char * llama_sampler_empty_name(const struct llama_sampler * smpl) {
|
|
437
|
+
auto * ctx = (llama_sampler_empty *) smpl->ctx;
|
|
438
|
+
return ctx->name;
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
static void llama_sampler_empty_accept(struct llama_sampler * smpl, llama_token token) {
|
|
442
|
+
GGML_UNUSED(smpl);
|
|
443
|
+
GGML_UNUSED(token);
|
|
444
|
+
}
|
|
445
|
+
|
|
446
|
+
static void llama_sampler_empty_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
|
447
|
+
GGML_UNUSED(smpl);
|
|
448
|
+
GGML_UNUSED(cur_p);
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
static void llama_sampler_empty_reset(struct llama_sampler * smpl) {
|
|
452
|
+
GGML_UNUSED(smpl);
|
|
453
|
+
}
|
|
454
|
+
|
|
455
|
+
static struct llama_sampler * llama_sampler_empty_clone(const struct llama_sampler * smpl) {
|
|
456
|
+
auto * ctx = (llama_sampler_empty *) smpl->ctx;
|
|
457
|
+
return llama_sampler_init_empty(ctx->name);
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
static void llama_sampler_empty_free(struct llama_sampler * smpl) {
|
|
461
|
+
delete (llama_sampler_empty *) smpl->ctx;
|
|
462
|
+
}
|
|
463
|
+
|
|
464
|
+
static bool llama_sampler_empty_backend_init(
|
|
465
|
+
struct llama_sampler * smpl,
|
|
466
|
+
ggml_backend_buffer_type_t buft) {
|
|
467
|
+
GGML_UNUSED(smpl);
|
|
468
|
+
GGML_UNUSED(buft);
|
|
469
|
+
|
|
470
|
+
return true;
|
|
471
|
+
}
|
|
472
|
+
|
|
473
|
+
static void llama_sampler_empty_backend_accept(
|
|
474
|
+
struct llama_sampler * smpl,
|
|
475
|
+
ggml_context * ctx,
|
|
476
|
+
ggml_cgraph * gf,
|
|
477
|
+
struct ggml_tensor * selected_token) {
|
|
478
|
+
GGML_UNUSED(smpl);
|
|
479
|
+
GGML_UNUSED(ctx);
|
|
480
|
+
GGML_UNUSED(gf);
|
|
481
|
+
GGML_UNUSED(selected_token);
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
static void llama_sampler_empty_backend_apply(
|
|
485
|
+
struct llama_sampler * smpl,
|
|
486
|
+
struct ggml_context * ctx,
|
|
487
|
+
struct ggml_cgraph * gf,
|
|
488
|
+
struct llama_sampler_data * data) {
|
|
489
|
+
GGML_UNUSED(smpl);
|
|
490
|
+
GGML_UNUSED(ctx);
|
|
491
|
+
GGML_UNUSED(gf);
|
|
492
|
+
GGML_UNUSED(data);
|
|
493
|
+
}
|
|
494
|
+
|
|
495
|
+
static void llama_sampler_empty_backend_set_input(struct llama_sampler * smpl) {
|
|
496
|
+
GGML_UNUSED(smpl);
|
|
497
|
+
}
|
|
498
|
+
|
|
499
|
+
static struct llama_sampler_i llama_sampler_empty_i = {
|
|
500
|
+
/* .name = */ llama_sampler_empty_name,
|
|
501
|
+
/* .accept = */ llama_sampler_empty_accept,
|
|
502
|
+
/* .apply = */ llama_sampler_empty_apply,
|
|
503
|
+
/* .reset = */ llama_sampler_empty_reset,
|
|
504
|
+
/* .clone = */ llama_sampler_empty_clone,
|
|
505
|
+
/* .free = */ llama_sampler_empty_free,
|
|
506
|
+
/* .backend_init = */ llama_sampler_empty_backend_init,
|
|
507
|
+
/* .backend_accept = */ llama_sampler_empty_backend_accept,
|
|
508
|
+
/* .backend_apply = */ llama_sampler_empty_backend_apply,
|
|
509
|
+
/* .backend_set_input = */ llama_sampler_empty_backend_set_input,
|
|
510
|
+
};
|
|
511
|
+
|
|
512
|
+
struct llama_sampler * llama_sampler_init_empty(const char * name) {
|
|
513
|
+
return llama_sampler_init(
|
|
514
|
+
/* .iface = */ &llama_sampler_empty_i,
|
|
515
|
+
/* .ctx = */ new llama_sampler_empty {
|
|
516
|
+
/* .name = */ name,
|
|
517
|
+
}
|
|
518
|
+
);
|
|
519
|
+
}
|
|
520
|
+
|
|
521
|
+
// common backend sampler functionality
|
|
522
|
+
//
|
|
523
|
+
// +name : means that the sampler is support and will run on the backend
|
|
524
|
+
// -name : means that a ggml operator is not supported by the backend
|
|
525
|
+
//
|
|
526
|
+
struct llama_sampler_backend {
|
|
527
|
+
llama_sampler_backend(const char * name) : name(name), name_ext(name), is_init(false), support(false) {}
|
|
528
|
+
|
|
529
|
+
const char * get_name() {
|
|
530
|
+
if (!is_init) {
|
|
531
|
+
return name.c_str();
|
|
532
|
+
}
|
|
431
533
|
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
534
|
+
if (support) {
|
|
535
|
+
name_ext = "+" + name;
|
|
536
|
+
} else {
|
|
537
|
+
name_ext = "-" + name;
|
|
538
|
+
}
|
|
539
|
+
|
|
540
|
+
return name_ext.c_str();
|
|
437
541
|
}
|
|
438
542
|
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
543
|
+
void init(bool support) {
|
|
544
|
+
GGML_ASSERT(this->is_init == false);
|
|
545
|
+
|
|
546
|
+
this->is_init = true;
|
|
547
|
+
this->support = support;
|
|
548
|
+
}
|
|
549
|
+
|
|
550
|
+
private:
|
|
551
|
+
std::string name;
|
|
552
|
+
std::string name_ext;
|
|
553
|
+
|
|
554
|
+
bool is_init;
|
|
555
|
+
bool support;
|
|
556
|
+
};
|
|
557
|
+
|
|
558
|
+
// check if all ggml ops used by the sampler are supported by the backend
|
|
559
|
+
static bool llama_sampler_backend_support(
|
|
560
|
+
llama_sampler * smpl,
|
|
561
|
+
ggml_backend_buffer_type_t buft) {
|
|
562
|
+
auto * device = ggml_backend_buft_get_device(buft);
|
|
563
|
+
if (!device) {
|
|
564
|
+
// CPU backend always supported
|
|
565
|
+
return true;
|
|
566
|
+
}
|
|
567
|
+
|
|
568
|
+
ggml_init_params params = {
|
|
569
|
+
/*.mem_size =*/ 128*ggml_tensor_overhead() + ggml_graph_overhead(),
|
|
570
|
+
/*.mem_buffer =*/ NULL,
|
|
571
|
+
/*.no_alloc =*/ true,
|
|
444
572
|
};
|
|
445
573
|
|
|
446
|
-
|
|
574
|
+
ggml_context_ptr ctx_ptr { ggml_init(params) };
|
|
575
|
+
if (!ctx_ptr) {
|
|
576
|
+
throw std::runtime_error(format("failed to create ggml context"));
|
|
577
|
+
}
|
|
447
578
|
|
|
448
|
-
|
|
579
|
+
ggml_context * ctx = ctx_ptr.get();
|
|
449
580
|
|
|
450
|
-
|
|
581
|
+
const int64_t n = 1024*1024;
|
|
451
582
|
|
|
452
|
-
|
|
583
|
+
llama_sampler_data data = {
|
|
584
|
+
/*.logits = */ ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n),
|
|
585
|
+
/*.probs = */ nullptr,
|
|
586
|
+
/*.sampled = */ nullptr,
|
|
587
|
+
/*.candidates = */ ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n),
|
|
588
|
+
};
|
|
453
589
|
|
|
454
|
-
|
|
590
|
+
ggml_cgraph * gf = ggml_new_graph(ctx);
|
|
591
|
+
|
|
592
|
+
smpl->iface->backend_apply(smpl, ctx, gf, &data);
|
|
593
|
+
|
|
594
|
+
if (data.logits) {
|
|
595
|
+
ggml_build_forward_expand(gf, data.logits);
|
|
596
|
+
}
|
|
597
|
+
|
|
598
|
+
if (data.probs) {
|
|
599
|
+
ggml_build_forward_expand(gf, data.probs);
|
|
600
|
+
}
|
|
601
|
+
|
|
602
|
+
if (data.sampled) {
|
|
603
|
+
ggml_build_forward_expand(gf, data.sampled);
|
|
604
|
+
}
|
|
605
|
+
|
|
606
|
+
if (data.candidates) {
|
|
607
|
+
ggml_build_forward_expand(gf, data.candidates);
|
|
608
|
+
}
|
|
609
|
+
|
|
610
|
+
for (int i = 0; i < ggml_graph_n_nodes(gf); i++) {
|
|
611
|
+
struct ggml_tensor * op = ggml_graph_node(gf, i);
|
|
612
|
+
|
|
613
|
+
if (!ggml_backend_dev_supports_op(device, op)) {
|
|
614
|
+
LLAMA_LOG_WARN("%s: device '%s' does not have support for op %s needed for sampler '%s'\n",
|
|
615
|
+
__func__, ggml_backend_dev_name(device), ggml_op_name(op->op), smpl->iface->name(smpl));
|
|
616
|
+
|
|
617
|
+
return false;
|
|
618
|
+
}
|
|
619
|
+
}
|
|
620
|
+
|
|
621
|
+
return true;
|
|
455
622
|
}
|
|
456
623
|
|
|
457
624
|
// sampler chain
|
|
@@ -465,8 +632,8 @@ static void llama_sampler_chain_accept(struct llama_sampler * smpl, llama_token
|
|
|
465
632
|
|
|
466
633
|
time_meas tm(chain->t_sample_us, chain->params.no_perf);
|
|
467
634
|
|
|
468
|
-
for (auto
|
|
469
|
-
llama_sampler_accept(smpl, token);
|
|
635
|
+
for (auto & smpl : chain->samplers) {
|
|
636
|
+
llama_sampler_accept(smpl.ptr, token);
|
|
470
637
|
}
|
|
471
638
|
|
|
472
639
|
chain->n_sample++;
|
|
@@ -477,16 +644,28 @@ static void llama_sampler_chain_apply(struct llama_sampler * smpl, llama_token_d
|
|
|
477
644
|
|
|
478
645
|
time_meas tm(chain->t_sample_us, chain->params.no_perf);
|
|
479
646
|
|
|
480
|
-
|
|
481
|
-
|
|
647
|
+
bool is_backend = chain->is_init;
|
|
648
|
+
|
|
649
|
+
for (auto & smpl : chain->samplers) {
|
|
650
|
+
if (is_backend && smpl.is_backend) {
|
|
651
|
+
continue;
|
|
652
|
+
}
|
|
653
|
+
|
|
654
|
+
is_backend = false;
|
|
655
|
+
|
|
656
|
+
if (smpl.ptr->iface->apply == nullptr) {
|
|
657
|
+
continue;
|
|
658
|
+
}
|
|
659
|
+
|
|
660
|
+
llama_sampler_apply(smpl.ptr, cur_p);
|
|
482
661
|
}
|
|
483
662
|
}
|
|
484
663
|
|
|
485
664
|
static void llama_sampler_chain_reset(struct llama_sampler * smpl) {
|
|
486
665
|
auto * chain = (llama_sampler_chain *) smpl->ctx;
|
|
487
666
|
|
|
488
|
-
for (auto
|
|
489
|
-
llama_sampler_reset(smpl);
|
|
667
|
+
for (auto & smpl : chain->samplers) {
|
|
668
|
+
llama_sampler_reset(smpl.ptr);
|
|
490
669
|
}
|
|
491
670
|
}
|
|
492
671
|
|
|
@@ -495,8 +674,8 @@ static struct llama_sampler * llama_sampler_chain_clone(const struct llama_sampl
|
|
|
495
674
|
|
|
496
675
|
auto * result = llama_sampler_chain_init(chain_src->params);
|
|
497
676
|
|
|
498
|
-
for (auto
|
|
499
|
-
llama_sampler_chain_add(result, llama_sampler_clone(smpl));
|
|
677
|
+
for (const auto & smpl : chain_src->samplers) {
|
|
678
|
+
llama_sampler_chain_add(result, llama_sampler_clone(smpl.ptr));
|
|
500
679
|
}
|
|
501
680
|
|
|
502
681
|
return result;
|
|
@@ -505,20 +684,109 @@ static struct llama_sampler * llama_sampler_chain_clone(const struct llama_sampl
|
|
|
505
684
|
static void llama_sampler_chain_free(struct llama_sampler * smpl) {
|
|
506
685
|
auto * chain = (llama_sampler_chain *) smpl->ctx;
|
|
507
686
|
|
|
508
|
-
for (auto
|
|
509
|
-
llama_sampler_free(smpl);
|
|
687
|
+
for (auto & smpl : chain->samplers) {
|
|
688
|
+
llama_sampler_free(smpl.ptr);
|
|
510
689
|
}
|
|
511
690
|
|
|
512
691
|
delete chain;
|
|
513
692
|
}
|
|
514
693
|
|
|
694
|
+
static bool llama_sampler_chain_backend_init(
|
|
695
|
+
struct llama_sampler * smpl,
|
|
696
|
+
ggml_backend_buffer_type_t buft) {
|
|
697
|
+
auto * chain = (llama_sampler_chain *) smpl->ctx;
|
|
698
|
+
|
|
699
|
+
GGML_ASSERT(chain->is_init == false && "llama_sampler_chain_backend_init() called twice");
|
|
700
|
+
|
|
701
|
+
chain->is_init = true;
|
|
702
|
+
|
|
703
|
+
bool res = true;
|
|
704
|
+
|
|
705
|
+
for (auto & smpl : chain->samplers) {
|
|
706
|
+
bool res_cur = true;
|
|
707
|
+
|
|
708
|
+
// to be able to run a sampler on the backend, it has to:
|
|
709
|
+
// - have the .backend_init() API implemented
|
|
710
|
+
// - return true during .backend_init()
|
|
711
|
+
if (smpl.ptr->iface->backend_init) {
|
|
712
|
+
if (!smpl.ptr->iface->backend_init(smpl.ptr, buft)) {
|
|
713
|
+
res_cur = false;
|
|
714
|
+
}
|
|
715
|
+
} else {
|
|
716
|
+
res_cur = false;
|
|
717
|
+
}
|
|
718
|
+
|
|
719
|
+
smpl.is_backend = res_cur;
|
|
720
|
+
|
|
721
|
+
res = res && res_cur;
|
|
722
|
+
}
|
|
723
|
+
|
|
724
|
+
return res;
|
|
725
|
+
}
|
|
726
|
+
|
|
727
|
+
static void llama_sampler_chain_backend_accept(
|
|
728
|
+
struct llama_sampler * smpl,
|
|
729
|
+
ggml_context * ctx,
|
|
730
|
+
ggml_cgraph * gf,
|
|
731
|
+
struct ggml_tensor * selected_token) {
|
|
732
|
+
auto * chain = (llama_sampler_chain *) smpl->ctx;
|
|
733
|
+
|
|
734
|
+
for (auto & smpl : chain->samplers) {
|
|
735
|
+
if (!smpl.is_backend) {
|
|
736
|
+
break;
|
|
737
|
+
}
|
|
738
|
+
|
|
739
|
+
if (smpl.ptr->iface->backend_accept) {
|
|
740
|
+
smpl.ptr->iface->backend_accept(smpl.ptr, ctx, gf, selected_token);
|
|
741
|
+
}
|
|
742
|
+
}
|
|
743
|
+
}
|
|
744
|
+
|
|
745
|
+
static void llama_sampler_chain_backend_apply(
|
|
746
|
+
struct llama_sampler * smpl,
|
|
747
|
+
struct ggml_context * ctx,
|
|
748
|
+
struct ggml_cgraph * gf,
|
|
749
|
+
struct llama_sampler_data * data) {
|
|
750
|
+
auto * chain = (llama_sampler_chain *) smpl->ctx;
|
|
751
|
+
|
|
752
|
+
GGML_ASSERT(chain->is_init && "llama_sampler_chain_backend_init() not called");
|
|
753
|
+
|
|
754
|
+
for (auto & smpl : chain->samplers) {
|
|
755
|
+
if (!smpl.is_backend) {
|
|
756
|
+
break;
|
|
757
|
+
}
|
|
758
|
+
|
|
759
|
+
if (smpl.ptr->iface->backend_apply) {
|
|
760
|
+
smpl.ptr->iface->backend_apply(smpl.ptr, ctx, gf, data);
|
|
761
|
+
}
|
|
762
|
+
}
|
|
763
|
+
}
|
|
764
|
+
|
|
765
|
+
static void llama_sampler_chain_backend_set_input(struct llama_sampler * smpl) {
|
|
766
|
+
auto * chain = (llama_sampler_chain *) smpl->ctx;
|
|
767
|
+
|
|
768
|
+
for (auto & smpl : chain->samplers) {
|
|
769
|
+
if (!smpl.is_backend) {
|
|
770
|
+
break;
|
|
771
|
+
}
|
|
772
|
+
|
|
773
|
+
if (smpl.ptr->iface->backend_set_input) {
|
|
774
|
+
smpl.ptr->iface->backend_set_input(smpl.ptr);
|
|
775
|
+
}
|
|
776
|
+
}
|
|
777
|
+
}
|
|
778
|
+
|
|
515
779
|
static struct llama_sampler_i llama_sampler_chain_i = {
|
|
516
|
-
/* .name
|
|
517
|
-
/* .accept
|
|
518
|
-
/* .apply
|
|
519
|
-
/* .reset
|
|
520
|
-
/* .clone
|
|
521
|
-
/* .free
|
|
780
|
+
/* .name = */ llama_sampler_chain_name,
|
|
781
|
+
/* .accept = */ llama_sampler_chain_accept,
|
|
782
|
+
/* .apply = */ llama_sampler_chain_apply,
|
|
783
|
+
/* .reset = */ llama_sampler_chain_reset,
|
|
784
|
+
/* .clone = */ llama_sampler_chain_clone,
|
|
785
|
+
/* .free = */ llama_sampler_chain_free,
|
|
786
|
+
/* .backend_init = */ llama_sampler_chain_backend_init,
|
|
787
|
+
/* .backend_accept = */ llama_sampler_chain_backend_accept,
|
|
788
|
+
/* .backend_apply = */ llama_sampler_chain_backend_apply,
|
|
789
|
+
/* .backend_set_input = */ llama_sampler_chain_backend_set_input,
|
|
522
790
|
};
|
|
523
791
|
|
|
524
792
|
struct llama_sampler * llama_sampler_chain_init(struct llama_sampler_chain_params params) {
|
|
@@ -526,26 +794,113 @@ struct llama_sampler * llama_sampler_chain_init(struct llama_sampler_chain_param
|
|
|
526
794
|
/* .iface = */ &llama_sampler_chain_i,
|
|
527
795
|
/* .ctx = */ new llama_sampler_chain {
|
|
528
796
|
/* .params = */ params,
|
|
797
|
+
/* .is_init = */ false,
|
|
529
798
|
/* .samplers = */ {},
|
|
799
|
+
/* .cur = */ {},
|
|
530
800
|
/* .t_sample_us = */ 0,
|
|
531
801
|
/* .n_sample = */ 0,
|
|
532
802
|
}
|
|
533
803
|
);
|
|
534
804
|
}
|
|
535
805
|
|
|
806
|
+
llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx) {
|
|
807
|
+
const llama_token sampled_token = llama_get_sampled_token_ith (ctx, idx);
|
|
808
|
+
const float * sampled_probs = llama_get_sampled_probs_ith (ctx, idx);
|
|
809
|
+
const float * sampled_logits = llama_get_sampled_logits_ith (ctx, idx);
|
|
810
|
+
const llama_token * sampled_ids = llama_get_sampled_candidates_ith(ctx, idx);
|
|
811
|
+
|
|
812
|
+
// If a backend sampler has already sampled a token, return it.
|
|
813
|
+
if (sampled_token != LLAMA_TOKEN_NULL) {
|
|
814
|
+
LLAMA_LOG_DEBUG("%s: Backend sampler selected token for idx %d. Skipping CPU samplers\n", __func__, idx);
|
|
815
|
+
return sampled_token;
|
|
816
|
+
}
|
|
817
|
+
|
|
818
|
+
const llama_model * model = llama_get_model(ctx);
|
|
819
|
+
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
820
|
+
|
|
821
|
+
const int n_vocab = llama_vocab_n_tokens(vocab);
|
|
822
|
+
|
|
823
|
+
// use pre-allocated buffer from chain if available, otherwise allocate locally
|
|
824
|
+
std::vector<llama_token_data> * cur_ptr;
|
|
825
|
+
std::vector<llama_token_data> cur_local;
|
|
826
|
+
|
|
827
|
+
if (smpl->iface == &llama_sampler_chain_i) {
|
|
828
|
+
auto * chain = (llama_sampler_chain *) smpl->ctx;
|
|
829
|
+
cur_ptr = &chain->cur;
|
|
830
|
+
} else {
|
|
831
|
+
cur_ptr = &cur_local;
|
|
832
|
+
}
|
|
833
|
+
|
|
834
|
+
auto & cur = *cur_ptr;
|
|
835
|
+
|
|
836
|
+
if (sampled_probs) {
|
|
837
|
+
const uint32_t sampled_probs_count = llama_get_sampled_probs_count_ith(ctx, idx);
|
|
838
|
+
cur.resize(sampled_probs_count);
|
|
839
|
+
for (uint32_t i = 0; i < sampled_probs_count; ++i) {
|
|
840
|
+
cur[i] = llama_token_data{sampled_ids[i], sampled_logits[i], sampled_probs[i]};
|
|
841
|
+
}
|
|
842
|
+
} else if (sampled_logits) {
|
|
843
|
+
const uint32_t sampled_logits_count = llama_get_sampled_logits_count_ith(ctx, idx);
|
|
844
|
+
cur.resize(sampled_logits_count);
|
|
845
|
+
for (llama_token i = 0; i < (int)sampled_logits_count; i++) {
|
|
846
|
+
cur[i] = llama_token_data{sampled_ids[i], sampled_logits[i], 0.0f};
|
|
847
|
+
}
|
|
848
|
+
} else {
|
|
849
|
+
const auto * logits = llama_get_logits_ith(ctx, idx);
|
|
850
|
+
GGML_ASSERT(logits != nullptr);
|
|
851
|
+
cur.resize(n_vocab);
|
|
852
|
+
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
|
853
|
+
cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
|
|
854
|
+
}
|
|
855
|
+
}
|
|
856
|
+
|
|
857
|
+
llama_token_data_array cur_p = {
|
|
858
|
+
/* .data = */ cur.data(),
|
|
859
|
+
/* .size = */ cur.size(),
|
|
860
|
+
/* .selected = */ -1,
|
|
861
|
+
/* .sorted = */ false,
|
|
862
|
+
};
|
|
863
|
+
|
|
864
|
+
llama_sampler_apply(smpl, &cur_p);
|
|
865
|
+
|
|
866
|
+
GGML_ASSERT(cur_p.selected >= 0 && cur_p.selected < (int32_t) cur_p.size);
|
|
867
|
+
|
|
868
|
+
auto token = cur_p.data[cur_p.selected].id;
|
|
869
|
+
|
|
870
|
+
llama_sampler_accept(smpl, token);
|
|
871
|
+
|
|
872
|
+
return token;
|
|
873
|
+
}
|
|
874
|
+
|
|
875
|
+
|
|
536
876
|
void llama_sampler_chain_add(struct llama_sampler * chain, struct llama_sampler * smpl) {
|
|
537
877
|
auto * p = (llama_sampler_chain *) chain->ctx;
|
|
538
|
-
p->samplers.push_back(
|
|
878
|
+
p->samplers.push_back({
|
|
879
|
+
/* .is_backend = */ false,
|
|
880
|
+
/* .ptr = */ smpl,
|
|
881
|
+
});
|
|
539
882
|
}
|
|
540
883
|
|
|
541
|
-
struct llama_sampler * llama_sampler_chain_get(
|
|
884
|
+
struct llama_sampler * llama_sampler_chain_get(struct llama_sampler * chain, int32_t i) {
|
|
885
|
+
if (chain == nullptr) {
|
|
886
|
+
return nullptr;
|
|
887
|
+
}
|
|
888
|
+
|
|
889
|
+
if (chain->iface != &llama_sampler_chain_i) {
|
|
890
|
+
return nullptr;
|
|
891
|
+
}
|
|
892
|
+
|
|
893
|
+
if (i == -1) {
|
|
894
|
+
return chain;
|
|
895
|
+
}
|
|
896
|
+
|
|
542
897
|
const auto * p = (const llama_sampler_chain *) chain->ctx;
|
|
543
898
|
|
|
544
899
|
if (i < 0 || (size_t) i >= p->samplers.size()) {
|
|
545
900
|
return nullptr;
|
|
546
901
|
}
|
|
547
902
|
|
|
548
|
-
return p->samplers[i];
|
|
903
|
+
return p->samplers[i].ptr;
|
|
549
904
|
}
|
|
550
905
|
|
|
551
906
|
struct llama_sampler * llama_sampler_chain_remove(struct llama_sampler * chain, int32_t i) {
|
|
@@ -555,7 +910,7 @@ struct llama_sampler * llama_sampler_chain_remove(struct llama_sampler * chain,
|
|
|
555
910
|
return nullptr;
|
|
556
911
|
}
|
|
557
912
|
|
|
558
|
-
auto * result = p->samplers[i];
|
|
913
|
+
auto * result = p->samplers[i].ptr;
|
|
559
914
|
p->samplers.erase(p->samplers.begin() + i);
|
|
560
915
|
|
|
561
916
|
return result;
|
|
@@ -573,8 +928,36 @@ int llama_sampler_chain_n(const struct llama_sampler * chain) {
|
|
|
573
928
|
|
|
574
929
|
// greedy
|
|
575
930
|
|
|
576
|
-
|
|
577
|
-
|
|
931
|
+
struct llama_sampler_greedy : public llama_sampler_backend {
|
|
932
|
+
};
|
|
933
|
+
|
|
934
|
+
static const char * llama_sampler_greedy_name(const struct llama_sampler * smpl) {
|
|
935
|
+
auto * sctx = (llama_sampler_greedy *) smpl->ctx;
|
|
936
|
+
return sctx->get_name();
|
|
937
|
+
}
|
|
938
|
+
|
|
939
|
+
static void llama_sampler_greedy_reset(struct llama_sampler * smpl) {
|
|
940
|
+
auto * ctx = (llama_sampler_greedy *) smpl->ctx;
|
|
941
|
+
GGML_UNUSED(ctx);
|
|
942
|
+
}
|
|
943
|
+
|
|
944
|
+
static struct llama_sampler * llama_sampler_greedy_clone(const struct llama_sampler * smpl) {
|
|
945
|
+
const auto * ctx = (const llama_sampler_greedy *) smpl->ctx;
|
|
946
|
+
auto * result = llama_sampler_init_greedy();
|
|
947
|
+
|
|
948
|
+
// copy the state
|
|
949
|
+
{
|
|
950
|
+
auto * result_ctx = (llama_sampler_greedy *) result->ctx;
|
|
951
|
+
|
|
952
|
+
GGML_UNUSED(ctx);
|
|
953
|
+
GGML_UNUSED(result_ctx);
|
|
954
|
+
}
|
|
955
|
+
|
|
956
|
+
return result;
|
|
957
|
+
}
|
|
958
|
+
|
|
959
|
+
static void llama_sampler_greedy_free(struct llama_sampler * smpl) {
|
|
960
|
+
delete (llama_sampler_greedy *) smpl->ctx;
|
|
578
961
|
}
|
|
579
962
|
|
|
580
963
|
static void llama_sampler_greedy_apply(struct llama_sampler * /*smpl*/, llama_token_data_array * cur_p) {
|
|
@@ -586,33 +969,72 @@ static void llama_sampler_greedy_apply(struct llama_sampler * /*smpl*/, llama_to
|
|
|
586
969
|
}
|
|
587
970
|
}
|
|
588
971
|
|
|
972
|
+
static bool llama_sampler_greedy_backend_init(
|
|
973
|
+
struct llama_sampler * smpl,
|
|
974
|
+
ggml_backend_buffer_type_t buft) {
|
|
975
|
+
auto * sctx = (llama_sampler_greedy *) smpl->ctx;
|
|
976
|
+
|
|
977
|
+
const bool res = llama_sampler_backend_support(smpl, buft);
|
|
978
|
+
|
|
979
|
+
sctx->init(res);
|
|
980
|
+
|
|
981
|
+
return res;
|
|
982
|
+
}
|
|
983
|
+
|
|
984
|
+
static void llama_sampler_greedy_backend_apply(
|
|
985
|
+
struct llama_sampler * smpl,
|
|
986
|
+
struct ggml_context * ctx,
|
|
987
|
+
struct ggml_cgraph * gf,
|
|
988
|
+
struct llama_sampler_data * data) {
|
|
989
|
+
GGML_UNUSED(gf);
|
|
990
|
+
GGML_UNUSED(smpl);
|
|
991
|
+
|
|
992
|
+
struct ggml_tensor * curl = ggml_argmax(ctx, data->logits);
|
|
993
|
+
ggml_set_name(curl, "greedy_argmax");
|
|
994
|
+
|
|
995
|
+
data->sampled = curl;
|
|
996
|
+
}
|
|
997
|
+
|
|
589
998
|
static struct llama_sampler_i llama_sampler_greedy_i = {
|
|
590
|
-
/* .name
|
|
591
|
-
/* .accept
|
|
592
|
-
/* .apply
|
|
593
|
-
/* .reset
|
|
594
|
-
/* .clone
|
|
595
|
-
/* .free
|
|
999
|
+
/* .name = */ llama_sampler_greedy_name,
|
|
1000
|
+
/* .accept = */ nullptr,
|
|
1001
|
+
/* .apply = */ llama_sampler_greedy_apply,
|
|
1002
|
+
/* .reset = */ llama_sampler_greedy_reset,
|
|
1003
|
+
/* .clone = */ llama_sampler_greedy_clone,
|
|
1004
|
+
/* .free = */ llama_sampler_greedy_free,
|
|
1005
|
+
/* .backend_init = */ llama_sampler_greedy_backend_init,
|
|
1006
|
+
/* .backend_accept = */ nullptr,
|
|
1007
|
+
/* .backend_apply = */ llama_sampler_greedy_backend_apply,
|
|
1008
|
+
/* .backend_set_input = */ nullptr,
|
|
596
1009
|
};
|
|
597
1010
|
|
|
598
1011
|
struct llama_sampler * llama_sampler_init_greedy() {
|
|
599
1012
|
return llama_sampler_init(
|
|
600
1013
|
/* .iface = */ &llama_sampler_greedy_i,
|
|
601
|
-
/* .ctx = */
|
|
1014
|
+
/* .ctx = */ new llama_sampler_greedy {
|
|
1015
|
+
("greedy"),
|
|
1016
|
+
}
|
|
602
1017
|
);
|
|
603
1018
|
}
|
|
604
1019
|
|
|
605
1020
|
// dist
|
|
606
1021
|
|
|
607
|
-
struct llama_sampler_dist {
|
|
1022
|
+
struct llama_sampler_dist : public llama_sampler_backend {
|
|
608
1023
|
const uint32_t seed;
|
|
609
1024
|
uint32_t seed_cur;
|
|
610
1025
|
|
|
611
1026
|
std::mt19937 rng;
|
|
1027
|
+
|
|
1028
|
+
// backend input
|
|
1029
|
+
struct ggml_tensor * inp_uniform;
|
|
1030
|
+
|
|
1031
|
+
ggml_context_ptr inp_ctx;
|
|
1032
|
+
ggml_backend_buffer_ptr inp_buf;
|
|
612
1033
|
};
|
|
613
1034
|
|
|
614
|
-
static const char * llama_sampler_dist_name(const struct llama_sampler *
|
|
615
|
-
|
|
1035
|
+
static const char * llama_sampler_dist_name(const struct llama_sampler * smpl) {
|
|
1036
|
+
auto * sctx = (llama_sampler_dist *) smpl->ctx;
|
|
1037
|
+
return sctx->get_name();
|
|
616
1038
|
}
|
|
617
1039
|
|
|
618
1040
|
static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
|
@@ -687,6 +1109,12 @@ static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_da
|
|
|
687
1109
|
#endif
|
|
688
1110
|
}
|
|
689
1111
|
|
|
1112
|
+
static void llama_sampler_dist_reset(struct llama_sampler * smpl) {
|
|
1113
|
+
auto * ctx = (llama_sampler_dist *) smpl->ctx;
|
|
1114
|
+
ctx->seed_cur = get_rng_seed(ctx->seed);
|
|
1115
|
+
ctx->rng.seed(ctx->seed_cur);
|
|
1116
|
+
}
|
|
1117
|
+
|
|
690
1118
|
static struct llama_sampler * llama_sampler_dist_clone(const struct llama_sampler * smpl) {
|
|
691
1119
|
const auto * ctx = (const llama_sampler_dist *) smpl->ctx;
|
|
692
1120
|
auto * result = llama_sampler_init_dist(ctx->seed);
|
|
@@ -701,23 +1129,127 @@ static struct llama_sampler * llama_sampler_dist_clone(const struct llama_sample
|
|
|
701
1129
|
return result;
|
|
702
1130
|
}
|
|
703
1131
|
|
|
704
|
-
static void llama_sampler_dist_reset(struct llama_sampler * smpl) {
|
|
705
|
-
auto * ctx = (llama_sampler_dist *) smpl->ctx;
|
|
706
|
-
ctx->seed_cur = get_rng_seed(ctx->seed);
|
|
707
|
-
ctx->rng.seed(ctx->seed_cur);
|
|
708
|
-
}
|
|
709
|
-
|
|
710
1132
|
static void llama_sampler_dist_free(struct llama_sampler * smpl) {
|
|
711
1133
|
delete (llama_sampler_dist *) smpl->ctx;
|
|
712
1134
|
}
|
|
713
1135
|
|
|
1136
|
+
static bool llama_sampler_dist_backend_init(
|
|
1137
|
+
struct llama_sampler * smpl,
|
|
1138
|
+
ggml_backend_buffer_type_t buft) {
|
|
1139
|
+
auto * sctx = (llama_sampler_dist *) smpl->ctx;
|
|
1140
|
+
|
|
1141
|
+
// allocate inputs
|
|
1142
|
+
{
|
|
1143
|
+
ggml_init_params params = {
|
|
1144
|
+
/*.mem_size =*/ ggml_tensor_overhead(),
|
|
1145
|
+
/*.mem_buffer =*/ nullptr,
|
|
1146
|
+
/*.no_alloc =*/ true,
|
|
1147
|
+
};
|
|
1148
|
+
|
|
1149
|
+
sctx->inp_ctx.reset(ggml_init(params));
|
|
1150
|
+
|
|
1151
|
+
// Create the uniform random scalar input tensor. This will be set by
|
|
1152
|
+
// llama_sampler_dist_backend_set_input after this graph is built.
|
|
1153
|
+
sctx->inp_uniform = ggml_new_tensor_1d(sctx->inp_ctx.get(), GGML_TYPE_F32, 1);
|
|
1154
|
+
ggml_set_name (sctx->inp_uniform, "uniform");
|
|
1155
|
+
ggml_set_input(sctx->inp_uniform);
|
|
1156
|
+
|
|
1157
|
+
// Allocate all tensors from our context to the backend
|
|
1158
|
+
sctx->inp_buf.reset(ggml_backend_alloc_ctx_tensors_from_buft(sctx->inp_ctx.get(), buft));
|
|
1159
|
+
|
|
1160
|
+
ggml_backend_buffer_clear(sctx->inp_buf.get(), 0);
|
|
1161
|
+
}
|
|
1162
|
+
|
|
1163
|
+
const bool res = llama_sampler_backend_support(smpl, buft);
|
|
1164
|
+
|
|
1165
|
+
sctx->init(res);
|
|
1166
|
+
|
|
1167
|
+
if (!res) {
|
|
1168
|
+
sctx->inp_ctx.reset(nullptr);
|
|
1169
|
+
sctx->inp_buf.reset(nullptr);
|
|
1170
|
+
}
|
|
1171
|
+
|
|
1172
|
+
return res;
|
|
1173
|
+
}
|
|
1174
|
+
|
|
1175
|
+
static void llama_sampler_dist_backend_apply(
|
|
1176
|
+
struct llama_sampler * smpl,
|
|
1177
|
+
struct ggml_context * ctx,
|
|
1178
|
+
struct ggml_cgraph * gf,
|
|
1179
|
+
struct llama_sampler_data * data) {
|
|
1180
|
+
GGML_UNUSED(gf);
|
|
1181
|
+
auto * sctx = (llama_sampler_dist *) smpl->ctx;
|
|
1182
|
+
|
|
1183
|
+
struct ggml_tensor * probs = ggml_soft_max(ctx, data->logits);
|
|
1184
|
+
ggml_set_name(probs, "dist_probs");
|
|
1185
|
+
|
|
1186
|
+
struct ggml_tensor * cumsum = ggml_cumsum(ctx, probs);
|
|
1187
|
+
ggml_set_name(cumsum, "dist_cumsum");
|
|
1188
|
+
|
|
1189
|
+
// The uniform tensor has a random value and we subtract this tensor with
|
|
1190
|
+
// the cumsum tensor (the uniform tensor will be broadcasted by ggml_sub).
|
|
1191
|
+
// Recall that each entry in cumsum is the cumulative probability up to that
|
|
1192
|
+
// index so values stay negative while the cumulative total is below the
|
|
1193
|
+
// random value, and become zero/positive once the threshold is crossed.
|
|
1194
|
+
struct ggml_tensor * diff = ggml_sub(ctx, cumsum, sctx->inp_uniform);
|
|
1195
|
+
ggml_set_name(diff, "dist_cumsum");
|
|
1196
|
+
|
|
1197
|
+
// The ggml_step function produces a tensor where entries are 1 if the
|
|
1198
|
+
// corresponding entry in diff is > 0, and 0 otherwise. So all values up to
|
|
1199
|
+
// the index where the cumulative probability exceeds the random value are 0,
|
|
1200
|
+
// and all entries after that are 1.
|
|
1201
|
+
struct ggml_tensor * mask = ggml_step(ctx, diff);
|
|
1202
|
+
ggml_set_name(mask, "dist_mask");
|
|
1203
|
+
|
|
1204
|
+
// Taking the sum of the mask gives us the sum of elements after the threshold
|
|
1205
|
+
// we are interested in.
|
|
1206
|
+
struct ggml_tensor * idxf = ggml_sum(ctx, mask);
|
|
1207
|
+
ggml_set_name(idxf, "dist_index_f32");
|
|
1208
|
+
|
|
1209
|
+
// Use ggml_scale_bias to scale the index value by -1 and then add the size
|
|
1210
|
+
// of the mask to that value so we get the correct index ((-1 * idxf) + n).
|
|
1211
|
+
struct ggml_tensor * idx = ggml_cast(ctx, ggml_scale_bias(ctx, idxf, -1.0f, mask->ne[0]), GGML_TYPE_I32);
|
|
1212
|
+
ggml_set_name(idx, "dist_index_i32");
|
|
1213
|
+
|
|
1214
|
+
// Map back to original vocab ids if a candidates tensor is available.
|
|
1215
|
+
struct ggml_tensor * sampled_token = idx;
|
|
1216
|
+
if (data->candidates != nullptr) {
|
|
1217
|
+
struct ggml_tensor * candidates = ggml_reshape_2d(ctx, data->candidates, 1, ggml_nelements(data->candidates));
|
|
1218
|
+
|
|
1219
|
+
sampled_token = ggml_get_rows(ctx, candidates, idx);
|
|
1220
|
+
ggml_set_name(sampled_token, "dist_sampled_token");
|
|
1221
|
+
}
|
|
1222
|
+
|
|
1223
|
+
data->sampled = sampled_token;
|
|
1224
|
+
data->probs = probs;
|
|
1225
|
+
}
|
|
1226
|
+
|
|
1227
|
+
static void llama_sampler_dist_backend_set_input(struct llama_sampler * smpl) {
|
|
1228
|
+
auto * sctx = (llama_sampler_dist *) smpl->ctx;
|
|
1229
|
+
GGML_ASSERT(sctx->inp_uniform != nullptr);
|
|
1230
|
+
|
|
1231
|
+
// We sample in double precision and cast to float to match rnd numbers of
|
|
1232
|
+
// llama_dampler_dist which uses double precision (sampling from
|
|
1233
|
+
// std::uniform_real_distribution<double> and
|
|
1234
|
+
// std::uniform_real_distribution<float> with same rng will produce
|
|
1235
|
+
// different sequences).
|
|
1236
|
+
std::uniform_real_distribution<double> dist(0.0f, 1.0f);
|
|
1237
|
+
const float rnd = dist(sctx->rng);
|
|
1238
|
+
|
|
1239
|
+
ggml_backend_tensor_set(sctx->inp_uniform, &rnd, 0, sizeof(float));
|
|
1240
|
+
}
|
|
1241
|
+
|
|
714
1242
|
static struct llama_sampler_i llama_sampler_dist_i = {
|
|
715
|
-
/* .name
|
|
716
|
-
/* .accept
|
|
717
|
-
/* .apply
|
|
718
|
-
/* .reset
|
|
719
|
-
/* .clone
|
|
720
|
-
/* .free
|
|
1243
|
+
/* .name = */ llama_sampler_dist_name,
|
|
1244
|
+
/* .accept = */ nullptr,
|
|
1245
|
+
/* .apply = */ llama_sampler_dist_apply,
|
|
1246
|
+
/* .reset = */ llama_sampler_dist_reset,
|
|
1247
|
+
/* .clone = */ llama_sampler_dist_clone,
|
|
1248
|
+
/* .free = */ llama_sampler_dist_free,
|
|
1249
|
+
/* .backend_init = */ llama_sampler_dist_backend_init,
|
|
1250
|
+
/* .backend_accept = */ nullptr,
|
|
1251
|
+
/* .backend_apply = */ llama_sampler_dist_backend_apply,
|
|
1252
|
+
/* .backend_set_input = */ llama_sampler_dist_backend_set_input,
|
|
721
1253
|
};
|
|
722
1254
|
|
|
723
1255
|
struct llama_sampler * llama_sampler_init_dist(uint32_t seed) {
|
|
@@ -725,21 +1257,26 @@ struct llama_sampler * llama_sampler_init_dist(uint32_t seed) {
|
|
|
725
1257
|
return llama_sampler_init(
|
|
726
1258
|
/* .iface = */ &llama_sampler_dist_i,
|
|
727
1259
|
/* .ctx = */ new llama_sampler_dist {
|
|
728
|
-
|
|
729
|
-
/* .
|
|
730
|
-
/* .
|
|
1260
|
+
("dist"),
|
|
1261
|
+
/* .seed = */ seed,
|
|
1262
|
+
/* .seed_cur = */ seed_cur,
|
|
1263
|
+
/* .rng = */ std::mt19937(seed_cur),
|
|
1264
|
+
/* .inp_uniform = */ nullptr,
|
|
1265
|
+
/* .inp_ctx = */ nullptr,
|
|
1266
|
+
/* .inp_buf = */ nullptr,
|
|
731
1267
|
}
|
|
732
1268
|
);
|
|
733
1269
|
}
|
|
734
1270
|
|
|
735
1271
|
// top-k
|
|
736
1272
|
|
|
737
|
-
struct llama_sampler_top_k {
|
|
1273
|
+
struct llama_sampler_top_k : public llama_sampler_backend {
|
|
738
1274
|
const int32_t k;
|
|
739
1275
|
};
|
|
740
1276
|
|
|
741
|
-
static const char * llama_sampler_top_k_name(const struct llama_sampler *
|
|
742
|
-
|
|
1277
|
+
static const char * llama_sampler_top_k_name(const struct llama_sampler * smpl) {
|
|
1278
|
+
auto * sctx = (llama_sampler_top_k *) smpl->ctx;
|
|
1279
|
+
return sctx->get_name();
|
|
743
1280
|
}
|
|
744
1281
|
|
|
745
1282
|
static void llama_sampler_top_k_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
|
@@ -756,19 +1293,69 @@ static void llama_sampler_top_k_free(struct llama_sampler * smpl) {
|
|
|
756
1293
|
delete (llama_sampler_top_k *) smpl->ctx;
|
|
757
1294
|
}
|
|
758
1295
|
|
|
1296
|
+
static bool llama_sampler_top_k_backend_init(
|
|
1297
|
+
struct llama_sampler * smpl,
|
|
1298
|
+
ggml_backend_buffer_type_t buft) {
|
|
1299
|
+
auto * sctx = (llama_sampler_top_k *) smpl->ctx;
|
|
1300
|
+
|
|
1301
|
+
const bool res = llama_sampler_backend_support(smpl, buft);
|
|
1302
|
+
|
|
1303
|
+
sctx->init(res);
|
|
1304
|
+
|
|
1305
|
+
return res;
|
|
1306
|
+
}
|
|
1307
|
+
|
|
1308
|
+
static void llama_sampler_top_k_backend_apply(
|
|
1309
|
+
struct llama_sampler * smpl,
|
|
1310
|
+
struct ggml_context * ctx,
|
|
1311
|
+
struct ggml_cgraph * gf,
|
|
1312
|
+
struct llama_sampler_data * data) {
|
|
1313
|
+
auto * sctx = (llama_sampler_top_k *) smpl->ctx;
|
|
1314
|
+
|
|
1315
|
+
struct ggml_tensor * top_k = ggml_top_k(ctx, data->logits, sctx->k);
|
|
1316
|
+
ggml_set_name(top_k, "top_k");
|
|
1317
|
+
|
|
1318
|
+
if (data->candidates) {
|
|
1319
|
+
struct ggml_tensor * candidates_rows = ggml_reshape_2d(ctx, data->candidates, 1, data->candidates->ne[0]);
|
|
1320
|
+
data->candidates = ggml_get_rows(ctx, candidates_rows, top_k);
|
|
1321
|
+
data->candidates = ggml_reshape_1d(ctx, data->candidates, sctx->k);
|
|
1322
|
+
ggml_set_name(data->candidates, "top_k_candidates");
|
|
1323
|
+
} else {
|
|
1324
|
+
data->candidates = top_k;
|
|
1325
|
+
}
|
|
1326
|
+
|
|
1327
|
+
struct ggml_tensor * logits_rows = ggml_reshape_2d(ctx, data->logits, 1, data->logits->ne[0]);
|
|
1328
|
+
struct ggml_tensor * top_k_rows = ggml_get_rows(ctx, logits_rows, top_k);
|
|
1329
|
+
data->logits = ggml_reshape_1d(ctx, top_k_rows, sctx->k);
|
|
1330
|
+
ggml_set_name(top_k_rows, "top_k_rows");
|
|
1331
|
+
|
|
1332
|
+
GGML_UNUSED(gf);
|
|
1333
|
+
}
|
|
1334
|
+
|
|
759
1335
|
static struct llama_sampler_i llama_sampler_top_k_i = {
|
|
760
|
-
/* .name
|
|
761
|
-
/* .accept
|
|
762
|
-
/* .apply
|
|
763
|
-
/* .reset
|
|
764
|
-
/* .clone
|
|
765
|
-
/* .free
|
|
1336
|
+
/* .name = */ llama_sampler_top_k_name,
|
|
1337
|
+
/* .accept = */ nullptr,
|
|
1338
|
+
/* .apply = */ llama_sampler_top_k_apply,
|
|
1339
|
+
/* .reset = */ nullptr,
|
|
1340
|
+
/* .clone = */ llama_sampler_top_k_clone,
|
|
1341
|
+
/* .free = */ llama_sampler_top_k_free,
|
|
1342
|
+
/* .backend_init = */ llama_sampler_top_k_backend_init,
|
|
1343
|
+
/* .backend_accept = */ nullptr,
|
|
1344
|
+
/* .backend_apply = */ llama_sampler_top_k_backend_apply,
|
|
1345
|
+
/* .backend_set_input = */ nullptr,
|
|
766
1346
|
};
|
|
767
1347
|
|
|
768
1348
|
struct llama_sampler * llama_sampler_init_top_k(int32_t k) {
|
|
1349
|
+
const bool is_empty = (k <= 0);
|
|
1350
|
+
|
|
1351
|
+
if (is_empty) {
|
|
1352
|
+
return llama_sampler_init_empty("?top-k");
|
|
1353
|
+
}
|
|
1354
|
+
|
|
769
1355
|
return llama_sampler_init(
|
|
770
1356
|
/* .iface = */ &llama_sampler_top_k_i,
|
|
771
1357
|
/* .ctx = */ new llama_sampler_top_k {
|
|
1358
|
+
("top-k"),
|
|
772
1359
|
/* .k = */ k,
|
|
773
1360
|
}
|
|
774
1361
|
);
|
|
@@ -776,15 +1363,16 @@ struct llama_sampler * llama_sampler_init_top_k(int32_t k) {
|
|
|
776
1363
|
|
|
777
1364
|
// top-p
|
|
778
1365
|
|
|
779
|
-
struct llama_sampler_top_p {
|
|
1366
|
+
struct llama_sampler_top_p : public llama_sampler_backend {
|
|
780
1367
|
const float p;
|
|
781
1368
|
const size_t min_keep;
|
|
782
1369
|
|
|
783
1370
|
std::vector<llama_token_data> buf_sort;
|
|
784
1371
|
};
|
|
785
1372
|
|
|
786
|
-
static const char * llama_sampler_top_p_name(const struct llama_sampler *
|
|
787
|
-
|
|
1373
|
+
static const char * llama_sampler_top_p_name(const struct llama_sampler * smpl) {
|
|
1374
|
+
auto * sctx = (llama_sampler_top_p *) smpl->ctx;
|
|
1375
|
+
return sctx->get_name();
|
|
788
1376
|
}
|
|
789
1377
|
|
|
790
1378
|
static void llama_sampler_top_p_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
|
@@ -851,19 +1439,118 @@ static void llama_sampler_top_p_free(struct llama_sampler * smpl) {
|
|
|
851
1439
|
delete (llama_sampler_top_p *) smpl->ctx;
|
|
852
1440
|
}
|
|
853
1441
|
|
|
1442
|
+
static bool llama_sampler_top_p_backend_init(
|
|
1443
|
+
struct llama_sampler * smpl,
|
|
1444
|
+
ggml_backend_buffer_type_t buft) {
|
|
1445
|
+
auto * sctx = (llama_sampler_top_p *) smpl->ctx;
|
|
1446
|
+
|
|
1447
|
+
const bool res = llama_sampler_backend_support(smpl, buft);
|
|
1448
|
+
|
|
1449
|
+
sctx->init(res);
|
|
1450
|
+
|
|
1451
|
+
return res;
|
|
1452
|
+
}
|
|
1453
|
+
|
|
1454
|
+
static void llama_sampler_top_p_backend_apply(
|
|
1455
|
+
struct llama_sampler * smpl,
|
|
1456
|
+
struct ggml_context * ctx,
|
|
1457
|
+
struct ggml_cgraph * gf,
|
|
1458
|
+
struct llama_sampler_data * data) {
|
|
1459
|
+
auto * sctx = (llama_sampler_top_p *) smpl->ctx;
|
|
1460
|
+
|
|
1461
|
+
auto ggml_sort = [ctx](struct ggml_tensor * a, struct ggml_tensor * b) {
|
|
1462
|
+
GGML_ASSERT(ggml_nrows(a) == 1);
|
|
1463
|
+
struct ggml_tensor * a_reshaped = ggml_reshape_2d(ctx, a, 1, a->ne[0]);
|
|
1464
|
+
struct ggml_tensor * a_sorted = ggml_get_rows(ctx, a_reshaped, b);
|
|
1465
|
+
return ggml_reshape_1d(ctx, a_sorted, a->ne[0]);
|
|
1466
|
+
};
|
|
1467
|
+
|
|
1468
|
+
// Get the sorted logits in descending order.
|
|
1469
|
+
struct ggml_tensor * sorted_idx = ggml_argsort(ctx, data->logits, GGML_SORT_ORDER_DESC);
|
|
1470
|
+
ggml_set_name(sorted_idx, "top_p_sorted_idx");
|
|
1471
|
+
|
|
1472
|
+
// Do the sorting via reshape + get_rows
|
|
1473
|
+
struct ggml_tensor * sorted_logits = ggml_sort(data->logits, sorted_idx);
|
|
1474
|
+
ggml_set_name(sorted_logits, "top_p_sorted_logits");
|
|
1475
|
+
|
|
1476
|
+
struct ggml_tensor * softmax = ggml_soft_max(ctx, sorted_logits);
|
|
1477
|
+
ggml_set_name(softmax, "top_p_softmax");
|
|
1478
|
+
|
|
1479
|
+
// If candidates are provided, sort them as well. Otherwise, set sorted indices as candidates.
|
|
1480
|
+
if (data->candidates) {
|
|
1481
|
+
data->candidates = ggml_sort(data->candidates, sorted_idx);
|
|
1482
|
+
} else {
|
|
1483
|
+
data->candidates = sorted_idx;
|
|
1484
|
+
}
|
|
1485
|
+
ggml_set_name(data->candidates, "top_p_candidates");
|
|
1486
|
+
|
|
1487
|
+
// Compute Cumulative Distribution Function (CDF) by means of GGML_OP_CUMSUM.
|
|
1488
|
+
struct ggml_tensor * cdf = ggml_cumsum(ctx, softmax);
|
|
1489
|
+
ggml_set_name(cdf, "top_p_cdf");
|
|
1490
|
+
|
|
1491
|
+
// Invert CDF and add top-p value so that ggml_step yields 1 for values we want to keep
|
|
1492
|
+
struct ggml_tensor * cdf_scaled = ggml_scale_bias(ctx, cdf, -1.0f, sctx->p);
|
|
1493
|
+
ggml_set_name(cdf_scaled, "top_p_cdf_scaled");
|
|
1494
|
+
|
|
1495
|
+
struct ggml_tensor * mask = ggml_step(ctx, cdf_scaled);
|
|
1496
|
+
ggml_set_name(mask, "top_p_mask");
|
|
1497
|
+
|
|
1498
|
+
// Taking the sum of the mask gives us the sum of elements after the threshold
|
|
1499
|
+
// we are interested in.
|
|
1500
|
+
struct ggml_tensor * idxf = ggml_sum(ctx, mask);
|
|
1501
|
+
ggml_set_name(idxf, "top_p_index_f32");
|
|
1502
|
+
|
|
1503
|
+
// prevent out-of-bounds access
|
|
1504
|
+
idxf = ggml_clamp(ctx, idxf, 0.0f, mask->ne[0] - 1);
|
|
1505
|
+
|
|
1506
|
+
// construct ones tensor to set the value in the mask
|
|
1507
|
+
struct ggml_tensor * ones = ggml_scale_bias(ctx, idxf, 0.0f, 1.0f);
|
|
1508
|
+
ggml_set_name(ones, "top_p_ones");
|
|
1509
|
+
|
|
1510
|
+
// Make top-p inclusive (i.e. return all values such that cum_sum/cdf >= p)
|
|
1511
|
+
struct ggml_tensor * mask_reshaped = ggml_reshape_2d(ctx, mask, 1, mask->ne[0]);
|
|
1512
|
+
|
|
1513
|
+
mask_reshaped = ggml_set_rows(ctx, mask_reshaped, ones, ggml_cast(ctx, idxf, GGML_TYPE_I32));
|
|
1514
|
+
mask = ggml_reshape_1d(ctx, mask_reshaped, mask->ne[0]);
|
|
1515
|
+
|
|
1516
|
+
// Use ggml_scale_bias (output = (a * s) + b) which in this case becomes:
|
|
1517
|
+
// top_p_bias = (mask * 1e9f) - 1e9f.
|
|
1518
|
+
// So entries in the mask that we want to discard will become -1e9f, and
|
|
1519
|
+
// others will be 0 (meaning that will not effect the logits).
|
|
1520
|
+
const float large_val = 1e9f;
|
|
1521
|
+
struct ggml_tensor * top_p_bias = ggml_scale_bias(ctx, mask, large_val, -large_val);
|
|
1522
|
+
ggml_set_name(top_p_bias, "top_p_bias");
|
|
1523
|
+
|
|
1524
|
+
data->logits = ggml_add(ctx, sorted_logits, top_p_bias);
|
|
1525
|
+
ggml_set_name(data->logits, "top_p_logits");
|
|
1526
|
+
|
|
1527
|
+
GGML_UNUSED(gf);
|
|
1528
|
+
}
|
|
1529
|
+
|
|
854
1530
|
static struct llama_sampler_i llama_sampler_top_p_i = {
|
|
855
|
-
/* .name
|
|
856
|
-
/* .accept
|
|
857
|
-
/* .apply
|
|
858
|
-
/* .reset
|
|
859
|
-
/* .clone
|
|
860
|
-
/* .free
|
|
1531
|
+
/* .name = */ llama_sampler_top_p_name,
|
|
1532
|
+
/* .accept = */ nullptr,
|
|
1533
|
+
/* .apply = */ llama_sampler_top_p_apply,
|
|
1534
|
+
/* .reset = */ nullptr,
|
|
1535
|
+
/* .clone = */ llama_sampler_top_p_clone,
|
|
1536
|
+
/* .free = */ llama_sampler_top_p_free,
|
|
1537
|
+
/* .backend_init = */ llama_sampler_top_p_backend_init,
|
|
1538
|
+
/* .backend_accept = */ nullptr,
|
|
1539
|
+
/* .backend_apply = */ llama_sampler_top_p_backend_apply,
|
|
1540
|
+
/* .backend_set_input = */ nullptr,
|
|
861
1541
|
};
|
|
862
1542
|
|
|
863
1543
|
struct llama_sampler * llama_sampler_init_top_p(float p, size_t min_keep) {
|
|
1544
|
+
const bool is_empty = p >= 1.0f;
|
|
1545
|
+
|
|
1546
|
+
if (is_empty) {
|
|
1547
|
+
return llama_sampler_init_empty("?top-p");
|
|
1548
|
+
}
|
|
1549
|
+
|
|
864
1550
|
return llama_sampler_init(
|
|
865
1551
|
/* .iface = */ &llama_sampler_top_p_i,
|
|
866
1552
|
/* .ctx = */ new llama_sampler_top_p {
|
|
1553
|
+
("top-p"),
|
|
867
1554
|
/* .p = */ p,
|
|
868
1555
|
/* .min_keep = */ min_keep,
|
|
869
1556
|
/* .buf_sort = */ {},
|
|
@@ -873,13 +1560,14 @@ struct llama_sampler * llama_sampler_init_top_p(float p, size_t min_keep) {
|
|
|
873
1560
|
|
|
874
1561
|
// min-p
|
|
875
1562
|
|
|
876
|
-
struct llama_sampler_min_p {
|
|
1563
|
+
struct llama_sampler_min_p : public llama_sampler_backend {
|
|
877
1564
|
const float p;
|
|
878
1565
|
const size_t min_keep;
|
|
879
1566
|
};
|
|
880
1567
|
|
|
881
|
-
static const char * llama_sampler_min_p_name(const struct llama_sampler *
|
|
882
|
-
|
|
1568
|
+
static const char * llama_sampler_min_p_name(const struct llama_sampler * smpl) {
|
|
1569
|
+
auto * sctx = (llama_sampler_min_p *) smpl->ctx;
|
|
1570
|
+
return sctx->get_name();
|
|
883
1571
|
}
|
|
884
1572
|
|
|
885
1573
|
static void llama_sampler_min_p_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
|
@@ -945,19 +1633,85 @@ static void llama_sampler_min_p_free(struct llama_sampler * smpl) {
|
|
|
945
1633
|
delete (llama_sampler_min_p *) smpl->ctx;
|
|
946
1634
|
}
|
|
947
1635
|
|
|
1636
|
+
static bool llama_sampler_min_p_backend_init(
|
|
1637
|
+
struct llama_sampler * smpl,
|
|
1638
|
+
ggml_backend_buffer_type_t buft) {
|
|
1639
|
+
auto * sctx = (llama_sampler_min_p *) smpl->ctx;
|
|
1640
|
+
|
|
1641
|
+
const bool res = llama_sampler_backend_support(smpl, buft);
|
|
1642
|
+
|
|
1643
|
+
sctx->init(res);
|
|
1644
|
+
|
|
1645
|
+
return res;
|
|
1646
|
+
}
|
|
1647
|
+
|
|
1648
|
+
static void llama_sampler_min_p_backend_apply(
|
|
1649
|
+
struct llama_sampler * smpl,
|
|
1650
|
+
struct ggml_context * ctx,
|
|
1651
|
+
struct ggml_cgraph * gf,
|
|
1652
|
+
struct llama_sampler_data * data) {
|
|
1653
|
+
auto * sctx = (llama_sampler_min_p *) smpl->ctx;
|
|
1654
|
+
|
|
1655
|
+
struct ggml_tensor * max_idx = ggml_argmax(ctx, data->logits);
|
|
1656
|
+
ggml_set_name(max_idx, "max_idx");
|
|
1657
|
+
|
|
1658
|
+
struct ggml_tensor * logits_rows = ggml_reshape_2d(ctx, data->logits, 1, data->logits->ne[0]);
|
|
1659
|
+
ggml_set_name(logits_rows, "logits_rows");
|
|
1660
|
+
|
|
1661
|
+
struct ggml_tensor * max_logit = ggml_get_rows(ctx, logits_rows, max_idx);
|
|
1662
|
+
ggml_set_name(max_logit, "max_logit");
|
|
1663
|
+
|
|
1664
|
+
// Calculate the threshold value.
|
|
1665
|
+
struct ggml_tensor * threshold = ggml_scale_bias(ctx, max_logit, 1.0f, logf(sctx->p));
|
|
1666
|
+
ggml_set_name(threshold, "min_p_threshold");
|
|
1667
|
+
|
|
1668
|
+
// Subtract the threshold from logits.
|
|
1669
|
+
struct ggml_tensor * sub = ggml_sub(ctx, data->logits, threshold);
|
|
1670
|
+
|
|
1671
|
+
// Create a mask where logits below the threshold are 0 (discard),
|
|
1672
|
+
// and others are 1 (keep).
|
|
1673
|
+
struct ggml_tensor * mask = ggml_step(ctx, sub);
|
|
1674
|
+
ggml_set_name(mask, "min_p_mask");
|
|
1675
|
+
|
|
1676
|
+
// Use ggml_scale_bias (output = (a * s) + b) which in this case becomes:
|
|
1677
|
+
// min_p_bias = (mask * 1e9f) - 1e9f.
|
|
1678
|
+
// So entries in the mask that we want to discard will become -1e9f, and
|
|
1679
|
+
// others will be 0 (meaning that will not effect the logits).
|
|
1680
|
+
const float large_val = 1e9f;
|
|
1681
|
+
struct ggml_tensor * min_p_bias = ggml_scale_bias(ctx, mask, large_val, -large_val);
|
|
1682
|
+
ggml_set_name(min_p_bias, "min_p_bias");
|
|
1683
|
+
|
|
1684
|
+
// Add the min_p bias to the logits.
|
|
1685
|
+
data->logits = ggml_add(ctx, data->logits, min_p_bias);
|
|
1686
|
+
ggml_set_name(data->logits, "min_p_logits");
|
|
1687
|
+
|
|
1688
|
+
GGML_UNUSED(gf);
|
|
1689
|
+
}
|
|
1690
|
+
|
|
948
1691
|
static struct llama_sampler_i llama_sampler_min_p_i = {
|
|
949
|
-
/* .name
|
|
950
|
-
/* .accept
|
|
951
|
-
/* .apply
|
|
952
|
-
/* .reset
|
|
953
|
-
/* .clone
|
|
954
|
-
/* .free
|
|
1692
|
+
/* .name = */ llama_sampler_min_p_name,
|
|
1693
|
+
/* .accept = */ nullptr,
|
|
1694
|
+
/* .apply = */ llama_sampler_min_p_apply,
|
|
1695
|
+
/* .reset = */ nullptr,
|
|
1696
|
+
/* .clone = */ llama_sampler_min_p_clone,
|
|
1697
|
+
/* .free = */ llama_sampler_min_p_free,
|
|
1698
|
+
/* .backend_init = */ llama_sampler_min_p_backend_init,
|
|
1699
|
+
/* .backend_accept = */ nullptr,
|
|
1700
|
+
/* .backend_apply = */ llama_sampler_min_p_backend_apply,
|
|
1701
|
+
/* .backend_set_input = */ nullptr,
|
|
955
1702
|
};
|
|
956
1703
|
|
|
957
1704
|
struct llama_sampler * llama_sampler_init_min_p(float p, size_t min_keep) {
|
|
1705
|
+
const bool is_empty = (p <= 0.0f);
|
|
1706
|
+
|
|
1707
|
+
if (is_empty) {
|
|
1708
|
+
return llama_sampler_init_empty("?min-p");
|
|
1709
|
+
}
|
|
1710
|
+
|
|
958
1711
|
return llama_sampler_init(
|
|
959
1712
|
/* .iface = */ &llama_sampler_min_p_i,
|
|
960
1713
|
/* .ctx = */ new llama_sampler_min_p {
|
|
1714
|
+
("min-p"),
|
|
961
1715
|
/* .p = */ p,
|
|
962
1716
|
/* .min_keep = */ min_keep,
|
|
963
1717
|
}
|
|
@@ -1045,15 +1799,25 @@ static void llama_sampler_typical_free(struct llama_sampler * smpl) {
|
|
|
1045
1799
|
}
|
|
1046
1800
|
|
|
1047
1801
|
static struct llama_sampler_i llama_sampler_typical_i = {
|
|
1048
|
-
/* .name
|
|
1049
|
-
/* .accept
|
|
1050
|
-
/* .apply
|
|
1051
|
-
/* .reset
|
|
1052
|
-
/* .clone
|
|
1053
|
-
/* .free
|
|
1802
|
+
/* .name = */ llama_sampler_typical_name,
|
|
1803
|
+
/* .accept = */ nullptr,
|
|
1804
|
+
/* .apply = */ llama_sampler_typical_apply,
|
|
1805
|
+
/* .reset = */ nullptr,
|
|
1806
|
+
/* .clone = */ llama_sampler_typical_clone,
|
|
1807
|
+
/* .free = */ llama_sampler_typical_free,
|
|
1808
|
+
/* .backend_init = */ nullptr,
|
|
1809
|
+
/* .backend_accept = */ nullptr,
|
|
1810
|
+
/* .backend_apply = */ nullptr,
|
|
1811
|
+
/* .backend_set_input = */ nullptr,
|
|
1054
1812
|
};
|
|
1055
1813
|
|
|
1056
1814
|
struct llama_sampler * llama_sampler_init_typical(float p, size_t min_keep) {
|
|
1815
|
+
const bool is_empty = (p >= 1.0f);
|
|
1816
|
+
|
|
1817
|
+
if (is_empty) {
|
|
1818
|
+
return llama_sampler_init_empty("?typical");
|
|
1819
|
+
}
|
|
1820
|
+
|
|
1057
1821
|
return llama_sampler_init(
|
|
1058
1822
|
/* .iface = */ &llama_sampler_typical_i,
|
|
1059
1823
|
/* .ctx = */ new llama_sampler_typical {
|
|
@@ -1065,12 +1829,13 @@ struct llama_sampler * llama_sampler_init_typical(float p, size_t min_keep) {
|
|
|
1065
1829
|
|
|
1066
1830
|
// temp
|
|
1067
1831
|
|
|
1068
|
-
struct llama_sampler_temp {
|
|
1832
|
+
struct llama_sampler_temp : public llama_sampler_backend {
|
|
1069
1833
|
const float temp;
|
|
1070
1834
|
};
|
|
1071
1835
|
|
|
1072
|
-
static const char * llama_sampler_temp_name(const struct llama_sampler *
|
|
1073
|
-
|
|
1836
|
+
static const char * llama_sampler_temp_name(const struct llama_sampler * smpl) {
|
|
1837
|
+
auto * sctx = (llama_sampler_temp *) smpl->ctx;
|
|
1838
|
+
return sctx->get_name();
|
|
1074
1839
|
}
|
|
1075
1840
|
|
|
1076
1841
|
static void llama_sampler_temp_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
|
@@ -1088,19 +1853,79 @@ static void llama_sampler_temp_free(struct llama_sampler * smpl) {
|
|
|
1088
1853
|
delete (llama_sampler_temp *) smpl->ctx;
|
|
1089
1854
|
}
|
|
1090
1855
|
|
|
1856
|
+
static void llama_sampler_backend_temp_sampling(
|
|
1857
|
+
struct ggml_context * ctx,
|
|
1858
|
+
struct ggml_cgraph * gf,
|
|
1859
|
+
struct llama_sampler_data * data,
|
|
1860
|
+
float temp) {
|
|
1861
|
+
if (temp <= 0.0f) {
|
|
1862
|
+
// Find the most probable token index.
|
|
1863
|
+
struct ggml_tensor * max_idx = ggml_argmax(ctx, data->logits);
|
|
1864
|
+
ggml_set_name(max_idx, "temp_max_idx");
|
|
1865
|
+
|
|
1866
|
+
if (data->candidates) {
|
|
1867
|
+
struct ggml_tensor * candidates_rows = ggml_reshape_2d(ctx, data->candidates, 1, data->candidates->ne[0]);
|
|
1868
|
+
data->candidates = ggml_get_rows(ctx, candidates_rows, max_idx);
|
|
1869
|
+
} else {
|
|
1870
|
+
data->candidates = max_idx;
|
|
1871
|
+
}
|
|
1872
|
+
|
|
1873
|
+
struct ggml_tensor * logits_rows = ggml_reshape_2d(ctx, data->logits, 1, data->logits->ne[0]);
|
|
1874
|
+
data->logits = ggml_get_rows(ctx, logits_rows, max_idx);
|
|
1875
|
+
|
|
1876
|
+
return;
|
|
1877
|
+
}
|
|
1878
|
+
|
|
1879
|
+
data->logits = ggml_scale(ctx, data->logits, 1.0f / temp);
|
|
1880
|
+
|
|
1881
|
+
GGML_UNUSED(gf);
|
|
1882
|
+
}
|
|
1883
|
+
|
|
1884
|
+
static bool llama_sampler_temp_backend_init(
|
|
1885
|
+
struct llama_sampler * smpl,
|
|
1886
|
+
ggml_backend_buffer_type_t buft) {
|
|
1887
|
+
auto * sctx = (llama_sampler_temp *) smpl->ctx;
|
|
1888
|
+
|
|
1889
|
+
const bool res = llama_sampler_backend_support(smpl, buft);
|
|
1890
|
+
|
|
1891
|
+
sctx->init(res);
|
|
1892
|
+
|
|
1893
|
+
return res;
|
|
1894
|
+
}
|
|
1895
|
+
|
|
1896
|
+
static void llama_sampler_temp_backend_apply(
|
|
1897
|
+
struct llama_sampler * smpl,
|
|
1898
|
+
struct ggml_context * ctx,
|
|
1899
|
+
struct ggml_cgraph * gf,
|
|
1900
|
+
struct llama_sampler_data * data) {
|
|
1901
|
+
auto * sctx = (llama_sampler_temp *) smpl->ctx;
|
|
1902
|
+
llama_sampler_backend_temp_sampling(ctx, gf, data, sctx->temp);
|
|
1903
|
+
}
|
|
1904
|
+
|
|
1091
1905
|
static struct llama_sampler_i llama_sampler_temp_i = {
|
|
1092
|
-
/* .name
|
|
1093
|
-
/* .accept
|
|
1094
|
-
/* .apply
|
|
1095
|
-
/* .reset
|
|
1096
|
-
/* .clone
|
|
1097
|
-
/* .free
|
|
1906
|
+
/* .name = */ llama_sampler_temp_name,
|
|
1907
|
+
/* .accept = */ nullptr,
|
|
1908
|
+
/* .apply = */ llama_sampler_temp_apply,
|
|
1909
|
+
/* .reset = */ nullptr,
|
|
1910
|
+
/* .clone = */ llama_sampler_temp_clone,
|
|
1911
|
+
/* .free = */ llama_sampler_temp_free,
|
|
1912
|
+
/* .backend_init = */ llama_sampler_temp_backend_init,
|
|
1913
|
+
/* .backend_accept = */ nullptr,
|
|
1914
|
+
/* .backend_apply = */ llama_sampler_temp_backend_apply,
|
|
1915
|
+
/* .backend_set_input = */ nullptr,
|
|
1098
1916
|
};
|
|
1099
1917
|
|
|
1100
1918
|
struct llama_sampler * llama_sampler_init_temp(float temp) {
|
|
1919
|
+
const bool is_empty = temp == 1.0f;
|
|
1920
|
+
|
|
1921
|
+
if (is_empty) {
|
|
1922
|
+
return llama_sampler_init_empty("?temp");
|
|
1923
|
+
}
|
|
1924
|
+
|
|
1101
1925
|
return llama_sampler_init(
|
|
1102
1926
|
/* .iface = */ &llama_sampler_temp_i,
|
|
1103
1927
|
/* .ctx = */ new llama_sampler_temp {
|
|
1928
|
+
("temp"),
|
|
1104
1929
|
/*.temp = */ temp,
|
|
1105
1930
|
}
|
|
1106
1931
|
);
|
|
@@ -1108,14 +1933,15 @@ struct llama_sampler * llama_sampler_init_temp(float temp) {
|
|
|
1108
1933
|
|
|
1109
1934
|
// temp-ext
|
|
1110
1935
|
|
|
1111
|
-
struct llama_sampler_temp_ext {
|
|
1936
|
+
struct llama_sampler_temp_ext : public llama_sampler_backend {
|
|
1112
1937
|
const float temp;
|
|
1113
1938
|
const float delta;
|
|
1114
1939
|
const float exponent;
|
|
1115
1940
|
};
|
|
1116
1941
|
|
|
1117
|
-
static const char * llama_sampler_temp_ext_name(const struct llama_sampler *
|
|
1118
|
-
|
|
1942
|
+
static const char * llama_sampler_temp_ext_name(const struct llama_sampler * smpl) {
|
|
1943
|
+
auto * sctx = (llama_sampler_temp_ext *) smpl->ctx;
|
|
1944
|
+
return sctx->get_name();
|
|
1119
1945
|
}
|
|
1120
1946
|
|
|
1121
1947
|
static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
|
@@ -1198,24 +2024,112 @@ static void llama_sampler_temp_ext_free(struct llama_sampler * smpl) {
|
|
|
1198
2024
|
delete (llama_sampler_temp_ext *) smpl->ctx;
|
|
1199
2025
|
}
|
|
1200
2026
|
|
|
2027
|
+
static bool llama_sampler_temp_ext_backend_init(
|
|
2028
|
+
struct llama_sampler * smpl,
|
|
2029
|
+
ggml_backend_buffer_type_t buft) {
|
|
2030
|
+
auto * sctx = (llama_sampler_temp_ext *) smpl->ctx;
|
|
2031
|
+
|
|
2032
|
+
const bool res = llama_sampler_backend_support(smpl, buft);
|
|
2033
|
+
|
|
2034
|
+
sctx->init(res);
|
|
2035
|
+
|
|
2036
|
+
return res;
|
|
2037
|
+
}
|
|
2038
|
+
|
|
2039
|
+
static void llama_sampler_temp_ext_backend_apply(
|
|
2040
|
+
struct llama_sampler * smpl,
|
|
2041
|
+
struct ggml_context * ctx,
|
|
2042
|
+
struct ggml_cgraph * gf,
|
|
2043
|
+
struct llama_sampler_data * data) {
|
|
2044
|
+
auto * sctx = (llama_sampler_temp_ext *) smpl->ctx;
|
|
2045
|
+
|
|
2046
|
+
// Revert to standard temperature scaling if delta or temp are non-positive.
|
|
2047
|
+
if (sctx->delta <= 0.0f || sctx->temp <= 0.0f) {
|
|
2048
|
+
llama_sampler_backend_temp_sampling(ctx, gf, data, sctx->temp);
|
|
2049
|
+
return;
|
|
2050
|
+
}
|
|
2051
|
+
|
|
2052
|
+
// Calculate min_temp, max_temp, and max_entropy.
|
|
2053
|
+
const float min_temp = std::max(0.0f, sctx->temp - sctx->delta);
|
|
2054
|
+
const float max_temp = sctx->temp + sctx->delta;
|
|
2055
|
+
const float max_entropy = logf(data->logits->ne[0]);
|
|
2056
|
+
|
|
2057
|
+
// Calculate the probabilities.
|
|
2058
|
+
struct ggml_tensor * probs = ggml_soft_max(ctx, data->logits);
|
|
2059
|
+
ggml_set_name(probs, "temp_ext_softmax_probs");
|
|
2060
|
+
|
|
2061
|
+
// Clamp probabilities to avoid log(0) which would give -inf
|
|
2062
|
+
struct ggml_tensor * probs_clamped = ggml_clamp(ctx, probs, 1e-10f, 1.0f);
|
|
2063
|
+
ggml_set_name(probs_clamped, "temp_ext_probs_clamped");
|
|
2064
|
+
|
|
2065
|
+
// Calculate the entropy, entropy = -Σ(p * log(p)).
|
|
2066
|
+
struct ggml_tensor * log_probs = ggml_log(ctx, probs_clamped);
|
|
2067
|
+
struct ggml_tensor * p_log_p = ggml_mul(ctx, probs_clamped, log_probs);
|
|
2068
|
+
struct ggml_tensor * sum_p_log_p = ggml_sum(ctx, p_log_p);
|
|
2069
|
+
struct ggml_tensor * entropy = ggml_scale(ctx, sum_p_log_p, -1.0f);
|
|
2070
|
+
ggml_set_name(log_probs, "temp_ext_log_probs");
|
|
2071
|
+
ggml_set_name(p_log_p, "temp_ext_p_log_p");
|
|
2072
|
+
ggml_set_name(sum_p_log_p, "temp_ext_sum_p_log_p");
|
|
2073
|
+
ggml_set_name(entropy, "temp_ext_entropy");
|
|
2074
|
+
|
|
2075
|
+
// Normalize the entropy, norm_entropy = entropy / max_entropy
|
|
2076
|
+
struct ggml_tensor * norm_entropy = ggml_scale(ctx, entropy, 1.0f / max_entropy);
|
|
2077
|
+
ggml_set_name(norm_entropy, "temp_ext_norm_entropy");
|
|
2078
|
+
|
|
2079
|
+
// Calculate the dynamic temperature:
|
|
2080
|
+
// dyn_temp = min_temp + (max_temp - min_temp) * powf(normalized_entropy, exponent);
|
|
2081
|
+
//
|
|
2082
|
+
// Calculate powf(normalized_entropy, exponent) as
|
|
2083
|
+
// norm_entropy^exponent = exp(exponent * log(norm_entropy))
|
|
2084
|
+
struct ggml_tensor * log_norm_entropy = ggml_log(ctx, norm_entropy);
|
|
2085
|
+
struct ggml_tensor * scaled_log = ggml_scale(ctx, log_norm_entropy, sctx->exponent);
|
|
2086
|
+
struct ggml_tensor * pow_entropy = ggml_exp(ctx, scaled_log);
|
|
2087
|
+
// With pow_entropy computed we can now compute dyn_temp, scaling by
|
|
2088
|
+
// (max_temp - min_temp) and then adding min_temp.
|
|
2089
|
+
struct ggml_tensor * dyn_temp = ggml_scale_bias(ctx, pow_entropy, max_temp - min_temp, min_temp);
|
|
2090
|
+
ggml_set_name(log_norm_entropy, "temp_ext_log_norm_entropy");
|
|
2091
|
+
ggml_set_name(scaled_log, "temp_ext_scaled_log");
|
|
2092
|
+
ggml_set_name(pow_entropy, "temp_ext_pow_entropy");
|
|
2093
|
+
ggml_set_name(dyn_temp, "temp_ext_dyn_temp");
|
|
2094
|
+
|
|
2095
|
+
// Scale the logits by the dynamic temperature
|
|
2096
|
+
struct ggml_tensor * scaled_logits = ggml_div(ctx, data->logits, dyn_temp);
|
|
2097
|
+
ggml_set_name(scaled_logits, "temp_ext_scaled_logits");
|
|
2098
|
+
|
|
2099
|
+
data->logits = scaled_logits;
|
|
2100
|
+
}
|
|
2101
|
+
|
|
1201
2102
|
static struct llama_sampler_i llama_sampler_temp_ext_i = {
|
|
1202
|
-
/* .name
|
|
1203
|
-
/* .accept
|
|
1204
|
-
/* .apply
|
|
1205
|
-
/* .reset
|
|
1206
|
-
/* .clone
|
|
1207
|
-
/* .free
|
|
2103
|
+
/* .name = */ llama_sampler_temp_ext_name,
|
|
2104
|
+
/* .accept = */ nullptr,
|
|
2105
|
+
/* .apply = */ llama_sampler_temp_ext_apply,
|
|
2106
|
+
/* .reset = */ nullptr,
|
|
2107
|
+
/* .clone = */ llama_sampler_temp_ext_clone,
|
|
2108
|
+
/* .free = */ llama_sampler_temp_ext_free,
|
|
2109
|
+
/* .backend_init = */ llama_sampler_temp_ext_backend_init,
|
|
2110
|
+
/* .backend_accept = */ nullptr,
|
|
2111
|
+
/* .backend_apply = */ llama_sampler_temp_ext_backend_apply,
|
|
2112
|
+
/* .backend_set_input = */ nullptr,
|
|
1208
2113
|
};
|
|
1209
2114
|
|
|
1210
2115
|
struct llama_sampler * llama_sampler_init_temp_ext(float temp, float delta, float exponent) {
|
|
1211
|
-
|
|
2116
|
+
const bool is_empty = temp == 1.0f && delta <= 0.0f;
|
|
2117
|
+
|
|
2118
|
+
if (is_empty) {
|
|
2119
|
+
return llama_sampler_init_empty("?temp-ext");
|
|
2120
|
+
}
|
|
2121
|
+
|
|
2122
|
+
auto * res = llama_sampler_init(
|
|
1212
2123
|
/* .iface = */ &llama_sampler_temp_ext_i,
|
|
1213
2124
|
/* .ctx = */ new llama_sampler_temp_ext {
|
|
2125
|
+
("temp-ext"),
|
|
1214
2126
|
/* .temp = */ temp,
|
|
1215
2127
|
/* .delta = */ delta,
|
|
1216
2128
|
/* .exponent = */ exponent,
|
|
1217
2129
|
}
|
|
1218
2130
|
);
|
|
2131
|
+
|
|
2132
|
+
return res;
|
|
1219
2133
|
}
|
|
1220
2134
|
|
|
1221
2135
|
// xtc
|
|
@@ -1293,16 +2207,27 @@ static void llama_sampler_xtc_reset(struct llama_sampler * smpl) {
|
|
|
1293
2207
|
}
|
|
1294
2208
|
|
|
1295
2209
|
static struct llama_sampler_i llama_sampler_xtc_i = {
|
|
1296
|
-
/* .name
|
|
1297
|
-
/* .accept
|
|
1298
|
-
/* .apply
|
|
1299
|
-
/* .reset
|
|
1300
|
-
/* .clone
|
|
1301
|
-
/* .free
|
|
2210
|
+
/* .name = */ llama_sampler_xtc_name,
|
|
2211
|
+
/* .accept = */ nullptr,
|
|
2212
|
+
/* .apply = */ llama_sample_xtc_apply,
|
|
2213
|
+
/* .reset = */ llama_sampler_xtc_reset,
|
|
2214
|
+
/* .clone = */ llama_sampler_xtc_clone,
|
|
2215
|
+
/* .free = */ llama_sampler_xtc_free,
|
|
2216
|
+
/* .backend_init = */ nullptr,
|
|
2217
|
+
/* .backend_accept = */ nullptr,
|
|
2218
|
+
/* .backend_apply = */ nullptr,
|
|
2219
|
+
/* .backend_set_input = */ nullptr,
|
|
1302
2220
|
};
|
|
1303
2221
|
|
|
1304
2222
|
struct llama_sampler * llama_sampler_init_xtc(float p, float t, size_t min_keep, uint32_t seed) {
|
|
1305
|
-
|
|
2223
|
+
const bool is_empty = (p <= 0.0f || t > 0.5f);
|
|
2224
|
+
|
|
2225
|
+
if (is_empty) {
|
|
2226
|
+
return llama_sampler_init_empty("?xtc");
|
|
2227
|
+
}
|
|
2228
|
+
|
|
2229
|
+
const auto seed_cur = get_rng_seed(seed);
|
|
2230
|
+
|
|
1306
2231
|
return llama_sampler_init(
|
|
1307
2232
|
/* .iface = */ &llama_sampler_xtc_i,
|
|
1308
2233
|
/* .ctx = */ new llama_sampler_xtc {
|
|
@@ -1401,16 +2326,21 @@ static void llama_sampler_mirostat_free(struct llama_sampler * smpl) {
|
|
|
1401
2326
|
}
|
|
1402
2327
|
|
|
1403
2328
|
static struct llama_sampler_i llama_sampler_mirostat_i = {
|
|
1404
|
-
/* .name
|
|
1405
|
-
/* .accept
|
|
1406
|
-
/* .apply
|
|
1407
|
-
/* .reset
|
|
1408
|
-
/* .clone
|
|
1409
|
-
/* .free
|
|
2329
|
+
/* .name = */ llama_sampler_mirostat_name,
|
|
2330
|
+
/* .accept = */ nullptr,
|
|
2331
|
+
/* .apply = */ llama_sampler_mirostat_apply,
|
|
2332
|
+
/* .reset = */ llama_sampler_mirostat_reset,
|
|
2333
|
+
/* .clone = */ llama_sampler_mirostat_clone,
|
|
2334
|
+
/* .free = */ llama_sampler_mirostat_free,
|
|
2335
|
+
/* .backend_init = */ nullptr,
|
|
2336
|
+
/* .backend_accept = */ nullptr,
|
|
2337
|
+
/* .backend_apply = */ nullptr,
|
|
2338
|
+
/* .backend_set_input = */ nullptr,
|
|
1410
2339
|
};
|
|
1411
2340
|
|
|
1412
2341
|
struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t seed, float tau, float eta, int32_t m) {
|
|
1413
|
-
auto seed_cur = get_rng_seed(seed);
|
|
2342
|
+
const auto seed_cur = get_rng_seed(seed);
|
|
2343
|
+
|
|
1414
2344
|
return llama_sampler_init(
|
|
1415
2345
|
/* .iface = */ &llama_sampler_mirostat_i,
|
|
1416
2346
|
/* .ctx = */ new llama_sampler_mirostat {
|
|
@@ -1500,12 +2430,16 @@ static void llama_sampler_mirostat_v2_free(struct llama_sampler * smpl) {
|
|
|
1500
2430
|
}
|
|
1501
2431
|
|
|
1502
2432
|
static struct llama_sampler_i llama_sampler_mirostat_v2_i = {
|
|
1503
|
-
/* .name
|
|
1504
|
-
/* .accept
|
|
1505
|
-
/* .apply
|
|
1506
|
-
/* .reset
|
|
1507
|
-
/* .clone
|
|
1508
|
-
/* .free
|
|
2433
|
+
/* .name = */ llama_sampler_mirostat_v2_name,
|
|
2434
|
+
/* .accept = */ nullptr,
|
|
2435
|
+
/* .apply = */ llama_sampler_mirostat_v2_apply,
|
|
2436
|
+
/* .reset = */ llama_sampler_mirostat_v2_reset,
|
|
2437
|
+
/* .clone = */ llama_sampler_mirostat_v2_clone,
|
|
2438
|
+
/* .free = */ llama_sampler_mirostat_v2_free,
|
|
2439
|
+
/* .backend_init = */ nullptr,
|
|
2440
|
+
/* .backend_accept = */ nullptr,
|
|
2441
|
+
/* .backend_apply = */ nullptr,
|
|
2442
|
+
/* .backend_set_input = */ nullptr,
|
|
1509
2443
|
};
|
|
1510
2444
|
|
|
1511
2445
|
struct llama_sampler * llama_sampler_init_mirostat_v2(uint32_t seed, float tau, float eta) {
|
|
@@ -1617,12 +2551,16 @@ static void llama_sampler_grammar_free(struct llama_sampler * smpl) {
|
|
|
1617
2551
|
}
|
|
1618
2552
|
|
|
1619
2553
|
static struct llama_sampler_i llama_sampler_grammar_i = {
|
|
1620
|
-
/* .name
|
|
1621
|
-
/* .accept
|
|
1622
|
-
/* .apply
|
|
1623
|
-
/* .reset
|
|
1624
|
-
/* .clone
|
|
1625
|
-
/* .free
|
|
2554
|
+
/* .name = */ llama_sampler_grammar_name,
|
|
2555
|
+
/* .accept = */ llama_sampler_grammar_accept_impl,
|
|
2556
|
+
/* .apply = */ llama_sampler_grammar_apply,
|
|
2557
|
+
/* .reset = */ llama_sampler_grammar_reset,
|
|
2558
|
+
/* .clone = */ llama_sampler_grammar_clone,
|
|
2559
|
+
/* .free = */ llama_sampler_grammar_free,
|
|
2560
|
+
/* .backend_init = */ nullptr,
|
|
2561
|
+
/* .backend_accept = */ nullptr,
|
|
2562
|
+
/* .backend_apply = */ nullptr,
|
|
2563
|
+
/* .backend_set_input = */ nullptr,
|
|
1626
2564
|
};
|
|
1627
2565
|
|
|
1628
2566
|
static struct llama_sampler * llama_sampler_init_grammar_impl(
|
|
@@ -1824,12 +2762,16 @@ static void llama_sampler_penalties_free(struct llama_sampler * smpl) {
|
|
|
1824
2762
|
}
|
|
1825
2763
|
|
|
1826
2764
|
static struct llama_sampler_i llama_sampler_penalties_i = {
|
|
1827
|
-
/* .name
|
|
1828
|
-
/* .accept
|
|
1829
|
-
/* .apply
|
|
1830
|
-
/* .reset
|
|
1831
|
-
/* .clone
|
|
1832
|
-
/* .free
|
|
2765
|
+
/* .name = */ llama_sampler_penalties_name,
|
|
2766
|
+
/* .accept = */ llama_sampler_penalties_accept,
|
|
2767
|
+
/* .apply = */ llama_sampler_penalties_apply,
|
|
2768
|
+
/* .reset = */ llama_sampler_penalties_reset,
|
|
2769
|
+
/* .clone = */ llama_sampler_penalties_clone,
|
|
2770
|
+
/* .free = */ llama_sampler_penalties_free,
|
|
2771
|
+
/* .backend_init = */ nullptr,
|
|
2772
|
+
/* .backend_accept = */ nullptr,
|
|
2773
|
+
/* .backend_apply = */ nullptr,
|
|
2774
|
+
/* .backend_set_input = */ nullptr,
|
|
1833
2775
|
};
|
|
1834
2776
|
|
|
1835
2777
|
struct llama_sampler * llama_sampler_init_penalties(
|
|
@@ -1839,6 +2781,12 @@ struct llama_sampler * llama_sampler_init_penalties(
|
|
|
1839
2781
|
float penalty_present) {
|
|
1840
2782
|
penalty_last_n = std::max(penalty_last_n, 0);
|
|
1841
2783
|
|
|
2784
|
+
const bool is_empty = (penalty_last_n == 0 || (penalty_repeat == 1.0f && penalty_freq == 0.0f && penalty_present == 0.0f));
|
|
2785
|
+
|
|
2786
|
+
if (is_empty) {
|
|
2787
|
+
return llama_sampler_init_empty("?penalties");
|
|
2788
|
+
}
|
|
2789
|
+
|
|
1842
2790
|
return llama_sampler_init(
|
|
1843
2791
|
/* .iface = */ &llama_sampler_penalties_i,
|
|
1844
2792
|
/* .ctx = */ new llama_sampler_penalties {
|
|
@@ -1876,9 +2824,7 @@ static void llama_sampler_top_n_sigma_apply(struct llama_sampler * smpl, llama_t
|
|
|
1876
2824
|
for (size_t i = 0; i < cur_p->size; ++i) {
|
|
1877
2825
|
// Only count non-negative infinity values
|
|
1878
2826
|
if (cur_p->data[i].logit != -INFINITY) {
|
|
1879
|
-
|
|
1880
|
-
max = cur_p->data[i].logit;
|
|
1881
|
-
}
|
|
2827
|
+
max = std::max(max, cur_p->data[i].logit);
|
|
1882
2828
|
logits_sum += cur_p->data[i].logit;
|
|
1883
2829
|
valid_count++;
|
|
1884
2830
|
}
|
|
@@ -1915,15 +2861,25 @@ static void llama_sampler_top_n_sigma_free(struct llama_sampler * smpl) {
|
|
|
1915
2861
|
}
|
|
1916
2862
|
|
|
1917
2863
|
static struct llama_sampler_i llama_sampler_top_n_sigma_i = {
|
|
1918
|
-
/* .name
|
|
1919
|
-
/* .accept
|
|
1920
|
-
/* .apply
|
|
1921
|
-
/* .reset
|
|
1922
|
-
/* .clone
|
|
1923
|
-
/* .free
|
|
2864
|
+
/* .name = */ llama_sampler_top_n_sigma_name,
|
|
2865
|
+
/* .accept = */ nullptr,
|
|
2866
|
+
/* .apply = */ llama_sampler_top_n_sigma_apply,
|
|
2867
|
+
/* .reset = */ nullptr,
|
|
2868
|
+
/* .clone = */ llama_sampler_top_n_sigma_clone,
|
|
2869
|
+
/* .free = */ llama_sampler_top_n_sigma_free,
|
|
2870
|
+
/* .backend_init = */ nullptr,
|
|
2871
|
+
/* .backend_accept = */ nullptr,
|
|
2872
|
+
/* .backend_apply = */ nullptr,
|
|
2873
|
+
/* .backend_set_input = */ nullptr,
|
|
1924
2874
|
};
|
|
1925
2875
|
|
|
1926
2876
|
struct llama_sampler * llama_sampler_init_top_n_sigma(float n) {
|
|
2877
|
+
const bool is_empty = (n <= 0.0f);
|
|
2878
|
+
|
|
2879
|
+
if (is_empty) {
|
|
2880
|
+
return llama_sampler_init_empty("?top-n-sigma");
|
|
2881
|
+
}
|
|
2882
|
+
|
|
1927
2883
|
return llama_sampler_init(
|
|
1928
2884
|
/* .iface = */ &llama_sampler_top_n_sigma_i,
|
|
1929
2885
|
/* .ctx = */ new llama_sampler_top_n_sigma {
|
|
@@ -2245,12 +3201,16 @@ static void llama_sampler_dry_free(struct llama_sampler * smpl) {
|
|
|
2245
3201
|
}
|
|
2246
3202
|
|
|
2247
3203
|
static struct llama_sampler_i llama_sampler_dry_i = {
|
|
2248
|
-
/* .name
|
|
2249
|
-
/* .accept
|
|
2250
|
-
/* .apply
|
|
2251
|
-
/* .reset
|
|
2252
|
-
/* .clone
|
|
2253
|
-
/* .free
|
|
3204
|
+
/* .name = */ llama_sampler_dry_name,
|
|
3205
|
+
/* .accept = */ llama_sampler_dry_accept,
|
|
3206
|
+
/* .apply = */ llama_sampler_dry_apply,
|
|
3207
|
+
/* .reset = */ llama_sampler_dry_reset,
|
|
3208
|
+
/* .clone = */ llama_sampler_dry_clone,
|
|
3209
|
+
/* .free = */ llama_sampler_dry_free,
|
|
3210
|
+
/* .backend_init = */ nullptr,
|
|
3211
|
+
/* .backend_accept = */ nullptr,
|
|
3212
|
+
/* .backend_apply = */ nullptr,
|
|
3213
|
+
/* .backend_set_input = */ nullptr,
|
|
2254
3214
|
};
|
|
2255
3215
|
|
|
2256
3216
|
struct llama_sampler * llama_sampler_init_dry(const struct llama_vocab * vocab, int32_t n_ctx_train, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const char** seq_breakers, size_t num_breakers) {
|
|
@@ -2261,6 +3221,10 @@ struct llama_sampler * llama_sampler_init_dry(const struct llama_vocab * vocab,
|
|
|
2261
3221
|
|
|
2262
3222
|
const bool dry_enabled = (dry_multiplier != 0.0f && dry_base >= 1.0f && dry_penalty_last_n != 0);
|
|
2263
3223
|
|
|
3224
|
+
if (!dry_enabled) {
|
|
3225
|
+
return llama_sampler_init_empty("?dry");
|
|
3226
|
+
}
|
|
3227
|
+
|
|
2264
3228
|
if (dry_enabled && seq_breakers != nullptr && num_breakers > 0) {
|
|
2265
3229
|
// Process sequence breakers
|
|
2266
3230
|
for (size_t i = 0; i < num_breakers; ++i) {
|
|
@@ -2331,16 +3295,23 @@ struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, floa
|
|
|
2331
3295
|
|
|
2332
3296
|
// logit-bias
|
|
2333
3297
|
|
|
2334
|
-
struct llama_sampler_logit_bias {
|
|
3298
|
+
struct llama_sampler_logit_bias : public llama_sampler_backend {
|
|
2335
3299
|
const int32_t n_vocab;
|
|
2336
3300
|
|
|
2337
3301
|
const std::vector<llama_logit_bias> logit_bias;
|
|
2338
3302
|
|
|
2339
3303
|
std::vector<llama_logit_bias> to_search;
|
|
3304
|
+
|
|
3305
|
+
struct ggml_tensor * inp_logit_bias;
|
|
3306
|
+
struct ggml_tensor * inp_logit_idxs;
|
|
3307
|
+
|
|
3308
|
+
ggml_context_ptr inp_ctx;
|
|
3309
|
+
ggml_backend_buffer_ptr inp_buf;
|
|
2340
3310
|
};
|
|
2341
3311
|
|
|
2342
|
-
static const char * llama_sampler_logit_bias_name(const struct llama_sampler *
|
|
2343
|
-
|
|
3312
|
+
static const char * llama_sampler_logit_bias_name(const struct llama_sampler * smpl) {
|
|
3313
|
+
auto * ctx = (llama_sampler_logit_bias *) smpl->ctx;
|
|
3314
|
+
return ctx->get_name();
|
|
2344
3315
|
}
|
|
2345
3316
|
|
|
2346
3317
|
static void llama_sampler_logit_bias_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
|
@@ -2385,25 +3356,123 @@ static void llama_sampler_logit_bias_free(struct llama_sampler * smpl) {
|
|
|
2385
3356
|
delete (llama_sampler_logit_bias *) smpl->ctx;
|
|
2386
3357
|
}
|
|
2387
3358
|
|
|
3359
|
+
static void llama_sampler_logit_bias_backend_apply(
|
|
3360
|
+
struct llama_sampler * smpl,
|
|
3361
|
+
struct ggml_context * ctx,
|
|
3362
|
+
struct ggml_cgraph * gf,
|
|
3363
|
+
struct llama_sampler_data * data) {
|
|
3364
|
+
GGML_UNUSED(gf);
|
|
3365
|
+
GGML_UNUSED(ctx);
|
|
3366
|
+
|
|
3367
|
+
auto * sctx = (llama_sampler_logit_bias *) smpl->ctx;
|
|
3368
|
+
if (sctx->logit_bias.empty()) {
|
|
3369
|
+
return;
|
|
3370
|
+
}
|
|
3371
|
+
|
|
3372
|
+
ggml_tensor * cur = ggml_fill(ctx, data->logits, 0.0f);
|
|
3373
|
+
|
|
3374
|
+
cur = ggml_reshape_2d(ctx, cur, 1, ggml_nelements(cur));
|
|
3375
|
+
cur = ggml_set_rows(ctx, cur, sctx->inp_logit_bias, sctx->inp_logit_idxs);
|
|
3376
|
+
cur = ggml_reshape_1d(ctx, cur, ggml_nelements(cur));
|
|
3377
|
+
|
|
3378
|
+
data->logits = ggml_add(ctx, data->logits, cur);
|
|
3379
|
+
}
|
|
3380
|
+
|
|
3381
|
+
static void llama_sampler_logit_bias_backend_set_input(struct llama_sampler * smpl) {
|
|
3382
|
+
auto * sctx = (llama_sampler_logit_bias *) smpl->ctx;
|
|
3383
|
+
if (sctx->logit_bias.empty()) {
|
|
3384
|
+
return;
|
|
3385
|
+
}
|
|
3386
|
+
|
|
3387
|
+
GGML_ASSERT(sctx->inp_logit_bias != nullptr);
|
|
3388
|
+
GGML_ASSERT(sctx->inp_logit_idxs != nullptr);
|
|
3389
|
+
|
|
3390
|
+
const size_t n = sctx->logit_bias.size();
|
|
3391
|
+
|
|
3392
|
+
std::vector<float> data_logit_bias(n, 0.0f);
|
|
3393
|
+
std::vector<int32_t> data_logit_idxs(n, 0);
|
|
3394
|
+
for (size_t i = 0; i < n; ++i) {
|
|
3395
|
+
const auto & lb = sctx->logit_bias[i];
|
|
3396
|
+
GGML_ASSERT(lb.token >= 0 && lb.token < (int32_t) sctx->n_vocab);
|
|
3397
|
+
data_logit_bias[i] = lb.bias;
|
|
3398
|
+
data_logit_idxs[i] = lb.token;
|
|
3399
|
+
}
|
|
3400
|
+
|
|
3401
|
+
ggml_backend_tensor_set(sctx->inp_logit_bias, data_logit_bias.data(), 0, ggml_nbytes(sctx->inp_logit_bias));
|
|
3402
|
+
ggml_backend_tensor_set(sctx->inp_logit_idxs, data_logit_idxs.data(), 0, ggml_nbytes(sctx->inp_logit_idxs));
|
|
3403
|
+
}
|
|
3404
|
+
|
|
3405
|
+
static bool llama_sampler_logit_bias_backend_init(
|
|
3406
|
+
struct llama_sampler * smpl,
|
|
3407
|
+
ggml_backend_buffer_type_t buft) {
|
|
3408
|
+
auto * sctx = (llama_sampler_logit_bias *) smpl->ctx;
|
|
3409
|
+
|
|
3410
|
+
sctx->init(true);
|
|
3411
|
+
|
|
3412
|
+
if (sctx->logit_bias.empty()) {
|
|
3413
|
+
return true;
|
|
3414
|
+
}
|
|
3415
|
+
|
|
3416
|
+
ggml_init_params params = {
|
|
3417
|
+
/*.mem_size =*/ 2*ggml_tensor_overhead(),
|
|
3418
|
+
/*.mem_buffer =*/ nullptr,
|
|
3419
|
+
/*.no_alloc =*/ true,
|
|
3420
|
+
};
|
|
3421
|
+
|
|
3422
|
+
sctx->inp_ctx.reset(ggml_init(params));
|
|
3423
|
+
|
|
3424
|
+
const size_t n = sctx->logit_bias.size();
|
|
3425
|
+
|
|
3426
|
+
sctx->inp_logit_bias = ggml_new_tensor_2d(sctx->inp_ctx.get(), GGML_TYPE_F32, 1, n);
|
|
3427
|
+
ggml_set_name(sctx->inp_logit_bias, "logit_bias");
|
|
3428
|
+
ggml_set_input(sctx->inp_logit_bias);
|
|
3429
|
+
|
|
3430
|
+
sctx->inp_logit_idxs = ggml_new_tensor_1d(sctx->inp_ctx.get(), GGML_TYPE_I32, n);
|
|
3431
|
+
ggml_set_name(sctx->inp_logit_idxs, "logit_idxs");
|
|
3432
|
+
ggml_set_input(sctx->inp_logit_idxs);
|
|
3433
|
+
|
|
3434
|
+
// Allocate all tensors from our context to the backend
|
|
3435
|
+
sctx->inp_buf.reset(ggml_backend_alloc_ctx_tensors_from_buft(sctx->inp_ctx.get(), buft));
|
|
3436
|
+
|
|
3437
|
+
ggml_backend_buffer_clear(sctx->inp_buf.get(), 0);
|
|
3438
|
+
|
|
3439
|
+
return true;
|
|
3440
|
+
}
|
|
3441
|
+
|
|
2388
3442
|
static struct llama_sampler_i llama_sampler_logit_bias_i = {
|
|
2389
|
-
/* .name
|
|
2390
|
-
/* .accept
|
|
2391
|
-
/* .apply
|
|
2392
|
-
/* .reset
|
|
2393
|
-
/* .clone
|
|
2394
|
-
/* .free
|
|
3443
|
+
/* .name = */ llama_sampler_logit_bias_name,
|
|
3444
|
+
/* .accept = */ nullptr,
|
|
3445
|
+
/* .apply = */ llama_sampler_logit_bias_apply,
|
|
3446
|
+
/* .reset = */ nullptr,
|
|
3447
|
+
/* .clone = */ llama_sampler_logit_bias_clone,
|
|
3448
|
+
/* .free = */ llama_sampler_logit_bias_free,
|
|
3449
|
+
/* .backend_init = */ llama_sampler_logit_bias_backend_init,
|
|
3450
|
+
/* .backend_accept = */ nullptr,
|
|
3451
|
+
/* .backend_apply = */ llama_sampler_logit_bias_backend_apply,
|
|
3452
|
+
/* .backend_set_input = */ llama_sampler_logit_bias_backend_set_input,
|
|
2395
3453
|
};
|
|
2396
3454
|
|
|
2397
3455
|
struct llama_sampler * llama_sampler_init_logit_bias(
|
|
2398
3456
|
int32_t n_vocab,
|
|
2399
3457
|
int32_t n_logit_bias,
|
|
2400
3458
|
const llama_logit_bias * logit_bias) {
|
|
3459
|
+
const bool is_empty = n_logit_bias <= 0;
|
|
3460
|
+
|
|
3461
|
+
if (is_empty) {
|
|
3462
|
+
return llama_sampler_init_empty("?logit-bias");
|
|
3463
|
+
}
|
|
3464
|
+
|
|
2401
3465
|
return llama_sampler_init(
|
|
2402
3466
|
/* .iface = */ &llama_sampler_logit_bias_i,
|
|
2403
3467
|
/* .ctx = */ new llama_sampler_logit_bias {
|
|
2404
|
-
|
|
2405
|
-
/* .
|
|
2406
|
-
/* .
|
|
3468
|
+
("logit-bias"),
|
|
3469
|
+
/* .n_vocab = */ n_vocab,
|
|
3470
|
+
/* .logit_bias = */ std::vector<llama_logit_bias>(logit_bias, logit_bias + n_logit_bias),
|
|
3471
|
+
/* .to_search = */ {},
|
|
3472
|
+
/* .inp_logit_bias = */ nullptr,
|
|
3473
|
+
/* .inp_logit_idxs = */ nullptr,
|
|
3474
|
+
/* .inp_ctx = */ nullptr,
|
|
3475
|
+
/* .inp_buf = */ nullptr,
|
|
2407
3476
|
}
|
|
2408
3477
|
);
|
|
2409
3478
|
}
|
|
@@ -2616,12 +3685,16 @@ static void llama_sampler_infill_free(struct llama_sampler * smpl) {
|
|
|
2616
3685
|
}
|
|
2617
3686
|
|
|
2618
3687
|
static struct llama_sampler_i llama_sampler_infill_i = {
|
|
2619
|
-
/* .name
|
|
2620
|
-
/* .accept
|
|
2621
|
-
/* .apply
|
|
2622
|
-
/* .reset
|
|
2623
|
-
/* .clone
|
|
2624
|
-
/* .free
|
|
3688
|
+
/* .name = */ llama_sampler_infill_name,
|
|
3689
|
+
/* .accept = */ nullptr,
|
|
3690
|
+
/* .apply = */ llama_sampler_infill_apply,
|
|
3691
|
+
/* .reset = */ nullptr,
|
|
3692
|
+
/* .clone = */ llama_sampler_infill_clone,
|
|
3693
|
+
/* .free = */ llama_sampler_infill_free,
|
|
3694
|
+
/* .backend_apply = */ nullptr,
|
|
3695
|
+
/* .backend_accept = */ nullptr,
|
|
3696
|
+
/* .backend_set_input = */ nullptr,
|
|
3697
|
+
/* .backend_init = */ nullptr,
|
|
2625
3698
|
};
|
|
2626
3699
|
|
|
2627
3700
|
struct llama_sampler * llama_sampler_init_infill(const struct llama_vocab * vocab) {
|
|
@@ -2653,7 +3726,7 @@ uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl) {
|
|
|
2653
3726
|
if (smpl->iface == &llama_sampler_chain_i) {
|
|
2654
3727
|
const auto * ctx = (const llama_sampler_chain *) smpl->ctx;
|
|
2655
3728
|
for (auto it = ctx->samplers.rbegin(); it != ctx->samplers.rend(); ++it) {
|
|
2656
|
-
const uint32_t seed = llama_sampler_get_seed(
|
|
3729
|
+
const uint32_t seed = llama_sampler_get_seed(it->ptr);
|
|
2657
3730
|
if (seed != LLAMA_DEFAULT_SEED) {
|
|
2658
3731
|
return seed;
|
|
2659
3732
|
}
|