cui-llama.rn 1.2.4 → 1.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -113,7 +113,7 @@ static void llama_sampler_softmax_impl(llama_token_data_array * cur_p) {
113
113
  }
114
114
 
115
115
  static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) {
116
- // TODO: move bucket sort to separate function so that top_p/tail_free/typical/softmax first is equally fast
116
+ // TODO: move bucket sort to separate function so that top_p/typical/softmax first is equally fast
117
117
  // if (k >= (int32_t)cur_p->size) {
118
118
  // return;
119
119
  // }
@@ -734,246 +734,6 @@ struct llama_sampler * llama_sampler_init_min_p(float p, size_t min_keep) {
734
734
  };
735
735
  }
736
736
 
737
- // xtc
738
-
739
- /*
740
- struct llama_sampler_xtc {
741
- const uint32_t seed;
742
- std::mt19937 rng;
743
- const float xtc_p;
744
- const float xtc_t;
745
- const size_t min_keep;
746
- };
747
-
748
- static const char * llama_sampler_xtc_name(const struct llama_sampler * /* smpl /) {
749
- return "xtc";
750
- }
751
-
752
- static void llama_sampler_xtc_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
753
- auto * ctx = (llama_sampler_xtc *) smpl->ctx;
754
-
755
- size_t min_keep = ctx -> min_keep;
756
- std::mt19937 rng = ctx -> rng;
757
-
758
- float xtc_threshold = ctx -> xtc_t;
759
- float xtc_probability = ctx -> xtc_p;
760
-
761
-
762
- if(xtc_threshold <= 0.0f || !cur_p-> size) {
763
- return;
764
- }
765
-
766
- bool xtc_applied = false;
767
- const int64_t t_start_sample_us = lm_ggml_time_us();
768
- llama_sampler_softmax_impl(cur_p);
769
-
770
- // unsorted iteration
771
- if (!cur_p->sorted) {
772
- std::vector<llama_token_data> top_tokens, low_tokens;
773
-
774
- // split candidates into two arrays for low and high tokens
775
- for (size_t i = 0; i < cur_p->size; ++i) {
776
- if (cur_p->data[i].logit >= xtc_threshold) {
777
- top_tokens.push_back(cur_p->data[i]);
778
- } else {
779
- low_tokens.push_back(cur_p-> data[i]);
780
- }
781
- }
782
- // if there is only one or no top_tokens, do not truncate
783
-
784
- if (top_tokens.size() <= 1) {
785
- return;
786
- }
787
-
788
- // sort top_tokens
789
- std::sort(top_tokens.begin(), top_tokens.end(), [](const llama_token_data & a, const llama_token_data & b) {
790
- return a.logit < b.logit;
791
- });
792
-
793
- // insert top_tokens with probability. Always insert lowest top_token
794
- low_tokens.push_back(top_tokens[0]);
795
- std::uniform_real_distribution<float> random_float(0.0 , 1.0);
796
- for (size_t i = 1; i < top_tokens.size(); ++i) {
797
- if(random_float(rng) <= xtc_probability) {
798
- low_tokens.push_back(top_tokens[i]);
799
- }
800
- }
801
- if(low_tokens.size() >= min_keep) {
802
- memcpy(cur_p->data, low_tokens.data(), low_tokens.size()*sizeof(llama_token_data));
803
- cur_p->size = low_tokens.size();
804
- xtc_applied = true;
805
- }
806
- }
807
- // sorted iteration
808
-
809
- if (!xtc_applied) {
810
- // Sort the logits in descending order
811
- if (!cur_p->sorted) {
812
- std::sort(cur_p->data, cur_p->data + cur_p->size, [](const llama_token_data & a, const llama_token_data & b) {
813
- return a.logit > b.logit;
814
- });
815
- cur_p->sorted = true;
816
- }
817
-
818
- // find last token over threshold
819
-
820
- size_t last_index = 0;
821
-
822
- for (; last_index < cur_p -> size; ++last_index) {
823
- if(cur_p -> data[last_index].p < xtc_threshold) {
824
- break;
825
- }
826
- }
827
-
828
- // check if only 1 token above threshold
829
- if(last_index <= 1) {
830
- return;
831
- }
832
- last_index--;
833
- // items beyond safe index will be ignored
834
- size_t safe_index = cur_p -> size;
835
-
836
- // remove tokens until last threshold item
837
- std::uniform_real_distribution<float> random_float(0.0 , 1.0);
838
- for (size_t i = 0; i < last_index; i++) {
839
- if(random_float(rng) < xtc_probability) {
840
- std::swap(cur_p-> data[i], cur_p->data[safe_index - 1]);
841
- safe_index--;
842
- if (cur_p-> sorted) {
843
- cur_p -> sorted = false;
844
- }
845
- }
846
- }
847
- cur_p -> size = safe_index;
848
- }
849
- }
850
-
851
- static struct llama_sampler * llama_sampler_xtc_clone(const struct llama_sampler * smpl) {
852
- const auto * ctx = (const llama_sampler_xtc *) smpl->ctx;
853
- return llama_sampler_init_xtc(ctx->xtc_p, ctx->xtc_t, ctx->min_keep, ctx->seed);
854
- }
855
-
856
- static void llama_sampler_xtc_free(struct llama_sampler * smpl) {
857
- delete (const llama_sampler_xtc *) smpl->ctx;
858
- }
859
-
860
- static struct llama_sampler_i llama_sampler_xtc_i = {
861
- /* .name = / llama_sampler_xtc_name,
862
- /* .accept = / nullptr,
863
- /* .apply = / llama_sampler_xtc_apply,
864
- /* .reset = / nullptr,
865
- /* .clone = / llama_sampler_xtc_clone,
866
- /* .free = / llama_sampler_xtc_free,
867
- };
868
-
869
- struct llama_sampler * llama_sampler_init_xtc(float xtc_p, float xtc_t, size_t min_keep, uint32_t seed) {
870
- return new llama_sampler {
871
- /* .iface = / &llama_sampler_xtc_i,
872
- /* .ctx = / new llama_sampler_xtc {
873
- /* .seed = / seed,
874
- /* .rng = / std::mt19937(seed),
875
- /* .xtc_p = / xtc_p,
876
- /* .xtc_t = / xtc_t,
877
- /* .min_keep = / min_keep
878
- },
879
- };
880
- }
881
- */
882
- // tail-free
883
-
884
- struct llama_sampler_tail_free {
885
- const float z;
886
- const size_t min_keep;
887
- };
888
-
889
- static const char * llama_sampler_tail_free_name(const struct llama_sampler * /*smpl*/) {
890
- return "tail-free";
891
- }
892
-
893
- static void llama_sampler_tail_free_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
894
- const auto * ctx = (llama_sampler_tail_free *) smpl->ctx;
895
-
896
- if (ctx->z >= 1.0f || cur_p->size <= 2) {
897
- return;
898
- }
899
-
900
- llama_sampler_softmax_impl(cur_p);
901
-
902
- // Compute the first and second derivatives
903
- std::vector<float> first_derivatives(cur_p->size - 1);
904
- std::vector<float> second_derivatives(cur_p->size - 2);
905
-
906
- for (size_t i = 0; i < first_derivatives.size(); ++i) {
907
- first_derivatives[i] = cur_p->data[i].p - cur_p->data[i + 1].p;
908
- }
909
- for (size_t i = 0; i < second_derivatives.size(); ++i) {
910
- second_derivatives[i] = first_derivatives[i] - first_derivatives[i + 1];
911
- }
912
-
913
- // Calculate absolute value of second derivatives
914
- for (size_t i = 0; i < second_derivatives.size(); ++i) {
915
- second_derivatives[i] = std::abs(second_derivatives[i]);
916
- }
917
-
918
- // Normalize the second derivatives
919
- {
920
- const float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
921
-
922
- if (second_derivatives_sum > 1e-6f) {
923
- for (float & value : second_derivatives) {
924
- value /= second_derivatives_sum;
925
- }
926
- } else {
927
- for (float & value : second_derivatives) {
928
- value = 1.0f / second_derivatives.size();
929
- }
930
- }
931
- }
932
-
933
- float cum_sum = 0.0f;
934
- size_t last_idx = cur_p->size;
935
- for (size_t i = 0; i < second_derivatives.size(); ++i) {
936
- cum_sum += second_derivatives[i];
937
-
938
- // Check if the running sum is greater than z or if we have kept at least min_keep tokens
939
- if (cum_sum > ctx->z && i >= ctx->min_keep) {
940
- last_idx = i;
941
- break;
942
- }
943
- }
944
-
945
- // Resize the output vector to keep only the tokens above the tail location
946
- cur_p->size = last_idx;
947
- }
948
-
949
- static struct llama_sampler * llama_sampler_tail_free_clone(const struct llama_sampler * smpl) {
950
- const auto * ctx = (const llama_sampler_tail_free *) smpl->ctx;
951
- return llama_sampler_init_tail_free(ctx->z, ctx->min_keep);
952
- }
953
-
954
- static void llama_sampler_tail_free_free(struct llama_sampler * smpl) {
955
- delete (llama_sampler_tail_free *) smpl->ctx;
956
- }
957
-
958
- static struct llama_sampler_i llama_sampler_tail_free_i = {
959
- /* .name = */ llama_sampler_tail_free_name,
960
- /* .accept = */ nullptr,
961
- /* .apply = */ llama_sampler_tail_free_apply,
962
- /* .reset = */ nullptr,
963
- /* .clone = */ llama_sampler_tail_free_clone,
964
- /* .free = */ llama_sampler_tail_free_free,
965
- };
966
-
967
- struct llama_sampler * llama_sampler_init_tail_free(float z, size_t min_keep) {
968
- return new llama_sampler {
969
- /* .iface = */ &llama_sampler_tail_free_i,
970
- /* .ctx = */ new llama_sampler_tail_free {
971
- /* .z = */ z,
972
- /*. min_keep = */ min_keep,
973
- },
974
- };
975
- }
976
-
977
737
  // typical
978
738
 
979
739
  struct llama_sampler_typical {
@@ -1829,6 +1589,397 @@ struct llama_sampler * llama_sampler_init_penalties(
1829
1589
  };
1830
1590
  }
1831
1591
 
1592
+ // DRY
1593
+
1594
+ struct llama_sampler_dry {
1595
+ int32_t total_context_size;
1596
+
1597
+ const float dry_multiplier;
1598
+ const float dry_base;
1599
+ const int32_t dry_allowed_length;
1600
+ const int32_t dry_penalty_last_n;
1601
+
1602
+ std::unordered_multimap<llama_token, std::vector<llama_token>> dry_processed_breakers;
1603
+ std::vector<int> dry_repeat_count;
1604
+ std::unordered_map<llama_token, int> dry_max_token_repeat;
1605
+ ring_buffer<llama_token> last_tokens;
1606
+ };
1607
+
1608
+ // Ported from Koboldcpp, original PR: https://github.com/LostRuins/koboldcpp/pull/982 (Original author: pi6am)
1609
+ static void get_overlapping_token_sequences(const llama_vocab & vocab, const std::string& str, std::unordered_multimap<llama_token, std::vector<llama_token>>& token_sequences, int max_tail_len = -1) {
1610
+ for (llama_token token_id = 0; token_id < (llama_token)vocab.n_vocab; token_id++) {
1611
+ std::string word = llama_detokenize(vocab, {token_id}, true);
1612
+ if (word.find(str) != std::string::npos) {
1613
+ token_sequences.emplace(token_id, std::vector<llama_token>());
1614
+ } else {
1615
+ size_t word_len = word.size(), str_len = str.size();
1616
+ size_t pos = -1;
1617
+ while ((pos = word.find(str[0], pos + 1)) != std::string::npos) {
1618
+ bool match = true;
1619
+ size_t i;
1620
+ for (i = 1; i < str_len && i + pos < word_len; ++i) {
1621
+ if (word[pos + i] != str[i]) {
1622
+ match = false;
1623
+ break;
1624
+ }
1625
+ }
1626
+ if (match) {
1627
+ std::vector<llama_token> tokenization = llama_tokenize_internal(vocab, str.substr(i), false, false);
1628
+ if (max_tail_len >= 0 && tokenization.size() > (size_t)max_tail_len) {
1629
+ tokenization.resize(max_tail_len);
1630
+ }
1631
+
1632
+ // Ensure we don't already have a duplicate matching tokenization
1633
+ auto its = token_sequences.equal_range(token_id);
1634
+ bool found = false;
1635
+ for (auto it = its.first; it != its.second; ++it) {
1636
+ if (tokenization == it->second) {
1637
+ found = true;
1638
+ break;
1639
+ }
1640
+ }
1641
+ if (!found) {
1642
+ token_sequences.emplace(token_id, tokenization);
1643
+ }
1644
+ }
1645
+ }
1646
+ }
1647
+ }
1648
+ }
1649
+
1650
+ static const char * llama_sampler_dry_name(const struct llama_sampler * /*smpl*/) {
1651
+ return "dry";
1652
+ }
1653
+
1654
+ static void llama_sampler_dry_accept(struct llama_sampler * smpl, llama_token token) {
1655
+ auto * ctx = (llama_sampler_dry *) smpl->ctx;
1656
+ if (ctx->dry_multiplier == 0.0f || ctx->dry_base < 1.0f || ctx->dry_penalty_last_n == 0) {
1657
+ return;
1658
+ }
1659
+
1660
+ ctx->last_tokens.push_back(token);
1661
+ }
1662
+
1663
+ // Ported from Koboldcpp, original PR: https://github.com/LostRuins/koboldcpp/pull/982 (Original author: pi6am)
1664
+ static void llama_sampler_dry_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
1665
+ auto * ctx = (llama_sampler_dry *) smpl->ctx;
1666
+
1667
+ if (ctx->dry_multiplier == 0.0f || ctx->dry_base < 1.0f || ctx->dry_penalty_last_n == 0) {
1668
+ return;
1669
+ }
1670
+
1671
+ int32_t effective_dry_penalty_last_n = (ctx->dry_penalty_last_n == -1) ? ctx->total_context_size : std::max(ctx->dry_penalty_last_n, 0);
1672
+ int last_n_repeat = std::min(std::min((int)ctx->last_tokens.size(), effective_dry_penalty_last_n), ctx->total_context_size);
1673
+
1674
+ if (last_n_repeat <= ctx->dry_allowed_length) {
1675
+ return;
1676
+ }
1677
+
1678
+ ctx->dry_repeat_count.assign(last_n_repeat, 0);
1679
+ ctx->dry_max_token_repeat.clear();
1680
+
1681
+ // Step 1: Look for restart sequences to limit the maximum repetition length.
1682
+ // Work backwards through the context looking for any token that begins a restart sequence.
1683
+ //
1684
+ // The collection `restart_sequences` is a mapping from a "head" token to all "tail"
1685
+ // sequences that together comprise a restart sequence. This allows us to quickly check
1686
+ // whether each token is the head of a complete sequence. Most restart sequences are actually
1687
+ // a single token, and for these the "tail" is an empty vector.
1688
+ //
1689
+ // If the token is a "head", test all restart sequences that begin with this token
1690
+ // (there will often only be one sequence for each token, but if sequences like 'aaaq1' and
1691
+ // 'aaa1' are used as restart strings, both could start with 'aaa' when tokenized). The
1692
+ // longest matching sequence (if any) is used to limit the maximum repetition length.
1693
+ //
1694
+ // Note that in the case case of a short sequence contained in a longer one, this might fail to
1695
+ // find the smallest value for `rep_limit`. For example, if 'amniotic' and 'ni' are both used as
1696
+ // restart sequences, 'ni' will be found first, and since it's shorter it will fail to suppress
1697
+ // 'otic'. This is a minor issue since fully contained restart sequences are likely to be rare.
1698
+ //
1699
+ // This is theoretically worst-case O(N^2) for arbitrary restart sequences, which is why we
1700
+ // have already clamped the maximum tail sequence length when generating `restart_sequences`.
1701
+ // With clamping, this scan is O(N) in the context length.
1702
+
1703
+ int rep_limit = last_n_repeat;
1704
+ for (int i = 0; i < last_n_repeat; ++i) {
1705
+ llama_token token = ctx->last_tokens.rat(i);
1706
+ auto its = ctx->dry_processed_breakers.equal_range(token);
1707
+ if (its.first == ctx->dry_processed_breakers.end()) {
1708
+ continue;
1709
+ }
1710
+ int longest_match = -1;
1711
+ for (auto it = its.first; it != its.second; ++it) {
1712
+ // Note that (*it) does not contain the head character, so seq_len will be
1713
+ // the restart sequence length minus 1.
1714
+ // In the common case of a single-token restart sequence, (*it) will be empty
1715
+ // and we will trivially match.
1716
+ int seq_len = (int)it->second.size();
1717
+ if (seq_len > longest_match && seq_len <= (int)i) {
1718
+ bool match = true;
1719
+ for (int offset = 0; offset < seq_len; ++offset) {
1720
+ // The -1 when indexing `last_tokens` is because we already matched the head.
1721
+ if (it->second[offset] != ctx->last_tokens.rat(i - offset - 1)) {
1722
+ match = false;
1723
+ break;
1724
+ }
1725
+ }
1726
+ if (match) {
1727
+ longest_match = seq_len;
1728
+ }
1729
+ }
1730
+ }
1731
+ if (longest_match >= 0) {
1732
+ // We found a restart sequence starting `i` tokens from the end and continuing for
1733
+ // `longest_match` tokens.
1734
+ rep_limit = i - longest_match;
1735
+ break;
1736
+ }
1737
+ }
1738
+ if (rep_limit < ctx->dry_allowed_length) {
1739
+ return;
1740
+ }
1741
+
1742
+ // Step 2: Iterate in reverse over the last N tokens of the context, using the "Z-algorithm" (in
1743
+ // the reverse direction) to efficiently compute the positions and lengths of suffixes appearing
1744
+ // elsewhere in the context. We limit the suffix length to `rep_limit` to respect restart sequences.
1745
+ //
1746
+ // This algorithm is not currently documented on Wikipedia, but there is a clear description here:
1747
+ // https://ivanyu.me/blog/2014/10/15/z-algorithm/
1748
+ //
1749
+ // The code below is adapted from the public domain implementation by the same author here:
1750
+ // https://github.com/ivanyu/string-algorithms/blob/master/z_algorithm.py
1751
+ //
1752
+ // Example:
1753
+ // Last N tokens: a b c c b c y a b c
1754
+ // Repeat counts: 0 0 3 1 0 2 0 0 0 0
1755
+ // ^
1756
+ // This `3` means that the last three tokens of the context (a b c) also appear here.
1757
+ //
1758
+ // This step is worst case O(N) since the Z-algorithm is linear, despite the appearance of nested
1759
+ // for/while loops. This can be seen by observing that the `lt` and `rt` bounds are set after each
1760
+ // repeated suffix is detected (i.e. after each while loop when n > 0). These bound variables
1761
+ // ensure that the inner while loops only examine each token in the context once as the outer
1762
+ // for loop iterates over the context.
1763
+
1764
+ {
1765
+ const int last = last_n_repeat - 1;
1766
+ int rt = 0, lt = 0;
1767
+
1768
+ for (int k = 1; k < last_n_repeat; ++k) {
1769
+ if (k > rt) {
1770
+ // If k is outside the current Z-box, do naive computation.
1771
+ int n = 0;
1772
+ while (n + k < last_n_repeat && ctx->last_tokens.rat(n) == ctx->last_tokens.rat(n+k)) {
1773
+ ++n;
1774
+ }
1775
+ ctx->dry_repeat_count[last - k] = std::min(n, rep_limit);
1776
+ if (n > 0) {
1777
+ lt = k;
1778
+ rt = k+n-1;
1779
+ }
1780
+ } else {
1781
+ // If k is inside the current Z-box, consider two cases.
1782
+
1783
+ int p = k - lt; // Pair index.
1784
+ int right_part_len = rt - k + 1;
1785
+
1786
+ if (ctx->dry_repeat_count[last - p] < right_part_len) {
1787
+ int n = std::min(ctx->dry_repeat_count[last - p], rep_limit);
1788
+ ctx->dry_repeat_count[last - k] = n;
1789
+ } else {
1790
+ int i = rt + 1;
1791
+ while (i < last_n_repeat && ctx->last_tokens.rat(i) == ctx->last_tokens.rat(i - k)) {
1792
+ i += 1;
1793
+ }
1794
+
1795
+ int n = std::min(i - k, rep_limit);
1796
+ ctx->dry_repeat_count[last - k] = n;
1797
+ lt = k;
1798
+ rt = i - 1;
1799
+ }
1800
+ }
1801
+ }
1802
+ }
1803
+
1804
+ // Step 3: Iterate over dry_repeat_count and last_tokens, examining the maximum repeat length
1805
+ // that would be generated by emitting each new token that would extend a sequence.
1806
+ //
1807
+ // Following the same example as above:
1808
+ // Last N tokens: a b c c b c y a b c
1809
+ // Repeat counts: 0 0 3 1 0 2 0 0 0 0
1810
+ //
1811
+ // For each non-zero, look ahead one token. This token, if emitted, would extend the repetition.
1812
+ // c: 3 -> 4 (from `a b c` to `a b c c`)
1813
+ // b: 1 -> 2 (from `c` to `c b`)
1814
+ // y: 2 -> 3 (from `b c` to `b c y`)
1815
+
1816
+ for (int i = 0; i < last_n_repeat - 1; ++i) {
1817
+ int repeat_len = ctx->dry_repeat_count[i];
1818
+ if (repeat_len >= ctx->dry_allowed_length) {
1819
+ // This token ends a repeat, so the next token would continue one.
1820
+ // By convention, the value of `repeat_len` only includes the tokens currently
1821
+ // in the context, not the new token that would be added.
1822
+ llama_token token = ctx->last_tokens.rat(last_n_repeat - 2 - i);
1823
+ // Track the maximum sequence ending in this token.
1824
+ const auto& it = ctx->dry_max_token_repeat.find(token);
1825
+ if (it == ctx->dry_max_token_repeat.end() || it->second < repeat_len) {
1826
+ ctx->dry_max_token_repeat[token] = repeat_len;
1827
+ }
1828
+ }
1829
+ }
1830
+
1831
+ // Step 4: Apply logit penalties based on the maximum repeat length for relevant tokens.
1832
+
1833
+ // Prevent floating point overflow in `pow(penalty_base, exponent)` by clamping to `max_exponent`.
1834
+ // Compute it from `penalty_base` and the approximate log of `std::numeric_limits<float>::max()`
1835
+ const float FLOAT_MAX_LOG = 88.7228391f;
1836
+ int max_exponent = 0;
1837
+ if (ctx->dry_base > 1.000001f) {
1838
+ max_exponent = FLOAT_MAX_LOG / std::log(ctx->dry_base);
1839
+ }
1840
+
1841
+ for (size_t i = 0; i < cur_p->size; ++i) {
1842
+ const auto& af_kvp = ctx->dry_max_token_repeat.find(cur_p->data[i].id);
1843
+ if (af_kvp != ctx->dry_max_token_repeat.end()) {
1844
+ // Check all sequence breakers starting with this token
1845
+ auto range = ctx->dry_processed_breakers.equal_range(cur_p->data[i].id);
1846
+ bool is_single_token_breaker = false;
1847
+
1848
+ for (auto it = range.first; it != range.second; ++it) {
1849
+ if (it->second.empty()) {
1850
+ is_single_token_breaker = true;
1851
+ break;
1852
+ }
1853
+ }
1854
+
1855
+ // Apply penalty only if it's not a single-token sequence breaker
1856
+ if (!is_single_token_breaker) {
1857
+ int repeat_exp = af_kvp->second - ctx->dry_allowed_length;
1858
+ if (max_exponent > 0 && repeat_exp > max_exponent) {
1859
+ repeat_exp = max_exponent;
1860
+ }
1861
+ float penalty = ctx->dry_multiplier * std::pow(ctx->dry_base, repeat_exp);
1862
+ cur_p->data[i].logit -= penalty;
1863
+ }
1864
+ }
1865
+ }
1866
+
1867
+ cur_p->sorted = false;
1868
+ }
1869
+
1870
+ static void llama_sampler_dry_reset(struct llama_sampler * smpl) {
1871
+ auto * ctx = (llama_sampler_dry *) smpl->ctx;
1872
+ ctx->last_tokens.clear();
1873
+ ctx->dry_repeat_count.clear();
1874
+ ctx->dry_max_token_repeat.clear();
1875
+ }
1876
+
1877
+ static struct llama_sampler * llama_sampler_dry_clone(const struct llama_sampler * smpl) {
1878
+ const auto * ctx = (llama_sampler_dry *) smpl->ctx;
1879
+
1880
+ // nullptr is passed as vocab because it is only needed for raw sequence breaker processing, which we have already done and will be copying
1881
+ auto * result = llama_sampler_init_dry(nullptr, ctx->dry_multiplier, ctx->dry_base, ctx->dry_allowed_length, ctx->dry_penalty_last_n, NULL, 0);
1882
+ // Copy the state, including the processed breakers
1883
+ {
1884
+ auto * result_ctx = (llama_sampler_dry *) result->ctx;
1885
+ result_ctx->dry_processed_breakers = ctx->dry_processed_breakers;
1886
+ result_ctx->dry_repeat_count = ctx->dry_repeat_count;
1887
+ result_ctx->dry_max_token_repeat = ctx->dry_max_token_repeat;
1888
+ result_ctx->last_tokens = ctx->last_tokens;
1889
+ }
1890
+
1891
+ return result;
1892
+ }
1893
+
1894
+ static void llama_sampler_dry_free(struct llama_sampler * smpl) {
1895
+ delete (llama_sampler_dry *) smpl->ctx;
1896
+ }
1897
+
1898
+ static struct llama_sampler_i llama_sampler_dry_i = {
1899
+ /* .name = */ llama_sampler_dry_name,
1900
+ /* .accept = */ llama_sampler_dry_accept,
1901
+ /* .apply = */ llama_sampler_dry_apply,
1902
+ /* .reset = */ llama_sampler_dry_reset,
1903
+ /* .clone = */ llama_sampler_dry_clone,
1904
+ /* .free = */ llama_sampler_dry_free,
1905
+ };
1906
+
1907
+ struct llama_sampler * llama_sampler_init_dry_impl(const struct llama_vocab & vocab, int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const char** seq_breakers, size_t num_breakers) {
1908
+ int32_t effective_dry_penalty_last_n = (dry_penalty_last_n == -1) ? context_size : std::max(dry_penalty_last_n, 0);
1909
+ std::unordered_multimap<llama_token, std::vector<llama_token>> processed_breakers;
1910
+ const int MAX_CHAR_LEN = 40;
1911
+ const int MAX_SEQ_LEN = 20;
1912
+
1913
+ const bool dry_enabled = (dry_multiplier != 0.0f && dry_base >= 1.0f && dry_penalty_last_n != 0);
1914
+
1915
+ if (dry_enabled && seq_breakers != nullptr && num_breakers > 0) {
1916
+ // Process sequence breakers
1917
+ for (size_t i = 0; i < num_breakers; ++i) {
1918
+ if (seq_breakers[i] == nullptr || std::strlen(seq_breakers[i]) == 0) {
1919
+ LLAMA_LOG_WARN("skipping null or empty DRY sequence breaker at index %zu\n", i);
1920
+ continue;
1921
+ }
1922
+
1923
+ std::string sequence_break(seq_breakers[i]);
1924
+ if (sequence_break.empty()) {
1925
+ LLAMA_LOG_WARN("skipping empty DRY sequence breaker\n");
1926
+ continue;
1927
+ }
1928
+
1929
+ if (sequence_break.size() > MAX_CHAR_LEN) {
1930
+ LLAMA_LOG_WARN("truncating DRY sequence breaker to %d characters\n", MAX_CHAR_LEN);
1931
+ sequence_break.resize(MAX_CHAR_LEN);
1932
+ }
1933
+
1934
+ get_overlapping_token_sequences(vocab, sequence_break, processed_breakers, MAX_SEQ_LEN);
1935
+ }
1936
+ }
1937
+
1938
+ return new llama_sampler {
1939
+ /* .iface = */ &llama_sampler_dry_i,
1940
+ /* .ctx = */ new llama_sampler_dry {
1941
+ /* .total_context_size = */ context_size,
1942
+ /* .dry_multiplier = */ dry_multiplier,
1943
+ /* .dry_base = */ dry_base,
1944
+ /* .dry_allowed_length = */ dry_allowed_length,
1945
+ /* .dry_penalty_last_n = */ dry_penalty_last_n,
1946
+ /* .dry_processed_breakers = */ std::move(processed_breakers),
1947
+ /* .dry_repeat_count = */ dry_enabled ? std::vector<int>(effective_dry_penalty_last_n, 0) : std::vector<int>{},
1948
+ /* .dry_max_token_repeat = */ {},
1949
+ /* .last_tokens = */ dry_enabled ? ring_buffer<llama_token>(effective_dry_penalty_last_n) : ring_buffer<llama_token>(0),
1950
+ },
1951
+ };
1952
+ }
1953
+
1954
+ // wrapper for test-sampling.cpp
1955
+ struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const std::vector<std::vector<llama_token>>& seq_breakers) {
1956
+ llama_vocab dummy_vocab;
1957
+ auto * result = llama_sampler_init_dry_impl(dummy_vocab, context_size, dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, NULL, 0);
1958
+ auto * ctx = (llama_sampler_dry *) result->ctx;
1959
+
1960
+ // Process the token-based sequence breakers
1961
+ ctx->dry_processed_breakers.clear();
1962
+ if (seq_breakers.empty()) {
1963
+ LLAMA_LOG_WARN("empty DRY sequence breakers list in llama_sampler_init_dry_testing\n");
1964
+ } else {
1965
+ for (const auto& breaker : seq_breakers) {
1966
+ if (breaker.empty()) {
1967
+ LLAMA_LOG_WARN("skipping DRY empty sequence breaker\n");
1968
+ continue;
1969
+ }
1970
+ llama_token head_token = breaker[0];
1971
+ std::vector<llama_token> tail_tokens(breaker.begin() + 1, breaker.end());
1972
+ ctx->dry_processed_breakers.emplace(head_token, std::move(tail_tokens));
1973
+ }
1974
+
1975
+ if (ctx->dry_processed_breakers.empty()) {
1976
+ LLAMA_LOG_WARN("no valid DRY sequence breakers processed in llama_sampler_init_dry_testing\n");
1977
+ }
1978
+ }
1979
+
1980
+ return result;
1981
+ }
1982
+
1832
1983
  // logit-bias
1833
1984
 
1834
1985
  struct llama_sampler_logit_bias {
@@ -28,3 +28,21 @@ struct llama_sampler * llama_sampler_init_grammar_impl(
28
28
 
29
29
  struct llama_sampler * llama_sampler_init_infill_impl(
30
30
  const struct llama_vocab & vocab);
31
+
32
+ struct llama_sampler * llama_sampler_init_dry_impl(
33
+ const struct llama_vocab & vocab,
34
+ int32_t context_size,
35
+ float dry_multiplier,
36
+ float dry_base,
37
+ int32_t dry_allowed_length,
38
+ int32_t dry_penalty_last_n,
39
+ const char ** seq_breakers,
40
+ size_t num_breakers);
41
+
42
+ struct llama_sampler * llama_sampler_init_dry_testing(
43
+ int32_t context_size,
44
+ float dry_multiplier,
45
+ float dry_base,
46
+ int32_t dry_allowed_length,
47
+ int32_t dry_penalty_last_n,
48
+ const std::vector<std::vector<llama_token>>& seq_breakers);
@@ -1966,3 +1966,19 @@ int32_t llama_detokenize_impl(
1966
1966
 
1967
1967
  return total <= text_len_max ? total : -total;
1968
1968
  }
1969
+
1970
+ std::string llama_detokenize(const struct llama_vocab & vocab, const std::vector<llama_token> & tokens, bool special) {
1971
+ std::string text;
1972
+ text.resize(std::max(text.capacity(), tokens.size()));
1973
+ int32_t n_chars = llama_detokenize_impl(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
1974
+ if (n_chars < 0) {
1975
+ text.resize(-n_chars);
1976
+ n_chars = llama_detokenize_impl(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
1977
+ LM_GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization
1978
+ }
1979
+
1980
+ text.resize(n_chars);
1981
+
1982
+ // NOTE: the original tokenizer decodes bytes after collecting the pieces.
1983
+ return text;
1984
+ }