@fugood/llama.node 0.3.2 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (190) hide show
  1. package/CMakeLists.txt +2 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +1 -1
  17. package/src/DetokenizeWorker.cpp +1 -1
  18. package/src/EmbeddingWorker.cpp +2 -2
  19. package/src/LlamaCompletionWorker.cpp +8 -8
  20. package/src/LlamaCompletionWorker.h +2 -2
  21. package/src/LlamaContext.cpp +8 -9
  22. package/src/TokenizeWorker.cpp +1 -1
  23. package/src/common.hpp +4 -4
  24. package/src/llama.cpp/.github/workflows/build.yml +43 -9
  25. package/src/llama.cpp/.github/workflows/docker.yml +3 -0
  26. package/src/llama.cpp/CMakeLists.txt +7 -4
  27. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  28. package/src/llama.cpp/common/CMakeLists.txt +0 -2
  29. package/src/llama.cpp/common/arg.cpp +642 -607
  30. package/src/llama.cpp/common/arg.h +22 -22
  31. package/src/llama.cpp/common/common.cpp +79 -281
  32. package/src/llama.cpp/common/common.h +130 -100
  33. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  34. package/src/llama.cpp/common/log.cpp +50 -50
  35. package/src/llama.cpp/common/log.h +18 -18
  36. package/src/llama.cpp/common/ngram-cache.cpp +36 -36
  37. package/src/llama.cpp/common/ngram-cache.h +19 -19
  38. package/src/llama.cpp/common/sampling.cpp +116 -108
  39. package/src/llama.cpp/common/sampling.h +20 -20
  40. package/src/llama.cpp/docs/build.md +37 -17
  41. package/src/llama.cpp/examples/CMakeLists.txt +1 -1
  42. package/src/llama.cpp/examples/batched/batched.cpp +14 -14
  43. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
  44. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
  45. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
  46. package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
  47. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
  48. package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
  49. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
  50. package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
  51. package/src/llama.cpp/examples/imatrix/imatrix.cpp +20 -11
  52. package/src/llama.cpp/examples/infill/infill.cpp +40 -86
  53. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +42 -151
  54. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  55. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
  56. package/src/llama.cpp/examples/llava/clip.cpp +1 -0
  57. package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
  58. package/src/llama.cpp/examples/llava/llava.cpp +37 -3
  59. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
  60. package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
  61. package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
  62. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  63. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +14 -14
  64. package/src/llama.cpp/examples/lookup/lookup.cpp +29 -29
  65. package/src/llama.cpp/examples/main/main.cpp +64 -109
  66. package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
  67. package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
  68. package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
  69. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
  70. package/src/llama.cpp/examples/retrieval/retrieval.cpp +13 -13
  71. package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
  72. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +34 -17
  73. package/src/llama.cpp/examples/server/CMakeLists.txt +4 -13
  74. package/src/llama.cpp/examples/server/server.cpp +553 -691
  75. package/src/llama.cpp/examples/server/utils.hpp +312 -25
  76. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  77. package/src/llama.cpp/examples/simple/simple.cpp +128 -96
  78. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
  79. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
  80. package/src/llama.cpp/examples/speculative/speculative.cpp +54 -51
  81. package/src/llama.cpp/examples/tokenize/tokenize.cpp +2 -2
  82. package/src/llama.cpp/ggml/CMakeLists.txt +15 -9
  83. package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
  84. package/src/llama.cpp/ggml/include/ggml-backend.h +46 -33
  85. package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
  86. package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
  87. package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
  88. package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
  89. package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
  90. package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
  91. package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
  92. package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
  93. package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
  94. package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
  95. package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
  96. package/src/llama.cpp/ggml/include/ggml.h +53 -393
  97. package/src/llama.cpp/ggml/src/CMakeLists.txt +66 -1149
  98. package/src/llama.cpp/ggml/src/ggml-aarch64.c +46 -3126
  99. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
  100. package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -27
  101. package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
  102. package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
  103. package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
  104. package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
  105. package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
  106. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +6 -25
  107. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
  108. package/src/llama.cpp/ggml/src/ggml-backend.cpp +303 -864
  109. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
  110. package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +213 -65
  111. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
  112. package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +255 -149
  113. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
  114. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
  115. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
  116. package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +0 -243
  117. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
  118. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  119. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
  120. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
  121. package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +667 -1
  122. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
  123. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
  124. package/src/llama.cpp/ggml/src/ggml-impl.h +366 -16
  125. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
  126. package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +238 -72
  127. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
  128. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
  129. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
  130. package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
  131. package/src/llama.cpp/ggml/src/ggml-quants.c +187 -10692
  132. package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
  133. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
  134. package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +475 -300
  135. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
  136. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
  137. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +40 -0
  138. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +258 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
  140. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +2 -22
  141. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
  142. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
  143. package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3584 -4142
  144. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +69 -67
  145. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +3 -3
  146. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
  147. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
  148. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
  149. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
  150. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
  151. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
  152. package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  153. package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
  154. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
  155. package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +555 -623
  156. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +125 -206
  157. package/src/llama.cpp/ggml/src/ggml.c +4032 -19890
  158. package/src/llama.cpp/include/llama.h +67 -33
  159. package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
  160. package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
  161. package/src/llama.cpp/src/CMakeLists.txt +2 -1
  162. package/src/llama.cpp/src/llama-sampling.cpp +745 -105
  163. package/src/llama.cpp/src/llama-sampling.h +21 -2
  164. package/src/llama.cpp/src/llama-vocab.cpp +49 -9
  165. package/src/llama.cpp/src/llama-vocab.h +35 -11
  166. package/src/llama.cpp/src/llama.cpp +2636 -2406
  167. package/src/llama.cpp/src/unicode-data.cpp +2 -2
  168. package/src/llama.cpp/tests/CMakeLists.txt +1 -2
  169. package/src/llama.cpp/tests/test-arg-parser.cpp +14 -14
  170. package/src/llama.cpp/tests/test-backend-ops.cpp +185 -60
  171. package/src/llama.cpp/tests/test-barrier.cpp +1 -0
  172. package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
  173. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
  174. package/src/llama.cpp/tests/test-log.cpp +2 -2
  175. package/src/llama.cpp/tests/test-opt.cpp +853 -142
  176. package/src/llama.cpp/tests/test-quantize-fns.cpp +22 -19
  177. package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
  178. package/src/llama.cpp/tests/test-rope.cpp +1 -0
  179. package/src/llama.cpp/tests/test-sampling.cpp +162 -137
  180. package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
  181. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
  182. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
  183. package/src/llama.cpp/common/train.cpp +0 -1515
  184. package/src/llama.cpp/common/train.h +0 -233
  185. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
  186. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
  187. package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
  188. /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
  189. /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
  190. /package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +0 -0
@@ -63,6 +63,30 @@ static void llama_log_softmax(float * array, size_t size) {
63
63
  }
64
64
  */
65
65
 
66
+ static void llama_sampler_temp_impl(llama_token_data_array * cur_p, float temp) {
67
+ if (temp <= 0.0f) {
68
+ // find the token with the highest logit and set the rest to -inf
69
+ size_t max_i = 0;
70
+ float max_l = cur_p->data[0].logit;
71
+
72
+ for (size_t i = 1; i < cur_p->size; ++i) {
73
+ if (cur_p->data[i ].logit > max_l) {
74
+ cur_p->data[max_i].logit = -INFINITY;
75
+ max_i = i;
76
+ max_l = cur_p->data[i].logit;
77
+ } else {
78
+ cur_p->data[i].logit = -INFINITY;
79
+ }
80
+ }
81
+
82
+ return;
83
+ }
84
+
85
+ for (size_t i = 0; i < cur_p->size; ++i) {
86
+ cur_p->data[i].logit /= temp;
87
+ }
88
+ }
89
+
66
90
  static void llama_sampler_softmax_impl(llama_token_data_array * cur_p) {
67
91
  GGML_ASSERT(cur_p->size > 0);
68
92
 
@@ -89,7 +113,7 @@ static void llama_sampler_softmax_impl(llama_token_data_array * cur_p) {
89
113
  }
90
114
 
91
115
  static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) {
92
- // TODO: move bucket sort to separate function so that top_p/tail_free/typical/softmax first is equally fast
116
+ // TODO: move bucket sort to separate function so that top_p/typical/softmax first is equally fast
93
117
  // if (k >= (int32_t)cur_p->size) {
94
118
  // return;
95
119
  // }
@@ -427,6 +451,9 @@ static const char * llama_sampler_dist_name(const struct llama_sampler * /*smpl*
427
451
 
428
452
  static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
429
453
  auto * ctx = (llama_sampler_dist *) smpl->ctx;
454
+
455
+ llama_sampler_softmax_impl(cur_p);
456
+
430
457
  cur_p->selected = llama_sample_dist(cur_p, ctx->rng);
431
458
  }
432
459
 
@@ -706,101 +733,6 @@ struct llama_sampler * llama_sampler_init_min_p(float p, size_t min_keep) {
706
733
  };
707
734
  }
708
735
 
709
- // tail-free
710
-
711
- struct llama_sampler_tail_free {
712
- const float z;
713
- const size_t min_keep;
714
- };
715
-
716
- static const char * llama_sampler_tail_free_name(const struct llama_sampler * /*smpl*/) {
717
- return "tail-free";
718
- }
719
-
720
- static void llama_sampler_tail_free_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
721
- const auto * ctx = (llama_sampler_tail_free *) smpl->ctx;
722
-
723
- if (ctx->z >= 1.0f || cur_p->size <= 2) {
724
- return;
725
- }
726
-
727
- llama_sampler_softmax_impl(cur_p);
728
-
729
- // Compute the first and second derivatives
730
- std::vector<float> first_derivatives(cur_p->size - 1);
731
- std::vector<float> second_derivatives(cur_p->size - 2);
732
-
733
- for (size_t i = 0; i < first_derivatives.size(); ++i) {
734
- first_derivatives[i] = cur_p->data[i].p - cur_p->data[i + 1].p;
735
- }
736
- for (size_t i = 0; i < second_derivatives.size(); ++i) {
737
- second_derivatives[i] = first_derivatives[i] - first_derivatives[i + 1];
738
- }
739
-
740
- // Calculate absolute value of second derivatives
741
- for (size_t i = 0; i < second_derivatives.size(); ++i) {
742
- second_derivatives[i] = std::abs(second_derivatives[i]);
743
- }
744
-
745
- // Normalize the second derivatives
746
- {
747
- const float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
748
-
749
- if (second_derivatives_sum > 1e-6f) {
750
- for (float & value : second_derivatives) {
751
- value /= second_derivatives_sum;
752
- }
753
- } else {
754
- for (float & value : second_derivatives) {
755
- value = 1.0f / second_derivatives.size();
756
- }
757
- }
758
- }
759
-
760
- float cum_sum = 0.0f;
761
- size_t last_idx = cur_p->size;
762
- for (size_t i = 0; i < second_derivatives.size(); ++i) {
763
- cum_sum += second_derivatives[i];
764
-
765
- // Check if the running sum is greater than z or if we have kept at least min_keep tokens
766
- if (cum_sum > ctx->z && i >= ctx->min_keep) {
767
- last_idx = i;
768
- break;
769
- }
770
- }
771
-
772
- // Resize the output vector to keep only the tokens above the tail location
773
- cur_p->size = last_idx;
774
- }
775
-
776
- static struct llama_sampler * llama_sampler_tail_free_clone(const struct llama_sampler * smpl) {
777
- const auto * ctx = (const llama_sampler_tail_free *) smpl->ctx;
778
- return llama_sampler_init_tail_free(ctx->z, ctx->min_keep);
779
- }
780
-
781
- static void llama_sampler_tail_free_free(struct llama_sampler * smpl) {
782
- delete (llama_sampler_tail_free *) smpl->ctx;
783
- }
784
-
785
- static struct llama_sampler_i llama_sampler_tail_free_i = {
786
- /* .name = */ llama_sampler_tail_free_name,
787
- /* .accept = */ nullptr,
788
- /* .apply = */ llama_sampler_tail_free_apply,
789
- /* .reset = */ nullptr,
790
- /* .clone = */ llama_sampler_tail_free_clone,
791
- /* .free = */ llama_sampler_tail_free_free,
792
- };
793
-
794
- struct llama_sampler * llama_sampler_init_tail_free(float z, size_t min_keep) {
795
- return new llama_sampler {
796
- /* .iface = */ &llama_sampler_tail_free_i,
797
- /* .ctx = */ new llama_sampler_tail_free {
798
- /* .z = */ z,
799
- /*. min_keep = */ min_keep,
800
- },
801
- };
802
- }
803
-
804
736
  // typical
805
737
 
806
738
  struct llama_sampler_typical {
@@ -912,9 +844,8 @@ static const char * llama_sampler_temp_name(const struct llama_sampler * /*smpl*
912
844
 
913
845
  static void llama_sampler_temp_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
914
846
  const auto * ctx = (llama_sampler_temp *) smpl->ctx;
915
- for (size_t i = 0; i < cur_p->size; ++i) {
916
- cur_p->data[i].logit /= ctx->temp;
917
- }
847
+
848
+ llama_sampler_temp_impl(cur_p, ctx->temp);
918
849
  }
919
850
 
920
851
  static struct llama_sampler * llama_sampler_temp_clone(const struct llama_sampler * smpl) {
@@ -961,6 +892,7 @@ static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_toke
961
892
  if (ctx->delta > 0) {
962
893
  const float min_temp = std::max(0.0f, ctx->temp - ctx->delta);
963
894
  const float max_temp = ctx->temp + ctx->delta;
895
+
964
896
  float exponent_val = ctx->exponent;
965
897
 
966
898
  // no need to do anything if there is only one (or zero) candidates
@@ -998,9 +930,7 @@ static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_toke
998
930
  #endif
999
931
 
1000
932
  // Apply the dynamically calculated temperature scaling
1001
- for (size_t i = 0; i < cur_p->size; ++i) {
1002
- cur_p->data[i].logit /= dyn_temp;
1003
- }
933
+ llama_sampler_temp_impl(cur_p, dyn_temp);
1004
934
 
1005
935
  // Re-compute softmax probabilities after scaling logits with dynamic temperature
1006
936
  const double max_l_double = cur_p->data[0].logit;
@@ -1024,9 +954,7 @@ static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_toke
1024
954
  }
1025
955
  #endif
1026
956
  } else {
1027
- for (size_t i = 0; i < cur_p->size; ++i) {
1028
- cur_p->data[i].logit /= ctx->temp;
1029
- }
957
+ llama_sampler_temp_impl(cur_p, ctx->temp);
1030
958
  }
1031
959
  }
1032
960
 
@@ -1059,6 +987,101 @@ struct llama_sampler * llama_sampler_init_temp_ext(float temp, float delta, floa
1059
987
  };
1060
988
  }
1061
989
 
990
+ // xtc
991
+
992
+ struct llama_sampler_xtc {
993
+ const float probability;
994
+ const float threshold;
995
+ const size_t min_keep;
996
+
997
+ const uint32_t seed;
998
+ uint32_t seed_cur;
999
+
1000
+ std::mt19937 rng;
1001
+ };
1002
+
1003
+ static const char * llama_sampler_xtc_name(const struct llama_sampler * /*smpl*/) {
1004
+ return "xtc";
1005
+ }
1006
+
1007
+ static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
1008
+ auto * ctx = (llama_sampler_xtc *) smpl->ctx;
1009
+
1010
+ if (ctx->probability <= 0.0f
1011
+ || ctx->threshold > 0.5f
1012
+ || cur_p->size < 2) {
1013
+ return;
1014
+ }
1015
+
1016
+ std::uniform_real_distribution<float> distribution(0.0f, 1.0f);
1017
+ float chance = distribution(ctx->rng);
1018
+ if (chance > ctx->probability) return;
1019
+
1020
+ // in case it's not sorted/recalculated yet
1021
+ llama_sampler_softmax_impl(cur_p);
1022
+
1023
+ int pos_last = 0;
1024
+
1025
+ for (size_t i = 0; i < cur_p->size; ++i) {
1026
+ if (cur_p->data[i].p >= ctx->threshold) {
1027
+ pos_last = i;
1028
+ } else break;
1029
+ }
1030
+
1031
+ if (cur_p->size - pos_last >= ctx->min_keep && pos_last > 0) {
1032
+ cur_p->data += pos_last;
1033
+ cur_p->size -= pos_last;
1034
+ }
1035
+ }
1036
+
1037
+ static struct llama_sampler * llama_sampler_xtc_clone(const struct llama_sampler * smpl) {
1038
+ const auto * ctx = (const llama_sampler_xtc *) smpl->ctx;
1039
+ auto * result = llama_sampler_init_xtc(ctx->probability, ctx->threshold, ctx->min_keep, ctx->seed);
1040
+
1041
+ // copy the state
1042
+ {
1043
+ auto * result_ctx = (llama_sampler_xtc *) result->ctx;
1044
+
1045
+ result_ctx->rng = ctx->rng;
1046
+ }
1047
+
1048
+ return result;
1049
+ }
1050
+
1051
+ static void llama_sampler_xtc_free(struct llama_sampler * smpl) {
1052
+ delete (llama_sampler_xtc *) smpl->ctx;
1053
+ }
1054
+
1055
+ static void llama_sampler_xtc_reset(struct llama_sampler * smpl) {
1056
+ auto * ctx = (llama_sampler_xtc *) smpl->ctx;
1057
+ ctx->seed_cur = get_rng_seed(ctx->seed);
1058
+ ctx->rng.seed(ctx->seed_cur);
1059
+ }
1060
+
1061
+ static struct llama_sampler_i llama_sampler_xtc_i = {
1062
+ /* .name = */ llama_sampler_xtc_name,
1063
+ /* .accept = */ nullptr,
1064
+ /* .apply = */ llama_sample_xtc_apply,
1065
+ /* .reset = */ llama_sampler_xtc_reset,
1066
+ /* .clone = */ llama_sampler_xtc_clone,
1067
+ /* .free = */ llama_sampler_xtc_free,
1068
+ };
1069
+
1070
+ struct llama_sampler * llama_sampler_init_xtc(float p, float t, size_t min_keep, uint32_t seed) {
1071
+ auto seed_cur = get_rng_seed(seed);
1072
+ return new llama_sampler {
1073
+ /* .iface = */ &llama_sampler_xtc_i,
1074
+ /* .ctx = */ new llama_sampler_xtc {
1075
+ /* .probability = */ p,
1076
+ /* .threshold = */ t,
1077
+ /* .min_keep = */ min_keep,
1078
+ /* .seed = */ seed,
1079
+ /* .seed_cur = */ seed_cur,
1080
+ /* .rng = */ std::mt19937(seed_cur),
1081
+ },
1082
+ };
1083
+ }
1084
+
1062
1085
  // mirostat
1063
1086
 
1064
1087
  struct llama_sampler_mirostat {
@@ -1565,6 +1588,400 @@ struct llama_sampler * llama_sampler_init_penalties(
1565
1588
  };
1566
1589
  }
1567
1590
 
1591
+ // DRY
1592
+
1593
+ struct llama_sampler_dry {
1594
+ int32_t total_context_size;
1595
+
1596
+ const float dry_multiplier;
1597
+ const float dry_base;
1598
+ const int32_t dry_allowed_length;
1599
+ const int32_t dry_penalty_last_n;
1600
+
1601
+ std::unordered_multimap<llama_token, std::vector<llama_token>> dry_processed_breakers;
1602
+ std::vector<int> dry_repeat_count;
1603
+ std::unordered_map<llama_token, int> dry_max_token_repeat;
1604
+ ring_buffer<llama_token> last_tokens;
1605
+ };
1606
+
1607
+ // Ported from Koboldcpp, original PR: https://github.com/LostRuins/koboldcpp/pull/982 (Original author: pi6am)
1608
+ static void get_overlapping_token_sequences(const llama_vocab & vocab, const std::string& str, std::unordered_multimap<llama_token, std::vector<llama_token>>& token_sequences, int max_tail_len = -1) {
1609
+ for (llama_token token_id = 0; token_id < (llama_token)vocab.n_vocab; token_id++) {
1610
+ std::string word = llama_detokenize(vocab, {token_id}, true);
1611
+ if (word.find(str) != std::string::npos) {
1612
+ token_sequences.emplace(token_id, std::vector<llama_token>());
1613
+ } else {
1614
+ size_t word_len = word.size(), str_len = str.size();
1615
+ size_t pos = -1;
1616
+ while ((pos = word.find(str[0], pos + 1)) != std::string::npos) {
1617
+ bool match = true;
1618
+ size_t i;
1619
+ for (i = 1; i < str_len && i + pos < word_len; ++i) {
1620
+ if (word[pos + i] != str[i]) {
1621
+ match = false;
1622
+ break;
1623
+ }
1624
+ }
1625
+ if (match) {
1626
+ std::vector<llama_token> tokenization = llama_tokenize_internal(vocab, str.substr(i), false, false);
1627
+ if (max_tail_len >= 0 && tokenization.size() > (size_t)max_tail_len) {
1628
+ tokenization.resize(max_tail_len);
1629
+ }
1630
+
1631
+ // Ensure we don't already have a duplicate matching tokenization
1632
+ auto its = token_sequences.equal_range(token_id);
1633
+ bool found = false;
1634
+ for (auto it = its.first; it != its.second; ++it) {
1635
+ if (tokenization == it->second) {
1636
+ found = true;
1637
+ break;
1638
+ }
1639
+ }
1640
+ if (!found) {
1641
+ token_sequences.emplace(token_id, tokenization);
1642
+ }
1643
+ }
1644
+ }
1645
+ }
1646
+ }
1647
+ }
1648
+
1649
+ static const char * llama_sampler_dry_name(const struct llama_sampler * /*smpl*/) {
1650
+ return "dry";
1651
+ }
1652
+
1653
+ static void llama_sampler_dry_accept(struct llama_sampler * smpl, llama_token token) {
1654
+ auto * ctx = (llama_sampler_dry *) smpl->ctx;
1655
+ if (ctx->dry_multiplier == 0.0f || ctx->dry_base < 1.0f || ctx->dry_penalty_last_n == 0) {
1656
+ return;
1657
+ }
1658
+
1659
+ ctx->last_tokens.push_back(token);
1660
+ }
1661
+
1662
+ // Ported from Koboldcpp, original PR: https://github.com/LostRuins/koboldcpp/pull/982 (Original author: pi6am)
1663
+ static void llama_sampler_dry_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
1664
+ auto * ctx = (llama_sampler_dry *) smpl->ctx;
1665
+
1666
+ if (ctx->dry_multiplier == 0.0f || ctx->dry_base < 1.0f || ctx->dry_penalty_last_n == 0) {
1667
+ return;
1668
+ }
1669
+
1670
+ int32_t effective_dry_penalty_last_n = (ctx->dry_penalty_last_n == -1) ? ctx->total_context_size : std::max(ctx->dry_penalty_last_n, 0);
1671
+ int last_n_repeat = std::min(std::min((int)ctx->last_tokens.size(), effective_dry_penalty_last_n), ctx->total_context_size);
1672
+
1673
+ if (last_n_repeat <= ctx->dry_allowed_length) {
1674
+ return;
1675
+ }
1676
+
1677
+ ctx->dry_repeat_count.assign(last_n_repeat, 0);
1678
+ ctx->dry_max_token_repeat.clear();
1679
+
1680
+ // Step 1: Look for restart sequences to limit the maximum repetition length.
1681
+ // Work backwards through the context looking for any token that begins a restart sequence.
1682
+ //
1683
+ // The collection `restart_sequences` is a mapping from a "head" token to all "tail"
1684
+ // sequences that together comprise a restart sequence. This allows us to quickly check
1685
+ // whether each token is the head of a complete sequence. Most restart sequences are actually
1686
+ // a single token, and for these the "tail" is an empty vector.
1687
+ //
1688
+ // If the token is a "head", test all restart sequences that begin with this token
1689
+ // (there will often only be one sequence for each token, but if sequences like 'aaaq1' and
1690
+ // 'aaa1' are used as restart strings, both could start with 'aaa' when tokenized). The
1691
+ // longest matching sequence (if any) is used to limit the maximum repetition length.
1692
+ //
1693
+ // Note that in the case case of a short sequence contained in a longer one, this might fail to
1694
+ // find the smallest value for `rep_limit`. For example, if 'amniotic' and 'ni' are both used as
1695
+ // restart sequences, 'ni' will be found first, and since it's shorter it will fail to suppress
1696
+ // 'otic'. This is a minor issue since fully contained restart sequences are likely to be rare.
1697
+ //
1698
+ // This is theoretically worst-case O(N^2) for arbitrary restart sequences, which is why we
1699
+ // have already clamped the maximum tail sequence length when generating `restart_sequences`.
1700
+ // With clamping, this scan is O(N) in the context length.
1701
+
1702
+ int rep_limit = last_n_repeat;
1703
+ for (int i = 0; i < last_n_repeat; ++i) {
1704
+ llama_token token = ctx->last_tokens.rat(i);
1705
+ auto its = ctx->dry_processed_breakers.equal_range(token);
1706
+ if (its.first == ctx->dry_processed_breakers.end()) {
1707
+ continue;
1708
+ }
1709
+ int longest_match = -1;
1710
+ for (auto it = its.first; it != its.second; ++it) {
1711
+ // Note that (*it) does not contain the head character, so seq_len will be
1712
+ // the restart sequence length minus 1.
1713
+ // In the common case of a single-token restart sequence, (*it) will be empty
1714
+ // and we will trivially match.
1715
+ int seq_len = (int)it->second.size();
1716
+ if (seq_len > longest_match && seq_len <= (int)i) {
1717
+ bool match = true;
1718
+ for (int offset = 0; offset < seq_len; ++offset) {
1719
+ // The -1 when indexing `last_tokens` is because we already matched the head.
1720
+ if (it->second[offset] != ctx->last_tokens.rat(i - offset - 1)) {
1721
+ match = false;
1722
+ break;
1723
+ }
1724
+ }
1725
+ if (match) {
1726
+ longest_match = seq_len;
1727
+ }
1728
+ }
1729
+ }
1730
+ if (longest_match >= 0) {
1731
+ // We found a restart sequence starting `i` tokens from the end and continuing for
1732
+ // `longest_match` tokens.
1733
+ rep_limit = i - longest_match;
1734
+ break;
1735
+ }
1736
+ }
1737
+ if (rep_limit < ctx->dry_allowed_length) {
1738
+ return;
1739
+ }
1740
+
1741
+ // Step 2: Iterate in reverse over the last N tokens of the context, using the "Z-algorithm" (in
1742
+ // the reverse direction) to efficiently compute the positions and lengths of suffixes appearing
1743
+ // elsewhere in the context. We limit the suffix length to `rep_limit` to respect restart sequences.
1744
+ //
1745
+ // This algorithm is not currently documented on Wikipedia, but there is a clear description here:
1746
+ // https://ivanyu.me/blog/2014/10/15/z-algorithm/
1747
+ //
1748
+ // The code below is adapted from the public domain implementation by the same author here:
1749
+ // https://github.com/ivanyu/string-algorithms/blob/master/z_algorithm.py
1750
+ //
1751
+ // Example:
1752
+ // Last N tokens: a b c c b c y a b c
1753
+ // Repeat counts: 0 0 3 1 0 2 0 0 0 0
1754
+ // ^
1755
+ // This `3` means that the last three tokens of the context (a b c) also appear here.
1756
+ //
1757
+ // This step is worst case O(N) since the Z-algorithm is linear, despite the appearance of nested
1758
+ // for/while loops. This can be seen by observing that the `lt` and `rt` bounds are set after each
1759
+ // repeated suffix is detected (i.e. after each while loop when n > 0). These bound variables
1760
+ // ensure that the inner while loops only examine each token in the context once as the outer
1761
+ // for loop iterates over the context.
1762
+
1763
+ {
1764
+ const int last = last_n_repeat - 1;
1765
+ int rt = 0, lt = 0;
1766
+
1767
+ for (int k = 1; k < last_n_repeat; ++k) {
1768
+ if (k > rt) {
1769
+ // If k is outside the current Z-box, do naive computation.
1770
+ int n = 0;
1771
+ while (n + k < last_n_repeat && ctx->last_tokens.rat(n) == ctx->last_tokens.rat(n+k)) {
1772
+ ++n;
1773
+ }
1774
+ ctx->dry_repeat_count[last - k] = std::min(n, rep_limit);
1775
+ if (n > 0) {
1776
+ lt = k;
1777
+ rt = k+n-1;
1778
+ }
1779
+ } else {
1780
+ // If k is inside the current Z-box, consider two cases.
1781
+
1782
+ int p = k - lt; // Pair index.
1783
+ int right_part_len = rt - k + 1;
1784
+
1785
+ if (ctx->dry_repeat_count[last - p] < right_part_len) {
1786
+ int n = std::min(ctx->dry_repeat_count[last - p], rep_limit);
1787
+ ctx->dry_repeat_count[last - k] = n;
1788
+ } else {
1789
+ int i = rt + 1;
1790
+ while (i < last_n_repeat && ctx->last_tokens.rat(i) == ctx->last_tokens.rat(i - k)) {
1791
+ i += 1;
1792
+ }
1793
+
1794
+ int n = std::min(i - k, rep_limit);
1795
+ ctx->dry_repeat_count[last - k] = n;
1796
+ lt = k;
1797
+ rt = i - 1;
1798
+ }
1799
+ }
1800
+ }
1801
+ }
1802
+
1803
+ // Step 3: Iterate over dry_repeat_count and last_tokens, examining the maximum repeat length
1804
+ // that would be generated by emitting each new token that would extend a sequence.
1805
+ //
1806
+ // Following the same example as above:
1807
+ // Last N tokens: a b c c b c y a b c
1808
+ // Repeat counts: 0 0 3 1 0 2 0 0 0 0
1809
+ //
1810
+ // For each non-zero, look ahead one token. This token, if emitted, would extend the repetition.
1811
+ // c: 3 -> 4 (from `a b c` to `a b c c`)
1812
+ // b: 1 -> 2 (from `c` to `c b`)
1813
+ // y: 2 -> 3 (from `b c` to `b c y`)
1814
+
1815
+ for (int i = 0; i < last_n_repeat - 1; ++i) {
1816
+ int repeat_len = ctx->dry_repeat_count[i];
1817
+ if (repeat_len >= ctx->dry_allowed_length) {
1818
+ // This token ends a repeat, so the next token would continue one.
1819
+ // By convention, the value of `repeat_len` only includes the tokens currently
1820
+ // in the context, not the new token that would be added.
1821
+ llama_token token = ctx->last_tokens.rat(last_n_repeat - 2 - i);
1822
+ // Track the maximum sequence ending in this token.
1823
+ const auto& it = ctx->dry_max_token_repeat.find(token);
1824
+ if (it == ctx->dry_max_token_repeat.end() || it->second < repeat_len) {
1825
+ ctx->dry_max_token_repeat[token] = repeat_len;
1826
+ }
1827
+ }
1828
+ }
1829
+
1830
+ // Step 4: Apply logit penalties based on the maximum repeat length for relevant tokens.
1831
+
1832
+ // Prevent floating point overflow in `pow(penalty_base, exponent)` by clamping to `max_exponent`.
1833
+ // Compute it from `penalty_base` and the approximate log of `std::numeric_limits<float>::max()`
1834
+ const float FLOAT_MAX_LOG = 88.7228391f;
1835
+ int max_exponent = 0;
1836
+ if (ctx->dry_base > 1.000001f) {
1837
+ max_exponent = FLOAT_MAX_LOG / std::log(ctx->dry_base);
1838
+ }
1839
+
1840
+ for (size_t i = 0; i < cur_p->size; ++i) {
1841
+ const auto& af_kvp = ctx->dry_max_token_repeat.find(cur_p->data[i].id);
1842
+ if (af_kvp != ctx->dry_max_token_repeat.end()) {
1843
+ // Check all sequence breakers starting with this token
1844
+ auto range = ctx->dry_processed_breakers.equal_range(cur_p->data[i].id);
1845
+ bool is_single_token_breaker = false;
1846
+
1847
+ for (auto it = range.first; it != range.second; ++it) {
1848
+ if (it->second.empty()) {
1849
+ is_single_token_breaker = true;
1850
+ break;
1851
+ }
1852
+ }
1853
+
1854
+ // Apply penalty only if it's not a single-token sequence breaker
1855
+ if (!is_single_token_breaker) {
1856
+ int repeat_exp = af_kvp->second - ctx->dry_allowed_length;
1857
+ if (max_exponent > 0 && repeat_exp > max_exponent) {
1858
+ repeat_exp = max_exponent;
1859
+ }
1860
+ float penalty = ctx->dry_multiplier * std::pow(ctx->dry_base, repeat_exp);
1861
+ cur_p->data[i].logit -= penalty;
1862
+ }
1863
+ }
1864
+ }
1865
+
1866
+ cur_p->sorted = false;
1867
+ }
1868
+
1869
+ static void llama_sampler_dry_reset(struct llama_sampler * smpl) {
1870
+ auto * ctx = (llama_sampler_dry *) smpl->ctx;
1871
+ ctx->last_tokens.clear();
1872
+ ctx->dry_repeat_count.clear();
1873
+ ctx->dry_max_token_repeat.clear();
1874
+ }
1875
+
1876
+ static struct llama_sampler * llama_sampler_dry_clone(const struct llama_sampler * smpl) {
1877
+ const auto * ctx = (llama_sampler_dry *) smpl->ctx;
1878
+
1879
+ llama_vocab dummy_vocab;
1880
+
1881
+ // dummy vocab is passed because it is only needed for raw sequence breaker processing, which we have already done and will simply be copying
1882
+ auto * result = llama_sampler_init_dry_impl(dummy_vocab, ctx->total_context_size, ctx->dry_multiplier, ctx->dry_base, ctx->dry_allowed_length, ctx->dry_penalty_last_n, NULL, 0);
1883
+
1884
+ // Copy the state, including the processed breakers
1885
+ {
1886
+ auto * result_ctx = (llama_sampler_dry *) result->ctx;
1887
+ result_ctx->dry_processed_breakers = ctx->dry_processed_breakers;
1888
+ result_ctx->dry_repeat_count = ctx->dry_repeat_count;
1889
+ result_ctx->dry_max_token_repeat = ctx->dry_max_token_repeat;
1890
+ result_ctx->last_tokens = ctx->last_tokens;
1891
+ }
1892
+
1893
+ return result;
1894
+ }
1895
+
1896
+ static void llama_sampler_dry_free(struct llama_sampler * smpl) {
1897
+ delete (llama_sampler_dry *) smpl->ctx;
1898
+ }
1899
+
1900
+ static struct llama_sampler_i llama_sampler_dry_i = {
1901
+ /* .name = */ llama_sampler_dry_name,
1902
+ /* .accept = */ llama_sampler_dry_accept,
1903
+ /* .apply = */ llama_sampler_dry_apply,
1904
+ /* .reset = */ llama_sampler_dry_reset,
1905
+ /* .clone = */ llama_sampler_dry_clone,
1906
+ /* .free = */ llama_sampler_dry_free,
1907
+ };
1908
+
1909
+ struct llama_sampler * llama_sampler_init_dry_impl(const struct llama_vocab & vocab, int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const char** seq_breakers, size_t num_breakers) {
1910
+ int32_t effective_dry_penalty_last_n = (dry_penalty_last_n == -1) ? context_size : std::max(dry_penalty_last_n, 0);
1911
+ std::unordered_multimap<llama_token, std::vector<llama_token>> processed_breakers;
1912
+ const int MAX_CHAR_LEN = 40;
1913
+ const int MAX_SEQ_LEN = 20;
1914
+
1915
+ const bool dry_enabled = (dry_multiplier != 0.0f && dry_base >= 1.0f && dry_penalty_last_n != 0);
1916
+
1917
+ if (dry_enabled && seq_breakers != nullptr && num_breakers > 0) {
1918
+ // Process sequence breakers
1919
+ for (size_t i = 0; i < num_breakers; ++i) {
1920
+ if (seq_breakers[i] == nullptr || std::strlen(seq_breakers[i]) == 0) {
1921
+ LLAMA_LOG_WARN("skipping null or empty DRY sequence breaker at index %zu\n", i);
1922
+ continue;
1923
+ }
1924
+
1925
+ std::string sequence_break(seq_breakers[i]);
1926
+ if (sequence_break.empty()) {
1927
+ LLAMA_LOG_WARN("skipping empty DRY sequence breaker\n");
1928
+ continue;
1929
+ }
1930
+
1931
+ if (sequence_break.size() > MAX_CHAR_LEN) {
1932
+ LLAMA_LOG_WARN("truncating DRY sequence breaker to %d characters\n", MAX_CHAR_LEN);
1933
+ sequence_break.resize(MAX_CHAR_LEN);
1934
+ }
1935
+
1936
+ get_overlapping_token_sequences(vocab, sequence_break, processed_breakers, MAX_SEQ_LEN);
1937
+ }
1938
+ }
1939
+
1940
+ return new llama_sampler {
1941
+ /* .iface = */ &llama_sampler_dry_i,
1942
+ /* .ctx = */ new llama_sampler_dry {
1943
+ /* .total_context_size = */ context_size,
1944
+ /* .dry_multiplier = */ dry_multiplier,
1945
+ /* .dry_base = */ dry_base,
1946
+ /* .dry_allowed_length = */ dry_allowed_length,
1947
+ /* .dry_penalty_last_n = */ dry_penalty_last_n,
1948
+ /* .dry_processed_breakers = */ std::move(processed_breakers),
1949
+ /* .dry_repeat_count = */ dry_enabled ? std::vector<int>(effective_dry_penalty_last_n, 0) : std::vector<int>{},
1950
+ /* .dry_max_token_repeat = */ {},
1951
+ /* .last_tokens = */ dry_enabled ? ring_buffer<llama_token>(effective_dry_penalty_last_n) : ring_buffer<llama_token>(0),
1952
+ },
1953
+ };
1954
+ }
1955
+
1956
+ // wrapper for test-sampling.cpp
1957
+ struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const std::vector<std::vector<llama_token>>& seq_breakers) {
1958
+ llama_vocab dummy_vocab;
1959
+ auto * result = llama_sampler_init_dry_impl(dummy_vocab, context_size, dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, NULL, 0);
1960
+ auto * ctx = (llama_sampler_dry *) result->ctx;
1961
+
1962
+ // Process the token-based sequence breakers
1963
+ ctx->dry_processed_breakers.clear();
1964
+ if (seq_breakers.empty()) {
1965
+ LLAMA_LOG_WARN("empty DRY sequence breakers list in llama_sampler_init_dry_testing\n");
1966
+ } else {
1967
+ for (const auto& breaker : seq_breakers) {
1968
+ if (breaker.empty()) {
1969
+ LLAMA_LOG_WARN("skipping DRY empty sequence breaker\n");
1970
+ continue;
1971
+ }
1972
+ llama_token head_token = breaker[0];
1973
+ std::vector<llama_token> tail_tokens(breaker.begin() + 1, breaker.end());
1974
+ ctx->dry_processed_breakers.emplace(head_token, std::move(tail_tokens));
1975
+ }
1976
+
1977
+ if (ctx->dry_processed_breakers.empty()) {
1978
+ LLAMA_LOG_WARN("no valid DRY sequence breakers processed in llama_sampler_init_dry_testing\n");
1979
+ }
1980
+ }
1981
+
1982
+ return result;
1983
+ }
1984
+
1568
1985
  // logit-bias
1569
1986
 
1570
1987
  struct llama_sampler_logit_bias {
@@ -1644,6 +2061,229 @@ struct llama_sampler * llama_sampler_init_logit_bias(
1644
2061
  };
1645
2062
  }
1646
2063
 
2064
+ // infill
2065
+
2066
+ //#define GGML_DEBUG_SAMPLER_INFILL
2067
+
2068
+ struct llama_sampler_infill {
2069
+ const struct llama_vocab * vocab;
2070
+
2071
+ std::vector<char> buf0;
2072
+ std::vector<char> buf1;
2073
+ };
2074
+
2075
+ static const char * llama_sampler_infill_name(const struct llama_sampler * /*smpl*/) {
2076
+ return "infill";
2077
+ }
2078
+
2079
+ static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
2080
+ auto * ctx = (llama_sampler_infill *) smpl->ctx;
2081
+
2082
+ llama_sampler_softmax_impl(cur_p);
2083
+
2084
+ #if defined(GGML_DEBUG_SAMPLER_INFILL)
2085
+ #define LOG_DBG_CUR LLAMA_LOG_DEBUG
2086
+ #else
2087
+ #define LOG_DBG_CUR(...)
2088
+ #endif
2089
+
2090
+ for (size_t i = 0; i < cur_p->size; ++i) {
2091
+ LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
2092
+ }
2093
+
2094
+ float p_txt_sum = 0.0f;
2095
+ float p_eog_sum = 0.0f;
2096
+
2097
+ for (size_t i = 0; i < cur_p->size; ++i) {
2098
+ if (llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id)) {
2099
+ p_eog_sum += cur_p->data[i].p;
2100
+ } else {
2101
+ p_txt_sum += cur_p->data[i].p;
2102
+ }
2103
+ }
2104
+
2105
+ const float rat = p_eog_sum == 0.0 ? INFINITY : p_txt_sum / p_eog_sum; GGML_UNUSED(rat);
2106
+
2107
+ LOG_DBG_CUR("%s: p_txt_sum = %.2f, p_eog_sum = %.2f, rat = %.2f, n = %zu\n", __func__, p_txt_sum, p_eog_sum, rat, cur_p->size);
2108
+
2109
+ if (3*p_eog_sum*cur_p->size > p_txt_sum) {
2110
+ LOG_DBG_CUR("%s: the ratio p_txt/p_eog = %.2f is too low -> sampling EOG\n", __func__, p_txt_sum/p_eog_sum);
2111
+
2112
+ // keep just the EOG tokens
2113
+ const auto size_org = cur_p->size;
2114
+
2115
+ cur_p->size = 0;
2116
+
2117
+ float p_sum = 0.0f;
2118
+
2119
+ for (size_t i = 0; i < size_org; ++i) {
2120
+ if (llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id)) {
2121
+ p_sum += cur_p->data[i].p;
2122
+
2123
+ cur_p->data[cur_p->size++] = cur_p->data[i];
2124
+ }
2125
+ }
2126
+
2127
+ // normalize probs
2128
+ for (size_t i = 0; i < cur_p->size; ++i) {
2129
+ cur_p->data[i].p /= p_sum;
2130
+ }
2131
+
2132
+ return;
2133
+ }
2134
+
2135
+ size_t n_combined = 0; GGML_UNUSED(n_combined);
2136
+
2137
+ // combine tokens with common prefix
2138
+ for (size_t i0 = 0; i0 < cur_p->size; ++i0) {
2139
+ for (size_t i1 = 0; i1 < cur_p->size; ++i1) {
2140
+ if (cur_p->data[i0].logit == -INFINITY) {
2141
+ break;
2142
+ }
2143
+
2144
+ if (i0 == i1 || cur_p->data[i1].logit == -INFINITY) {
2145
+ continue;
2146
+ }
2147
+
2148
+ int len0 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
2149
+ if (len0 < 0) {
2150
+ ctx->buf0.resize(len0);
2151
+ len0 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
2152
+ assert(len0 > 0);
2153
+ }
2154
+
2155
+ int len1 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
2156
+ if (len1 < 0) {
2157
+ ctx->buf1.resize(len1);
2158
+ len1 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
2159
+ assert(len1 > 0);
2160
+ }
2161
+
2162
+ // token i0 is a prefix of token i1
2163
+ if (len0 > 0 && len0 <= len1 && memcmp(ctx->buf0.data(), ctx->buf1.data(), len0) == 0) {
2164
+ int dst = i0;
2165
+ int src = i1;
2166
+
2167
+ // merge into the token with higher probability
2168
+ if (cur_p->data[i1].p > cur_p->data[i0].p) {
2169
+ std::swap(dst, src);
2170
+ }
2171
+
2172
+ cur_p->data[dst].p += cur_p->data[src].p;
2173
+ cur_p->data[src].logit = -INFINITY;
2174
+ cur_p->data[src].p = 0.0f;
2175
+
2176
+ n_combined++;
2177
+ }
2178
+ }
2179
+ }
2180
+
2181
+ size_t n_non_eog = 0;
2182
+
2183
+ size_t size_org = cur_p->size;
2184
+
2185
+ float p_sum = 0.0f;
2186
+ float thold = 0.2f;
2187
+
2188
+ cur_p->size = 0;
2189
+
2190
+ LOG_DBG_CUR("%s: n_combined = %zu, applying thold = %.3f\n", __func__, n_combined, thold);
2191
+
2192
+ for (size_t i = 0; i < size_org; ++i) {
2193
+ const bool is_eog = llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id);
2194
+
2195
+ if (cur_p->data[i].p < thold && !is_eog) {
2196
+ continue;
2197
+ }
2198
+
2199
+ if (!is_eog) {
2200
+ ++n_non_eog;
2201
+ }
2202
+
2203
+ p_sum += cur_p->data[i].p;
2204
+
2205
+ // keep this token
2206
+ cur_p->data[cur_p->size++] = cur_p->data[i];
2207
+ }
2208
+
2209
+ LOG_DBG_CUR("%s: n_non_eog = %zu\n", __func__, n_non_eog);
2210
+
2211
+ // if no non-EOG tokens are left -> reduce cur_p to single EOT token
2212
+ if (n_non_eog == 0) {
2213
+ cur_p->size = 1;
2214
+ cur_p->data[0].id = llama_token_eot_impl(*ctx->vocab);
2215
+ cur_p->data[0].logit = 1.0f;
2216
+
2217
+ return;
2218
+ }
2219
+
2220
+ // normalize probs
2221
+ for (size_t i = 0; i < cur_p->size; ++i) {
2222
+ cur_p->data[i].p /= p_sum;
2223
+
2224
+ LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
2225
+ }
2226
+
2227
+ size_org = cur_p->size;
2228
+ p_sum = 0.0f;
2229
+ thold = 1.0/(n_non_eog + 1);
2230
+
2231
+ cur_p->size = 0;
2232
+
2233
+ LOG_DBG_CUR("%s: applying thold = %.3f\n", __func__, thold);
2234
+
2235
+ for (size_t i = 0; i < size_org; ++i) {
2236
+ const bool is_eog = llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id);
2237
+
2238
+ if (cur_p->data[i].p < thold && !is_eog) {
2239
+ continue;
2240
+ }
2241
+
2242
+ p_sum += cur_p->data[i].p;
2243
+
2244
+ cur_p->data[cur_p->size++] = cur_p->data[i];
2245
+ }
2246
+
2247
+ // normalize probs
2248
+ for (size_t i = 0; i < cur_p->size; ++i) {
2249
+ cur_p->data[i].p /= p_sum;
2250
+
2251
+ LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
2252
+ }
2253
+
2254
+ #undef LOG_DBG_CUR
2255
+ }
2256
+
2257
+ static struct llama_sampler * llama_sampler_infill_clone(const struct llama_sampler * smpl) {
2258
+ const auto * ctx = (const llama_sampler_infill *) smpl->ctx;
2259
+ return llama_sampler_init_infill_impl(*ctx->vocab);
2260
+ }
2261
+
2262
+ static void llama_sampler_infill_free(struct llama_sampler * smpl) {
2263
+ delete (llama_sampler_infill *) smpl->ctx;
2264
+ }
2265
+
2266
+ static struct llama_sampler_i llama_sampler_infill_i = {
2267
+ /* .name = */ llama_sampler_infill_name,
2268
+ /* .accept = */ nullptr,
2269
+ /* .apply = */ llama_sampler_infill_apply,
2270
+ /* .reset = */ nullptr,
2271
+ /* .clone = */ llama_sampler_infill_clone,
2272
+ /* .free = */ llama_sampler_infill_free,
2273
+ };
2274
+
2275
+ struct llama_sampler * llama_sampler_init_infill_impl(
2276
+ const struct llama_vocab & vocab) {
2277
+ return new llama_sampler {
2278
+ /* .iface = */ &llama_sampler_infill_i,
2279
+ /* .ctx = */ new llama_sampler_infill {
2280
+ /* .vocab = */ &vocab,
2281
+ /* .buf0 = */ std::vector<char>(512),
2282
+ /* .buf1 = */ std::vector<char>(512),
2283
+ },
2284
+ };
2285
+ }
2286
+
1647
2287
  // utils
1648
2288
 
1649
2289
  uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl) {