cui-llama.rn 1.4.4 → 1.4.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. package/android/src/main/CMakeLists.txt +2 -2
  2. package/android/src/main/jni.cpp +12 -10
  3. package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
  4. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
  6. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
  10. package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
  11. package/cpp/chat-template.hpp +529 -529
  12. package/cpp/chat.cpp +959 -265
  13. package/cpp/chat.h +135 -0
  14. package/cpp/common.cpp +2064 -1996
  15. package/cpp/common.h +700 -744
  16. package/cpp/ggml-alloc.c +1039 -1030
  17. package/cpp/ggml-alloc.h +1 -1
  18. package/cpp/ggml-backend-impl.h +255 -255
  19. package/cpp/ggml-backend-reg.cpp +586 -582
  20. package/cpp/ggml-backend.cpp +2004 -2002
  21. package/cpp/ggml-backend.h +354 -354
  22. package/cpp/ggml-common.h +1851 -1851
  23. package/cpp/ggml-cpp.h +39 -39
  24. package/cpp/ggml-cpu-aarch64.cpp +4248 -4247
  25. package/cpp/ggml-cpu-aarch64.h +8 -8
  26. package/cpp/ggml-cpu-impl.h +531 -380
  27. package/cpp/ggml-cpu-quants.c +12527 -11517
  28. package/cpp/ggml-cpu-traits.cpp +36 -36
  29. package/cpp/ggml-cpu-traits.h +38 -38
  30. package/cpp/ggml-cpu.c +15766 -14485
  31. package/cpp/ggml-cpu.cpp +655 -633
  32. package/cpp/ggml-cpu.h +138 -135
  33. package/cpp/ggml-impl.h +567 -567
  34. package/cpp/ggml-metal-impl.h +235 -0
  35. package/cpp/ggml-metal.h +66 -66
  36. package/cpp/ggml-metal.m +5146 -5002
  37. package/cpp/ggml-opt.cpp +854 -854
  38. package/cpp/ggml-opt.h +216 -216
  39. package/cpp/ggml-quants.c +5238 -5238
  40. package/cpp/ggml-threading.h +14 -14
  41. package/cpp/ggml.c +6529 -6524
  42. package/cpp/ggml.h +2198 -2194
  43. package/cpp/gguf.cpp +1329 -1329
  44. package/cpp/gguf.h +202 -202
  45. package/cpp/json-schema-to-grammar.cpp +1024 -1025
  46. package/cpp/json-schema-to-grammar.h +21 -22
  47. package/cpp/json.hpp +24766 -24766
  48. package/cpp/llama-adapter.cpp +347 -347
  49. package/cpp/llama-adapter.h +74 -74
  50. package/cpp/llama-arch.cpp +1513 -1492
  51. package/cpp/llama-arch.h +403 -402
  52. package/cpp/llama-batch.cpp +368 -368
  53. package/cpp/llama-batch.h +88 -88
  54. package/cpp/llama-chat.cpp +588 -587
  55. package/cpp/llama-chat.h +53 -53
  56. package/cpp/llama-context.cpp +1775 -1775
  57. package/cpp/llama-context.h +128 -128
  58. package/cpp/llama-cparams.cpp +1 -1
  59. package/cpp/llama-cparams.h +37 -37
  60. package/cpp/llama-cpp.h +30 -30
  61. package/cpp/llama-grammar.cpp +1219 -1219
  62. package/cpp/llama-grammar.h +173 -164
  63. package/cpp/llama-hparams.cpp +71 -71
  64. package/cpp/llama-hparams.h +139 -139
  65. package/cpp/llama-impl.cpp +167 -167
  66. package/cpp/llama-impl.h +61 -61
  67. package/cpp/llama-kv-cache.cpp +718 -718
  68. package/cpp/llama-kv-cache.h +219 -218
  69. package/cpp/llama-mmap.cpp +600 -590
  70. package/cpp/llama-mmap.h +68 -68
  71. package/cpp/llama-model-loader.cpp +1124 -1124
  72. package/cpp/llama-model-loader.h +167 -167
  73. package/cpp/llama-model.cpp +4087 -4023
  74. package/cpp/llama-model.h +370 -370
  75. package/cpp/llama-sampling.cpp +2558 -2525
  76. package/cpp/llama-sampling.h +32 -32
  77. package/cpp/llama-vocab.cpp +3264 -3252
  78. package/cpp/llama-vocab.h +125 -125
  79. package/cpp/llama.cpp +10284 -10137
  80. package/cpp/llama.h +1354 -1340
  81. package/cpp/log.cpp +393 -423
  82. package/cpp/log.h +132 -132
  83. package/cpp/minja/chat-template.hpp +529 -0
  84. package/cpp/minja/minja.hpp +2915 -0
  85. package/cpp/minja.hpp +2915 -2883
  86. package/cpp/rn-llama.cpp +20 -37
  87. package/cpp/rn-llama.h +12 -2
  88. package/cpp/sampling.cpp +570 -532
  89. package/cpp/sgemm.cpp +2598 -2598
  90. package/cpp/sgemm.h +14 -14
  91. package/cpp/speculative.cpp +278 -277
  92. package/cpp/speculative.h +28 -28
  93. package/package.json +1 -1
  94. package/android/src/main/build-arm64/CMakeCache.txt +0 -429
  95. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeCCompiler.cmake +0 -81
  96. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeCXXCompiler.cmake +0 -101
  97. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeDetermineCompilerABI_C.bin +0 -0
  98. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeDetermineCompilerABI_CXX.bin +0 -0
  99. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CMakeSystem.cmake +0 -15
  100. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdC/CMakeCCompilerId.c +0 -904
  101. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdC/CMakeCCompilerId.o +0 -0
  102. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdCXX/CMakeCXXCompilerId.cpp +0 -919
  103. package/android/src/main/build-arm64/CMakeFiles/3.31.4/CompilerIdCXX/CMakeCXXCompilerId.o +0 -0
  104. package/android/src/main/build-arm64/CMakeFiles/CMakeConfigureLog.yaml +0 -431
  105. package/android/src/main/build-arm64/CMakeFiles/CMakeDirectoryInformation.cmake +0 -16
  106. package/android/src/main/build-arm64/CMakeFiles/Makefile.cmake +0 -165
  107. package/android/src/main/build-arm64/CMakeFiles/Makefile2 +0 -297
  108. package/android/src/main/build-arm64/CMakeFiles/Progress/1 +0 -1
  109. package/android/src/main/build-arm64/CMakeFiles/Progress/2 +0 -1
  110. package/android/src/main/build-arm64/CMakeFiles/Progress/3 +0 -1
  111. package/android/src/main/build-arm64/CMakeFiles/Progress/4 +0 -1
  112. package/android/src/main/build-arm64/CMakeFiles/Progress/5 +0 -1
  113. package/android/src/main/build-arm64/CMakeFiles/Progress/6 +0 -1
  114. package/android/src/main/build-arm64/CMakeFiles/Progress/count.txt +0 -1
  115. package/android/src/main/build-arm64/CMakeFiles/TargetDirectories.txt +0 -8
  116. package/android/src/main/build-arm64/CMakeFiles/cmake.check_cache +0 -1
  117. package/android/src/main/build-arm64/CMakeFiles/progress.marks +0 -1
  118. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-alloc.c.o +0 -0
  119. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-alloc.c.o.d +0 -58
  120. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend-reg.cpp.o +0 -0
  121. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend-reg.cpp.o.d +0 -756
  122. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend.cpp.o +0 -0
  123. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-backend.cpp.o.d +0 -709
  124. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-aarch64.cpp.o +0 -0
  125. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-aarch64.cpp.o.d +0 -714
  126. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-quants.c.o +0 -0
  127. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-quants.c.o.d +0 -62
  128. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-traits.cpp.o +0 -0
  129. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu-traits.cpp.o.d +0 -708
  130. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.c.o +0 -0
  131. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.c.o.d +0 -113
  132. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.cpp.o +0 -0
  133. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-cpu.cpp.o.d +0 -713
  134. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-opt.cpp.o +0 -0
  135. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-opt.cpp.o.d +0 -763
  136. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-quants.c.o +0 -0
  137. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-quants.c.o.d +0 -61
  138. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-threading.cpp.o +0 -0
  139. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml-threading.cpp.o.d +0 -707
  140. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml.c.o +0 -0
  141. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/ggml.c.o.d +0 -104
  142. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/gguf.cpp.o +0 -0
  143. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/gguf.cpp.o.d +0 -714
  144. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/log.cpp.o +0 -0
  145. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/D_/dev/react-native/cui-llama.rn/cpp/log.cpp.o.d +0 -723
  146. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/DependInfo.cmake +0 -62
  147. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/build.make +0 -722
  148. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/cmake_clean.cmake +0 -89
  149. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/compiler_depend.make +0 -2
  150. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/compiler_depend.ts +0 -2
  151. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/depend.make +0 -2
  152. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/flags.make +0 -17
  153. package/android/src/main/build-arm64/CMakeFiles/rnllama.dir/progress.make +0 -41
  154. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/DependInfo.cmake +0 -62
  155. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/build.make +0 -722
  156. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/cmake_clean.cmake +0 -89
  157. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/compiler_depend.make +0 -2
  158. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/compiler_depend.ts +0 -2
  159. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/depend.make +0 -2
  160. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/flags.make +0 -17
  161. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8.dir/progress.make +0 -41
  162. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/DependInfo.cmake +0 -62
  163. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/build.make +0 -722
  164. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/cmake_clean.cmake +0 -89
  165. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/compiler_depend.make +0 -2
  166. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/compiler_depend.ts +0 -2
  167. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/depend.make +0 -2
  168. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/flags.make +0 -17
  169. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2.dir/progress.make +0 -41
  170. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/DependInfo.cmake +0 -62
  171. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/build.make +0 -722
  172. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/cmake_clean.cmake +0 -89
  173. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/compiler_depend.make +0 -2
  174. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/compiler_depend.ts +0 -2
  175. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/depend.make +0 -2
  176. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/flags.make +0 -17
  177. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod.dir/progress.make +0 -41
  178. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/DependInfo.cmake +0 -62
  179. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/build.make +0 -722
  180. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/cmake_clean.cmake +0 -89
  181. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/compiler_depend.make +0 -2
  182. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/compiler_depend.ts +0 -2
  183. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/depend.make +0 -2
  184. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/flags.make +0 -17
  185. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_dotprod_i8mm.dir/progress.make +0 -41
  186. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/DependInfo.cmake +0 -62
  187. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/build.make +0 -722
  188. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/cmake_clean.cmake +0 -89
  189. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/compiler_depend.make +0 -2
  190. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/compiler_depend.ts +0 -2
  191. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/depend.make +0 -2
  192. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/flags.make +0 -17
  193. package/android/src/main/build-arm64/CMakeFiles/rnllama_v8_2_i8mm.dir/progress.make +0 -41
  194. package/android/src/main/build-arm64/Makefile +0 -1862
  195. package/android/src/main/build-arm64/cmake_install.cmake +0 -66
  196. package/cpp/chat.hpp +0 -55
  197. package/cpp/rn-llama.hpp +0 -913
package/cpp/sampling.cpp CHANGED
@@ -1,532 +1,570 @@
1
- #include "sampling.h"
2
-
3
- #include "common.h"
4
-
5
- #include <cmath>
6
- #include <unordered_map>
7
-
8
- // the ring buffer works similarly to std::deque, but with a fixed capacity
9
- // TODO: deduplicate with llama-impl.h
10
- template<typename T>
11
- struct ring_buffer {
12
- ring_buffer(size_t cap) : capacity(cap), data(cap) {}
13
-
14
- T & front() {
15
- if (sz == 0) {
16
- throw std::runtime_error("ring buffer is empty");
17
- }
18
- return data[first];
19
- }
20
-
21
- const T & front() const {
22
- if (sz == 0) {
23
- throw std::runtime_error("ring buffer is empty");
24
- }
25
- return data[first];
26
- }
27
-
28
- T & back() {
29
- if (sz == 0) {
30
- throw std::runtime_error("ring buffer is empty");
31
- }
32
- return data[pos];
33
- }
34
-
35
- const T & back() const {
36
- if (sz == 0) {
37
- throw std::runtime_error("ring buffer is empty");
38
- }
39
- return data[pos];
40
- }
41
-
42
- void push_back(const T & value) {
43
- if (sz == capacity) {
44
- // advance the start when buffer is full
45
- first = (first + 1) % capacity;
46
- } else {
47
- sz++;
48
- }
49
- data[pos] = value;
50
- pos = (pos + 1) % capacity;
51
- }
52
-
53
- T pop_front() {
54
- if (sz == 0) {
55
- throw std::runtime_error("ring buffer is empty");
56
- }
57
- T value = data[first];
58
- first = (first + 1) % capacity;
59
- sz--;
60
- return value;
61
- }
62
-
63
- const T & rat(size_t i) const {
64
- if (i >= sz) {
65
- throw std::runtime_error("ring buffer: index out of bounds");
66
- }
67
- return data[(first + sz - i - 1) % capacity];
68
- }
69
-
70
- std::vector<T> to_vector() const {
71
- std::vector<T> result;
72
- result.reserve(sz);
73
- for (size_t i = 0; i < sz; i++) {
74
- result.push_back(data[(first + i) % capacity]);
75
- }
76
- return result;
77
- }
78
-
79
- void clear() {
80
- // here only reset the status of the buffer
81
- sz = 0;
82
- first = 0;
83
- pos = 0;
84
- }
85
-
86
- bool empty() const {
87
- return sz == 0;
88
- }
89
-
90
- size_t size() const {
91
- return sz;
92
- }
93
-
94
- size_t capacity = 0;
95
- size_t sz = 0;
96
- size_t first = 0;
97
- size_t pos = 0;
98
- std::vector<T> data;
99
- };
100
-
101
- struct common_sampler {
102
- common_params_sampling params;
103
-
104
- struct llama_sampler * grmr;
105
- struct llama_sampler * chain;
106
-
107
- ring_buffer<llama_token> prev;
108
-
109
- std::vector<llama_token_data> cur;
110
-
111
- llama_token_data_array cur_p;
112
-
113
- void set_logits(struct llama_context * ctx, int idx) {
114
- const auto * logits = llama_get_logits_ith(ctx, idx);
115
-
116
- const llama_model * model = llama_get_model(ctx);
117
- const llama_vocab * vocab = llama_model_get_vocab(model);
118
-
119
- const int n_vocab = llama_vocab_n_tokens(vocab);
120
-
121
- cur.resize(n_vocab);
122
-
123
- for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
124
- cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
125
- }
126
-
127
- cur_p = { cur.data(), cur.size(), -1, false };
128
- }
129
- };
130
-
131
- std::string common_params_sampling::print() const {
132
- char result[1024];
133
-
134
- snprintf(result, sizeof(result),
135
- "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
136
- "\tdry_multiplier = %.3f, dry_base = %.3f, dry_allowed_length = %d, dry_penalty_last_n = %d\n"
137
- "\ttop_k = %d, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, top_n_sigma = %.3f, temp = %.3f\n"
138
- "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
139
- penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
140
- dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n,
141
- top_k, top_p, min_p, xtc_probability, xtc_threshold, typ_p, top_n_sigma, temp,
142
- mirostat, mirostat_eta, mirostat_tau);
143
-
144
- return std::string(result);
145
- }
146
-
147
- struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params) {
148
- const llama_vocab * vocab = llama_model_get_vocab(model);
149
-
150
- llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
151
-
152
- lparams.no_perf = params.no_perf;
153
-
154
- struct llama_sampler * grmr;
155
- if (params.grammar.compare(0, 11, "%llguidance") == 0) {
156
- #ifdef LLAMA_USE_LLGUIDANCE
157
- grmr = llama_sampler_init_llg(vocab, "lark", params.grammar.c_str());
158
- #else
159
- LM_GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
160
- #endif // LLAMA_USE_LLGUIDANCE
161
- } else {
162
- std::vector<const char *> trigger_words;
163
- trigger_words.reserve(params.grammar_trigger_words.size());
164
- for (const auto & str : params.grammar_trigger_words) {
165
- trigger_words.push_back(str.word.c_str());
166
- }
167
-
168
- grmr = params.grammar_lazy
169
- ? llama_sampler_init_grammar_lazy(vocab, params.grammar.c_str(), "root",
170
- trigger_words.data(), trigger_words.size(),
171
- params.grammar_trigger_tokens.data(), params.grammar_trigger_tokens.size())
172
- : llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
173
- }
174
-
175
- auto * result = new common_sampler {
176
- /* .params = */ params,
177
- /* .grmr = */ grmr,
178
- /* .chain = */ llama_sampler_chain_init(lparams),
179
- /* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
180
- /* .cur = */ {},
181
- /* .cur_p = */ {},
182
- };
183
-
184
- llama_sampler_chain_add(result->chain,
185
- llama_sampler_init_logit_bias(
186
- llama_vocab_n_tokens(vocab),
187
- params.logit_bias.size(),
188
- params.logit_bias.data()));
189
-
190
- if (params.mirostat == 0) {
191
- if (params.top_n_sigma >= 0) {
192
- llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
193
- llama_sampler_chain_add(result->chain, llama_sampler_init_temp (params.temp));
194
- llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma (params.top_n_sigma));
195
- } else {
196
- for (const auto & cnstr : params.samplers) {
197
- switch (cnstr) {
198
- case COMMON_SAMPLER_TYPE_DRY:
199
- {
200
- std::vector<const char *> c_breakers;
201
- c_breakers.reserve(params.dry_sequence_breakers.size());
202
- for (const auto & str : params.dry_sequence_breakers) {
203
- c_breakers.push_back(str.c_str());
204
- }
205
-
206
- llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
207
- }
208
- break;
209
- case COMMON_SAMPLER_TYPE_TOP_K:
210
- llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
211
- break;
212
- case COMMON_SAMPLER_TYPE_TOP_P:
213
- llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
214
- break;
215
- case COMMON_SAMPLER_TYPE_MIN_P:
216
- llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
217
- break;
218
- case COMMON_SAMPLER_TYPE_XTC:
219
- llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
220
- break;
221
- case COMMON_SAMPLER_TYPE_TYPICAL_P:
222
- llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
223
- break;
224
- case COMMON_SAMPLER_TYPE_TEMPERATURE:
225
- llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
226
- break;
227
- case COMMON_SAMPLER_TYPE_INFILL:
228
- llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab));
229
- break;
230
- case COMMON_SAMPLER_TYPE_PENALTIES:
231
- llama_sampler_chain_add(result->chain, llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
232
- break;
233
- default:
234
- LM_GGML_ASSERT(false && "unknown sampler type");
235
- }
236
- }
237
- }
238
- llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
239
- } else if (params.mirostat == 1) {
240
- llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
241
- llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_vocab_n_tokens(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
242
- } else if (params.mirostat == 2) {
243
- llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
244
- llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
245
- } else {
246
- LM_GGML_ASSERT(false && "unknown mirostat version");
247
- }
248
-
249
- return result;
250
- }
251
-
252
- void common_sampler_free(struct common_sampler * gsmpl) {
253
- if (gsmpl) {
254
- llama_sampler_free(gsmpl->grmr);
255
-
256
- llama_sampler_free(gsmpl->chain);
257
-
258
- delete gsmpl;
259
- }
260
- }
261
-
262
- void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
263
- if (accept_grammar) {
264
- llama_sampler_accept(gsmpl->grmr, token);
265
- }
266
-
267
- llama_sampler_accept(gsmpl->chain, token);
268
-
269
- gsmpl->prev.push_back(token);
270
- }
271
-
272
- void common_sampler_reset(struct common_sampler * gsmpl) {
273
- llama_sampler_reset(gsmpl->grmr);
274
-
275
- llama_sampler_reset(gsmpl->chain);
276
- }
277
-
278
- struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
279
- return new common_sampler {
280
- /* .params = */ gsmpl->params,
281
- /* .grmr = */ llama_sampler_clone(gsmpl->grmr),
282
- /* .chain = */ llama_sampler_clone(gsmpl->chain),
283
- /* .prev = */ gsmpl->prev,
284
- /* .cur = */ gsmpl->cur,
285
- /* .cur_p = */ gsmpl->cur_p,
286
- };
287
- }
288
-
289
- void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl) {
290
- // TODO: measure grammar performance
291
-
292
- if (gsmpl) {
293
- llama_perf_sampler_print(gsmpl->chain);
294
- }
295
- if (ctx) {
296
- llama_perf_context_print(ctx);
297
- }
298
- }
299
-
300
- llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
301
- gsmpl->set_logits(ctx, idx);
302
-
303
- auto & grmr = gsmpl->grmr;
304
- auto & chain = gsmpl->chain;
305
- auto & cur_p = gsmpl->cur_p; // initialized by set_logits
306
-
307
- if (grammar_first) {
308
- llama_sampler_apply(grmr, &cur_p);
309
- }
310
-
311
- llama_sampler_apply(chain, &cur_p);
312
-
313
- LM_GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");
314
-
315
- const llama_token id = cur_p.data[cur_p.selected].id;
316
-
317
- if (grammar_first) {
318
- return id;
319
- }
320
-
321
- // check if it the sampled token fits the grammar
322
- {
323
- llama_token_data single_token_data = { id, 1.0f, 0.0f };
324
- llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false };
325
-
326
- llama_sampler_apply(grmr, &single_token_data_array);
327
-
328
- const bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
329
- if (is_valid) {
330
- return id;
331
- }
332
- }
333
-
334
- // resampling:
335
- // if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain
336
- gsmpl->set_logits(ctx, idx);
337
-
338
- llama_sampler_apply(grmr, &cur_p);
339
- llama_sampler_apply(chain, &cur_p);
340
-
341
- LM_GGML_ASSERT(cur_p.selected != -1 && "no selected token during re-sampling - check your sampling configuration");
342
-
343
- return cur_p.data[cur_p.selected].id;
344
- }
345
-
346
- std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first) {
347
- LM_GGML_ASSERT(idxs.size() == draft.size() + 1 && "idxs.size() must be draft.size() + 1");
348
-
349
- std::vector<llama_token> result;
350
- result.reserve(idxs.size());
351
-
352
- size_t i = 0;
353
- for (; i < draft.size(); i++) {
354
- const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
355
-
356
- common_sampler_accept(gsmpl, id, true);
357
-
358
- result.push_back(id);
359
-
360
- if (draft[i] != id) {
361
- break;
362
- }
363
- }
364
-
365
- if (i == draft.size()) {
366
- const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
367
-
368
- common_sampler_accept(gsmpl, id, true);
369
-
370
- result.push_back(id);
371
- }
372
-
373
- return result;
374
- }
375
-
376
- std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first) {
377
- std::vector<int> idxs(draft.size() + 1);
378
- for (size_t i = 0; i < idxs.size(); ++i) {
379
- idxs[i] = i;
380
- }
381
-
382
- return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft, grammar_first);
383
- }
384
-
385
- uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
386
- return llama_sampler_get_seed(gsmpl->chain);
387
- }
388
-
389
- // helpers
390
-
391
- llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl) {
392
- return &gsmpl->cur_p;
393
- }
394
-
395
- llama_token common_sampler_last(const struct common_sampler * gsmpl) {
396
- return gsmpl->prev.rat(0);
397
- }
398
-
399
- std::string common_sampler_print(const struct common_sampler * gsmpl) {
400
- std::string result = "logits ";
401
-
402
- for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
403
- const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
404
- result += std::string("-> ") + llama_sampler_name(smpl) + " ";
405
- }
406
-
407
- return result;
408
- }
409
-
410
- std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx_main, int n) {
411
- n = std::min(n, (int) gsmpl->prev.size());
412
-
413
- if (n <= 0) {
414
- return "";
415
- }
416
-
417
- std::string result;
418
- result.reserve(8*n); // 8 is the average length of a token [citation needed], TODO: compute this from the vocab
419
-
420
- for (int i = n - 1; i >= 0; i--) {
421
- const llama_token id = gsmpl->prev.rat(i);
422
-
423
- LM_GGML_ASSERT(id != LLAMA_TOKEN_NULL && "null token in the sampling history - should not happen");
424
-
425
- result += common_token_to_piece(ctx_main, id);
426
- }
427
-
428
- return result;
429
- }
430
-
431
- char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
432
- switch (cnstr) {
433
- case COMMON_SAMPLER_TYPE_DRY: return 'd';
434
- case COMMON_SAMPLER_TYPE_TOP_K: return 'k';
435
- case COMMON_SAMPLER_TYPE_TYPICAL_P: return 'y';
436
- case COMMON_SAMPLER_TYPE_TOP_P: return 'p';
437
- case COMMON_SAMPLER_TYPE_MIN_P: return 'm';
438
- case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
439
- case COMMON_SAMPLER_TYPE_XTC: return 'x';
440
- case COMMON_SAMPLER_TYPE_INFILL: return 'i';
441
- case COMMON_SAMPLER_TYPE_PENALTIES: return 'e';
442
- default : return '?';
443
- }
444
- }
445
-
446
- std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
447
- switch (cnstr) {
448
- case COMMON_SAMPLER_TYPE_DRY: return "dry";
449
- case COMMON_SAMPLER_TYPE_TOP_K: return "top_k";
450
- case COMMON_SAMPLER_TYPE_TYPICAL_P: return "typ_p";
451
- case COMMON_SAMPLER_TYPE_TOP_P: return "top_p";
452
- case COMMON_SAMPLER_TYPE_MIN_P: return "min_p";
453
- case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
454
- case COMMON_SAMPLER_TYPE_XTC: return "xtc";
455
- case COMMON_SAMPLER_TYPE_INFILL: return "infill";
456
- case COMMON_SAMPLER_TYPE_PENALTIES: return "penalties";
457
- default : return "";
458
- }
459
- }
460
-
461
- std::vector<common_sampler_type> common_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
462
- std::unordered_map<std::string, common_sampler_type> sampler_canonical_name_map {
463
- { "dry", COMMON_SAMPLER_TYPE_DRY },
464
- { "top_k", COMMON_SAMPLER_TYPE_TOP_K },
465
- { "top_p", COMMON_SAMPLER_TYPE_TOP_P },
466
- { "typ_p", COMMON_SAMPLER_TYPE_TYPICAL_P },
467
- { "min_p", COMMON_SAMPLER_TYPE_MIN_P },
468
- { "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
469
- { "xtc", COMMON_SAMPLER_TYPE_XTC },
470
- { "infill", COMMON_SAMPLER_TYPE_INFILL },
471
- { "penalties", COMMON_SAMPLER_TYPE_PENALTIES },
472
- };
473
-
474
- // since samplers names are written multiple ways
475
- // make it ready for both system names and input names
476
- std::unordered_map<std::string, common_sampler_type> sampler_alt_name_map {
477
- { "top-k", COMMON_SAMPLER_TYPE_TOP_K },
478
- { "top-p", COMMON_SAMPLER_TYPE_TOP_P },
479
- { "nucleus", COMMON_SAMPLER_TYPE_TOP_P },
480
- { "typical-p", COMMON_SAMPLER_TYPE_TYPICAL_P },
481
- { "typical", COMMON_SAMPLER_TYPE_TYPICAL_P },
482
- { "typ-p", COMMON_SAMPLER_TYPE_TYPICAL_P },
483
- { "typ", COMMON_SAMPLER_TYPE_TYPICAL_P },
484
- { "min-p", COMMON_SAMPLER_TYPE_MIN_P },
485
- { "temp", COMMON_SAMPLER_TYPE_TEMPERATURE },
486
- };
487
-
488
- std::vector<common_sampler_type> samplers;
489
- samplers.reserve(names.size());
490
-
491
- for (const auto & name : names) {
492
- auto sampler = sampler_canonical_name_map.find(name);
493
- if (sampler != sampler_canonical_name_map.end()) {
494
- samplers.push_back(sampler->second);
495
- } else {
496
- if (allow_alt_names) {
497
- sampler = sampler_alt_name_map.find(name);
498
- if (sampler != sampler_alt_name_map.end()) {
499
- samplers.push_back(sampler->second);
500
- }
501
- }
502
- }
503
- }
504
-
505
- return samplers;
506
- }
507
-
508
- std::vector<common_sampler_type> common_sampler_types_from_chars(const std::string & chars) {
509
- std::unordered_map<char, common_sampler_type> sampler_name_map = {
510
- { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_DRY), COMMON_SAMPLER_TYPE_DRY },
511
- { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_K), COMMON_SAMPLER_TYPE_TOP_K },
512
- { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P), COMMON_SAMPLER_TYPE_TYPICAL_P },
513
- { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P), COMMON_SAMPLER_TYPE_TOP_P },
514
- { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P), COMMON_SAMPLER_TYPE_MIN_P },
515
- { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE },
516
- { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC), COMMON_SAMPLER_TYPE_XTC },
517
- { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_INFILL), COMMON_SAMPLER_TYPE_INFILL },
518
- { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_PENALTIES), COMMON_SAMPLER_TYPE_PENALTIES },
519
- };
520
-
521
- std::vector<common_sampler_type> samplers;
522
- samplers.reserve(chars.size());
523
-
524
- for (const auto & c : chars) {
525
- const auto sampler = sampler_name_map.find(c);
526
- if (sampler != sampler_name_map.end()) {
527
- samplers.push_back(sampler->second);
528
- }
529
- }
530
-
531
- return samplers;
532
- }
1
+ #include "sampling.h"
2
+
3
+ #include "common.h"
4
+
5
+ #include <cmath>
6
+ #include <unordered_map>
7
+ #include <algorithm>
8
+
9
+ // the ring buffer works similarly to std::deque, but with a fixed capacity
10
+ // TODO: deduplicate with llama-impl.h
11
+ template<typename T>
12
+ struct ring_buffer {
13
+ ring_buffer(size_t cap) : capacity(cap), data(cap) {}
14
+
15
+ T & front() {
16
+ if (sz == 0) {
17
+ throw std::runtime_error("ring buffer is empty");
18
+ }
19
+ return data[first];
20
+ }
21
+
22
+ const T & front() const {
23
+ if (sz == 0) {
24
+ throw std::runtime_error("ring buffer is empty");
25
+ }
26
+ return data[first];
27
+ }
28
+
29
+ T & back() {
30
+ if (sz == 0) {
31
+ throw std::runtime_error("ring buffer is empty");
32
+ }
33
+ return data[pos];
34
+ }
35
+
36
+ const T & back() const {
37
+ if (sz == 0) {
38
+ throw std::runtime_error("ring buffer is empty");
39
+ }
40
+ return data[pos];
41
+ }
42
+
43
+ void push_back(const T & value) {
44
+ if (sz == capacity) {
45
+ // advance the start when buffer is full
46
+ first = (first + 1) % capacity;
47
+ } else {
48
+ sz++;
49
+ }
50
+ data[pos] = value;
51
+ pos = (pos + 1) % capacity;
52
+ }
53
+
54
+ T pop_front() {
55
+ if (sz == 0) {
56
+ throw std::runtime_error("ring buffer is empty");
57
+ }
58
+ T value = data[first];
59
+ first = (first + 1) % capacity;
60
+ sz--;
61
+ return value;
62
+ }
63
+
64
+ const T & rat(size_t i) const {
65
+ if (i >= sz) {
66
+ throw std::runtime_error("ring buffer: index out of bounds");
67
+ }
68
+ return data[(first + sz - i - 1) % capacity];
69
+ }
70
+
71
+ std::vector<T> to_vector() const {
72
+ std::vector<T> result;
73
+ result.reserve(sz);
74
+ for (size_t i = 0; i < sz; i++) {
75
+ result.push_back(data[(first + i) % capacity]);
76
+ }
77
+ return result;
78
+ }
79
+
80
+ void clear() {
81
+ // here only reset the status of the buffer
82
+ sz = 0;
83
+ first = 0;
84
+ pos = 0;
85
+ }
86
+
87
+ bool empty() const {
88
+ return sz == 0;
89
+ }
90
+
91
+ size_t size() const {
92
+ return sz;
93
+ }
94
+
95
+ size_t capacity = 0;
96
+ size_t sz = 0;
97
+ size_t first = 0;
98
+ size_t pos = 0;
99
+ std::vector<T> data;
100
+ };
101
+
102
+ struct common_sampler {
103
+ common_params_sampling params;
104
+
105
+ struct llama_sampler * grmr;
106
+ struct llama_sampler * chain;
107
+
108
+ ring_buffer<llama_token> prev;
109
+
110
+ std::vector<llama_token_data> cur;
111
+
112
+ llama_token_data_array cur_p;
113
+
114
+ void set_logits(struct llama_context * ctx, int idx) {
115
+ const auto * logits = llama_get_logits_ith(ctx, idx);
116
+
117
+ const llama_model * model = llama_get_model(ctx);
118
+ const llama_vocab * vocab = llama_model_get_vocab(model);
119
+
120
+ const int n_vocab = llama_vocab_n_tokens(vocab);
121
+
122
+ cur.resize(n_vocab);
123
+
124
+ for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
125
+ cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
126
+ }
127
+
128
+ cur_p = { cur.data(), cur.size(), -1, false };
129
+ }
130
+ };
131
+
132
+ std::string common_params_sampling::print() const {
133
+ char result[1024];
134
+
135
+ snprintf(result, sizeof(result),
136
+ "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
137
+ "\tdry_multiplier = %.3f, dry_base = %.3f, dry_allowed_length = %d, dry_penalty_last_n = %d\n"
138
+ "\ttop_k = %d, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, top_n_sigma = %.3f, temp = %.3f\n"
139
+ "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
140
+ penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
141
+ dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n,
142
+ top_k, top_p, min_p, xtc_probability, xtc_threshold, typ_p, top_n_sigma, temp,
143
+ mirostat, mirostat_eta, mirostat_tau);
144
+
145
+ return std::string(result);
146
+ }
147
+
148
+ struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params) {
149
+ const llama_vocab * vocab = llama_model_get_vocab(model);
150
+
151
+ llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
152
+
153
+ lparams.no_perf = params.no_perf;
154
+
155
+ struct llama_sampler * grmr;
156
+ if (params.grammar.compare(0, 11, "%llguidance") == 0) {
157
+ #ifdef LLAMA_USE_LLGUIDANCE
158
+ grmr = llama_sampler_init_llg(vocab, "lark", params.grammar.c_str());
159
+ #else
160
+ LM_GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
161
+ #endif // LLAMA_USE_LLGUIDANCE
162
+ } else {
163
+ std::vector<std::string> patterns_at_start;
164
+ std::vector<std::string> patterns_anywhere;
165
+ std::vector<llama_token> trigger_tokens;
166
+ for (const auto & trigger : params.grammar_triggers) {
167
+ switch (trigger.type) {
168
+ case COMMON_GRAMMAR_TRIGGER_TYPE_WORD:
169
+ {
170
+ const auto & word = trigger.value;
171
+ patterns_anywhere.push_back(regex_escape(word));
172
+ break;
173
+ }
174
+ case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN:
175
+ case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START:
176
+ {
177
+ const auto & pattern = trigger.value;
178
+ (trigger.type == COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START ? patterns_at_start : patterns_anywhere).push_back(pattern);
179
+ break;
180
+ }
181
+ case COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN:
182
+ {
183
+ const auto token = trigger.token;
184
+ trigger_tokens.push_back(token);
185
+ break;
186
+ }
187
+ default:
188
+ LM_GGML_ASSERT(false && "unknown trigger type");
189
+ }
190
+ }
191
+
192
+ std::vector<std::string> trigger_patterns;
193
+ if (!patterns_at_start.empty()) {
194
+ trigger_patterns.push_back("^(" + string_join(patterns_at_start, "|") + ")[\\s\\S]*");
195
+ }
196
+ if (!patterns_anywhere.empty()) {
197
+ trigger_patterns.push_back("^[\\s\\S]*?(" + string_join(patterns_anywhere, "|") + ")[\\s\\S]*");
198
+ }
199
+
200
+ std::vector<const char *> trigger_patterns_c;
201
+ trigger_patterns_c.reserve(trigger_patterns.size());
202
+ for (const auto & regex : trigger_patterns) {
203
+ trigger_patterns_c.push_back(regex.c_str());
204
+ }
205
+
206
+ grmr = params.grammar_lazy
207
+ ? llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
208
+ trigger_patterns_c.data(), trigger_patterns_c.size(),
209
+ trigger_tokens.data(), trigger_tokens.size())
210
+ : llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
211
+ }
212
+
213
+ auto * result = new common_sampler {
214
+ /* .params = */ params,
215
+ /* .grmr = */ grmr,
216
+ /* .chain = */ llama_sampler_chain_init(lparams),
217
+ /* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
218
+ /* .cur = */ {},
219
+ /* .cur_p = */ {},
220
+ };
221
+
222
+ llama_sampler_chain_add(result->chain,
223
+ llama_sampler_init_logit_bias(
224
+ llama_vocab_n_tokens(vocab),
225
+ params.logit_bias.size(),
226
+ params.logit_bias.data()));
227
+
228
+ if (params.mirostat == 0) {
229
+ if (params.top_n_sigma >= 0) {
230
+ llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
231
+ llama_sampler_chain_add(result->chain, llama_sampler_init_temp (params.temp));
232
+ llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma (params.top_n_sigma));
233
+ } else {
234
+ for (const auto & cnstr : params.samplers) {
235
+ switch (cnstr) {
236
+ case COMMON_SAMPLER_TYPE_DRY:
237
+ {
238
+ std::vector<const char *> c_breakers;
239
+ c_breakers.reserve(params.dry_sequence_breakers.size());
240
+ for (const auto & str : params.dry_sequence_breakers) {
241
+ c_breakers.push_back(str.c_str());
242
+ }
243
+
244
+ llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
245
+ }
246
+ break;
247
+ case COMMON_SAMPLER_TYPE_TOP_K:
248
+ llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
249
+ break;
250
+ case COMMON_SAMPLER_TYPE_TOP_P:
251
+ llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
252
+ break;
253
+ case COMMON_SAMPLER_TYPE_MIN_P:
254
+ llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
255
+ break;
256
+ case COMMON_SAMPLER_TYPE_XTC:
257
+ llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
258
+ break;
259
+ case COMMON_SAMPLER_TYPE_TYPICAL_P:
260
+ llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
261
+ break;
262
+ case COMMON_SAMPLER_TYPE_TEMPERATURE:
263
+ llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
264
+ break;
265
+ case COMMON_SAMPLER_TYPE_INFILL:
266
+ llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab));
267
+ break;
268
+ case COMMON_SAMPLER_TYPE_PENALTIES:
269
+ llama_sampler_chain_add(result->chain, llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
270
+ break;
271
+ default:
272
+ LM_GGML_ASSERT(false && "unknown sampler type");
273
+ }
274
+ }
275
+ }
276
+ llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
277
+ } else if (params.mirostat == 1) {
278
+ llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
279
+ llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_vocab_n_tokens(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
280
+ } else if (params.mirostat == 2) {
281
+ llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
282
+ llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
283
+ } else {
284
+ LM_GGML_ASSERT(false && "unknown mirostat version");
285
+ }
286
+
287
+ return result;
288
+ }
289
+
290
+ void common_sampler_free(struct common_sampler * gsmpl) {
291
+ if (gsmpl) {
292
+ llama_sampler_free(gsmpl->grmr);
293
+
294
+ llama_sampler_free(gsmpl->chain);
295
+
296
+ delete gsmpl;
297
+ }
298
+ }
299
+
300
+ void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
301
+ if (accept_grammar) {
302
+ llama_sampler_accept(gsmpl->grmr, token);
303
+ }
304
+
305
+ llama_sampler_accept(gsmpl->chain, token);
306
+
307
+ gsmpl->prev.push_back(token);
308
+ }
309
+
310
+ void common_sampler_reset(struct common_sampler * gsmpl) {
311
+ llama_sampler_reset(gsmpl->grmr);
312
+
313
+ llama_sampler_reset(gsmpl->chain);
314
+ }
315
+
316
+ struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
317
+ return new common_sampler {
318
+ /* .params = */ gsmpl->params,
319
+ /* .grmr = */ llama_sampler_clone(gsmpl->grmr),
320
+ /* .chain = */ llama_sampler_clone(gsmpl->chain),
321
+ /* .prev = */ gsmpl->prev,
322
+ /* .cur = */ gsmpl->cur,
323
+ /* .cur_p = */ gsmpl->cur_p,
324
+ };
325
+ }
326
+
327
+ void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl) {
328
+ // TODO: measure grammar performance
329
+
330
+ if (gsmpl) {
331
+ llama_perf_sampler_print(gsmpl->chain);
332
+ }
333
+ if (ctx) {
334
+ llama_perf_context_print(ctx);
335
+ }
336
+ }
337
+
338
+ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
339
+ gsmpl->set_logits(ctx, idx);
340
+
341
+ auto & grmr = gsmpl->grmr;
342
+ auto & chain = gsmpl->chain;
343
+ auto & cur_p = gsmpl->cur_p; // initialized by set_logits
344
+
345
+ if (grammar_first) {
346
+ llama_sampler_apply(grmr, &cur_p);
347
+ }
348
+
349
+ llama_sampler_apply(chain, &cur_p);
350
+
351
+ LM_GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");
352
+
353
+ const llama_token id = cur_p.data[cur_p.selected].id;
354
+
355
+ if (grammar_first) {
356
+ return id;
357
+ }
358
+
359
+ // check if it the sampled token fits the grammar
360
+ {
361
+ llama_token_data single_token_data = { id, 1.0f, 0.0f };
362
+ llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false };
363
+
364
+ llama_sampler_apply(grmr, &single_token_data_array);
365
+
366
+ const bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
367
+ if (is_valid) {
368
+ return id;
369
+ }
370
+ }
371
+
372
+ // resampling:
373
+ // if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain
374
+ gsmpl->set_logits(ctx, idx);
375
+
376
+ llama_sampler_apply(grmr, &cur_p);
377
+ llama_sampler_apply(chain, &cur_p);
378
+
379
+ LM_GGML_ASSERT(cur_p.selected != -1 && "no selected token during re-sampling - check your sampling configuration");
380
+
381
+ return cur_p.data[cur_p.selected].id;
382
+ }
383
+
384
+ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first) {
385
+ LM_GGML_ASSERT(idxs.size() == draft.size() + 1 && "idxs.size() must be draft.size() + 1");
386
+
387
+ std::vector<llama_token> result;
388
+ result.reserve(idxs.size());
389
+
390
+ size_t i = 0;
391
+ for (; i < draft.size(); i++) {
392
+ const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
393
+
394
+ common_sampler_accept(gsmpl, id, true);
395
+
396
+ result.push_back(id);
397
+
398
+ if (draft[i] != id) {
399
+ break;
400
+ }
401
+ }
402
+
403
+ if (i == draft.size()) {
404
+ const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
405
+
406
+ common_sampler_accept(gsmpl, id, true);
407
+
408
+ result.push_back(id);
409
+ }
410
+
411
+ return result;
412
+ }
413
+
414
+ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first) {
415
+ std::vector<int> idxs(draft.size() + 1);
416
+ for (size_t i = 0; i < idxs.size(); ++i) {
417
+ idxs[i] = i;
418
+ }
419
+
420
+ return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft, grammar_first);
421
+ }
422
+
423
+ uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
424
+ return llama_sampler_get_seed(gsmpl->chain);
425
+ }
426
+
427
+ // helpers
428
+
429
+ llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl) {
430
+ return &gsmpl->cur_p;
431
+ }
432
+
433
+ llama_token common_sampler_last(const struct common_sampler * gsmpl) {
434
+ return gsmpl->prev.rat(0);
435
+ }
436
+
437
+ std::string common_sampler_print(const struct common_sampler * gsmpl) {
438
+ std::string result = "logits ";
439
+
440
+ for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
441
+ const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
442
+ result += std::string("-> ") + llama_sampler_name(smpl) + " ";
443
+ }
444
+
445
+ return result;
446
+ }
447
+
448
+ std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx_main, int n) {
449
+ n = std::min(n, (int) gsmpl->prev.size());
450
+
451
+ if (n <= 0) {
452
+ return "";
453
+ }
454
+
455
+ std::string result;
456
+ result.reserve(8*n); // 8 is the average length of a token [citation needed], TODO: compute this from the vocab
457
+
458
+ for (int i = n - 1; i >= 0; i--) {
459
+ const llama_token id = gsmpl->prev.rat(i);
460
+
461
+ LM_GGML_ASSERT(id != LLAMA_TOKEN_NULL && "null token in the sampling history - should not happen");
462
+
463
+ result += common_token_to_piece(ctx_main, id);
464
+ }
465
+
466
+ return result;
467
+ }
468
+
469
+ char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
470
+ switch (cnstr) {
471
+ case COMMON_SAMPLER_TYPE_DRY: return 'd';
472
+ case COMMON_SAMPLER_TYPE_TOP_K: return 'k';
473
+ case COMMON_SAMPLER_TYPE_TYPICAL_P: return 'y';
474
+ case COMMON_SAMPLER_TYPE_TOP_P: return 'p';
475
+ case COMMON_SAMPLER_TYPE_MIN_P: return 'm';
476
+ case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
477
+ case COMMON_SAMPLER_TYPE_XTC: return 'x';
478
+ case COMMON_SAMPLER_TYPE_INFILL: return 'i';
479
+ case COMMON_SAMPLER_TYPE_PENALTIES: return 'e';
480
+ default : return '?';
481
+ }
482
+ }
483
+
484
+ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
485
+ switch (cnstr) {
486
+ case COMMON_SAMPLER_TYPE_DRY: return "dry";
487
+ case COMMON_SAMPLER_TYPE_TOP_K: return "top_k";
488
+ case COMMON_SAMPLER_TYPE_TYPICAL_P: return "typ_p";
489
+ case COMMON_SAMPLER_TYPE_TOP_P: return "top_p";
490
+ case COMMON_SAMPLER_TYPE_MIN_P: return "min_p";
491
+ case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
492
+ case COMMON_SAMPLER_TYPE_XTC: return "xtc";
493
+ case COMMON_SAMPLER_TYPE_INFILL: return "infill";
494
+ case COMMON_SAMPLER_TYPE_PENALTIES: return "penalties";
495
+ default : return "";
496
+ }
497
+ }
498
+
499
+ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
500
+ std::unordered_map<std::string, common_sampler_type> sampler_canonical_name_map {
501
+ { "dry", COMMON_SAMPLER_TYPE_DRY },
502
+ { "top_k", COMMON_SAMPLER_TYPE_TOP_K },
503
+ { "top_p", COMMON_SAMPLER_TYPE_TOP_P },
504
+ { "typ_p", COMMON_SAMPLER_TYPE_TYPICAL_P },
505
+ { "min_p", COMMON_SAMPLER_TYPE_MIN_P },
506
+ { "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
507
+ { "xtc", COMMON_SAMPLER_TYPE_XTC },
508
+ { "infill", COMMON_SAMPLER_TYPE_INFILL },
509
+ { "penalties", COMMON_SAMPLER_TYPE_PENALTIES },
510
+ };
511
+
512
+ // since samplers names are written multiple ways
513
+ // make it ready for both system names and input names
514
+ std::unordered_map<std::string, common_sampler_type> sampler_alt_name_map {
515
+ { "top-k", COMMON_SAMPLER_TYPE_TOP_K },
516
+ { "top-p", COMMON_SAMPLER_TYPE_TOP_P },
517
+ { "nucleus", COMMON_SAMPLER_TYPE_TOP_P },
518
+ { "typical-p", COMMON_SAMPLER_TYPE_TYPICAL_P },
519
+ { "typical", COMMON_SAMPLER_TYPE_TYPICAL_P },
520
+ { "typ-p", COMMON_SAMPLER_TYPE_TYPICAL_P },
521
+ { "typ", COMMON_SAMPLER_TYPE_TYPICAL_P },
522
+ { "min-p", COMMON_SAMPLER_TYPE_MIN_P },
523
+ { "temp", COMMON_SAMPLER_TYPE_TEMPERATURE },
524
+ };
525
+
526
+ std::vector<common_sampler_type> samplers;
527
+ samplers.reserve(names.size());
528
+
529
+ for (const auto & name : names) {
530
+ auto sampler = sampler_canonical_name_map.find(name);
531
+ if (sampler != sampler_canonical_name_map.end()) {
532
+ samplers.push_back(sampler->second);
533
+ } else {
534
+ if (allow_alt_names) {
535
+ sampler = sampler_alt_name_map.find(name);
536
+ if (sampler != sampler_alt_name_map.end()) {
537
+ samplers.push_back(sampler->second);
538
+ }
539
+ }
540
+ }
541
+ }
542
+
543
+ return samplers;
544
+ }
545
+
546
+ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::string & chars) {
547
+ std::unordered_map<char, common_sampler_type> sampler_name_map = {
548
+ { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_DRY), COMMON_SAMPLER_TYPE_DRY },
549
+ { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_K), COMMON_SAMPLER_TYPE_TOP_K },
550
+ { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P), COMMON_SAMPLER_TYPE_TYPICAL_P },
551
+ { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P), COMMON_SAMPLER_TYPE_TOP_P },
552
+ { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P), COMMON_SAMPLER_TYPE_MIN_P },
553
+ { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE },
554
+ { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC), COMMON_SAMPLER_TYPE_XTC },
555
+ { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_INFILL), COMMON_SAMPLER_TYPE_INFILL },
556
+ { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_PENALTIES), COMMON_SAMPLER_TYPE_PENALTIES },
557
+ };
558
+
559
+ std::vector<common_sampler_type> samplers;
560
+ samplers.reserve(chars.size());
561
+
562
+ for (const auto & c : chars) {
563
+ const auto sampler = sampler_name_map.find(c);
564
+ if (sampler != sampler_name_map.end()) {
565
+ samplers.push_back(sampler->second);
566
+ }
567
+ }
568
+
569
+ return samplers;
570
+ }