@fugood/llama.node 0.0.1-alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (204) hide show
  1. package/CMakeLists.txt +85 -0
  2. package/README.md +56 -0
  3. package/bin/darwin/arm64/llama-node.node +0 -0
  4. package/bin/darwin/x64/llama-node.node +0 -0
  5. package/bin/linux/arm64/llama-node.node +0 -0
  6. package/bin/linux/x64/llama-node.node +0 -0
  7. package/bin/win32/arm64/llama-node.node +0 -0
  8. package/bin/win32/arm64/node.lib +0 -0
  9. package/bin/win32/x64/llama-node.node +0 -0
  10. package/bin/win32/x64/node.lib +0 -0
  11. package/lib/binding.js +13 -0
  12. package/lib/binding.ts +57 -0
  13. package/lib/index.js +24 -0
  14. package/lib/index.ts +13 -0
  15. package/package.json +65 -0
  16. package/src/addons.cpp +506 -0
  17. package/src/llama.cpp/CMakeLists.txt +1320 -0
  18. package/src/llama.cpp/build.zig +172 -0
  19. package/src/llama.cpp/cmake/FindSIMD.cmake +100 -0
  20. package/src/llama.cpp/common/CMakeLists.txt +87 -0
  21. package/src/llama.cpp/common/base64.hpp +392 -0
  22. package/src/llama.cpp/common/common.cpp +2949 -0
  23. package/src/llama.cpp/common/common.h +324 -0
  24. package/src/llama.cpp/common/console.cpp +501 -0
  25. package/src/llama.cpp/common/console.h +19 -0
  26. package/src/llama.cpp/common/grammar-parser.cpp +440 -0
  27. package/src/llama.cpp/common/grammar-parser.h +29 -0
  28. package/src/llama.cpp/common/json-schema-to-grammar.cpp +764 -0
  29. package/src/llama.cpp/common/json-schema-to-grammar.h +4 -0
  30. package/src/llama.cpp/common/json.hpp +24766 -0
  31. package/src/llama.cpp/common/log.h +724 -0
  32. package/src/llama.cpp/common/ngram-cache.cpp +282 -0
  33. package/src/llama.cpp/common/ngram-cache.h +94 -0
  34. package/src/llama.cpp/common/sampling.cpp +353 -0
  35. package/src/llama.cpp/common/sampling.h +147 -0
  36. package/src/llama.cpp/common/stb_image.h +8396 -0
  37. package/src/llama.cpp/common/train.cpp +1513 -0
  38. package/src/llama.cpp/common/train.h +233 -0
  39. package/src/llama.cpp/examples/CMakeLists.txt +52 -0
  40. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +5 -0
  41. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +1640 -0
  42. package/src/llama.cpp/examples/batched/CMakeLists.txt +5 -0
  43. package/src/llama.cpp/examples/batched/batched.cpp +262 -0
  44. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +5 -0
  45. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +261 -0
  46. package/src/llama.cpp/examples/beam-search/CMakeLists.txt +5 -0
  47. package/src/llama.cpp/examples/beam-search/beam-search.cpp +188 -0
  48. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +6 -0
  49. package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +275 -0
  50. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +5 -0
  51. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +936 -0
  52. package/src/llama.cpp/examples/embedding/CMakeLists.txt +5 -0
  53. package/src/llama.cpp/examples/embedding/embedding.cpp +211 -0
  54. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +9 -0
  55. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +195 -0
  56. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +5 -0
  57. package/src/llama.cpp/examples/export-lora/export-lora.cpp +462 -0
  58. package/src/llama.cpp/examples/finetune/CMakeLists.txt +5 -0
  59. package/src/llama.cpp/examples/finetune/finetune.cpp +1861 -0
  60. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +5 -0
  61. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +132 -0
  62. package/src/llama.cpp/examples/gguf/CMakeLists.txt +5 -0
  63. package/src/llama.cpp/examples/gguf/gguf.cpp +256 -0
  64. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +5 -0
  65. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +553 -0
  66. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +5 -0
  67. package/src/llama.cpp/examples/gritlm/gritlm.cpp +215 -0
  68. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +5 -0
  69. package/src/llama.cpp/examples/imatrix/imatrix.cpp +655 -0
  70. package/src/llama.cpp/examples/infill/CMakeLists.txt +5 -0
  71. package/src/llama.cpp/examples/infill/infill.cpp +767 -0
  72. package/src/llama.cpp/examples/jeopardy/questions.txt +100 -0
  73. package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +5 -0
  74. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +1286 -0
  75. package/src/llama.cpp/examples/llama.android/app/src/main/cpp/CMakeLists.txt +50 -0
  76. package/src/llama.cpp/examples/llama.android/app/src/main/cpp/llama-android.cpp +443 -0
  77. package/src/llama.cpp/examples/llava/CMakeLists.txt +37 -0
  78. package/src/llama.cpp/examples/llava/clip.cpp +2027 -0
  79. package/src/llama.cpp/examples/llava/clip.h +85 -0
  80. package/src/llama.cpp/examples/llava/llava-cli.cpp +309 -0
  81. package/src/llama.cpp/examples/llava/llava.cpp +426 -0
  82. package/src/llama.cpp/examples/llava/llava.h +50 -0
  83. package/src/llama.cpp/examples/llava/requirements.txt +3 -0
  84. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +5 -0
  85. package/src/llama.cpp/examples/lookahead/lookahead.cpp +485 -0
  86. package/src/llama.cpp/examples/lookup/CMakeLists.txt +23 -0
  87. package/src/llama.cpp/examples/lookup/lookup-create.cpp +41 -0
  88. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +47 -0
  89. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +160 -0
  90. package/src/llama.cpp/examples/lookup/lookup.cpp +258 -0
  91. package/src/llama.cpp/examples/main/CMakeLists.txt +5 -0
  92. package/src/llama.cpp/examples/main/main.cpp +957 -0
  93. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +33 -0
  94. package/src/llama.cpp/examples/parallel/CMakeLists.txt +5 -0
  95. package/src/llama.cpp/examples/parallel/parallel.cpp +427 -0
  96. package/src/llama.cpp/examples/passkey/CMakeLists.txt +5 -0
  97. package/src/llama.cpp/examples/passkey/passkey.cpp +302 -0
  98. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +5 -0
  99. package/src/llama.cpp/examples/perplexity/perplexity.cpp +1943 -0
  100. package/src/llama.cpp/examples/quantize/CMakeLists.txt +6 -0
  101. package/src/llama.cpp/examples/quantize/quantize.cpp +423 -0
  102. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +6 -0
  103. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +424 -0
  104. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +5 -0
  105. package/src/llama.cpp/examples/retrieval/retrieval.cpp +350 -0
  106. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +5 -0
  107. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +246 -0
  108. package/src/llama.cpp/examples/server/CMakeLists.txt +40 -0
  109. package/src/llama.cpp/examples/server/bench/requirements.txt +2 -0
  110. package/src/llama.cpp/examples/server/httplib.h +9465 -0
  111. package/src/llama.cpp/examples/server/server.cpp +3826 -0
  112. package/src/llama.cpp/examples/server/tests/requirements.txt +6 -0
  113. package/src/llama.cpp/examples/server/utils.hpp +653 -0
  114. package/src/llama.cpp/examples/simple/CMakeLists.txt +5 -0
  115. package/src/llama.cpp/examples/simple/simple.cpp +183 -0
  116. package/src/llama.cpp/examples/speculative/CMakeLists.txt +5 -0
  117. package/src/llama.cpp/examples/speculative/speculative.cpp +614 -0
  118. package/src/llama.cpp/examples/sycl/CMakeLists.txt +9 -0
  119. package/src/llama.cpp/examples/sycl/ls-sycl-device.cpp +13 -0
  120. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +5 -0
  121. package/src/llama.cpp/examples/tokenize/tokenize.cpp +42 -0
  122. package/src/llama.cpp/examples/train-text-from-scratch/CMakeLists.txt +5 -0
  123. package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +1252 -0
  124. package/src/llama.cpp/ggml-alloc.c +985 -0
  125. package/src/llama.cpp/ggml-alloc.h +76 -0
  126. package/src/llama.cpp/ggml-backend-impl.h +141 -0
  127. package/src/llama.cpp/ggml-backend.c +2099 -0
  128. package/src/llama.cpp/ggml-backend.h +233 -0
  129. package/src/llama.cpp/ggml-common.h +1853 -0
  130. package/src/llama.cpp/ggml-cuda.h +43 -0
  131. package/src/llama.cpp/ggml-impl.h +265 -0
  132. package/src/llama.cpp/ggml-kompute.cpp +2006 -0
  133. package/src/llama.cpp/ggml-kompute.h +46 -0
  134. package/src/llama.cpp/ggml-metal.h +66 -0
  135. package/src/llama.cpp/ggml-mpi.c +216 -0
  136. package/src/llama.cpp/ggml-mpi.h +39 -0
  137. package/src/llama.cpp/ggml-opencl.cpp +2301 -0
  138. package/src/llama.cpp/ggml-opencl.h +36 -0
  139. package/src/llama.cpp/ggml-quants.c +12678 -0
  140. package/src/llama.cpp/ggml-quants.h +133 -0
  141. package/src/llama.cpp/ggml-sycl.cpp +17882 -0
  142. package/src/llama.cpp/ggml-sycl.h +49 -0
  143. package/src/llama.cpp/ggml-vulkan-shaders.hpp +69849 -0
  144. package/src/llama.cpp/ggml-vulkan.cpp +6442 -0
  145. package/src/llama.cpp/ggml-vulkan.h +29 -0
  146. package/src/llama.cpp/ggml.c +21819 -0
  147. package/src/llama.cpp/ggml.h +2403 -0
  148. package/src/llama.cpp/llama.cpp +17468 -0
  149. package/src/llama.cpp/llama.h +1117 -0
  150. package/src/llama.cpp/pocs/CMakeLists.txt +12 -0
  151. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +9 -0
  152. package/src/llama.cpp/pocs/vdot/q8dot.cpp +172 -0
  153. package/src/llama.cpp/pocs/vdot/vdot.cpp +310 -0
  154. package/src/llama.cpp/prompts/LLM-questions.txt +49 -0
  155. package/src/llama.cpp/prompts/alpaca.txt +1 -0
  156. package/src/llama.cpp/prompts/assistant.txt +31 -0
  157. package/src/llama.cpp/prompts/chat-with-baichuan.txt +4 -0
  158. package/src/llama.cpp/prompts/chat-with-bob.txt +7 -0
  159. package/src/llama.cpp/prompts/chat-with-qwen.txt +1 -0
  160. package/src/llama.cpp/prompts/chat-with-vicuna-v0.txt +7 -0
  161. package/src/llama.cpp/prompts/chat-with-vicuna-v1.txt +7 -0
  162. package/src/llama.cpp/prompts/chat.txt +28 -0
  163. package/src/llama.cpp/prompts/dan-modified.txt +1 -0
  164. package/src/llama.cpp/prompts/dan.txt +1 -0
  165. package/src/llama.cpp/prompts/mnemonics.txt +93 -0
  166. package/src/llama.cpp/prompts/parallel-questions.txt +43 -0
  167. package/src/llama.cpp/prompts/reason-act.txt +18 -0
  168. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +3 -0
  169. package/src/llama.cpp/requirements/requirements-convert-llama-ggml-to-gguf.txt +1 -0
  170. package/src/llama.cpp/requirements/requirements-convert-lora-to-ggml.txt +2 -0
  171. package/src/llama.cpp/requirements/requirements-convert-persimmon-to-gguf.txt +2 -0
  172. package/src/llama.cpp/requirements/requirements-convert.txt +5 -0
  173. package/src/llama.cpp/requirements.txt +12 -0
  174. package/src/llama.cpp/scripts/gen-build-info-cpp.cmake +24 -0
  175. package/src/llama.cpp/scripts/xxd.cmake +16 -0
  176. package/src/llama.cpp/sgemm.cpp +999 -0
  177. package/src/llama.cpp/sgemm.h +12 -0
  178. package/src/llama.cpp/tests/CMakeLists.txt +78 -0
  179. package/src/llama.cpp/tests/get-model.cpp +21 -0
  180. package/src/llama.cpp/tests/get-model.h +2 -0
  181. package/src/llama.cpp/tests/test-autorelease.cpp +24 -0
  182. package/src/llama.cpp/tests/test-backend-ops.cpp +2266 -0
  183. package/src/llama.cpp/tests/test-c.c +7 -0
  184. package/src/llama.cpp/tests/test-chat-template.cpp +107 -0
  185. package/src/llama.cpp/tests/test-double-float.cpp +57 -0
  186. package/src/llama.cpp/tests/test-grad0.cpp +1606 -0
  187. package/src/llama.cpp/tests/test-grammar-integration.cpp +243 -0
  188. package/src/llama.cpp/tests/test-grammar-parser.cpp +250 -0
  189. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +899 -0
  190. package/src/llama.cpp/tests/test-llama-grammar.cpp +402 -0
  191. package/src/llama.cpp/tests/test-model-load-cancel.cpp +27 -0
  192. package/src/llama.cpp/tests/test-opt.cpp +181 -0
  193. package/src/llama.cpp/tests/test-quantize-fns.cpp +185 -0
  194. package/src/llama.cpp/tests/test-quantize-perf.cpp +363 -0
  195. package/src/llama.cpp/tests/test-rope.cpp +221 -0
  196. package/src/llama.cpp/tests/test-sampling.cpp +301 -0
  197. package/src/llama.cpp/tests/test-tokenizer-0-falcon.cpp +187 -0
  198. package/src/llama.cpp/tests/test-tokenizer-0-llama.cpp +190 -0
  199. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +123 -0
  200. package/src/llama.cpp/tests/test-tokenizer-1-llama.cpp +111 -0
  201. package/src/llama.cpp/unicode-data.cpp +1651 -0
  202. package/src/llama.cpp/unicode-data.h +16 -0
  203. package/src/llama.cpp/unicode.cpp +277 -0
  204. package/src/llama.cpp/unicode.h +28 -0
@@ -0,0 +1,282 @@
1
+ #include "ngram-cache.h"
2
+ #include "common.h"
3
+ #include "log.h"
4
+
5
+ #include <cstdint>
6
+ #include <fstream>
7
+
8
+ void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
9
+ std::vector<llama_token> & inp, int nnew, bool print_progress) {
10
+ const int64_t t_start_ms = ggml_time_ms();
11
+ const int64_t inp_size = inp.size();
12
+
13
+ const int64_t n_todo = inp_size * (ngram_max - ngram_min + 1);
14
+ int64_t n_done = 0;
15
+
16
+ for (int64_t ngram_size = ngram_min; ngram_size <= ngram_max; ++ngram_size) {
17
+ const int64_t i_start = std::max(inp_size - nnew, ngram_size);
18
+ for (int64_t i = i_start; i < inp_size; ++i) {
19
+ const int64_t ngram_start = i - ngram_size;
20
+ llama_ngram ngram(&inp[ngram_start], ngram_size);
21
+ const llama_token token = inp[i];
22
+
23
+ llama_ngram_cache::iterator part_it = ngram_cache.find(ngram);
24
+ if (part_it == ngram_cache.end()) {
25
+ llama_ngram_cache_part part;
26
+ part.emplace(token, 1);
27
+ ngram_cache.emplace(ngram, part);
28
+ } else {
29
+ llama_ngram_cache_part::iterator token_count_it = part_it->second.find(token);
30
+ if (token_count_it == part_it->second.end()) {
31
+ part_it->second.emplace(token, 1);
32
+ } else {
33
+ token_count_it->second++;
34
+ }
35
+ }
36
+ ++n_done;
37
+
38
+ if (print_progress && n_done % 10000000 == 0) {
39
+ const int64_t t_now_ms = ggml_time_ms();
40
+ const int64_t eta_ms = (inp_size*(ngram_max-ngram_min+1) - n_done) * (t_now_ms - t_start_ms) / n_done;
41
+ const int64_t eta_min = eta_ms / (60*1000);
42
+ const int64_t eta_s = (eta_ms - 60*1000*eta_min) / 1000;
43
+
44
+ fprintf(stderr, "%s: %" PRId64 "/%" PRId64 " done, ETA: %02" PRId64 ":%02" PRId64 "\n", __func__, n_done, n_todo, eta_min, eta_s);
45
+ }
46
+ }
47
+ }
48
+ }
49
+
50
+ // Helper function to get a token from the combined, speculative sequence of inp and draft.
51
+ static llama_token get_token(const std::vector<llama_token> & inp, const std::vector<llama_token> & draft, const size_t i) {
52
+ return i < inp.size() ? inp[i] : draft[1 + i - inp.size()];
53
+ }
54
+
55
+ // If sample size or percentage are below these thresholds the draft is aborted early:
56
+ constexpr int draft_min_sample_size_lax[LLAMA_NGRAM_MAX] = { 2, 2, 1, 1};
57
+ constexpr int draft_min_percent_lax[LLAMA_NGRAM_MAX] = {66, 50, 50, 50};
58
+ constexpr int draft_min_sample_size_strict[LLAMA_NGRAM_MAX] = { 4, 3, 2, 2};
59
+ constexpr int draft_min_percent_strict[LLAMA_NGRAM_MAX] = {75, 66, 66, 66};
60
+
61
+ // Helper function that tries to draft a token from only the static ngram cache:
62
+ static llama_token try_draft(llama_ngram_cache & nc_static, const llama_ngram ngram_static) {
63
+ llama_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
64
+ if (part_static_it == nc_static.end()) {
65
+ return -1;
66
+ }
67
+ const llama_ngram_cache_part part_static = part_static_it->second;
68
+
69
+ int max_count_static = 0;
70
+ int sum_count_static = 0;
71
+ llama_token max_token = -1;
72
+
73
+ for (std::pair<llama_token, int> token_count_static : part_static) {
74
+ const llama_token token = token_count_static.first;
75
+ const int32_t count_static = token_count_static.second;
76
+
77
+ if (count_static > max_count_static) {
78
+ max_token = token;
79
+ max_count_static = count_static;
80
+ }
81
+ sum_count_static += count_static;
82
+ }
83
+
84
+ if (sum_count_static < draft_min_sample_size_lax[LLAMA_NGRAM_STATIC-1]) {
85
+ return -1;
86
+ }
87
+ if (100*max_count_static < draft_min_percent_lax[LLAMA_NGRAM_STATIC-1]*sum_count_static) {
88
+ return -1;
89
+ }
90
+ return max_token;
91
+ }
92
+
93
+ // Try to draft a token from primary cache (context/dynamic), validate with static cache:
94
+ static llama_token try_draft(
95
+ llama_ngram_cache & nc_primary, const std::vector<llama_ngram> & ngrams_primary, llama_ngram_cache_part & part_static,
96
+ const int * min_sample_size, const int * min_percent) {
97
+
98
+ llama_token drafted_token = -1;
99
+
100
+ for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == -1; --i) {
101
+ const llama_ngram ngram_primary = ngrams_primary[i];
102
+
103
+ llama_ngram_cache::iterator part_primary_it = nc_primary.find(ngram_primary);
104
+ if (part_primary_it == nc_primary.end()) {
105
+ continue;
106
+ }
107
+ const llama_ngram_cache_part part_primary = part_primary_it->second;
108
+
109
+ int max_count_primary = 0;
110
+ int max_count_static = 0;
111
+ int sum_count_primary = 0;
112
+ llama_token max_token = -1;
113
+
114
+ for (std::pair<llama_token, int> token_count_primary : part_primary) {
115
+ const llama_token token = token_count_primary.first;
116
+
117
+ llama_ngram_cache_part::iterator token_count_static_it = part_static.find(token);
118
+
119
+ const int32_t count_primary = token_count_primary.second;
120
+ const int32_t count_static = token_count_static_it != part_static.end() ? 100*token_count_static_it->second : 1;
121
+
122
+ if (count_primary*count_static > max_count_primary*max_count_static) {
123
+ max_token = token;
124
+ max_count_primary = count_primary;
125
+ max_count_static = count_static;
126
+ }
127
+ sum_count_primary += count_primary;
128
+ }
129
+
130
+ if (sum_count_primary < min_sample_size[i]) {
131
+ continue;
132
+ }
133
+ if (100*max_count_primary < min_percent[i]*sum_count_primary) {
134
+ continue;;
135
+ }
136
+ drafted_token = max_token;
137
+ }
138
+
139
+ return drafted_token;
140
+ }
141
+
142
+ void llama_ngram_cache_draft(
143
+ std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
144
+ llama_ngram_cache & nc_context, llama_ngram_cache & nc_dynamic, llama_ngram_cache & nc_static
145
+ ) {
146
+ GGML_ASSERT(draft.size() == 1);
147
+ const int inp_size = inp.size();
148
+
149
+ if (inp_size < LLAMA_NGRAM_STATIC) {
150
+ return;
151
+ }
152
+
153
+ while ((int) draft.size()-1 < n_draft) {
154
+ llama_token drafted_token = -1;
155
+
156
+ const int ngram_start_static = inp_size-LLAMA_NGRAM_STATIC + draft.size()-1;
157
+ llama_ngram ngram_static;
158
+ for (int j = ngram_start_static; j < ngram_start_static + LLAMA_NGRAM_STATIC; ++j) {
159
+ ngram_static.tokens[j-ngram_start_static] = get_token(inp, draft, j);
160
+ }
161
+ llama_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
162
+ llama_ngram_cache_part part_static;
163
+ if (part_static_it != nc_static.end()) {
164
+ part_static = part_static_it->second;
165
+ }
166
+
167
+ // cd = context + dynamic
168
+ std::vector<llama_ngram> ngrams_cd;
169
+ for (int ngram_size_cd = ngram_min; ngram_size_cd <= ngram_max; ++ngram_size_cd) {
170
+ const int ngram_start_cd = inp_size-ngram_size_cd + draft.size()-1;
171
+ llama_ngram ngram_cd;
172
+ for (int j = ngram_start_cd; j < ngram_start_cd + ngram_size_cd; ++j) {
173
+ ngram_cd.tokens[j-ngram_start_cd] = get_token(inp, draft, j);
174
+ }
175
+ ngrams_cd.push_back(ngram_cd);
176
+ }
177
+ if (drafted_token == -1) {
178
+ drafted_token = try_draft(nc_context, ngrams_cd, part_static, draft_min_sample_size_lax, draft_min_percent_lax);
179
+ }
180
+ if (drafted_token == -1) {
181
+ drafted_token = try_draft(nc_dynamic, ngrams_cd, part_static, draft_min_sample_size_strict, draft_min_percent_strict);
182
+ }
183
+ if (drafted_token == -1) {
184
+ drafted_token = try_draft(nc_static, ngram_static);
185
+ }
186
+
187
+ if (drafted_token == -1) {
188
+ break;
189
+ }
190
+
191
+ LOG(" - draft candidate: token=%d\n", drafted_token);
192
+ draft.push_back(drafted_token);
193
+ }
194
+ }
195
+
196
+ void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filename) {
197
+ std::ofstream file_out(filename, std::ios::binary);
198
+ for (std::pair<llama_ngram, llama_ngram_cache_part> item : ngram_cache) {
199
+ const llama_ngram ngram = item.first;
200
+ llama_ngram_cache_part token_counts = item.second;
201
+ GGML_ASSERT(!token_counts.empty());
202
+ const int32_t ntokens = token_counts.size();
203
+ GGML_ASSERT(ntokens > 0);
204
+
205
+ file_out.write(reinterpret_cast<const char *>(&ngram), sizeof(llama_ngram));
206
+ file_out.write(reinterpret_cast<const char *>(&ntokens), sizeof(int32_t));
207
+ for (std::pair<llama_token, int32_t> item2 : token_counts) {
208
+ const llama_token token = item2.first;
209
+ const int32_t count = item2.second;
210
+ GGML_ASSERT(count > 0);
211
+
212
+ file_out.write(reinterpret_cast<const char *>(&token), sizeof(llama_token));
213
+ file_out.write(reinterpret_cast<const char *>(&count), sizeof(int32_t));
214
+ }
215
+ }
216
+
217
+ }
218
+
219
+ llama_ngram_cache llama_ngram_cache_load(std::string & filename) {
220
+ std::ifstream hashmap_file(filename, std::ios::binary);
221
+ if (!hashmap_file) {
222
+ throw std::ifstream::failure("Unable to open file " + filename);
223
+ }
224
+ llama_ngram_cache ngram_cache;
225
+
226
+ llama_ngram ngram;
227
+ int32_t ntokens;
228
+ llama_token token;
229
+ int32_t count;
230
+
231
+ char * ngramc = reinterpret_cast<char*>(&ngram);
232
+ char * ntokensc = reinterpret_cast<char*>(&ntokens);
233
+ char * tokenc = reinterpret_cast<char*>(&token);
234
+ char * countc = reinterpret_cast<char*>(&count);
235
+ while(hashmap_file.read(ngramc, sizeof(llama_ngram))) {
236
+ GGML_ASSERT(!hashmap_file.eof());
237
+ GGML_ASSERT(hashmap_file.read(ntokensc, sizeof(int32_t)));
238
+ GGML_ASSERT(ntokens > 0);
239
+ llama_ngram_cache_part token_counts;
240
+
241
+ for (int i = 0; i < ntokens; ++i) {
242
+ GGML_ASSERT(!hashmap_file.eof());
243
+ GGML_ASSERT(hashmap_file.read(tokenc, sizeof(llama_token)));
244
+ GGML_ASSERT(!hashmap_file.eof());
245
+ GGML_ASSERT(hashmap_file.read(countc, sizeof(int32_t)));
246
+ GGML_ASSERT(count > 0);
247
+ token_counts.emplace(token, count);
248
+ }
249
+
250
+ ngram_cache.emplace(ngram, token_counts);
251
+ }
252
+ GGML_ASSERT(hashmap_file.eof());
253
+
254
+ return ngram_cache;
255
+ }
256
+
257
+ void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram_cache & ngram_cache_add) {
258
+ for (std::pair<llama_ngram, llama_ngram_cache_part> ngram_part : ngram_cache_add) {
259
+ const llama_ngram ngram = ngram_part.first;
260
+ llama_ngram_cache_part part = ngram_part.second;
261
+
262
+ llama_ngram_cache::iterator part_merged_it = ngram_cache_target.find(ngram);
263
+ if (part_merged_it == ngram_cache_target.end()) {
264
+ ngram_cache_target.emplace(ngram, part);
265
+ continue;
266
+ }
267
+
268
+ for (std::pair<llama_token, int32_t> token_count : part) {
269
+ const llama_token token = token_count.first;
270
+ const int32_t count = token_count.second;
271
+ GGML_ASSERT(count > 0);
272
+
273
+ llama_ngram_cache_part::iterator token_count_merged_it = part_merged_it->second.find(token);
274
+ if (token_count_merged_it == part_merged_it->second.end()) {
275
+ part_merged_it->second.emplace(token, count);
276
+ continue;
277
+ }
278
+
279
+ token_count_merged_it->second += count;
280
+ }
281
+ }
282
+ }
@@ -0,0 +1,94 @@
1
+ #pragma once
2
+
3
+ #include "llama.h"
4
+
5
+ #include <unordered_map>
6
+ #include <string>
7
+ #include <vector>
8
+
9
+ #define LLAMA_NGRAM_MIN 1
10
+ #define LLAMA_NGRAM_MAX 4
11
+ #define LLAMA_NGRAM_STATIC 2
12
+
13
+ // Data structures to map n-grams to empirical token probabilities:
14
+
15
+ struct llama_ngram {
16
+ llama_token tokens[LLAMA_NGRAM_MAX];
17
+
18
+ llama_ngram() {
19
+ for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
20
+ tokens[i] = -1;
21
+ }
22
+ }
23
+
24
+ llama_ngram(const llama_token * input, const int ngram_size) {
25
+ for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
26
+ tokens[i] = i < ngram_size ? input[i] : -1;
27
+ }
28
+ }
29
+
30
+ bool operator==(const llama_ngram & other) const {
31
+ for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
32
+ if (tokens[i] != other.tokens[i]) {
33
+ return false;
34
+ }
35
+ }
36
+ return true;
37
+ }
38
+ };
39
+
40
+ struct llama_ngram_hash_function {
41
+ size_t operator()(const llama_ngram & ngram) const {
42
+ size_t hash = 0;
43
+ for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
44
+ hash ^= std::hash<llama_token>{}(ngram.tokens[i]);
45
+ }
46
+ return hash;
47
+ }
48
+ };
49
+
50
+ // token -> number of times token has been seen
51
+ typedef std::unordered_map<llama_token, int32_t> llama_ngram_cache_part;
52
+
53
+ // n-gram -> empirical distribution of following tokens
54
+ typedef std::unordered_map<llama_ngram, llama_ngram_cache_part, llama_ngram_hash_function> llama_ngram_cache;
55
+
56
+
57
+ // Update an ngram cache with tokens.
58
+ // ngram_cache: the cache to modify.
59
+ // ngram_min/ngram_max: the min/max size of the ngrams to extract from inp_data.
60
+ // inp_data: the token sequence with which to update ngram_cache.
61
+ // nnew: how many new tokens have been appended to inp_data since the last call to this function.
62
+ // print_progress: whether to print progress to stderr.
63
+ //
64
+ // In order to get correct results inp_data can ONLY BE APPENDED TO.
65
+ // Changes in the middle need a complete rebuild.
66
+ void llama_ngram_cache_update(
67
+ llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector<llama_token> & inp_data, int nnew, bool print_progress);
68
+
69
+ // Try to draft tokens from ngram caches.
70
+ // inp: the tokens generated so far.
71
+ // draft: the token sequence to draft. Expected to initially contain the previously sampled token.
72
+ // n_draft: maximum number of tokens to add to draft.
73
+ // ngram_min/gram_max: the min/max size of the ngrams in nc_context and nc_dynamic.
74
+ // nc_context: ngram cache based on current context.
75
+ // nc_dynamic: ngram cache based on previous user generations.
76
+ // nc_static: ngram cache generated from a large text corpus, used for validation.
77
+ void llama_ngram_cache_draft(
78
+ std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
79
+ llama_ngram_cache & nc_context, llama_ngram_cache & nc_dynamic, llama_ngram_cache & nc_static);
80
+
81
+ // Save an ngram cache to a file.
82
+ // ngram_cache: the ngram cache to save.
83
+ // filename: the path under which to save the ngram cache.
84
+ void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filename);
85
+
86
+ // Load an ngram cache saved with llama_ngram_cache_save.
87
+ // filename: the path from which to load the ngram cache.
88
+ // returns: an ngram cache containing the information saved to filename.
89
+ llama_ngram_cache llama_ngram_cache_load(std::string & filename);
90
+
91
+ // Merge two ngram caches.
92
+ // ngram_cache_target: the ngram cache to which to add the information from ngram_cache_add.
93
+ // ngram_cache_add: the ngram cache to add to ngram_cache_target.
94
+ void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram_cache & ngram_cache_add);