@fugood/llama.node 0.0.1-alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (204) hide show
  1. package/CMakeLists.txt +85 -0
  2. package/README.md +56 -0
  3. package/bin/darwin/arm64/llama-node.node +0 -0
  4. package/bin/darwin/x64/llama-node.node +0 -0
  5. package/bin/linux/arm64/llama-node.node +0 -0
  6. package/bin/linux/x64/llama-node.node +0 -0
  7. package/bin/win32/arm64/llama-node.node +0 -0
  8. package/bin/win32/arm64/node.lib +0 -0
  9. package/bin/win32/x64/llama-node.node +0 -0
  10. package/bin/win32/x64/node.lib +0 -0
  11. package/lib/binding.js +13 -0
  12. package/lib/binding.ts +57 -0
  13. package/lib/index.js +24 -0
  14. package/lib/index.ts +13 -0
  15. package/package.json +65 -0
  16. package/src/addons.cpp +506 -0
  17. package/src/llama.cpp/CMakeLists.txt +1320 -0
  18. package/src/llama.cpp/build.zig +172 -0
  19. package/src/llama.cpp/cmake/FindSIMD.cmake +100 -0
  20. package/src/llama.cpp/common/CMakeLists.txt +87 -0
  21. package/src/llama.cpp/common/base64.hpp +392 -0
  22. package/src/llama.cpp/common/common.cpp +2949 -0
  23. package/src/llama.cpp/common/common.h +324 -0
  24. package/src/llama.cpp/common/console.cpp +501 -0
  25. package/src/llama.cpp/common/console.h +19 -0
  26. package/src/llama.cpp/common/grammar-parser.cpp +440 -0
  27. package/src/llama.cpp/common/grammar-parser.h +29 -0
  28. package/src/llama.cpp/common/json-schema-to-grammar.cpp +764 -0
  29. package/src/llama.cpp/common/json-schema-to-grammar.h +4 -0
  30. package/src/llama.cpp/common/json.hpp +24766 -0
  31. package/src/llama.cpp/common/log.h +724 -0
  32. package/src/llama.cpp/common/ngram-cache.cpp +282 -0
  33. package/src/llama.cpp/common/ngram-cache.h +94 -0
  34. package/src/llama.cpp/common/sampling.cpp +353 -0
  35. package/src/llama.cpp/common/sampling.h +147 -0
  36. package/src/llama.cpp/common/stb_image.h +8396 -0
  37. package/src/llama.cpp/common/train.cpp +1513 -0
  38. package/src/llama.cpp/common/train.h +233 -0
  39. package/src/llama.cpp/examples/CMakeLists.txt +52 -0
  40. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +5 -0
  41. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +1640 -0
  42. package/src/llama.cpp/examples/batched/CMakeLists.txt +5 -0
  43. package/src/llama.cpp/examples/batched/batched.cpp +262 -0
  44. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +5 -0
  45. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +261 -0
  46. package/src/llama.cpp/examples/beam-search/CMakeLists.txt +5 -0
  47. package/src/llama.cpp/examples/beam-search/beam-search.cpp +188 -0
  48. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +6 -0
  49. package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +275 -0
  50. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +5 -0
  51. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +936 -0
  52. package/src/llama.cpp/examples/embedding/CMakeLists.txt +5 -0
  53. package/src/llama.cpp/examples/embedding/embedding.cpp +211 -0
  54. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +9 -0
  55. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +195 -0
  56. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +5 -0
  57. package/src/llama.cpp/examples/export-lora/export-lora.cpp +462 -0
  58. package/src/llama.cpp/examples/finetune/CMakeLists.txt +5 -0
  59. package/src/llama.cpp/examples/finetune/finetune.cpp +1861 -0
  60. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +5 -0
  61. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +132 -0
  62. package/src/llama.cpp/examples/gguf/CMakeLists.txt +5 -0
  63. package/src/llama.cpp/examples/gguf/gguf.cpp +256 -0
  64. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +5 -0
  65. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +553 -0
  66. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +5 -0
  67. package/src/llama.cpp/examples/gritlm/gritlm.cpp +215 -0
  68. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +5 -0
  69. package/src/llama.cpp/examples/imatrix/imatrix.cpp +655 -0
  70. package/src/llama.cpp/examples/infill/CMakeLists.txt +5 -0
  71. package/src/llama.cpp/examples/infill/infill.cpp +767 -0
  72. package/src/llama.cpp/examples/jeopardy/questions.txt +100 -0
  73. package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +5 -0
  74. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +1286 -0
  75. package/src/llama.cpp/examples/llama.android/app/src/main/cpp/CMakeLists.txt +50 -0
  76. package/src/llama.cpp/examples/llama.android/app/src/main/cpp/llama-android.cpp +443 -0
  77. package/src/llama.cpp/examples/llava/CMakeLists.txt +37 -0
  78. package/src/llama.cpp/examples/llava/clip.cpp +2027 -0
  79. package/src/llama.cpp/examples/llava/clip.h +85 -0
  80. package/src/llama.cpp/examples/llava/llava-cli.cpp +309 -0
  81. package/src/llama.cpp/examples/llava/llava.cpp +426 -0
  82. package/src/llama.cpp/examples/llava/llava.h +50 -0
  83. package/src/llama.cpp/examples/llava/requirements.txt +3 -0
  84. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +5 -0
  85. package/src/llama.cpp/examples/lookahead/lookahead.cpp +485 -0
  86. package/src/llama.cpp/examples/lookup/CMakeLists.txt +23 -0
  87. package/src/llama.cpp/examples/lookup/lookup-create.cpp +41 -0
  88. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +47 -0
  89. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +160 -0
  90. package/src/llama.cpp/examples/lookup/lookup.cpp +258 -0
  91. package/src/llama.cpp/examples/main/CMakeLists.txt +5 -0
  92. package/src/llama.cpp/examples/main/main.cpp +957 -0
  93. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +33 -0
  94. package/src/llama.cpp/examples/parallel/CMakeLists.txt +5 -0
  95. package/src/llama.cpp/examples/parallel/parallel.cpp +427 -0
  96. package/src/llama.cpp/examples/passkey/CMakeLists.txt +5 -0
  97. package/src/llama.cpp/examples/passkey/passkey.cpp +302 -0
  98. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +5 -0
  99. package/src/llama.cpp/examples/perplexity/perplexity.cpp +1943 -0
  100. package/src/llama.cpp/examples/quantize/CMakeLists.txt +6 -0
  101. package/src/llama.cpp/examples/quantize/quantize.cpp +423 -0
  102. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +6 -0
  103. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +424 -0
  104. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +5 -0
  105. package/src/llama.cpp/examples/retrieval/retrieval.cpp +350 -0
  106. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +5 -0
  107. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +246 -0
  108. package/src/llama.cpp/examples/server/CMakeLists.txt +40 -0
  109. package/src/llama.cpp/examples/server/bench/requirements.txt +2 -0
  110. package/src/llama.cpp/examples/server/httplib.h +9465 -0
  111. package/src/llama.cpp/examples/server/server.cpp +3826 -0
  112. package/src/llama.cpp/examples/server/tests/requirements.txt +6 -0
  113. package/src/llama.cpp/examples/server/utils.hpp +653 -0
  114. package/src/llama.cpp/examples/simple/CMakeLists.txt +5 -0
  115. package/src/llama.cpp/examples/simple/simple.cpp +183 -0
  116. package/src/llama.cpp/examples/speculative/CMakeLists.txt +5 -0
  117. package/src/llama.cpp/examples/speculative/speculative.cpp +614 -0
  118. package/src/llama.cpp/examples/sycl/CMakeLists.txt +9 -0
  119. package/src/llama.cpp/examples/sycl/ls-sycl-device.cpp +13 -0
  120. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +5 -0
  121. package/src/llama.cpp/examples/tokenize/tokenize.cpp +42 -0
  122. package/src/llama.cpp/examples/train-text-from-scratch/CMakeLists.txt +5 -0
  123. package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +1252 -0
  124. package/src/llama.cpp/ggml-alloc.c +985 -0
  125. package/src/llama.cpp/ggml-alloc.h +76 -0
  126. package/src/llama.cpp/ggml-backend-impl.h +141 -0
  127. package/src/llama.cpp/ggml-backend.c +2099 -0
  128. package/src/llama.cpp/ggml-backend.h +233 -0
  129. package/src/llama.cpp/ggml-common.h +1853 -0
  130. package/src/llama.cpp/ggml-cuda.h +43 -0
  131. package/src/llama.cpp/ggml-impl.h +265 -0
  132. package/src/llama.cpp/ggml-kompute.cpp +2006 -0
  133. package/src/llama.cpp/ggml-kompute.h +46 -0
  134. package/src/llama.cpp/ggml-metal.h +66 -0
  135. package/src/llama.cpp/ggml-mpi.c +216 -0
  136. package/src/llama.cpp/ggml-mpi.h +39 -0
  137. package/src/llama.cpp/ggml-opencl.cpp +2301 -0
  138. package/src/llama.cpp/ggml-opencl.h +36 -0
  139. package/src/llama.cpp/ggml-quants.c +12678 -0
  140. package/src/llama.cpp/ggml-quants.h +133 -0
  141. package/src/llama.cpp/ggml-sycl.cpp +17882 -0
  142. package/src/llama.cpp/ggml-sycl.h +49 -0
  143. package/src/llama.cpp/ggml-vulkan-shaders.hpp +69849 -0
  144. package/src/llama.cpp/ggml-vulkan.cpp +6442 -0
  145. package/src/llama.cpp/ggml-vulkan.h +29 -0
  146. package/src/llama.cpp/ggml.c +21819 -0
  147. package/src/llama.cpp/ggml.h +2403 -0
  148. package/src/llama.cpp/llama.cpp +17468 -0
  149. package/src/llama.cpp/llama.h +1117 -0
  150. package/src/llama.cpp/pocs/CMakeLists.txt +12 -0
  151. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +9 -0
  152. package/src/llama.cpp/pocs/vdot/q8dot.cpp +172 -0
  153. package/src/llama.cpp/pocs/vdot/vdot.cpp +310 -0
  154. package/src/llama.cpp/prompts/LLM-questions.txt +49 -0
  155. package/src/llama.cpp/prompts/alpaca.txt +1 -0
  156. package/src/llama.cpp/prompts/assistant.txt +31 -0
  157. package/src/llama.cpp/prompts/chat-with-baichuan.txt +4 -0
  158. package/src/llama.cpp/prompts/chat-with-bob.txt +7 -0
  159. package/src/llama.cpp/prompts/chat-with-qwen.txt +1 -0
  160. package/src/llama.cpp/prompts/chat-with-vicuna-v0.txt +7 -0
  161. package/src/llama.cpp/prompts/chat-with-vicuna-v1.txt +7 -0
  162. package/src/llama.cpp/prompts/chat.txt +28 -0
  163. package/src/llama.cpp/prompts/dan-modified.txt +1 -0
  164. package/src/llama.cpp/prompts/dan.txt +1 -0
  165. package/src/llama.cpp/prompts/mnemonics.txt +93 -0
  166. package/src/llama.cpp/prompts/parallel-questions.txt +43 -0
  167. package/src/llama.cpp/prompts/reason-act.txt +18 -0
  168. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +3 -0
  169. package/src/llama.cpp/requirements/requirements-convert-llama-ggml-to-gguf.txt +1 -0
  170. package/src/llama.cpp/requirements/requirements-convert-lora-to-ggml.txt +2 -0
  171. package/src/llama.cpp/requirements/requirements-convert-persimmon-to-gguf.txt +2 -0
  172. package/src/llama.cpp/requirements/requirements-convert.txt +5 -0
  173. package/src/llama.cpp/requirements.txt +12 -0
  174. package/src/llama.cpp/scripts/gen-build-info-cpp.cmake +24 -0
  175. package/src/llama.cpp/scripts/xxd.cmake +16 -0
  176. package/src/llama.cpp/sgemm.cpp +999 -0
  177. package/src/llama.cpp/sgemm.h +12 -0
  178. package/src/llama.cpp/tests/CMakeLists.txt +78 -0
  179. package/src/llama.cpp/tests/get-model.cpp +21 -0
  180. package/src/llama.cpp/tests/get-model.h +2 -0
  181. package/src/llama.cpp/tests/test-autorelease.cpp +24 -0
  182. package/src/llama.cpp/tests/test-backend-ops.cpp +2266 -0
  183. package/src/llama.cpp/tests/test-c.c +7 -0
  184. package/src/llama.cpp/tests/test-chat-template.cpp +107 -0
  185. package/src/llama.cpp/tests/test-double-float.cpp +57 -0
  186. package/src/llama.cpp/tests/test-grad0.cpp +1606 -0
  187. package/src/llama.cpp/tests/test-grammar-integration.cpp +243 -0
  188. package/src/llama.cpp/tests/test-grammar-parser.cpp +250 -0
  189. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +899 -0
  190. package/src/llama.cpp/tests/test-llama-grammar.cpp +402 -0
  191. package/src/llama.cpp/tests/test-model-load-cancel.cpp +27 -0
  192. package/src/llama.cpp/tests/test-opt.cpp +181 -0
  193. package/src/llama.cpp/tests/test-quantize-fns.cpp +185 -0
  194. package/src/llama.cpp/tests/test-quantize-perf.cpp +363 -0
  195. package/src/llama.cpp/tests/test-rope.cpp +221 -0
  196. package/src/llama.cpp/tests/test-sampling.cpp +301 -0
  197. package/src/llama.cpp/tests/test-tokenizer-0-falcon.cpp +187 -0
  198. package/src/llama.cpp/tests/test-tokenizer-0-llama.cpp +190 -0
  199. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +123 -0
  200. package/src/llama.cpp/tests/test-tokenizer-1-llama.cpp +111 -0
  201. package/src/llama.cpp/unicode-data.cpp +1651 -0
  202. package/src/llama.cpp/unicode-data.h +16 -0
  203. package/src/llama.cpp/unicode.cpp +277 -0
  204. package/src/llama.cpp/unicode.h +28 -0
@@ -0,0 +1,302 @@
1
+ #include "common.h"
2
+ #include "llama.h"
3
+
4
+ #include <cmath>
5
+ #include <cstdio>
6
+ #include <string>
7
+ #include <vector>
8
+
9
+ int main(int argc, char ** argv) {
10
+ gpt_params params;
11
+
12
+ if (argc == 1 || argv[1][0] == '-') {
13
+ printf("usage: %s MODEL_PATH N_JUNK N_GRP I_POS SEED\n" , argv[0]);
14
+ return 1 ;
15
+ }
16
+
17
+ int seed = -1;
18
+
19
+ int n_junk = 250; // number of times to repeat the junk text
20
+ int n_keep = 32; // number of tokens in the prompt prefix
21
+ int n_grp = 1; // if more than 1 - perform LongLM SelfExtend
22
+ int i_pos = -1; // position of the passkey in the junk text
23
+
24
+ if (argc >= 2) {
25
+ params.model = argv[1];
26
+ }
27
+
28
+ if (argc >= 3) {
29
+ n_junk = std::stoi(argv[2]);
30
+ }
31
+
32
+ if (argc >= 4) {
33
+ n_grp = std::stoi(argv[3]);
34
+ }
35
+
36
+ if (argc >= 5) {
37
+ i_pos = std::stoi(argv[4]);
38
+ }
39
+
40
+ if (argc >= 6) {
41
+ seed = std::stoi(argv[5]);
42
+ }
43
+
44
+ if (seed == -1) {
45
+ seed = time(NULL);
46
+ }
47
+
48
+ srand(seed);
49
+
50
+ if (i_pos == -1) {
51
+ i_pos = rand() % n_junk;
52
+ }
53
+
54
+ const std::string prompt_prefix = "There is an important info hidden inside a lot of irrelevant text. Find it and memorize them. I will quiz you about the important information there.";
55
+ const std::string prompt_suffix = " What is the pass key? The pass key is";
56
+
57
+ // generate junk text
58
+ params.prompt = prompt_prefix;
59
+
60
+ const int passkey = rand() % 50000 + 1;
61
+
62
+ for (int i = 0; i < n_junk; i++) {
63
+ if (i % n_junk == i_pos) {
64
+ params.prompt += " The pass key is " + std::to_string(passkey) + ". Remember it. " + std::to_string(passkey) + " is the pass key.";
65
+ }
66
+
67
+ params.prompt += " The grass is green. The sky is blue. The sun is yellow. Here we go. There and back again.";
68
+ }
69
+
70
+ params.prompt += prompt_suffix;
71
+
72
+ // init LLM
73
+
74
+ llama_backend_init();
75
+ llama_numa_init(params.numa);
76
+
77
+ // initialize the model
78
+
79
+ llama_model_params model_params = llama_model_default_params();
80
+
81
+ model_params.n_gpu_layers = 99; // offload all layers to the GPU
82
+
83
+ llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
84
+
85
+ if (model == NULL) {
86
+ fprintf(stderr , "%s: error: unable to load model\n" , __func__);
87
+ return 1;
88
+ }
89
+
90
+ // initialize the context
91
+
92
+ llama_context_params ctx_params = llama_context_default_params();
93
+
94
+ ctx_params.seed = seed;
95
+ ctx_params.n_ctx = llama_n_ctx_train(model)*n_grp + n_keep;
96
+ ctx_params.n_batch = 512;
97
+ ctx_params.n_threads = params.n_threads;
98
+ ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
99
+
100
+ GGML_ASSERT(ctx_params.n_batch % n_grp == 0 && "n_batch must be divisible by n_grp");
101
+
102
+ llama_context * ctx = llama_new_context_with_model(model, ctx_params);
103
+
104
+ if (ctx == NULL) {
105
+ fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
106
+ return 1;
107
+ }
108
+
109
+ // tokenize the prompt
110
+ std::vector<llama_token> tokens_list;
111
+ tokens_list = ::llama_tokenize(ctx, params.prompt, true);
112
+
113
+ // tokenize the prefix and use it as a sink
114
+ const int n_tokens_prefix = ::llama_tokenize(ctx, prompt_prefix, true).size();
115
+
116
+ const int n_tokens_all = tokens_list.size();
117
+
118
+ // we leave a margin of 16 tokens for the generated text - it should contain just the passkey
119
+ const int n_predict = 16;
120
+
121
+ // total length of the sequences including the prompt
122
+ const int n_len = n_tokens_all + n_predict;
123
+
124
+ const int n_ctx = llama_n_ctx(ctx) - n_keep;
125
+ const int n_kv_req = llama_n_ctx(ctx);
126
+ const int n_batch = ctx_params.n_batch;
127
+ const int n_batch_grp = ctx_params.n_batch/n_grp;
128
+
129
+ LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d, n_junk = %d, i_pos = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch, n_junk, i_pos);
130
+
131
+ // print the prompt token-by-token
132
+
133
+ LOG_TEE("\n");
134
+ LOG_TEE("prefix tokens: %d\n", n_tokens_prefix);
135
+ LOG_TEE("prompt tokens: %d\n", n_tokens_all);
136
+ //LOG_TEE("prompt: %s\n", params.prompt.c_str());
137
+
138
+ llama_batch batch = llama_batch_init(512, 0, 1);
139
+
140
+ int n_past = 0;
141
+
142
+ // fill the KV cache
143
+ for (int i = 0; i < n_ctx; i += n_batch) {
144
+ if (i > 0 && n_grp > 1) {
145
+ // if SelfExtend is enabled, we compress the position from the last batch by a factor of n_grp
146
+ const int ib = i/n_batch - 1;
147
+ const int bd = n_batch_grp*(n_grp - 1);
148
+
149
+ llama_kv_cache_seq_add (ctx, 0, n_past - n_batch, n_past, ib*bd);
150
+ llama_kv_cache_seq_div (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
151
+ llama_kv_cache_update (ctx);
152
+
153
+ n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
154
+ }
155
+
156
+ llama_batch_clear(batch);
157
+
158
+ for (int j = 0; j < n_batch && i + j < n_tokens_all; j++) {
159
+ llama_batch_add(batch, tokens_list[i + j], n_past++, { 0 }, false);
160
+ }
161
+
162
+ if (i + n_batch >= n_tokens_all) {
163
+ batch.logits[batch.n_tokens - 1] = true;
164
+ }
165
+
166
+ if (llama_decode(ctx, batch) != 0) {
167
+ LOG_TEE("%s: llama_decode() failed\n", __func__);
168
+ return 1;
169
+ }
170
+
171
+ LOG_TEE("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all));
172
+
173
+ if (i + n_batch >= n_tokens_all) {
174
+ break;
175
+ }
176
+ }
177
+
178
+ for (int i = n_ctx; i < n_tokens_all; i += n_batch) {
179
+ const int n_discard = n_batch;
180
+
181
+ LOG_TEE("%s: shifting KV cache with %d\n", __func__, n_discard);
182
+
183
+ llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
184
+ llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
185
+ //llama_kv_cache_defrag (ctx);
186
+ llama_kv_cache_update (ctx);
187
+
188
+ n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
189
+
190
+ llama_batch_clear(batch);
191
+
192
+ for (int j = 0; j < n_batch && i + j < n_tokens_all; j++) {
193
+ llama_batch_add(batch, tokens_list[i + j], n_past++, { 0 }, false);
194
+ }
195
+
196
+ if (i + n_batch >= n_tokens_all) {
197
+ batch.logits[batch.n_tokens - 1] = true;
198
+ }
199
+
200
+ if (llama_decode(ctx, batch) != 0) {
201
+ LOG_TEE("%s: llama_decode() failed\n", __func__);
202
+ return 1;
203
+ }
204
+
205
+ LOG_TEE("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all));
206
+ }
207
+
208
+ {
209
+ const int n_discard = n_past - n_ctx + n_predict;
210
+
211
+ if (n_discard > 0) {
212
+ LOG_TEE("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard);
213
+
214
+ llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
215
+ llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
216
+ //llama_kv_cache_defrag (ctx);
217
+ llama_kv_cache_update (ctx);
218
+
219
+ n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
220
+ }
221
+ }
222
+
223
+ LOG_TEE("\n");
224
+ LOG_TEE("%s: passkey = %d, inserted at position %d / %d (token pos: ~%d)\n", __func__, passkey, i_pos, n_junk, (i_pos * n_tokens_all) / n_junk);
225
+ LOG_TEE("\n");
226
+
227
+ // main loop
228
+
229
+ int n_cur = n_tokens_all;
230
+ int n_decode = 0;
231
+
232
+ LOG_TEE("%s", prompt_suffix.c_str());
233
+ fflush(stdout);
234
+
235
+ const auto t_main_start = ggml_time_us();
236
+
237
+ while (n_cur <= n_len) {
238
+ // sample the next token
239
+ {
240
+ auto n_vocab = llama_n_vocab(model);
241
+ auto * logits = llama_get_logits_ith(ctx, batch.n_tokens - 1);
242
+
243
+ std::vector<llama_token_data> candidates;
244
+ candidates.reserve(n_vocab);
245
+
246
+ for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
247
+ candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
248
+ }
249
+
250
+ llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
251
+
252
+ // sample the most likely token
253
+ const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
254
+
255
+ // is it an end of generation?
256
+ if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
257
+ LOG_TEE("\n");
258
+
259
+ break;
260
+ }
261
+
262
+ LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str());
263
+ fflush(stdout);
264
+
265
+ n_decode += 1;
266
+
267
+ // prepare the next batch
268
+ llama_batch_clear(batch);
269
+
270
+ // push this new token for next evaluation
271
+ llama_batch_add(batch, new_token_id, n_past++, { 0 }, true);
272
+ }
273
+
274
+ n_cur += 1;
275
+
276
+ // evaluate the current batch with the transformer model
277
+ if (llama_decode(ctx, batch)) {
278
+ fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
279
+ return 1;
280
+ }
281
+ }
282
+
283
+ LOG_TEE("\n");
284
+
285
+ const auto t_main_end = ggml_time_us();
286
+
287
+ LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
288
+ __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
289
+
290
+ llama_print_timings(ctx);
291
+
292
+ fprintf(stderr, "\n");
293
+
294
+ llama_batch_free(batch);
295
+
296
+ llama_free(ctx);
297
+ llama_free_model(model);
298
+
299
+ llama_backend_free();
300
+
301
+ return 0;
302
+ }
@@ -0,0 +1,5 @@
1
+ set(TARGET perplexity)
2
+ add_executable(${TARGET} perplexity.cpp)
3
+ install(TARGETS ${TARGET} RUNTIME)
4
+ target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
5
+ target_compile_features(${TARGET} PRIVATE cxx_std_11)