@fugood/llama.node 0.3.1 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (187) hide show
  1. package/CMakeLists.txt +0 -9
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +4 -2
  17. package/src/LlamaCompletionWorker.cpp +6 -6
  18. package/src/LlamaContext.cpp +7 -9
  19. package/src/common.hpp +2 -1
  20. package/src/llama.cpp/.github/workflows/build.yml +98 -24
  21. package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
  22. package/src/llama.cpp/.github/workflows/docker.yml +43 -34
  23. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
  24. package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
  25. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
  26. package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
  27. package/src/llama.cpp/.github/workflows/server.yml +7 -0
  28. package/src/llama.cpp/CMakeLists.txt +20 -8
  29. package/src/llama.cpp/common/CMakeLists.txt +12 -10
  30. package/src/llama.cpp/common/arg.cpp +2006 -0
  31. package/src/llama.cpp/common/arg.h +77 -0
  32. package/src/llama.cpp/common/common.cpp +496 -1632
  33. package/src/llama.cpp/common/common.h +161 -63
  34. package/src/llama.cpp/common/console.cpp +3 -0
  35. package/src/llama.cpp/common/log.cpp +401 -0
  36. package/src/llama.cpp/common/log.h +66 -698
  37. package/src/llama.cpp/common/ngram-cache.cpp +3 -0
  38. package/src/llama.cpp/common/sampling.cpp +348 -350
  39. package/src/llama.cpp/common/sampling.h +62 -139
  40. package/src/llama.cpp/common/stb_image.h +5990 -6398
  41. package/src/llama.cpp/common/train.cpp +2 -0
  42. package/src/llama.cpp/docs/build.md +36 -1
  43. package/src/llama.cpp/examples/CMakeLists.txt +0 -1
  44. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +1 -2
  45. package/src/llama.cpp/examples/batched/batched.cpp +39 -55
  46. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +34 -44
  47. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
  48. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +15 -15
  49. package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
  50. package/src/llama.cpp/examples/embedding/embedding.cpp +143 -87
  51. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +33 -33
  52. package/src/llama.cpp/examples/export-lora/export-lora.cpp +36 -35
  53. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
  54. package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +5 -0
  55. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
  56. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
  57. package/src/llama.cpp/examples/gritlm/gritlm.cpp +34 -27
  58. package/src/llama.cpp/examples/imatrix/imatrix.cpp +59 -62
  59. package/src/llama.cpp/examples/infill/infill.cpp +117 -132
  60. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +265 -58
  61. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +29 -22
  62. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  63. package/src/llama.cpp/examples/llava/clip.cpp +685 -150
  64. package/src/llama.cpp/examples/llava/clip.h +11 -2
  65. package/src/llama.cpp/examples/llava/llava-cli.cpp +47 -58
  66. package/src/llama.cpp/examples/llava/llava.cpp +110 -24
  67. package/src/llama.cpp/examples/llava/llava.h +2 -3
  68. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
  69. package/src/llama.cpp/examples/llava/requirements.txt +1 -0
  70. package/src/llama.cpp/examples/lookahead/lookahead.cpp +42 -43
  71. package/src/llama.cpp/examples/lookup/lookup-create.cpp +10 -8
  72. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +23 -22
  73. package/src/llama.cpp/examples/lookup/lookup.cpp +40 -43
  74. package/src/llama.cpp/examples/main/main.cpp +210 -262
  75. package/src/llama.cpp/examples/parallel/parallel.cpp +49 -49
  76. package/src/llama.cpp/examples/passkey/passkey.cpp +42 -50
  77. package/src/llama.cpp/examples/perplexity/perplexity.cpp +187 -200
  78. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  79. package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
  80. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -3
  81. package/src/llama.cpp/examples/retrieval/retrieval.cpp +49 -44
  82. package/src/llama.cpp/examples/rpc/rpc-server.cpp +24 -1
  83. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +32 -35
  84. package/src/llama.cpp/examples/server/CMakeLists.txt +3 -5
  85. package/src/llama.cpp/examples/server/server.cpp +1027 -1073
  86. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
  87. package/src/llama.cpp/examples/server/utils.hpp +107 -105
  88. package/src/llama.cpp/examples/simple/simple.cpp +35 -41
  89. package/src/llama.cpp/examples/speculative/speculative.cpp +129 -103
  90. package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
  91. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  92. package/src/llama.cpp/examples/tokenize/tokenize.cpp +25 -27
  93. package/src/llama.cpp/ggml/CMakeLists.txt +14 -3
  94. package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
  95. package/src/llama.cpp/ggml/include/ggml-backend.h +145 -60
  96. package/src/llama.cpp/ggml/include/ggml-blas.h +3 -3
  97. package/src/llama.cpp/ggml/include/ggml-cann.h +15 -19
  98. package/src/llama.cpp/ggml/include/ggml-cuda.h +16 -16
  99. package/src/llama.cpp/ggml/include/ggml-metal.h +5 -8
  100. package/src/llama.cpp/ggml/include/ggml-rpc.h +5 -5
  101. package/src/llama.cpp/ggml/include/ggml-sycl.h +8 -8
  102. package/src/llama.cpp/ggml/include/ggml-vulkan.h +7 -7
  103. package/src/llama.cpp/ggml/include/ggml.h +293 -186
  104. package/src/llama.cpp/ggml/src/CMakeLists.txt +86 -44
  105. package/src/llama.cpp/ggml/src/ggml-aarch64.c +2135 -1119
  106. package/src/llama.cpp/ggml/src/ggml-alloc.c +6 -0
  107. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +152 -70
  108. package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +606 -286
  109. package/src/llama.cpp/ggml/src/ggml-blas.cpp +9 -10
  110. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
  111. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
  112. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
  113. package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
  114. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
  115. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
  116. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
  117. package/src/llama.cpp/ggml/src/ggml-cann.cpp +215 -216
  118. package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
  119. package/src/llama.cpp/ggml/src/ggml-cpu-impl.h +614 -0
  120. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  121. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
  122. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  123. package/src/llama.cpp/ggml/src/ggml-impl.h +49 -603
  124. package/src/llama.cpp/ggml/src/ggml-kompute.cpp +4 -24
  125. package/src/llama.cpp/ggml/src/ggml-quants.c +972 -92
  126. package/src/llama.cpp/ggml/src/ggml-quants.h +15 -0
  127. package/src/llama.cpp/ggml/src/ggml-rpc.cpp +116 -66
  128. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
  129. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +11 -0
  130. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +52 -0
  131. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
  132. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
  133. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
  134. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
  135. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
  136. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
  137. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +16 -3
  138. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
  140. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
  141. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1 -1
  142. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +6 -3
  143. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
  144. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
  145. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
  146. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
  147. package/src/llama.cpp/ggml/src/ggml-sycl.cpp +97 -169
  148. package/src/llama.cpp/ggml/src/ggml-vulkan.cpp +1508 -1124
  149. package/src/llama.cpp/ggml/src/ggml.c +3001 -1647
  150. package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +192 -0
  151. package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +2 -0
  152. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +88 -40
  153. package/src/llama.cpp/include/llama.h +241 -264
  154. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
  155. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
  156. package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
  157. package/src/llama.cpp/src/llama-grammar.cpp +721 -122
  158. package/src/llama.cpp/src/llama-grammar.h +120 -15
  159. package/src/llama.cpp/src/llama-impl.h +156 -1
  160. package/src/llama.cpp/src/llama-sampling.cpp +1375 -303
  161. package/src/llama.cpp/src/llama-sampling.h +20 -47
  162. package/src/llama.cpp/src/llama-vocab.cpp +343 -120
  163. package/src/llama.cpp/src/llama-vocab.h +33 -17
  164. package/src/llama.cpp/src/llama.cpp +4247 -1525
  165. package/src/llama.cpp/src/unicode-data.cpp +6 -4
  166. package/src/llama.cpp/src/unicode-data.h +4 -4
  167. package/src/llama.cpp/src/unicode.cpp +15 -7
  168. package/src/llama.cpp/tests/CMakeLists.txt +3 -0
  169. package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
  170. package/src/llama.cpp/tests/test-backend-ops.cpp +1592 -289
  171. package/src/llama.cpp/tests/test-barrier.cpp +93 -0
  172. package/src/llama.cpp/tests/test-grad0.cpp +187 -70
  173. package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
  174. package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
  175. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +6 -4
  176. package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
  177. package/src/llama.cpp/tests/test-log.cpp +39 -0
  178. package/src/llama.cpp/tests/test-quantize-fns.cpp +6 -0
  179. package/src/llama.cpp/tests/test-rope.cpp +1 -1
  180. package/src/llama.cpp/tests/test-sampling.cpp +157 -98
  181. package/src/llama.cpp/tests/test-tokenizer-0.cpp +55 -35
  182. package/patches/llama.patch +0 -22
  183. package/src/llama.cpp/.github/workflows/bench.yml +0 -310
  184. package/src/llama.cpp/common/grammar-parser.cpp +0 -536
  185. package/src/llama.cpp/common/grammar-parser.h +0 -29
  186. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
  187. package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
@@ -1,44 +1,40 @@
1
+ #include "arg.h"
1
2
  #include "ggml.h"
2
- #include "llama.h"
3
3
  #include "common.h"
4
4
  #include "ngram-cache.h"
5
+ #include "sampling.h"
6
+ #include "log.h"
7
+ #include "llama.h"
5
8
 
6
- #include <cmath>
7
9
  #include <cstdint>
8
10
  #include <cstdio>
9
11
  #include <fstream>
10
12
  #include <string>
11
13
  #include <vector>
12
- #include <unordered_map>
13
14
 
14
15
  int main(int argc, char ** argv){
15
16
  gpt_params params;
16
17
 
17
- if (!gpt_params_parse(argc, argv, params)) {
18
- gpt_params_print_usage(argc, argv, params);
18
+ if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
19
19
  return 1;
20
20
  }
21
21
 
22
+ gpt_init();
23
+
22
24
  // max. number of additional tokens to draft if match is found
23
25
  const int n_draft = params.n_draft;
24
26
 
25
27
  const bool dump_kv_cache = params.dump_kv_cache;
26
28
 
27
- #ifndef LOG_DISABLE_LOGS
28
- log_set_target(log_filename_generator("lookup", "log"));
29
- LOG_TEE("Log start\n");
30
- log_dump_cmdline(argc, argv);
31
- #endif // LOG_DISABLE_LOGS
32
-
33
29
  // init llama.cpp
34
30
  llama_backend_init();
35
31
  llama_numa_init(params.numa);
36
32
 
37
- llama_model * model = NULL;
38
- llama_context * ctx = NULL;
39
-
40
33
  // load the model
41
- std::tie(model, ctx) = llama_init_from_gpt_params(params);
34
+ llama_init_result llama_init = llama_init_from_gpt_params(params);
35
+
36
+ llama_model * model = llama_init.model;
37
+ llama_context * ctx = llama_init.context;
42
38
 
43
39
  // tokenize the prompt
44
40
  std::vector<llama_token> inp;
@@ -59,7 +55,7 @@ int main(int argc, char ** argv){
59
55
  try {
60
56
  ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
61
57
  } catch (std::ifstream::failure const &) {
62
- fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
58
+ LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
63
59
  exit(1);
64
60
  }
65
61
  }
@@ -77,14 +73,14 @@ int main(int argc, char ** argv){
77
73
  const int max_tokens_list_size = max_context_size - 4;
78
74
 
79
75
  if ((int) inp.size() > max_tokens_list_size) {
80
- fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
76
+ LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
81
77
  return 1;
82
78
  }
83
79
 
84
- fprintf(stderr, "\n\n");
80
+ LOG("\n\n");
85
81
 
86
82
  for (auto id : inp) {
87
- fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
83
+ LOG("%s", llama_token_to_piece(ctx, id).c_str());
88
84
  }
89
85
 
90
86
  fflush(stderr);
@@ -106,7 +102,7 @@ int main(int argc, char ** argv){
106
102
 
107
103
  bool has_eos = false;
108
104
 
109
- struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams);
105
+ struct gpt_sampler * smpl = gpt_sampler_init(model, params.sparams);
110
106
 
111
107
  std::vector<llama_token> draft;
112
108
 
@@ -125,19 +121,19 @@ int main(int argc, char ** argv){
125
121
  }
126
122
 
127
123
  // print current draft sequence
128
- LOG("drafted %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, draft).c_str());
124
+ LOG_DBG("drafted %s\n", string_from(ctx, draft).c_str());
129
125
 
130
126
  int i_dft = 0;
131
127
  while (true) {
132
128
  // sample from the target model
133
- llama_token id = llama_sampling_sample(ctx_sampling, ctx, NULL, i_dft);
129
+ llama_token id = gpt_sampler_sample(smpl, ctx, i_dft);
134
130
 
135
- llama_sampling_accept(ctx_sampling, ctx, id, true);
131
+ gpt_sampler_accept(smpl, id, true);
136
132
 
137
133
  const std::string token_str = llama_token_to_piece(ctx, id);
138
134
 
139
135
  if (!params.use_color) {
140
- printf("%s", token_str.c_str());
136
+ LOG("%s", token_str.c_str());
141
137
  }
142
138
 
143
139
  if (llama_token_is_eog(model, id)) {
@@ -148,7 +144,7 @@ int main(int argc, char ** argv){
148
144
 
149
145
  // check if the target token matches the draft
150
146
  if (i_dft < (int) draft.size() && id == draft[i_dft]) {
151
- LOG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str());
147
+ LOG_DBG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str());
152
148
  ++n_accept;
153
149
  ++n_past;
154
150
  ++i_dft;
@@ -162,19 +158,19 @@ int main(int argc, char ** argv){
162
158
 
163
159
  if (params.use_color) {
164
160
  // color accepted draft token
165
- printf("\033[34m%s\033[0m", token_str.c_str());
161
+ LOG("\033[34m%s\033[0m", token_str.c_str());
166
162
  fflush(stdout);
167
163
  }
168
164
  continue;
169
165
  }
170
166
 
171
167
  if (params.use_color) {
172
- printf("%s", token_str.c_str());
168
+ LOG("%s", token_str.c_str());
173
169
  }
174
170
  fflush(stdout);
175
171
 
176
172
 
177
- LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());
173
+ LOG_DBG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());
178
174
 
179
175
  draft.clear();
180
176
  draft.push_back(id);
@@ -225,25 +221,26 @@ int main(int argc, char ** argv){
225
221
  llama_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
226
222
  llama_ngram_cache_save(ngram_cache_dynamic, params.lookup_cache_dynamic);
227
223
 
228
- LOG_TEE("\n\n");
224
+ LOG("\n\n");
229
225
 
230
- LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
231
- LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
226
+ LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
227
+ LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
232
228
 
233
- LOG_TEE("\n");
234
- LOG_TEE("n_draft = %d\n", n_draft);
235
- LOG_TEE("n_predict = %d\n", n_predict);
236
- LOG_TEE("n_drafted = %d\n", n_drafted);
237
- LOG_TEE("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
238
- LOG_TEE("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n",
229
+ LOG_INF("\n");
230
+ LOG_INF("n_draft = %d\n", n_draft);
231
+ LOG_INF("n_predict = %d\n", n_predict);
232
+ LOG_INF("n_drafted = %d\n", n_drafted);
233
+ LOG_INF("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
234
+ LOG_INF("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n",
239
235
  t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us));
240
- LOG_TEE("n_accept = %d\n", n_accept);
241
- LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
236
+ LOG_INF("n_accept = %d\n", n_accept);
237
+ LOG_INF("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
238
+
239
+ LOG_INF("\ntarget:\n\n");
240
+ gpt_perf_print(ctx, smpl);
242
241
 
243
- LOG_TEE("\ntarget:\n");
244
- llama_print_timings(ctx);
242
+ gpt_sampler_free(smpl);
245
243
 
246
- llama_sampling_free(ctx_sampling);
247
244
  llama_batch_free(batch_tgt);
248
245
 
249
246
  llama_free(ctx);
@@ -251,7 +248,7 @@ int main(int argc, char ** argv){
251
248
 
252
249
  llama_backend_free();
253
250
 
254
- fprintf(stderr, "\n\n");
251
+ LOG("\n\n");
255
252
 
256
253
  return 0;
257
254
  }