@fugood/llama.node 0.3.0 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (187) hide show
  1. package/CMakeLists.txt +1 -10
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +6 -4
  17. package/src/LlamaCompletionWorker.cpp +6 -6
  18. package/src/LlamaContext.cpp +7 -9
  19. package/src/common.hpp +2 -1
  20. package/src/llama.cpp/.github/workflows/build.yml +98 -24
  21. package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
  22. package/src/llama.cpp/.github/workflows/docker.yml +43 -34
  23. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
  24. package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
  25. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
  26. package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
  27. package/src/llama.cpp/.github/workflows/server.yml +7 -0
  28. package/src/llama.cpp/CMakeLists.txt +20 -8
  29. package/src/llama.cpp/common/CMakeLists.txt +12 -10
  30. package/src/llama.cpp/common/arg.cpp +2006 -0
  31. package/src/llama.cpp/common/arg.h +77 -0
  32. package/src/llama.cpp/common/common.cpp +496 -1632
  33. package/src/llama.cpp/common/common.h +161 -63
  34. package/src/llama.cpp/common/console.cpp +3 -0
  35. package/src/llama.cpp/common/log.cpp +401 -0
  36. package/src/llama.cpp/common/log.h +66 -698
  37. package/src/llama.cpp/common/ngram-cache.cpp +3 -0
  38. package/src/llama.cpp/common/sampling.cpp +348 -350
  39. package/src/llama.cpp/common/sampling.h +62 -139
  40. package/src/llama.cpp/common/stb_image.h +5990 -6398
  41. package/src/llama.cpp/common/train.cpp +2 -0
  42. package/src/llama.cpp/docs/build.md +36 -1
  43. package/src/llama.cpp/examples/CMakeLists.txt +0 -1
  44. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +1 -2
  45. package/src/llama.cpp/examples/batched/batched.cpp +39 -55
  46. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +34 -44
  47. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
  48. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +15 -15
  49. package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
  50. package/src/llama.cpp/examples/embedding/embedding.cpp +143 -87
  51. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +33 -33
  52. package/src/llama.cpp/examples/export-lora/export-lora.cpp +36 -35
  53. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
  54. package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +5 -0
  55. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
  56. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
  57. package/src/llama.cpp/examples/gritlm/gritlm.cpp +34 -27
  58. package/src/llama.cpp/examples/imatrix/imatrix.cpp +59 -62
  59. package/src/llama.cpp/examples/infill/infill.cpp +117 -132
  60. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +265 -58
  61. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +29 -22
  62. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  63. package/src/llama.cpp/examples/llava/clip.cpp +685 -150
  64. package/src/llama.cpp/examples/llava/clip.h +11 -2
  65. package/src/llama.cpp/examples/llava/llava-cli.cpp +47 -58
  66. package/src/llama.cpp/examples/llava/llava.cpp +110 -24
  67. package/src/llama.cpp/examples/llava/llava.h +2 -3
  68. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
  69. package/src/llama.cpp/examples/llava/requirements.txt +1 -0
  70. package/src/llama.cpp/examples/lookahead/lookahead.cpp +42 -43
  71. package/src/llama.cpp/examples/lookup/lookup-create.cpp +10 -8
  72. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +23 -22
  73. package/src/llama.cpp/examples/lookup/lookup.cpp +40 -43
  74. package/src/llama.cpp/examples/main/main.cpp +210 -262
  75. package/src/llama.cpp/examples/parallel/parallel.cpp +49 -49
  76. package/src/llama.cpp/examples/passkey/passkey.cpp +42 -50
  77. package/src/llama.cpp/examples/perplexity/perplexity.cpp +187 -200
  78. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  79. package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
  80. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -3
  81. package/src/llama.cpp/examples/retrieval/retrieval.cpp +49 -44
  82. package/src/llama.cpp/examples/rpc/rpc-server.cpp +24 -1
  83. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +32 -35
  84. package/src/llama.cpp/examples/server/CMakeLists.txt +3 -5
  85. package/src/llama.cpp/examples/server/server.cpp +1027 -1073
  86. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
  87. package/src/llama.cpp/examples/server/utils.hpp +107 -105
  88. package/src/llama.cpp/examples/simple/simple.cpp +35 -41
  89. package/src/llama.cpp/examples/speculative/speculative.cpp +129 -103
  90. package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
  91. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  92. package/src/llama.cpp/examples/tokenize/tokenize.cpp +25 -27
  93. package/src/llama.cpp/ggml/CMakeLists.txt +14 -3
  94. package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
  95. package/src/llama.cpp/ggml/include/ggml-backend.h +145 -60
  96. package/src/llama.cpp/ggml/include/ggml-blas.h +3 -3
  97. package/src/llama.cpp/ggml/include/ggml-cann.h +15 -19
  98. package/src/llama.cpp/ggml/include/ggml-cuda.h +16 -16
  99. package/src/llama.cpp/ggml/include/ggml-metal.h +5 -8
  100. package/src/llama.cpp/ggml/include/ggml-rpc.h +5 -5
  101. package/src/llama.cpp/ggml/include/ggml-sycl.h +8 -8
  102. package/src/llama.cpp/ggml/include/ggml-vulkan.h +7 -7
  103. package/src/llama.cpp/ggml/include/ggml.h +293 -186
  104. package/src/llama.cpp/ggml/src/CMakeLists.txt +86 -44
  105. package/src/llama.cpp/ggml/src/ggml-aarch64.c +2135 -1119
  106. package/src/llama.cpp/ggml/src/ggml-alloc.c +6 -0
  107. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +152 -70
  108. package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +606 -286
  109. package/src/llama.cpp/ggml/src/ggml-blas.cpp +9 -10
  110. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
  111. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
  112. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
  113. package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
  114. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
  115. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
  116. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
  117. package/src/llama.cpp/ggml/src/ggml-cann.cpp +215 -216
  118. package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
  119. package/src/llama.cpp/ggml/src/ggml-cpu-impl.h +614 -0
  120. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  121. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
  122. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  123. package/src/llama.cpp/ggml/src/ggml-impl.h +49 -603
  124. package/src/llama.cpp/ggml/src/ggml-kompute.cpp +4 -24
  125. package/src/llama.cpp/ggml/src/ggml-quants.c +972 -92
  126. package/src/llama.cpp/ggml/src/ggml-quants.h +15 -0
  127. package/src/llama.cpp/ggml/src/ggml-rpc.cpp +116 -66
  128. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
  129. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +11 -0
  130. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +52 -0
  131. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
  132. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
  133. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
  134. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
  135. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
  136. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
  137. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +16 -3
  138. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
  140. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
  141. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1 -1
  142. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +6 -3
  143. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
  144. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
  145. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
  146. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
  147. package/src/llama.cpp/ggml/src/ggml-sycl.cpp +97 -169
  148. package/src/llama.cpp/ggml/src/ggml-vulkan.cpp +1508 -1124
  149. package/src/llama.cpp/ggml/src/ggml.c +3001 -1647
  150. package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +192 -0
  151. package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +2 -0
  152. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +88 -40
  153. package/src/llama.cpp/include/llama.h +241 -264
  154. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
  155. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
  156. package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
  157. package/src/llama.cpp/src/llama-grammar.cpp +721 -122
  158. package/src/llama.cpp/src/llama-grammar.h +120 -15
  159. package/src/llama.cpp/src/llama-impl.h +156 -1
  160. package/src/llama.cpp/src/llama-sampling.cpp +1375 -303
  161. package/src/llama.cpp/src/llama-sampling.h +20 -47
  162. package/src/llama.cpp/src/llama-vocab.cpp +343 -120
  163. package/src/llama.cpp/src/llama-vocab.h +33 -17
  164. package/src/llama.cpp/src/llama.cpp +4247 -1525
  165. package/src/llama.cpp/src/unicode-data.cpp +6 -4
  166. package/src/llama.cpp/src/unicode-data.h +4 -4
  167. package/src/llama.cpp/src/unicode.cpp +15 -7
  168. package/src/llama.cpp/tests/CMakeLists.txt +3 -0
  169. package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
  170. package/src/llama.cpp/tests/test-backend-ops.cpp +1592 -289
  171. package/src/llama.cpp/tests/test-barrier.cpp +93 -0
  172. package/src/llama.cpp/tests/test-grad0.cpp +187 -70
  173. package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
  174. package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
  175. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +6 -4
  176. package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
  177. package/src/llama.cpp/tests/test-log.cpp +39 -0
  178. package/src/llama.cpp/tests/test-quantize-fns.cpp +6 -0
  179. package/src/llama.cpp/tests/test-rope.cpp +1 -1
  180. package/src/llama.cpp/tests/test-sampling.cpp +157 -98
  181. package/src/llama.cpp/tests/test-tokenizer-0.cpp +55 -35
  182. package/patches/llama.patch +0 -22
  183. package/src/llama.cpp/.github/workflows/bench.yml +0 -310
  184. package/src/llama.cpp/common/grammar-parser.cpp +0 -536
  185. package/src/llama.cpp/common/grammar-parser.h +0 -29
  186. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
  187. package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
@@ -1,8 +1,9 @@
1
+ #include "arg.h"
1
2
  #include "common.h"
2
-
3
3
  #include "console.h"
4
+ #include "sampling.h"
5
+ #include "log.h"
4
6
  #include "llama.h"
5
- #include "grammar-parser.h"
6
7
 
7
8
  #include <cassert>
8
9
  #include <cinttypes>
@@ -34,6 +35,7 @@
34
35
 
35
36
  static llama_context ** g_ctx;
36
37
  static llama_model ** g_model;
38
+ static gpt_sampler ** g_smpl;
37
39
  static gpt_params * g_params;
38
40
  static std::vector<llama_token> * g_input_tokens;
39
41
  static std::ostringstream * g_output_ss;
@@ -54,7 +56,7 @@ static void write_logfile(
54
56
 
55
57
  const bool success = fs_create_directory_with_parents(params.logdir);
56
58
  if (!success) {
57
- fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
59
+ LOG_ERR("%s: warning: failed to create logdir %s, cannot write logfile\n",
58
60
  __func__, params.logdir.c_str());
59
61
  return;
60
62
  }
@@ -63,7 +65,7 @@ static void write_logfile(
63
65
  FILE * logfile = fopen(logfile_path.c_str(), "w");
64
66
 
65
67
  if (logfile == NULL) {
66
- fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
68
+ LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
67
69
  return;
68
70
  }
69
71
 
@@ -81,7 +83,7 @@ static void write_logfile(
81
83
  yaml_dump_string_multiline(logfile, "output", output.c_str());
82
84
  yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
83
85
 
84
- llama_dump_timing_info_yaml(logfile, ctx);
86
+ llama_perf_dump_yaml(logfile, ctx);
85
87
  fclose(logfile);
86
88
  }
87
89
 
@@ -92,9 +94,14 @@ static void sigint_handler(int signo) {
92
94
  is_interacting = true;
93
95
  } else {
94
96
  console::cleanup();
95
- printf("\n");
96
- llama_print_timings(*g_ctx);
97
+ LOG("\n");
98
+ gpt_perf_print(*g_ctx, *g_smpl);
97
99
  write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
100
+
101
+ // make sure all logs are flushed
102
+ LOG("Interrupted by user\n");
103
+ gpt_log_pause(gpt_log_main());
104
+
98
105
  _exit(130);
99
106
  }
100
107
  }
@@ -103,106 +110,95 @@ static void sigint_handler(int signo) {
103
110
 
104
111
  int main(int argc, char ** argv) {
105
112
  gpt_params params;
106
- llama_sampling_params & sparams = params.sparams;
107
113
  g_params = &params;
108
114
 
109
- if (!gpt_params_parse(argc, argv, params)) {
110
- gpt_params_print_usage(argc, argv, params);
115
+ if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_INFILL)) {
111
116
  return 1;
112
117
  }
113
118
 
114
- #ifndef LOG_DISABLE_LOGS
115
- log_set_target(log_filename_generator("infill", "log"));
116
- LOG_TEE("Log start\n");
117
- log_dump_cmdline(argc, argv);
118
- #endif // LOG_DISABLE_LOGS
119
+ gpt_init();
120
+
121
+ auto & sparams = params.sparams;
119
122
 
120
123
  console::init(params.simple_io, params.use_color);
121
124
  atexit([]() { console::cleanup(); });
122
125
 
123
126
  if (params.logits_all) {
124
- printf("\n************\n");
125
- printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
126
- printf("************\n\n");
127
+ LOG_ERR("\n************\n");
128
+ LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
129
+ LOG_ERR("************\n\n");
127
130
 
128
131
  return 0;
129
132
  }
130
133
 
131
134
  if (params.embedding) {
132
- printf("\n************\n");
133
- printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
134
- printf("************\n\n");
135
+ LOG_ERR("\n************\n");
136
+ LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
137
+ LOG_ERR("************\n\n");
135
138
 
136
139
  return 0;
137
140
  }
138
141
 
139
142
  if (params.n_ctx != 0 && params.n_ctx < 8) {
140
- LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
143
+ LOG_WRN("%s: minimum context size is 8, using minimum size.\n", __func__);
141
144
  params.n_ctx = 8;
142
145
  }
146
+
143
147
  if (!params.interactive_first && (params.input_prefix.empty() && params.input_suffix.empty())) {
144
- printf("\n************\n");
145
- printf("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
146
- printf("************\n\n");
148
+ LOG_ERR("\n************\n");
149
+ LOG_ERR("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
150
+ LOG_ERR("************\n\n");
147
151
 
148
152
  return 0;
149
153
  }
150
154
 
151
155
  if (params.rope_freq_base != 0.0) {
152
- LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
156
+ LOG_WRN("%s: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
153
157
  }
154
158
 
155
159
  if (params.rope_freq_scale != 0.0) {
156
- LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
160
+ LOG_WRN("%s: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
157
161
  }
158
162
 
159
- LOG_TEE("%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
160
- LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);
161
-
162
- if (params.seed == LLAMA_DEFAULT_SEED) {
163
- params.seed = time(NULL);
164
- }
165
-
166
- LOG_TEE("%s: seed = %u\n", __func__, params.seed);
167
-
168
- std::mt19937 rng(params.seed);
169
-
170
- LOG("%s: llama backend init\n", __func__);
163
+ LOG_INF("%s: llama backend init\n", __func__);
171
164
  llama_backend_init();
172
165
  llama_numa_init(params.numa);
173
166
 
174
- llama_model * model;
175
- llama_context * ctx;
167
+ llama_model * model = nullptr;
168
+ llama_context * ctx = nullptr;
169
+ gpt_sampler * smpl = nullptr;
176
170
 
177
171
  g_model = &model;
178
172
  g_ctx = &ctx;
173
+ g_smpl = &smpl;
179
174
 
180
175
  // load the model and apply lora adapter, if any
181
- LOG("%s: load the model and apply lora adapter, if any\n", __func__);
182
- std::tie(model, ctx) = llama_init_from_gpt_params(params);
176
+ LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
177
+ llama_init_result llama_init = llama_init_from_gpt_params(params);
178
+
179
+ model = llama_init.model;
180
+ ctx = llama_init.context;
183
181
 
184
182
  if (model == NULL) {
185
- LOG_TEE("%s: error: unable to load model\n", __func__);
183
+ LOG_ERR("%s: unable to load model\n", __func__);
186
184
  return 1;
187
185
  }
188
186
 
189
187
  const int n_ctx_train = llama_n_ctx_train(model);
190
188
  const int n_ctx = llama_n_ctx(ctx);
191
- LOG("n_ctx: %d\n", n_ctx);
189
+ LOG_DBG("n_ctx: %d\n", n_ctx);
192
190
 
193
191
  if (n_ctx > n_ctx_train) {
194
- LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n",
195
- __func__, n_ctx_train, n_ctx);
192
+ LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx);
196
193
  }
197
194
 
198
195
  // print system information
199
196
  {
200
- LOG_TEE("\n");
201
- LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
197
+ LOG_INF("\n");
198
+ LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
202
199
  }
203
- const bool add_bos = llama_should_add_bos_token(model);
204
- GGML_ASSERT(llama_add_eos_token(model) != 1);
205
- LOG("add_bos: %d\n", add_bos);
200
+ const bool add_bos = llama_add_bos_token(model);
201
+ GGML_ASSERT(!llama_add_eos_token(model));
206
202
 
207
203
  std::vector<llama_token> embd_inp;
208
204
  std::vector<llama_token> embd_end;
@@ -227,18 +223,19 @@ int main(int argc, char ** argv) {
227
223
  embd_inp.push_back(middle_token);
228
224
  }
229
225
 
230
- LOG("prefix: \"%s\"\n", log_tostr(params.input_prefix));
231
- LOG("suffix: \"%s\"\n", log_tostr(params.input_suffix));
232
- LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
226
+ LOG_DBG("add_bos: %d\n", add_bos);
227
+ LOG_DBG("prefix: \"%s\"\n", params.input_prefix.c_str());
228
+ LOG_DBG("suffix: \"%s\"\n", params.input_suffix.c_str());
229
+ LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str());
233
230
 
234
231
  // Should not run without any tokens
235
232
  if (embd_inp.empty()) {
236
233
  embd_inp.push_back(llama_token_bos(model));
237
- LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
234
+ LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
238
235
  }
239
236
 
240
237
  if ((int) embd_inp.size() > n_ctx - 4) {
241
- LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
238
+ LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
242
239
  return 1;
243
240
  }
244
241
 
@@ -247,9 +244,8 @@ int main(int argc, char ** argv) {
247
244
  params.n_keep = (int)embd_inp.size();
248
245
  }
249
246
 
250
- LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
251
- LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
252
-
247
+ LOG_INF("inp_pfx: %s\n", string_from(ctx, inp_pfx).c_str());
248
+ LOG_INF("inp_sfx: %s\n", string_from(ctx, inp_sfx).c_str());
253
249
 
254
250
  // enable interactive mode if interactive start is specified
255
251
  if (params.interactive_first) {
@@ -257,21 +253,21 @@ int main(int argc, char ** argv) {
257
253
  }
258
254
 
259
255
  if (params.verbose_prompt) {
260
- LOG_TEE("\n");
261
- LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
262
- LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
256
+ LOG_INF("\n");
257
+ LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
258
+ LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
263
259
  for (int i = 0; i < (int) embd_inp.size(); i++) {
264
- LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
260
+ LOG_INF("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
265
261
  }
266
262
 
267
263
  if (params.n_keep > 0) {
268
- LOG_TEE("%s: static prompt based on n_keep: '", __func__);
264
+ LOG_INF("%s: static prompt based on n_keep: '", __func__);
269
265
  for (int i = 0; i < params.n_keep; i++) {
270
- LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
266
+ LOG_CNT("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
271
267
  }
272
- LOG_TEE("'\n");
268
+ LOG_CNT("'\n");
273
269
  }
274
- LOG_TEE("\n");
270
+ LOG_INF("\n");
275
271
  }
276
272
 
277
273
  if (params.interactive) {
@@ -288,30 +284,30 @@ int main(int argc, char ** argv) {
288
284
  SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
289
285
  #endif
290
286
 
291
- LOG_TEE("%s: interactive mode on.\n", __func__);
287
+ LOG_INF("%s: interactive mode on.\n", __func__);
292
288
 
293
289
  if (params.input_prefix_bos) {
294
- LOG_TEE("Input prefix with BOS\n");
290
+ LOG_INF("Input prefix with BOS\n");
295
291
  }
296
292
 
297
293
  if (!params.input_prefix.empty()) {
298
- LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
294
+ LOG_INF("Input prefix: '%s'\n", params.input_prefix.c_str());
299
295
  }
300
296
 
301
297
  if (!params.input_suffix.empty()) {
302
- LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
298
+ LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str());
303
299
  }
304
300
  }
305
- LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
306
- LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
307
- LOG_TEE("\n\n");
308
-
309
- LOG_TEE("\n##### Infill mode #####\n\n");
310
- if (params.infill) {
311
- printf("\n************\n");
312
- printf("no need to specify '--infill', always running infill\n");
313
- printf("************\n\n");
314
- }
301
+ smpl = gpt_sampler_init(model, sparams);
302
+
303
+ LOG_INF("sampler seed: %u\n", gpt_sampler_get_seed(smpl));
304
+ LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
305
+ LOG_INF("sampler chain: %s\n", gpt_sampler_print(smpl).c_str());
306
+
307
+ LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
308
+
309
+ LOG_INF("\n");
310
+ LOG_INF("\n##### Infill mode #####\n\n");
315
311
  if (params.interactive) {
316
312
  const char *control_message;
317
313
  if (params.multiline_input) {
@@ -322,11 +318,11 @@ int main(int argc, char ** argv) {
322
318
  " - To return control without starting a new line, end your input with '/'.\n"
323
319
  " - If you want to submit another line, end your input with '\\'.\n";
324
320
  }
325
- LOG_TEE("== Running in interactive mode. ==\n");
321
+ LOG_INF("== Running in interactive mode. ==\n");
326
322
  #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
327
- LOG_TEE( " - Press Ctrl+C to interject at any time.\n");
323
+ LOG_INF( " - Press Ctrl+C to interject at any time.\n");
328
324
  #endif
329
- LOG_TEE( "%s\n", control_message);
325
+ LOG_INF( "%s\n", control_message);
330
326
 
331
327
  is_interacting = params.interactive_first;
332
328
  }
@@ -346,8 +342,6 @@ int main(int argc, char ** argv) {
346
342
 
347
343
  std::vector<llama_token> embd;
348
344
 
349
- struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
350
-
351
345
  while (n_remain != 0 || params.interactive) {
352
346
  // predict
353
347
  if (!embd.empty()) {
@@ -361,9 +355,8 @@ int main(int argc, char ** argv) {
361
355
  embd.resize(max_embd_size);
362
356
 
363
357
  console::set_display(console::error);
364
- printf("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
358
+ LOG_WRN("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
365
359
  console::set_display(console::reset);
366
- fflush(stdout);
367
360
  }
368
361
 
369
362
  // infinite text generation via context swapping
@@ -372,14 +365,14 @@ int main(int argc, char ** argv) {
372
365
  // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
373
366
  if (n_past + (int) embd.size() > n_ctx) {
374
367
  if (params.n_predict == -2) {
375
- LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
368
+ LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
376
369
  break;
377
370
  }
378
371
 
379
372
  const int n_left = n_past - params.n_keep - 1;
380
373
  const int n_discard = n_left/2;
381
374
 
382
- LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
375
+ LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
383
376
  n_past, n_left, n_ctx, params.n_keep, n_discard);
384
377
 
385
378
  llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
@@ -387,9 +380,9 @@ int main(int argc, char ** argv) {
387
380
 
388
381
  n_past -= n_discard;
389
382
 
390
- LOG("after swap: n_past = %d\n", n_past);
383
+ LOG_DBG("after swap: n_past = %d\n", n_past);
391
384
 
392
- LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
385
+ LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
393
386
 
394
387
  }
395
388
 
@@ -401,16 +394,16 @@ int main(int argc, char ** argv) {
401
394
  n_eval = params.n_batch;
402
395
  }
403
396
 
404
- LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
397
+ LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
405
398
 
406
399
  if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
407
- LOG_TEE("%s : failed to eval\n", __func__);
400
+ LOG_ERR("%s : failed to eval\n", __func__);
408
401
  return 1;
409
402
  }
410
403
 
411
404
  n_past += n_eval;
412
405
 
413
- LOG("n_past = %d\n", n_past);
406
+ LOG_DBG("n_past = %d\n", n_past);
414
407
  }
415
408
 
416
409
  }
@@ -418,11 +411,11 @@ int main(int argc, char ** argv) {
418
411
  embd.clear();
419
412
 
420
413
  if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
421
- const llama_token id = llama_sampling_sample(ctx_sampling, ctx, nullptr);
414
+ const llama_token id = gpt_sampler_sample(smpl, ctx, -1);
422
415
 
423
- llama_sampling_accept(ctx_sampling, ctx, id, true);
416
+ gpt_sampler_accept(smpl, id, true);
424
417
 
425
- LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
418
+ // LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
426
419
 
427
420
  embd.push_back(id);
428
421
 
@@ -432,16 +425,16 @@ int main(int argc, char ** argv) {
432
425
  // decrement remaining sampling budget
433
426
  --n_remain;
434
427
 
435
- LOG("n_remain: %d\n", n_remain);
428
+ LOG_DBG("n_remain: %d\n", n_remain);
436
429
  } else {
437
430
  // some user input remains from prompt or interaction, forward it to processing
438
- LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
431
+ LOG_DBG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
439
432
  while ((int) embd_inp.size() > n_consumed) {
440
433
  embd.push_back(embd_inp[n_consumed]);
441
434
 
442
435
  // push the prompt in the sampling context in order to apply repetition penalties later
443
436
  // for the prompt, we don't apply grammar rules
444
- llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], false);
437
+ gpt_sampler_accept(smpl, embd_inp[n_consumed], false);
445
438
 
446
439
  ++n_consumed;
447
440
  if ((int) embd.size() >= params.n_batch) {
@@ -454,7 +447,7 @@ int main(int argc, char ** argv) {
454
447
  if (input_echo) {
455
448
  for (auto id : embd) {
456
449
  const std::string token_str = llama_token_to_piece(ctx, id);
457
- printf("%s", token_str.c_str());
450
+ LOG("%s", token_str.c_str());
458
451
 
459
452
  if (embd.size() > 1) {
460
453
  input_tokens.push_back(id);
@@ -463,7 +456,6 @@ int main(int argc, char ** argv) {
463
456
  output_ss << token_str;
464
457
  }
465
458
  }
466
- fflush(stdout);
467
459
  }
468
460
  // reset color to default if we there is no pending user input
469
461
  if (input_echo && (int) embd_inp.size() == n_consumed) {
@@ -473,13 +465,12 @@ int main(int argc, char ** argv) {
473
465
  // if not currently processing queued inputs;
474
466
  if ((int) embd_inp.size() <= n_consumed) {
475
467
  // deal with eot token in infill mode
476
- if ((llama_sampling_last(ctx_sampling) == llama_token_eot(model) || is_interacting) && params.interactive){
468
+ if ((gpt_sampler_last(smpl) == llama_token_eot(model) || is_interacting) && params.interactive){
477
469
  if (is_interacting && !params.interactive_first) {
478
470
  // print an eot token
479
- printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
471
+ LOG("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
480
472
  }
481
- fflush(stdout);
482
- printf("\n");
473
+ LOG("\n");
483
474
  console::set_display(console::user_input);
484
475
  std::string buffer;
485
476
  std::string line;
@@ -535,35 +526,33 @@ int main(int argc, char ** argv) {
535
526
  n_remain = params.n_predict;
536
527
  n_past = 0;
537
528
  n_consumed = 0;
538
- // LOG_TEE("took new input\n");
539
529
  is_interacting = false;
540
530
  }
541
531
  // deal with end of generation tokens in interactive mode
542
- else if (llama_token_is_eog(model, llama_sampling_last(ctx_sampling))) {
543
- LOG("found EOS token\n");
532
+ else if (llama_token_is_eog(model, gpt_sampler_last(smpl))) {
533
+ LOG_DBG("found EOS token\n");
544
534
 
545
535
  if (params.interactive) {
546
536
 
547
537
  is_interacting = true;
548
- printf("\n");
538
+ LOG("\n");
549
539
  console::set_display(console::user_input);
550
- fflush(stdout);
551
540
  }
552
541
  }
553
542
 
554
543
  if (n_past > 0 && is_interacting && !params.interactive) {
555
- LOG("waiting for user input\n");
544
+ LOG_DBG("waiting for user input\n");
556
545
 
557
546
  if (params.input_prefix_bos) {
558
- LOG("adding input prefix BOS token\n");
547
+ LOG_DBG("adding input prefix BOS token\n");
559
548
  embd_inp.push_back(llama_token_bos(model));
560
549
  }
561
550
 
562
551
  std::string buffer;
563
552
  if (!params.input_prefix.empty()) {
564
- LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
553
+ LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str());
565
554
  buffer += params.input_prefix;
566
- printf("%s", buffer.c_str());
555
+ LOG("%s", buffer.c_str());
567
556
  }
568
557
 
569
558
  std::string line;
@@ -581,17 +570,17 @@ int main(int argc, char ** argv) {
581
570
  if (buffer.length() > 1) {
582
571
  // append input suffix if any
583
572
  if (!params.input_suffix.empty()) {
584
- LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
573
+ LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str());
585
574
  buffer += params.input_suffix;
586
- printf("%s", params.input_suffix.c_str());
575
+ LOG("%s", params.input_suffix.c_str());
587
576
  }
588
577
 
589
- LOG("buffer: '%s'\n", buffer.c_str());
578
+ LOG_DBG("buffer: '%s'\n", buffer.c_str());
590
579
 
591
580
  const size_t original_size = embd_inp.size();
592
581
 
593
582
  const auto line_inp = ::llama_tokenize(ctx, buffer, false);
594
- LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
583
+ LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());
595
584
 
596
585
  embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
597
586
 
@@ -602,9 +591,9 @@ int main(int argc, char ** argv) {
602
591
  }
603
592
 
604
593
  n_remain -= line_inp.size();
605
- LOG("n_remain: %d\n", n_remain);
594
+ LOG_DBG("n_remain: %d\n", n_remain);
606
595
  } else {
607
- LOG("empty line, passing control back\n");
596
+ LOG_DBG("empty line, passing control back\n");
608
597
  }
609
598
 
610
599
  input_echo = false; // do not echo this again
@@ -612,7 +601,7 @@ int main(int argc, char ** argv) {
612
601
 
613
602
  if (n_past > 0) {
614
603
  if (is_interacting) {
615
- llama_sampling_reset(ctx_sampling);
604
+ gpt_sampler_reset(smpl);
616
605
  }
617
606
  is_interacting = false;
618
607
  }
@@ -631,22 +620,18 @@ int main(int argc, char ** argv) {
631
620
  }
632
621
  }
633
622
  if (!params.interactive && n_remain <= 0) {
634
- printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
635
- fflush(stdout);
623
+ LOG("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
636
624
  }
637
625
 
638
- llama_print_timings(ctx);
626
+ LOG("\n");
627
+ gpt_perf_print(ctx, smpl);
639
628
  write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
640
629
 
641
630
  llama_free(ctx);
642
631
  llama_free_model(model);
643
632
 
644
- llama_sampling_free(ctx_sampling);
633
+ gpt_sampler_free(smpl);
645
634
  llama_backend_free();
646
635
 
647
- #ifndef LOG_DISABLE_LOGS
648
- LOG_TEE("Log end\n");
649
- #endif // LOG_DISABLE_LOGS
650
-
651
636
  return 0;
652
637
  }