@fugood/llama.node 0.3.0 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (187) hide show
  1. package/CMakeLists.txt +1 -10
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +6 -4
  17. package/src/LlamaCompletionWorker.cpp +6 -6
  18. package/src/LlamaContext.cpp +7 -9
  19. package/src/common.hpp +2 -1
  20. package/src/llama.cpp/.github/workflows/build.yml +98 -24
  21. package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
  22. package/src/llama.cpp/.github/workflows/docker.yml +43 -34
  23. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
  24. package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
  25. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
  26. package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
  27. package/src/llama.cpp/.github/workflows/server.yml +7 -0
  28. package/src/llama.cpp/CMakeLists.txt +20 -8
  29. package/src/llama.cpp/common/CMakeLists.txt +12 -10
  30. package/src/llama.cpp/common/arg.cpp +2006 -0
  31. package/src/llama.cpp/common/arg.h +77 -0
  32. package/src/llama.cpp/common/common.cpp +496 -1632
  33. package/src/llama.cpp/common/common.h +161 -63
  34. package/src/llama.cpp/common/console.cpp +3 -0
  35. package/src/llama.cpp/common/log.cpp +401 -0
  36. package/src/llama.cpp/common/log.h +66 -698
  37. package/src/llama.cpp/common/ngram-cache.cpp +3 -0
  38. package/src/llama.cpp/common/sampling.cpp +348 -350
  39. package/src/llama.cpp/common/sampling.h +62 -139
  40. package/src/llama.cpp/common/stb_image.h +5990 -6398
  41. package/src/llama.cpp/common/train.cpp +2 -0
  42. package/src/llama.cpp/docs/build.md +36 -1
  43. package/src/llama.cpp/examples/CMakeLists.txt +0 -1
  44. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +1 -2
  45. package/src/llama.cpp/examples/batched/batched.cpp +39 -55
  46. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +34 -44
  47. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
  48. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +15 -15
  49. package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
  50. package/src/llama.cpp/examples/embedding/embedding.cpp +143 -87
  51. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +33 -33
  52. package/src/llama.cpp/examples/export-lora/export-lora.cpp +36 -35
  53. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
  54. package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +5 -0
  55. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
  56. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
  57. package/src/llama.cpp/examples/gritlm/gritlm.cpp +34 -27
  58. package/src/llama.cpp/examples/imatrix/imatrix.cpp +59 -62
  59. package/src/llama.cpp/examples/infill/infill.cpp +117 -132
  60. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +265 -58
  61. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +29 -22
  62. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  63. package/src/llama.cpp/examples/llava/clip.cpp +685 -150
  64. package/src/llama.cpp/examples/llava/clip.h +11 -2
  65. package/src/llama.cpp/examples/llava/llava-cli.cpp +47 -58
  66. package/src/llama.cpp/examples/llava/llava.cpp +110 -24
  67. package/src/llama.cpp/examples/llava/llava.h +2 -3
  68. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
  69. package/src/llama.cpp/examples/llava/requirements.txt +1 -0
  70. package/src/llama.cpp/examples/lookahead/lookahead.cpp +42 -43
  71. package/src/llama.cpp/examples/lookup/lookup-create.cpp +10 -8
  72. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +23 -22
  73. package/src/llama.cpp/examples/lookup/lookup.cpp +40 -43
  74. package/src/llama.cpp/examples/main/main.cpp +210 -262
  75. package/src/llama.cpp/examples/parallel/parallel.cpp +49 -49
  76. package/src/llama.cpp/examples/passkey/passkey.cpp +42 -50
  77. package/src/llama.cpp/examples/perplexity/perplexity.cpp +187 -200
  78. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  79. package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
  80. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -3
  81. package/src/llama.cpp/examples/retrieval/retrieval.cpp +49 -44
  82. package/src/llama.cpp/examples/rpc/rpc-server.cpp +24 -1
  83. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +32 -35
  84. package/src/llama.cpp/examples/server/CMakeLists.txt +3 -5
  85. package/src/llama.cpp/examples/server/server.cpp +1027 -1073
  86. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
  87. package/src/llama.cpp/examples/server/utils.hpp +107 -105
  88. package/src/llama.cpp/examples/simple/simple.cpp +35 -41
  89. package/src/llama.cpp/examples/speculative/speculative.cpp +129 -103
  90. package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
  91. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  92. package/src/llama.cpp/examples/tokenize/tokenize.cpp +25 -27
  93. package/src/llama.cpp/ggml/CMakeLists.txt +14 -3
  94. package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
  95. package/src/llama.cpp/ggml/include/ggml-backend.h +145 -60
  96. package/src/llama.cpp/ggml/include/ggml-blas.h +3 -3
  97. package/src/llama.cpp/ggml/include/ggml-cann.h +15 -19
  98. package/src/llama.cpp/ggml/include/ggml-cuda.h +16 -16
  99. package/src/llama.cpp/ggml/include/ggml-metal.h +5 -8
  100. package/src/llama.cpp/ggml/include/ggml-rpc.h +5 -5
  101. package/src/llama.cpp/ggml/include/ggml-sycl.h +8 -8
  102. package/src/llama.cpp/ggml/include/ggml-vulkan.h +7 -7
  103. package/src/llama.cpp/ggml/include/ggml.h +293 -186
  104. package/src/llama.cpp/ggml/src/CMakeLists.txt +86 -44
  105. package/src/llama.cpp/ggml/src/ggml-aarch64.c +2135 -1119
  106. package/src/llama.cpp/ggml/src/ggml-alloc.c +6 -0
  107. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +152 -70
  108. package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +606 -286
  109. package/src/llama.cpp/ggml/src/ggml-blas.cpp +9 -10
  110. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
  111. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
  112. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
  113. package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
  114. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
  115. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
  116. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
  117. package/src/llama.cpp/ggml/src/ggml-cann.cpp +215 -216
  118. package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
  119. package/src/llama.cpp/ggml/src/ggml-cpu-impl.h +614 -0
  120. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  121. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
  122. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  123. package/src/llama.cpp/ggml/src/ggml-impl.h +49 -603
  124. package/src/llama.cpp/ggml/src/ggml-kompute.cpp +4 -24
  125. package/src/llama.cpp/ggml/src/ggml-quants.c +972 -92
  126. package/src/llama.cpp/ggml/src/ggml-quants.h +15 -0
  127. package/src/llama.cpp/ggml/src/ggml-rpc.cpp +116 -66
  128. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
  129. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +11 -0
  130. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +52 -0
  131. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
  132. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
  133. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
  134. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
  135. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
  136. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
  137. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +16 -3
  138. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
  140. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
  141. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1 -1
  142. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +6 -3
  143. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
  144. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
  145. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
  146. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
  147. package/src/llama.cpp/ggml/src/ggml-sycl.cpp +97 -169
  148. package/src/llama.cpp/ggml/src/ggml-vulkan.cpp +1508 -1124
  149. package/src/llama.cpp/ggml/src/ggml.c +3001 -1647
  150. package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +192 -0
  151. package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +2 -0
  152. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +88 -40
  153. package/src/llama.cpp/include/llama.h +241 -264
  154. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
  155. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
  156. package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
  157. package/src/llama.cpp/src/llama-grammar.cpp +721 -122
  158. package/src/llama.cpp/src/llama-grammar.h +120 -15
  159. package/src/llama.cpp/src/llama-impl.h +156 -1
  160. package/src/llama.cpp/src/llama-sampling.cpp +1375 -303
  161. package/src/llama.cpp/src/llama-sampling.h +20 -47
  162. package/src/llama.cpp/src/llama-vocab.cpp +343 -120
  163. package/src/llama.cpp/src/llama-vocab.h +33 -17
  164. package/src/llama.cpp/src/llama.cpp +4247 -1525
  165. package/src/llama.cpp/src/unicode-data.cpp +6 -4
  166. package/src/llama.cpp/src/unicode-data.h +4 -4
  167. package/src/llama.cpp/src/unicode.cpp +15 -7
  168. package/src/llama.cpp/tests/CMakeLists.txt +3 -0
  169. package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
  170. package/src/llama.cpp/tests/test-backend-ops.cpp +1592 -289
  171. package/src/llama.cpp/tests/test-barrier.cpp +93 -0
  172. package/src/llama.cpp/tests/test-grad0.cpp +187 -70
  173. package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
  174. package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
  175. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +6 -4
  176. package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
  177. package/src/llama.cpp/tests/test-log.cpp +39 -0
  178. package/src/llama.cpp/tests/test-quantize-fns.cpp +6 -0
  179. package/src/llama.cpp/tests/test-rope.cpp +1 -1
  180. package/src/llama.cpp/tests/test-sampling.cpp +157 -98
  181. package/src/llama.cpp/tests/test-tokenizer-0.cpp +55 -35
  182. package/patches/llama.patch +0 -22
  183. package/src/llama.cpp/.github/workflows/bench.yml +0 -310
  184. package/src/llama.cpp/common/grammar-parser.cpp +0 -536
  185. package/src/llama.cpp/common/grammar-parser.h +0 -29
  186. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
  187. package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
@@ -1,11 +1,11 @@
1
+ #include "arg.h"
1
2
  #include "common.h"
2
-
3
3
  #include "console.h"
4
+ #include "log.h"
5
+ #include "sampling.h"
4
6
  #include "llama.h"
5
7
 
6
8
  #include <cassert>
7
- #include <cinttypes>
8
- #include <cmath>
9
9
  #include <cstdio>
10
10
  #include <cstring>
11
11
  #include <ctime>
@@ -33,6 +33,7 @@
33
33
 
34
34
  static llama_context ** g_ctx;
35
35
  static llama_model ** g_model;
36
+ static gpt_sampler ** g_smpl;
36
37
  static gpt_params * g_params;
37
38
  static std::vector<llama_token> * g_input_tokens;
38
39
  static std::ostringstream * g_output_ss;
@@ -40,6 +41,15 @@ static std::vector<llama_token> * g_output_tokens;
40
41
  static bool is_interacting = false;
41
42
  static bool need_insert_eot = false;
42
43
 
44
+ static void print_usage(int argc, char ** argv) {
45
+ (void) argc;
46
+
47
+ LOG("\nexample usage:\n");
48
+ LOG("\n text generation: %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]);
49
+ LOG("\n chat (conversation): %s -m your_model.gguf -p \"You are a helpful assistant\" -cnv\n", argv[0]);
50
+ LOG("\n");
51
+ }
52
+
43
53
  static bool file_exists(const std::string & path) {
44
54
  std::ifstream f(path.c_str());
45
55
  return f.good();
@@ -65,8 +75,7 @@ static void write_logfile(
65
75
 
66
76
  const bool success = fs_create_directory_with_parents(params.logdir);
67
77
  if (!success) {
68
- fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
69
- __func__, params.logdir.c_str());
78
+ LOG_ERR("%s: failed to create logdir %s, cannot write logfile\n", __func__, params.logdir.c_str());
70
79
  return;
71
80
  }
72
81
 
@@ -74,7 +83,7 @@ static void write_logfile(
74
83
  FILE * logfile = fopen(logfile_path.c_str(), "w");
75
84
 
76
85
  if (logfile == NULL) {
77
- fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
86
+ LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
78
87
  return;
79
88
  }
80
89
 
@@ -92,7 +101,7 @@ static void write_logfile(
92
101
  yaml_dump_string_multiline(logfile, "output", output.c_str());
93
102
  yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
94
103
 
95
- llama_dump_timing_info_yaml(logfile, ctx);
104
+ llama_perf_dump_yaml(logfile, ctx);
96
105
  fclose(logfile);
97
106
  }
98
107
 
@@ -104,50 +113,38 @@ static void sigint_handler(int signo) {
104
113
  need_insert_eot = true;
105
114
  } else {
106
115
  console::cleanup();
107
- printf("\n");
108
- llama_print_timings(*g_ctx);
116
+ LOG("\n");
117
+ gpt_perf_print(*g_ctx, *g_smpl);
109
118
  write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
119
+
120
+ // make sure all logs are flushed
121
+ LOG("Interrupted by user\n");
122
+ gpt_log_pause(gpt_log_main());
123
+
110
124
  _exit(130);
111
125
  }
112
126
  }
113
127
  }
114
128
  #endif
115
129
 
116
- static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
117
- (void) level;
118
- (void) user_data;
119
- LOG_TEE("%s", text);
120
- }
121
-
122
- static std::string chat_add_and_format(struct llama_model * model, std::vector<llama_chat_msg> & chat_msgs, std::string role, std::string content) {
130
+ static std::string chat_add_and_format(struct llama_model * model, std::vector<llama_chat_msg> & chat_msgs, const std::string & role, const std::string & content) {
123
131
  llama_chat_msg new_msg{role, content};
124
- auto formatted = llama_chat_format_single(
125
- model, g_params->chat_template, chat_msgs, new_msg, role == "user");
132
+ auto formatted = llama_chat_format_single(model, g_params->chat_template, chat_msgs, new_msg, role == "user");
126
133
  chat_msgs.push_back({role, content});
127
- LOG("formatted: %s\n", formatted.c_str());
134
+ LOG_DBG("formatted: '%s'\n", formatted.c_str());
128
135
  return formatted;
129
136
  }
130
137
 
131
138
  int main(int argc, char ** argv) {
132
139
  gpt_params params;
133
140
  g_params = &params;
134
-
135
- if (!gpt_params_parse(argc, argv, params)) {
136
- gpt_params_print_usage(argc, argv, params);
141
+ if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) {
137
142
  return 1;
138
143
  }
139
144
 
140
- llama_sampling_params & sparams = params.sparams;
145
+ gpt_init();
141
146
 
142
- #ifndef LOG_DISABLE_LOGS
143
- log_set_target(log_filename_generator("main", "log"));
144
- LOG_TEE("Log start\n");
145
- log_dump_cmdline(argc, argv);
146
- llama_log_set(llama_log_callback_logTee, nullptr);
147
- #endif // LOG_DISABLE_LOGS
148
-
149
- // TODO: Dump params ?
150
- //LOG("Params perplexity: %s\n", LOG_TOSTR(params.perplexity));
147
+ auto & sparams = params.sparams;
151
148
 
152
149
  // save choice to use color for later
153
150
  // (note for later: this is a slightly awkward choice)
@@ -155,120 +152,141 @@ int main(int argc, char ** argv) {
155
152
  atexit([]() { console::cleanup(); });
156
153
 
157
154
  if (params.logits_all) {
158
- printf("\n************\n");
159
- printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
160
- printf("************\n\n");
155
+ LOG_ERR("************\n");
156
+ LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
157
+ LOG_ERR("************\n\n");
161
158
 
162
159
  return 0;
163
160
  }
164
161
 
165
162
  if (params.embedding) {
166
- printf("\n************\n");
167
- printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
168
- printf("************\n\n");
163
+ LOG_ERR("************\n");
164
+ LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
165
+ LOG_ERR("************\n\n");
169
166
 
170
167
  return 0;
171
168
  }
172
169
 
173
170
  if (params.n_ctx != 0 && params.n_ctx < 8) {
174
- LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
171
+ LOG_WRN("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
175
172
  params.n_ctx = 8;
176
173
  }
177
174
 
178
175
  if (params.rope_freq_base != 0.0) {
179
- LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
176
+ LOG_WRN("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
180
177
  }
181
178
 
182
179
  if (params.rope_freq_scale != 0.0) {
183
- LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
184
- }
185
-
186
- LOG_TEE("%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
187
- LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);
188
-
189
- if (params.seed == LLAMA_DEFAULT_SEED) {
190
- params.seed = time(NULL);
180
+ LOG_WRN("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
191
181
  }
192
182
 
193
- LOG_TEE("%s: seed = %u\n", __func__, params.seed);
183
+ LOG_INF("%s: llama backend init\n", __func__);
194
184
 
195
- std::mt19937 rng(params.seed);
196
-
197
- LOG("%s: llama backend init\n", __func__);
198
185
  llama_backend_init();
199
186
  llama_numa_init(params.numa);
200
187
 
201
- llama_model * model;
202
- llama_context * ctx;
203
- llama_context * ctx_guidance = NULL;
188
+ llama_model * model = nullptr;
189
+ llama_context * ctx = nullptr;
190
+ gpt_sampler * smpl = nullptr;
191
+
204
192
  std::vector<llama_chat_msg> chat_msgs;
193
+
205
194
  g_model = &model;
206
195
  g_ctx = &ctx;
196
+ g_smpl = &smpl;
207
197
 
208
198
  // load the model and apply lora adapter, if any
209
- LOG("%s: load the model and apply lora adapter, if any\n", __func__);
210
- std::tie(model, ctx) = llama_init_from_gpt_params(params);
211
- if (sparams.cfg_scale > 1.f) {
212
- struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
213
- ctx_guidance = llama_new_context_with_model(model, lparams);
214
- }
199
+ LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
200
+ llama_init_result llama_init = llama_init_from_gpt_params(params);
201
+
202
+ model = llama_init.model;
203
+ ctx = llama_init.context;
215
204
 
216
205
  if (model == NULL) {
217
- LOG_TEE("%s: error: unable to load model\n", __func__);
206
+ LOG_ERR("%s: error: unable to load model\n", __func__);
218
207
  return 1;
219
208
  }
220
209
 
210
+ LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);
211
+
212
+ struct ggml_threadpool_params tpp_batch =
213
+ ggml_threadpool_params_from_cpu_params(params.cpuparams_batch);
214
+ struct ggml_threadpool_params tpp =
215
+ ggml_threadpool_params_from_cpu_params(params.cpuparams);
216
+
217
+ set_process_priority(params.cpuparams.priority);
218
+
219
+ struct ggml_threadpool * threadpool_batch = NULL;
220
+ if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
221
+ threadpool_batch = ggml_threadpool_new(&tpp_batch);
222
+ if (!threadpool_batch) {
223
+ LOG_ERR("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
224
+ return 1;
225
+ }
226
+
227
+ // Start the non-batch threadpool in the paused state
228
+ tpp.paused = true;
229
+ }
230
+
231
+ struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp);
232
+ if (!threadpool) {
233
+ LOG_ERR("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
234
+ return 1;
235
+ }
236
+
237
+ llama_attach_threadpool(ctx, threadpool, threadpool_batch);
238
+
221
239
  const int n_ctx_train = llama_n_ctx_train(model);
222
240
  const int n_ctx = llama_n_ctx(ctx);
223
- LOG("n_ctx: %d\n", n_ctx);
224
241
 
225
242
  if (n_ctx > n_ctx_train) {
226
- LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n",
227
- __func__, n_ctx_train, n_ctx);
243
+ LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx);
228
244
  }
229
245
 
230
246
  // print chat template example in conversation mode
231
247
  if (params.conversation) {
232
248
  if (params.enable_chat_template) {
233
- LOG_TEE("%s: chat template example: %s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str());
249
+ LOG_INF("%s: chat template example:\n%s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str());
234
250
  } else {
235
- LOG_TEE("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
251
+ LOG_INF("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
236
252
  }
237
253
  }
238
254
 
239
255
  // print system information
240
256
  {
241
- LOG_TEE("\n");
242
- LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
257
+ LOG_INF("\n");
258
+ LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
259
+ LOG_INF("\n");
243
260
  }
244
261
 
245
262
  std::string path_session = params.path_prompt_cache;
246
263
  std::vector<llama_token> session_tokens;
247
264
 
248
265
  if (!path_session.empty()) {
249
- LOG_TEE("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
266
+ LOG_INF("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
250
267
  if (!file_exists(path_session)) {
251
- LOG_TEE("%s: session file does not exist, will create.\n", __func__);
268
+ LOG_INF("%s: session file does not exist, will create.\n", __func__);
252
269
  } else if (file_is_empty(path_session)) {
253
- LOG_TEE("%s: The session file is empty. A new session will be initialized.\n", __func__);
270
+ LOG_INF("%s: The session file is empty. A new session will be initialized.\n", __func__);
254
271
  } else {
255
272
  // The file exists and is not empty
256
273
  session_tokens.resize(n_ctx);
257
274
  size_t n_token_count_out = 0;
258
275
  if (!llama_state_load_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
259
- LOG_TEE("%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
276
+ LOG_ERR("%s: failed to load session file '%s'\n", __func__, path_session.c_str());
260
277
  return 1;
261
278
  }
262
279
  session_tokens.resize(n_token_count_out);
263
- LOG_TEE("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size());
280
+ LOG_INF("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size());
264
281
  }
265
282
  }
266
283
 
267
- const bool add_bos = llama_should_add_bos_token(model);
284
+ const bool add_bos = llama_add_bos_token(model);
268
285
  if (!llama_model_has_encoder(model)) {
269
- GGML_ASSERT(llama_add_eos_token(model) != 1);
286
+ GGML_ASSERT(!llama_add_eos_token(model));
270
287
  }
271
- LOG("add_bos: %d\n", add_bos);
288
+
289
+ LOG_DBG("n_ctx: %d, add_bos: %d\n", n_ctx, add_bos);
272
290
 
273
291
  std::vector<llama_token> embd_inp;
274
292
 
@@ -277,49 +295,31 @@ int main(int argc, char ** argv) {
277
295
  ? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode
278
296
  : params.prompt;
279
297
  if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
280
- LOG("tokenize the prompt\n");
298
+ LOG_DBG("tokenize the prompt\n");
281
299
  embd_inp = ::llama_tokenize(ctx, prompt, true, true);
282
300
  } else {
283
- LOG("use session tokens\n");
301
+ LOG_DBG("use session tokens\n");
284
302
  embd_inp = session_tokens;
285
303
  }
286
304
 
287
- LOG("prompt: \"%s\"\n", log_tostr(prompt));
288
- LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
305
+ LOG_DBG("prompt: \"%s\"\n", prompt.c_str());
306
+ LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str());
289
307
  }
290
308
 
291
309
  // Should not run without any tokens
292
310
  if (embd_inp.empty()) {
293
311
  if (add_bos) {
294
312
  embd_inp.push_back(llama_token_bos(model));
295
- LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
313
+ LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
296
314
  } else {
297
- LOG_TEE("error: input is empty\n");
315
+ LOG_ERR("input is empty\n");
298
316
  return -1;
299
317
  }
300
318
  }
301
319
 
302
320
  // Tokenize negative prompt
303
- std::vector<llama_token> guidance_inp;
304
- int guidance_offset = 0;
305
- int original_prompt_len = 0;
306
- if (ctx_guidance) {
307
- LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
308
-
309
- guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true, true);
310
- LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
311
-
312
- std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true, true);
313
- LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
314
-
315
- original_prompt_len = original_inp.size();
316
- guidance_offset = (int)guidance_inp.size() - original_prompt_len;
317
- LOG("original_prompt_len: %s", log_tostr(original_prompt_len));
318
- LOG("guidance_offset: %s", log_tostr(guidance_offset));
319
- }
320
-
321
321
  if ((int) embd_inp.size() > n_ctx - 4) {
322
- LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
322
+ LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
323
323
  return 1;
324
324
  }
325
325
 
@@ -333,29 +333,28 @@ int main(int argc, char ** argv) {
333
333
  n_matching_session_tokens++;
334
334
  }
335
335
  if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) {
336
- LOG_TEE("%s: using full prompt from session file\n", __func__);
336
+ LOG_INF("%s: using full prompt from session file\n", __func__);
337
337
  } else if (n_matching_session_tokens >= embd_inp.size()) {
338
- LOG_TEE("%s: session file has exact match for prompt!\n", __func__);
338
+ LOG_INF("%s: session file has exact match for prompt!\n", __func__);
339
339
  } else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
340
- LOG_TEE("%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
341
- __func__, n_matching_session_tokens, embd_inp.size());
340
+ LOG_WRN("%s: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
341
+ __func__, n_matching_session_tokens, embd_inp.size());
342
342
  } else {
343
- LOG_TEE("%s: session file matches %zu / %zu tokens of prompt\n",
344
- __func__, n_matching_session_tokens, embd_inp.size());
343
+ LOG_INF("%s: session file matches %zu / %zu tokens of prompt\n",
344
+ __func__, n_matching_session_tokens, embd_inp.size());
345
345
  }
346
346
 
347
347
  // remove any "future" tokens that we might have inherited from the previous session
348
348
  llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1);
349
349
  }
350
350
 
351
- LOGLN(
352
- "recalculate the cached logits (check): embd_inp.empty() %s, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu, embd_inp.size() %zu",
353
- log_tostr(embd_inp.empty()), n_matching_session_tokens, embd_inp.size(), session_tokens.size(), embd_inp.size());
351
+ LOG_DBG("recalculate the cached logits (check): embd_inp.size() %zu, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu\n",
352
+ embd_inp.size(), n_matching_session_tokens, embd_inp.size(), session_tokens.size());
354
353
 
355
354
  // if we will use the cache for the full prompt without reaching the end of the cache, force
356
355
  // reevaluation of the last token to recalculate the cached logits
357
356
  if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && session_tokens.size() > embd_inp.size()) {
358
- LOGLN("recalculate the cached logits (do): session_tokens.resize( %zu )", embd_inp.size() - 1);
357
+ LOG_DBG("recalculate the cached logits (do): session_tokens.resize( %zu )\n", embd_inp.size() - 1);
359
358
 
360
359
  session_tokens.resize(embd_inp.size() - 1);
361
360
  }
@@ -377,30 +376,20 @@ int main(int argc, char ** argv) {
377
376
  }
378
377
 
379
378
  if (params.verbose_prompt) {
380
- LOG_TEE("\n");
381
- LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
382
- LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
379
+ LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
380
+ LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
383
381
  for (int i = 0; i < (int) embd_inp.size(); i++) {
384
- LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
385
- }
386
-
387
- if (ctx_guidance) {
388
- LOG_TEE("\n");
389
- LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
390
- LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
391
- for (int i = 0; i < (int) guidance_inp.size(); i++) {
392
- LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
393
- }
382
+ LOG_INF("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
394
383
  }
395
384
 
396
385
  if (params.n_keep > add_bos) {
397
- LOG_TEE("%s: static prompt based on n_keep: '", __func__);
386
+ LOG_INF("%s: static prompt based on n_keep: '", __func__);
398
387
  for (int i = 0; i < params.n_keep; i++) {
399
- LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
388
+ LOG_CNT("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
400
389
  }
401
- LOG_TEE("'\n");
390
+ LOG_CNT("'\n");
402
391
  }
403
- LOG_TEE("\n");
392
+ LOG_INF("\n");
404
393
  }
405
394
 
406
395
  // ctrl+C handling
@@ -420,47 +409,56 @@ int main(int argc, char ** argv) {
420
409
  }
421
410
 
422
411
  if (params.interactive) {
423
- LOG_TEE("%s: interactive mode on.\n", __func__);
412
+ LOG_INF("%s: interactive mode on.\n", __func__);
424
413
 
425
414
  if (!params.antiprompt.empty()) {
426
415
  for (const auto & antiprompt : params.antiprompt) {
427
- LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str());
416
+ LOG_INF("Reverse prompt: '%s'\n", antiprompt.c_str());
428
417
  if (params.verbose_prompt) {
429
418
  auto tmp = ::llama_tokenize(ctx, antiprompt, false, true);
430
419
  for (int i = 0; i < (int) tmp.size(); i++) {
431
- LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
420
+ LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
432
421
  }
433
422
  }
434
423
  }
435
424
  }
436
425
 
437
426
  if (params.input_prefix_bos) {
438
- LOG_TEE("Input prefix with BOS\n");
427
+ LOG_INF("Input prefix with BOS\n");
439
428
  }
440
429
 
441
430
  if (!params.input_prefix.empty()) {
442
- LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
431
+ LOG_INF("Input prefix: '%s'\n", params.input_prefix.c_str());
443
432
  if (params.verbose_prompt) {
444
433
  auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true);
445
434
  for (int i = 0; i < (int) tmp.size(); i++) {
446
- LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
435
+ LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
447
436
  }
448
437
  }
449
438
  }
450
439
 
451
440
  if (!params.input_suffix.empty()) {
452
- LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
441
+ LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str());
453
442
  if (params.verbose_prompt) {
454
443
  auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true);
455
444
  for (int i = 0; i < (int) tmp.size(); i++) {
456
- LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
445
+ LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
457
446
  }
458
447
  }
459
448
  }
460
449
  }
461
- LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
462
- LOG_TEE("sampling order: \n%s\n", llama_sampling_order_print(sparams).c_str());
463
- LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
450
+
451
+ smpl = gpt_sampler_init(model, sparams);
452
+ if (!smpl) {
453
+ LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
454
+ return 1;
455
+ }
456
+
457
+ LOG_INF("sampler seed: %u\n", gpt_sampler_get_seed(smpl));
458
+ LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
459
+ LOG_INF("sampler chain: %s\n", gpt_sampler_print(smpl).c_str());
460
+
461
+ LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
464
462
 
465
463
  // group-attention state
466
464
  // number of grouped KV tokens so far (used only if params.grp_attn_n > 1)
@@ -474,9 +472,9 @@ int main(int argc, char ** argv) {
474
472
  GGML_ASSERT(ga_w % ga_n == 0 && "grp_attn_w must be a multiple of grp_attn_n"); // NOLINT
475
473
  //GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of grp_attn_w"); // NOLINT
476
474
  //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * grp_attn_n"); // NOLINT
477
- LOG_TEE("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w);
475
+ LOG_INF("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w);
478
476
  }
479
- LOG_TEE("\n\n");
477
+ LOG_INF("\n");
480
478
 
481
479
  if (params.interactive) {
482
480
  const char * control_message;
@@ -488,11 +486,11 @@ int main(int argc, char ** argv) {
488
486
  " - To return control without starting a new line, end your input with '/'.\n"
489
487
  " - If you want to submit another line, end your input with '\\'.\n";
490
488
  }
491
- LOG_TEE("== Running in interactive mode. ==\n");
489
+ LOG_INF("== Running in interactive mode. ==\n");
492
490
  #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
493
- LOG_TEE( " - Press Ctrl+C to interject at any time.\n");
491
+ LOG_INF( " - Press Ctrl+C to interject at any time.\n");
494
492
  #endif
495
- LOG_TEE( "%s\n", control_message);
493
+ LOG_INF( "%s\n", control_message);
496
494
 
497
495
  is_interacting = params.interactive_first;
498
496
  }
@@ -506,7 +504,6 @@ int main(int argc, char ** argv) {
506
504
  int n_remain = params.n_predict;
507
505
  int n_consumed = 0;
508
506
  int n_session_consumed = 0;
509
- int n_past_guidance = 0;
510
507
 
511
508
  std::vector<int> input_tokens; g_input_tokens = &input_tokens;
512
509
  std::vector<int> output_tokens; g_output_tokens = &output_tokens;
@@ -518,7 +515,6 @@ int main(int argc, char ** argv) {
518
515
  display = params.display_prompt;
519
516
 
520
517
  std::vector<llama_token> embd;
521
- std::vector<llama_token> embd_guidance;
522
518
 
523
519
  // tokenized antiprompts
524
520
  std::vector<std::vector<llama_token>> antiprompt_ids;
@@ -528,18 +524,12 @@ int main(int argc, char ** argv) {
528
524
  antiprompt_ids.emplace_back(::llama_tokenize(ctx, antiprompt, false, true));
529
525
  }
530
526
 
531
- struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
532
- if (!ctx_sampling) {
533
- fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
534
- exit(1);
535
- }
536
-
537
527
  if (llama_model_has_encoder(model)) {
538
528
  int enc_input_size = embd_inp.size();
539
529
  llama_token * enc_input_buf = embd_inp.data();
540
530
 
541
531
  if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size, 0, 0))) {
542
- LOG_TEE("%s : failed to eval\n", __func__);
532
+ LOG_ERR("%s : failed to eval\n", __func__);
543
533
  return 1;
544
534
  }
545
535
 
@@ -565,9 +555,8 @@ int main(int argc, char ** argv) {
565
555
  embd.resize(max_embd_size);
566
556
 
567
557
  console::set_display(console::error);
568
- printf("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
558
+ LOG_WRN("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
569
559
  console::set_display(console::reset);
570
- fflush(stdout);
571
560
  }
572
561
 
573
562
  if (ga_n == 1) {
@@ -575,33 +564,35 @@ int main(int argc, char ** argv) {
575
564
  // if we run out of context:
576
565
  // - take the n_keep first tokens from the original prompt (via n_past)
577
566
  // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
578
- if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) >= n_ctx) {
579
- if (params.n_predict == -2) {
580
- LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
581
- break;
582
- }
583
567
 
584
- const int n_left = n_past - params.n_keep;
585
- const int n_discard = n_left/2;
568
+ if (n_past + (int) embd.size() >= n_ctx) {
569
+ if (!params.ctx_shift){
570
+ LOG_DBG("\n\n%s: context full and context shift is disabled => stopping\n", __func__);
571
+ break;
572
+ } else {
573
+ if (params.n_predict == -2) {
574
+ LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
575
+ break;
576
+ }
586
577
 
587
- LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
588
- n_past, n_left, n_ctx, params.n_keep, n_discard);
578
+ const int n_left = n_past - params.n_keep;
579
+ const int n_discard = n_left/2;
589
580
 
590
- llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard);
591
- llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
581
+ LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
582
+ n_past, n_left, n_ctx, params.n_keep, n_discard);
592
583
 
593
- n_past -= n_discard;
584
+ llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard);
585
+ llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
594
586
 
595
- if (ctx_guidance) {
596
- n_past_guidance -= n_discard;
597
- }
587
+ n_past -= n_discard;
598
588
 
599
- LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
589
+ LOG_DBG("after swap: n_past = %d\n", n_past);
600
590
 
601
- LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
591
+ LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
602
592
 
603
- LOG("clear session path\n");
604
- path_session.clear();
593
+ LOG_DBG("clear session path\n");
594
+ path_session.clear();
595
+ }
605
596
  }
606
597
  } else {
607
598
  // context extension via Self-Extend
@@ -610,10 +601,10 @@ int main(int argc, char ** argv) {
610
601
  const int bd = (ga_w/ga_n)*(ga_n - 1);
611
602
  const int dd = (ga_w/ga_n) - ib*bd - ga_w;
612
603
 
613
- LOG("\n");
614
- LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i, n_past, ib*bd, ga_i + ib*bd, n_past + ib*bd);
615
- LOG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
616
- LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);
604
+ LOG_DBG("\n");
605
+ LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i, n_past, ib*bd, ga_i + ib*bd, n_past + ib*bd);
606
+ LOG_DBG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
607
+ LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);
617
608
 
618
609
  llama_kv_cache_seq_add(ctx, 0, ga_i, n_past, ib*bd);
619
610
  llama_kv_cache_seq_div(ctx, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n);
@@ -623,7 +614,7 @@ int main(int argc, char ** argv) {
623
614
 
624
615
  ga_i += ga_w/ga_n;
625
616
 
626
- LOG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", n_past + bd, n_past, ga_i);
617
+ LOG_DBG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", n_past + bd, n_past, ga_i);
627
618
  }
628
619
  }
629
620
 
@@ -649,65 +640,25 @@ int main(int argc, char ** argv) {
649
640
  }
650
641
  }
651
642
 
652
- // evaluate tokens in batches
653
- // embd is typically prepared beforehand to fit within a batch, but not always
654
- if (ctx_guidance) {
655
- int input_size = 0;
656
- llama_token * input_buf = NULL;
657
-
658
- if (n_past_guidance < (int) guidance_inp.size()) {
659
- // Guidance context should have the same data with these modifications:
660
- //
661
- // * Replace the initial prompt
662
- // * Shift everything by guidance_offset
663
- embd_guidance = guidance_inp;
664
- if (embd.begin() + original_prompt_len < embd.end()) {
665
- embd_guidance.insert(
666
- embd_guidance.end(),
667
- embd.begin() + original_prompt_len,
668
- embd.end()
669
- );
670
- }
671
-
672
- input_buf = embd_guidance.data();
673
- input_size = embd_guidance.size();
674
-
675
- LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance).c_str());
676
- } else {
677
- input_buf = embd.data();
678
- input_size = embd.size();
679
- }
680
-
681
- for (int i = 0; i < input_size; i += params.n_batch) {
682
- int n_eval = std::min(input_size - i, params.n_batch);
683
- if (llama_decode(ctx_guidance, llama_batch_get_one(input_buf + i, n_eval, n_past_guidance, 0))) {
684
- LOG_TEE("%s : failed to eval\n", __func__);
685
- return 1;
686
- }
687
-
688
- n_past_guidance += n_eval;
689
- }
690
- }
691
-
692
643
  for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
693
644
  int n_eval = (int) embd.size() - i;
694
645
  if (n_eval > params.n_batch) {
695
646
  n_eval = params.n_batch;
696
647
  }
697
648
 
698
- LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
649
+ LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
699
650
 
700
651
  if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
701
- LOG_TEE("%s : failed to eval\n", __func__);
652
+ LOG_ERR("%s : failed to eval\n", __func__);
702
653
  return 1;
703
654
  }
704
655
 
705
656
  n_past += n_eval;
706
657
 
707
- LOG("n_past = %d\n", n_past);
658
+ LOG_DBG("n_past = %d\n", n_past);
708
659
  // Display total tokens alongside total time
709
660
  if (params.n_print > 0 && n_past % params.n_print == 0) {
710
- LOG_TEE("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx);
661
+ LOG_DBG("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx);
711
662
  }
712
663
  }
713
664
 
@@ -718,7 +669,6 @@ int main(int argc, char ** argv) {
718
669
  }
719
670
 
720
671
  embd.clear();
721
- embd_guidance.clear();
722
672
 
723
673
  if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
724
674
  // optionally save the session on first sample (for faster prompt loading next time)
@@ -726,14 +676,14 @@ int main(int argc, char ** argv) {
726
676
  need_to_save_session = false;
727
677
  llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
728
678
 
729
- LOG("saved session to %s\n", path_session.c_str());
679
+ LOG_DBG("saved session to %s\n", path_session.c_str());
730
680
  }
731
681
 
732
- const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
682
+ const llama_token id = gpt_sampler_sample(smpl, ctx, -1);
733
683
 
734
- llama_sampling_accept(ctx_sampling, ctx, id, /* apply_grammar= */ true);
684
+ gpt_sampler_accept(smpl, id, /* accept_grammar= */ true);
735
685
 
736
- LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
686
+ // LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
737
687
 
738
688
  embd.push_back(id);
739
689
 
@@ -743,16 +693,16 @@ int main(int argc, char ** argv) {
743
693
  // decrement remaining sampling budget
744
694
  --n_remain;
745
695
 
746
- LOG("n_remain: %d\n", n_remain);
696
+ LOG_DBG("n_remain: %d\n", n_remain);
747
697
  } else {
748
698
  // some user input remains from prompt or interaction, forward it to processing
749
- LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
699
+ LOG_DBG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
750
700
  while ((int) embd_inp.size() > n_consumed) {
751
701
  embd.push_back(embd_inp[n_consumed]);
752
702
 
753
703
  // push the prompt in the sampling context in order to apply repetition penalties later
754
704
  // for the prompt, we don't apply grammar rules
755
- llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], /* apply_grammar= */ false);
705
+ gpt_sampler_accept(smpl, embd_inp[n_consumed], /* accept_grammar= */ false);
756
706
 
757
707
  ++n_consumed;
758
708
  if ((int) embd.size() >= params.n_batch) {
@@ -767,7 +717,7 @@ int main(int argc, char ** argv) {
767
717
  const std::string token_str = llama_token_to_piece(ctx, id, params.special);
768
718
 
769
719
  // Console/Stream Output
770
- fprintf(stdout, "%s", token_str.c_str());
720
+ LOG("%s", token_str.c_str());
771
721
 
772
722
  // Record Displayed Tokens To Log
773
723
  // Note: Generated tokens are created one by one hence this check
@@ -779,8 +729,6 @@ int main(int argc, char ** argv) {
779
729
  output_tokens.push_back(id);
780
730
  output_ss << token_str;
781
731
  }
782
-
783
- fflush(stdout);
784
732
  }
785
733
  }
786
734
 
@@ -795,7 +743,7 @@ int main(int argc, char ** argv) {
795
743
  // check for reverse prompt in the last n_prev tokens
796
744
  if (!params.antiprompt.empty()) {
797
745
  const int n_prev = 32;
798
- const std::string last_output = llama_sampling_prev_str(ctx_sampling, ctx, n_prev);
746
+ const std::string last_output = gpt_sampler_prev_str(smpl, ctx, n_prev);
799
747
 
800
748
  is_antiprompt = false;
801
749
  // Check if each of the reverse prompts appears at the end of the output.
@@ -817,7 +765,7 @@ int main(int argc, char ** argv) {
817
765
  }
818
766
 
819
767
  // check for reverse prompt using special tokens
820
- llama_token last_token = llama_sampling_last(ctx_sampling);
768
+ llama_token last_token = gpt_sampler_last(smpl);
821
769
  for (std::vector<llama_token> ids : antiprompt_ids) {
822
770
  if (ids.size() == 1 && last_token == ids[0]) {
823
771
  if (params.interactive) {
@@ -829,13 +777,13 @@ int main(int argc, char ** argv) {
829
777
  }
830
778
 
831
779
  if (is_antiprompt) {
832
- LOG("found antiprompt: %s\n", last_output.c_str());
780
+ LOG_DBG("found antiprompt: %s\n", last_output.c_str());
833
781
  }
834
782
  }
835
783
 
836
784
  // deal with end of generation tokens in interactive mode
837
- if (llama_token_is_eog(model, llama_sampling_last(ctx_sampling))) {
838
- LOG("found an EOG token\n");
785
+ if (llama_token_is_eog(model, gpt_sampler_last(smpl))) {
786
+ LOG_DBG("found an EOG token\n");
839
787
 
840
788
  if (params.interactive) {
841
789
  if (!params.antiprompt.empty()) {
@@ -849,32 +797,32 @@ int main(int argc, char ** argv) {
849
797
  chat_add_and_format(model, chat_msgs, "assistant", assistant_ss.str());
850
798
  }
851
799
  is_interacting = true;
852
- printf("\n");
800
+ LOG("\n");
853
801
  }
854
802
  }
855
803
 
856
804
  // if current token is not EOG, we add it to current assistant message
857
805
  if (params.conversation) {
858
- auto id = llama_sampling_last(ctx_sampling);
806
+ const auto id = gpt_sampler_last(smpl);
859
807
  assistant_ss << llama_token_to_piece(ctx, id, false);
860
808
  }
861
809
 
862
810
  if (n_past > 0 && is_interacting) {
863
- LOG("waiting for user input\n");
811
+ LOG_DBG("waiting for user input\n");
864
812
 
865
813
  if (params.conversation) {
866
- printf("\n> ");
814
+ LOG("\n> ");
867
815
  }
868
816
 
869
817
  if (params.input_prefix_bos) {
870
- LOG("adding input prefix BOS token\n");
818
+ LOG_DBG("adding input prefix BOS token\n");
871
819
  embd_inp.push_back(llama_token_bos(model));
872
820
  }
873
821
 
874
822
  std::string buffer;
875
823
  if (!params.input_prefix.empty() && !params.conversation) {
876
- LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
877
- printf("%s", params.input_prefix.c_str());
824
+ LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str());
825
+ LOG("%s", params.input_prefix.c_str());
878
826
  }
879
827
 
880
828
  // color user input only
@@ -897,11 +845,11 @@ int main(int argc, char ** argv) {
897
845
  if (buffer.length() > 1) {
898
846
  // append input suffix if any
899
847
  if (!params.input_suffix.empty() && !params.conversation) {
900
- LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
901
- printf("%s", params.input_suffix.c_str());
848
+ LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str());
849
+ LOG("%s", params.input_suffix.c_str());
902
850
  }
903
851
 
904
- LOG("buffer: '%s'\n", buffer.c_str());
852
+ LOG_DBG("buffer: '%s'\n", buffer.c_str());
905
853
 
906
854
  const size_t original_size = embd_inp.size();
907
855
 
@@ -918,7 +866,7 @@ int main(int argc, char ** argv) {
918
866
  const auto line_inp = ::llama_tokenize(ctx, user_inp, false, format_chat);
919
867
  const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
920
868
 
921
- LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
869
+ LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());
922
870
 
923
871
  // if user stop generation mid-way, we must add EOT to finish model's last response
924
872
  if (need_insert_eot && format_chat) {
@@ -941,9 +889,9 @@ int main(int argc, char ** argv) {
941
889
  assistant_ss.str("");
942
890
 
943
891
  n_remain -= line_inp.size();
944
- LOG("n_remain: %d\n", n_remain);
892
+ LOG_DBG("n_remain: %d\n", n_remain);
945
893
  } else {
946
- LOG("empty line, passing control back\n");
894
+ LOG_DBG("empty line, passing control back\n");
947
895
  }
948
896
 
949
897
  input_echo = false; // do not echo this again
@@ -951,7 +899,7 @@ int main(int argc, char ** argv) {
951
899
 
952
900
  if (n_past > 0) {
953
901
  if (is_interacting) {
954
- llama_sampling_reset(ctx_sampling);
902
+ gpt_sampler_reset(smpl);
955
903
  }
956
904
  is_interacting = false;
957
905
  }
@@ -959,7 +907,7 @@ int main(int argc, char ** argv) {
959
907
 
960
908
  // end of generation
961
909
  if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.interactive)) {
962
- LOG_TEE(" [end of text]\n");
910
+ LOG(" [end of text]\n");
963
911
  break;
964
912
  }
965
913
 
@@ -972,23 +920,23 @@ int main(int argc, char ** argv) {
972
920
  }
973
921
 
974
922
  if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) {
975
- LOG_TEE("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
923
+ LOG("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
976
924
  llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
977
925
  }
978
926
 
979
- llama_print_timings(ctx);
927
+ LOG("\n\n");
928
+ gpt_perf_print(ctx, smpl);
980
929
  write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
981
930
 
982
- if (ctx_guidance) { llama_free(ctx_guidance); }
931
+ gpt_sampler_free(smpl);
932
+
983
933
  llama_free(ctx);
984
934
  llama_free_model(model);
985
935
 
986
- llama_sampling_free(ctx_sampling);
987
936
  llama_backend_free();
988
937
 
989
- #ifndef LOG_DISABLE_LOGS
990
- LOG_TEE("Log end\n");
991
- #endif // LOG_DISABLE_LOGS
938
+ ggml_threadpool_free(threadpool);
939
+ ggml_threadpool_free(threadpool_batch);
992
940
 
993
941
  return 0;
994
942
  }