@fugood/llama.node 0.3.0 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (187) hide show
  1. package/CMakeLists.txt +1 -10
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +6 -4
  17. package/src/LlamaCompletionWorker.cpp +6 -6
  18. package/src/LlamaContext.cpp +7 -9
  19. package/src/common.hpp +2 -1
  20. package/src/llama.cpp/.github/workflows/build.yml +98 -24
  21. package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
  22. package/src/llama.cpp/.github/workflows/docker.yml +43 -34
  23. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
  24. package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
  25. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
  26. package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
  27. package/src/llama.cpp/.github/workflows/server.yml +7 -0
  28. package/src/llama.cpp/CMakeLists.txt +20 -8
  29. package/src/llama.cpp/common/CMakeLists.txt +12 -10
  30. package/src/llama.cpp/common/arg.cpp +2006 -0
  31. package/src/llama.cpp/common/arg.h +77 -0
  32. package/src/llama.cpp/common/common.cpp +496 -1632
  33. package/src/llama.cpp/common/common.h +161 -63
  34. package/src/llama.cpp/common/console.cpp +3 -0
  35. package/src/llama.cpp/common/log.cpp +401 -0
  36. package/src/llama.cpp/common/log.h +66 -698
  37. package/src/llama.cpp/common/ngram-cache.cpp +3 -0
  38. package/src/llama.cpp/common/sampling.cpp +348 -350
  39. package/src/llama.cpp/common/sampling.h +62 -139
  40. package/src/llama.cpp/common/stb_image.h +5990 -6398
  41. package/src/llama.cpp/common/train.cpp +2 -0
  42. package/src/llama.cpp/docs/build.md +36 -1
  43. package/src/llama.cpp/examples/CMakeLists.txt +0 -1
  44. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +1 -2
  45. package/src/llama.cpp/examples/batched/batched.cpp +39 -55
  46. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +34 -44
  47. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
  48. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +15 -15
  49. package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
  50. package/src/llama.cpp/examples/embedding/embedding.cpp +143 -87
  51. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +33 -33
  52. package/src/llama.cpp/examples/export-lora/export-lora.cpp +36 -35
  53. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
  54. package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +5 -0
  55. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
  56. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
  57. package/src/llama.cpp/examples/gritlm/gritlm.cpp +34 -27
  58. package/src/llama.cpp/examples/imatrix/imatrix.cpp +59 -62
  59. package/src/llama.cpp/examples/infill/infill.cpp +117 -132
  60. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +265 -58
  61. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +29 -22
  62. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  63. package/src/llama.cpp/examples/llava/clip.cpp +685 -150
  64. package/src/llama.cpp/examples/llava/clip.h +11 -2
  65. package/src/llama.cpp/examples/llava/llava-cli.cpp +47 -58
  66. package/src/llama.cpp/examples/llava/llava.cpp +110 -24
  67. package/src/llama.cpp/examples/llava/llava.h +2 -3
  68. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
  69. package/src/llama.cpp/examples/llava/requirements.txt +1 -0
  70. package/src/llama.cpp/examples/lookahead/lookahead.cpp +42 -43
  71. package/src/llama.cpp/examples/lookup/lookup-create.cpp +10 -8
  72. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +23 -22
  73. package/src/llama.cpp/examples/lookup/lookup.cpp +40 -43
  74. package/src/llama.cpp/examples/main/main.cpp +210 -262
  75. package/src/llama.cpp/examples/parallel/parallel.cpp +49 -49
  76. package/src/llama.cpp/examples/passkey/passkey.cpp +42 -50
  77. package/src/llama.cpp/examples/perplexity/perplexity.cpp +187 -200
  78. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  79. package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
  80. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -3
  81. package/src/llama.cpp/examples/retrieval/retrieval.cpp +49 -44
  82. package/src/llama.cpp/examples/rpc/rpc-server.cpp +24 -1
  83. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +32 -35
  84. package/src/llama.cpp/examples/server/CMakeLists.txt +3 -5
  85. package/src/llama.cpp/examples/server/server.cpp +1027 -1073
  86. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
  87. package/src/llama.cpp/examples/server/utils.hpp +107 -105
  88. package/src/llama.cpp/examples/simple/simple.cpp +35 -41
  89. package/src/llama.cpp/examples/speculative/speculative.cpp +129 -103
  90. package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
  91. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  92. package/src/llama.cpp/examples/tokenize/tokenize.cpp +25 -27
  93. package/src/llama.cpp/ggml/CMakeLists.txt +14 -3
  94. package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
  95. package/src/llama.cpp/ggml/include/ggml-backend.h +145 -60
  96. package/src/llama.cpp/ggml/include/ggml-blas.h +3 -3
  97. package/src/llama.cpp/ggml/include/ggml-cann.h +15 -19
  98. package/src/llama.cpp/ggml/include/ggml-cuda.h +16 -16
  99. package/src/llama.cpp/ggml/include/ggml-metal.h +5 -8
  100. package/src/llama.cpp/ggml/include/ggml-rpc.h +5 -5
  101. package/src/llama.cpp/ggml/include/ggml-sycl.h +8 -8
  102. package/src/llama.cpp/ggml/include/ggml-vulkan.h +7 -7
  103. package/src/llama.cpp/ggml/include/ggml.h +293 -186
  104. package/src/llama.cpp/ggml/src/CMakeLists.txt +86 -44
  105. package/src/llama.cpp/ggml/src/ggml-aarch64.c +2135 -1119
  106. package/src/llama.cpp/ggml/src/ggml-alloc.c +6 -0
  107. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +152 -70
  108. package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +606 -286
  109. package/src/llama.cpp/ggml/src/ggml-blas.cpp +9 -10
  110. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
  111. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
  112. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
  113. package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
  114. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
  115. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
  116. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
  117. package/src/llama.cpp/ggml/src/ggml-cann.cpp +215 -216
  118. package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
  119. package/src/llama.cpp/ggml/src/ggml-cpu-impl.h +614 -0
  120. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  121. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
  122. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  123. package/src/llama.cpp/ggml/src/ggml-impl.h +49 -603
  124. package/src/llama.cpp/ggml/src/ggml-kompute.cpp +4 -24
  125. package/src/llama.cpp/ggml/src/ggml-quants.c +972 -92
  126. package/src/llama.cpp/ggml/src/ggml-quants.h +15 -0
  127. package/src/llama.cpp/ggml/src/ggml-rpc.cpp +116 -66
  128. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
  129. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +11 -0
  130. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +52 -0
  131. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
  132. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
  133. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
  134. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
  135. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
  136. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
  137. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +16 -3
  138. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
  140. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
  141. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1 -1
  142. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +6 -3
  143. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
  144. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
  145. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
  146. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
  147. package/src/llama.cpp/ggml/src/ggml-sycl.cpp +97 -169
  148. package/src/llama.cpp/ggml/src/ggml-vulkan.cpp +1508 -1124
  149. package/src/llama.cpp/ggml/src/ggml.c +3001 -1647
  150. package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +192 -0
  151. package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +2 -0
  152. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +88 -40
  153. package/src/llama.cpp/include/llama.h +241 -264
  154. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
  155. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
  156. package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
  157. package/src/llama.cpp/src/llama-grammar.cpp +721 -122
  158. package/src/llama.cpp/src/llama-grammar.h +120 -15
  159. package/src/llama.cpp/src/llama-impl.h +156 -1
  160. package/src/llama.cpp/src/llama-sampling.cpp +1375 -303
  161. package/src/llama.cpp/src/llama-sampling.h +20 -47
  162. package/src/llama.cpp/src/llama-vocab.cpp +343 -120
  163. package/src/llama.cpp/src/llama-vocab.h +33 -17
  164. package/src/llama.cpp/src/llama.cpp +4247 -1525
  165. package/src/llama.cpp/src/unicode-data.cpp +6 -4
  166. package/src/llama.cpp/src/unicode-data.h +4 -4
  167. package/src/llama.cpp/src/unicode.cpp +15 -7
  168. package/src/llama.cpp/tests/CMakeLists.txt +3 -0
  169. package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
  170. package/src/llama.cpp/tests/test-backend-ops.cpp +1592 -289
  171. package/src/llama.cpp/tests/test-barrier.cpp +93 -0
  172. package/src/llama.cpp/tests/test-grad0.cpp +187 -70
  173. package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
  174. package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
  175. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +6 -4
  176. package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
  177. package/src/llama.cpp/tests/test-log.cpp +39 -0
  178. package/src/llama.cpp/tests/test-quantize-fns.cpp +6 -0
  179. package/src/llama.cpp/tests/test-rope.cpp +1 -1
  180. package/src/llama.cpp/tests/test-sampling.cpp +157 -98
  181. package/src/llama.cpp/tests/test-tokenizer-0.cpp +55 -35
  182. package/patches/llama.patch +0 -22
  183. package/src/llama.cpp/.github/workflows/bench.yml +0 -310
  184. package/src/llama.cpp/common/grammar-parser.cpp +0 -536
  185. package/src/llama.cpp/common/grammar-parser.h +0 -29
  186. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
  187. package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
@@ -1,4 +1,6 @@
1
+ #include "arg.h"
1
2
  #include "common.h"
3
+ #include "log.h"
2
4
  #include "llama.h"
3
5
 
4
6
  #include <cmath>
@@ -17,15 +19,13 @@
17
19
  #pragma warning(disable: 4244 4267) // possible loss of data
18
20
  #endif
19
21
 
20
- static void print_usage(int argc, char ** argv, const gpt_params & params) {
21
- gpt_params_print_usage(argc, argv, params);
22
-
23
- LOG_TEE("\nexample usage:\n");
24
- LOG_TEE("\n %s \\\n"
25
- " -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \\\n"
22
+ static void print_usage(int, char ** argv) {
23
+ LOG("\nexample usage:\n");
24
+ LOG("\n %s \\\n"
25
+ " -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] \\\n"
26
26
  " [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n"
27
27
  " [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n" , argv[0]);
28
- LOG_TEE("\n");
28
+ LOG("\n");
29
29
  }
30
30
 
31
31
  struct Stats {
@@ -126,12 +126,10 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
126
126
  e.counts.resize(src1->ne[0]*n_as, 0);
127
127
  }
128
128
  else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
129
- fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
129
+ LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
130
130
  exit(1); //GGML_ABORT("fatal error");
131
131
  }
132
- if (m_params.verbosity > 1) {
133
- printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
134
- }
132
+ LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
135
133
  // loop over all possible experts, regardless if they are used or not in the batch
136
134
  for (int ex = 0; ex < n_as; ++ex) {
137
135
  size_t e_start = ex*src1->ne[0];
@@ -152,7 +150,8 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
152
150
  e.values[e_start + j] += x[j]*x[j];
153
151
  e.counts[e_start + j]++;
154
152
  if (!std::isfinite(e.values[e_start + j])) {
155
- fprintf(stderr, "%f detected in %s\n", e.values[e_start + j], wname.c_str());
153
+ LOG("\n");
154
+ LOG_ERR("%f detected in %s\n", e.values[e_start + j], wname.c_str());
156
155
  exit(1);
157
156
  }
158
157
  }
@@ -175,20 +174,18 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
175
174
  e.counts.resize(src1->ne[0], 0);
176
175
  }
177
176
  else if (e.values.size() != (size_t)src1->ne[0]) {
178
- fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
177
+ LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
179
178
  exit(1); //GGML_ABORT("fatal error");
180
179
  }
181
180
  ++e.ncall;
182
- if (m_params.verbosity > 1) {
183
- printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
184
- }
181
+ LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
185
182
  for (int row = 0; row < (int)src1->ne[1]; ++row) {
186
183
  const float * x = data + row * src1->ne[0];
187
184
  for (int j = 0; j < (int)src1->ne[0]; ++j) {
188
185
  e.values[j] += x[j]*x[j];
189
186
  e.counts[j]++;
190
187
  if (!std::isfinite(e.values[j])) {
191
- fprintf(stderr, "%f detected in %s\n", e.values[j], wname.c_str());
188
+ LOG_ERR("%f detected in %s\n", e.values[j], wname.c_str());
192
189
  exit(1);
193
190
  }
194
191
  }
@@ -240,17 +237,17 @@ void IMatrixCollector::save_imatrix(int ncall) const {
240
237
  }
241
238
 
242
239
  if (n_zeros != 0 && is_first) {
243
- fprintf(stderr, "\n");
240
+ LOG_INF("\n");
244
241
  is_first = false;
245
242
  }
246
243
 
247
244
  if (n_zeros == n_all) {
248
- fprintf(stderr, "%s: entry '%40s' has no data - skipping\n", __func__, kv.first.c_str());
245
+ LOG_WRN("%s: entry '%40s' has no data - skipping\n", __func__, kv.first.c_str());
249
246
  continue;
250
247
  }
251
248
 
252
249
  if (n_zeros > 0) {
253
- fprintf(stderr, "%s: entry '%40s' has partial data (%.2f%%) - skipping\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
250
+ LOG_WRN("%s: entry '%40s' has partial data (%.2f%%) - skipping\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
254
251
  continue;
255
252
  }
256
253
 
@@ -259,7 +256,7 @@ void IMatrixCollector::save_imatrix(int ncall) const {
259
256
  }
260
257
 
261
258
  if (to_store.size() < m_stats.size()) {
262
- fprintf(stderr, "%s: warning: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size());
259
+ LOG_WRN("%s: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size());
263
260
  }
264
261
 
265
262
  std::ofstream out(fname, std::ios::binary);
@@ -291,21 +288,20 @@ void IMatrixCollector::save_imatrix(int ncall) const {
291
288
  out.write(m_params.prompt_file.c_str(), len);
292
289
  }
293
290
 
294
- if (m_params.verbosity > 0) {
295
- fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str());
296
- }
291
+ LOGV(1, "\n");
292
+ LOG_DBGV(1, "%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str());
297
293
  }
298
294
 
299
295
  bool IMatrixCollector::load_imatrix(const char * fname) {
300
296
  std::ifstream in(fname, std::ios::binary);
301
297
  if (!in) {
302
- printf("%s: failed to open %s\n",__func__, fname);
298
+ LOG_ERR("%s: failed to open %s\n",__func__, fname);
303
299
  return false;
304
300
  }
305
301
  int n_entries;
306
302
  in.read((char*)&n_entries, sizeof(n_entries));
307
303
  if (in.fail() || n_entries < 1) {
308
- printf("%s: no data in file %s\n", __func__, fname);
304
+ LOG_ERR("%s: no data in file %s\n", __func__, fname);
309
305
  return false;
310
306
  }
311
307
  for (int i = 0; i < n_entries; ++i) {
@@ -313,7 +309,7 @@ bool IMatrixCollector::load_imatrix(const char * fname) {
313
309
  std::vector<char> name_as_vec(len+1);
314
310
  in.read((char *)name_as_vec.data(), len);
315
311
  if (in.fail()) {
316
- printf("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname);
312
+ LOG_ERR("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname);
317
313
  return false;
318
314
  }
319
315
  name_as_vec[len] = 0;
@@ -324,7 +320,7 @@ bool IMatrixCollector::load_imatrix(const char * fname) {
324
320
  int nval;
325
321
  in.read((char *)&nval, sizeof(nval));
326
322
  if (in.fail() || nval < 1) {
327
- printf("%s: failed reading number of values for entry %d\n",__func__,i);
323
+ LOG_ERR("%s: failed reading number of values for entry %d\n",__func__,i);
328
324
  m_stats = {};
329
325
  return false;
330
326
  }
@@ -337,7 +333,7 @@ bool IMatrixCollector::load_imatrix(const char * fname) {
337
333
  std::vector<float> tmp(nval);
338
334
  in.read((char*)tmp.data(), nval*sizeof(float));
339
335
  if (in.fail()) {
340
- printf("%s: failed reading data for entry %d\n",__func__,i);
336
+ LOG_ERR("%s: failed reading data for entry %d\n",__func__,i);
341
337
  m_stats = {};
342
338
  return false;
343
339
  }
@@ -433,31 +429,30 @@ static void process_logits(
433
429
  }
434
430
 
435
431
  static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
436
- const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
437
- GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
432
+ const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
433
+ GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
438
434
  const int n_ctx = llama_n_ctx(ctx);
439
435
 
440
436
  auto tim1 = std::chrono::high_resolution_clock::now();
441
- fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
437
+ LOG_INF("%s: tokenizing the input ..\n", __func__);
442
438
 
443
439
  std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
444
440
 
445
441
  auto tim2 = std::chrono::high_resolution_clock::now();
446
- fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
442
+ LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
447
443
 
448
444
  if (params.i_chunk > 0) {
449
445
  if (size_t((params.i_chunk + 2)*n_ctx) >= tokens.size()) {
450
- fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk);
446
+ LOG_ERR("%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk);
451
447
  return false;
452
448
  }
453
- fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx);
449
+ LOG_INF("%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx);
454
450
  tokens.erase(tokens.begin(), tokens.begin() + params.i_chunk*n_ctx);
455
451
  }
456
452
 
457
453
  if (int(tokens.size()) < 2*n_ctx) {
458
- fprintf(stderr, "%s: you need at least %d tokens for a context of %d tokens\n",__func__,2*n_ctx,
459
- n_ctx);
460
- fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
454
+ LOG_ERR("%s: you need at least %d tokens for a context of %d tokens\n", __func__, 2*n_ctx, n_ctx);
455
+ LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n", __func__, tokens.size());
461
456
  return false;
462
457
  }
463
458
 
@@ -479,7 +474,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
479
474
  double nll = 0.0;
480
475
  double nll2 = 0.0;
481
476
 
482
- fprintf(stderr, "%s: computing over %d chunks with batch_size %d\n", __func__, n_chunk, n_batch);
477
+ LOG_INF("%s: computing over %d chunks with batch_size %d\n", __func__, n_chunk, n_batch);
483
478
 
484
479
  std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
485
480
 
@@ -515,7 +510,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
515
510
 
516
511
  // TODO: use batch.logits to save computations instead of relying on logits_all == true
517
512
  if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
518
- fprintf(stderr, "%s : failed to eval\n", __func__);
513
+ LOG_ERR("%s : failed to eval\n", __func__);
519
514
  return false;
520
515
  }
521
516
 
@@ -532,29 +527,29 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
532
527
 
533
528
  if (i == 0) {
534
529
  const float t_total = std::chrono::duration<float>(t_end - t_start).count();
535
- fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
530
+ LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
536
531
  int total_seconds = (int)(t_total * n_chunk);
537
532
  if (total_seconds >= 60*60) {
538
- fprintf(stderr, "%d hours ", total_seconds / (60*60));
533
+ LOG("%d hours ", total_seconds / (60*60));
539
534
  total_seconds = total_seconds % (60*60);
540
535
  }
541
- fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
536
+ LOG("%.2f minutes\n", total_seconds / 60.0);
542
537
  }
543
538
 
544
539
  if (params.compute_ppl) {
545
540
  const int first = n_ctx/2;
546
- const auto all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
541
+ const auto * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
547
542
  process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
548
543
  workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
549
544
  count += n_ctx - first - 1;
550
545
 
551
- printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
546
+ LOG("[%d]%.4lf,", i + 1, std::exp(nll / count));
552
547
  fflush(stdout);
553
548
 
554
549
  logits.clear();
555
550
  }
556
551
  }
557
- printf("\n");
552
+ LOG("\n");
558
553
 
559
554
  if (params.compute_ppl) {
560
555
  nll2 /= count;
@@ -563,9 +558,9 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
563
558
  nll2 -= nll * nll;
564
559
  if (nll2 > 0) {
565
560
  nll2 = sqrt(nll2/(count-1));
566
- printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
561
+ LOG("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
567
562
  } else {
568
- printf("Unexpected negative standard deviation of log(prob)\n");
563
+ LOG("Unexpected negative standard deviation of log(prob)\n");
569
564
  }
570
565
  }
571
566
 
@@ -577,27 +572,28 @@ int main(int argc, char ** argv) {
577
572
 
578
573
  params.n_ctx = 512;
579
574
  params.logits_all = true;
580
- params.verbosity = 1;
575
+ params.escape = false;
581
576
 
582
- if (!gpt_params_parse(argc, argv, params)) {
583
- print_usage(argc, argv, params);
577
+ if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) {
584
578
  return 1;
585
579
  }
586
580
 
581
+ gpt_init();
582
+
587
583
  params.n_batch = std::min(params.n_batch, params.n_ctx);
588
584
 
589
585
  g_collector.set_params(params);
590
586
 
591
587
  for (const auto & in_file : params.in_files) {
592
- printf("%s : loading imatrix from '%s'\n", __func__, in_file.c_str());
588
+ LOG_INF("%s : loading imatrix from '%s'\n", __func__, in_file.c_str());
593
589
  if (!g_collector.load_imatrix(in_file.c_str())) {
594
- fprintf(stderr, "%s : failed to load %s\n", __func__, in_file.c_str());
590
+ LOG_ERR("%s : failed to load %s\n", __func__, in_file.c_str());
595
591
  return 1;
596
592
  }
597
593
  }
598
594
 
599
595
  if (params.in_files.size() > 1) {
600
- printf("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str());
596
+ LOG_INF("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str());
601
597
  g_collector.save_imatrix();
602
598
  }
603
599
 
@@ -611,25 +607,25 @@ int main(int argc, char ** argv) {
611
607
  params.warmup = false;
612
608
 
613
609
  // init
614
- llama_model * model;
615
- llama_context * ctx;
610
+ llama_init_result llama_init = llama_init_from_gpt_params(params);
616
611
 
617
- std::tie(model, ctx) = llama_init_from_gpt_params(params);
612
+ llama_model * model = llama_init.model;
613
+ llama_context * ctx = llama_init.context;
618
614
  if (model == nullptr || ctx == nullptr) {
619
- fprintf(stderr, "%s : failed to init\n", __func__);
615
+ LOG_ERR("%s : failed to init\n", __func__);
620
616
  return 1;
621
617
  }
622
618
 
623
619
  const int n_ctx_train = llama_n_ctx_train(model);
624
620
  if (params.n_ctx > n_ctx_train) {
625
- fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
621
+ LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n",
626
622
  __func__, n_ctx_train, params.n_ctx);
627
623
  }
628
624
 
629
625
  // print system information
630
626
  {
631
- fprintf(stderr, "\n");
632
- fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
627
+ LOG_INF("\n");
628
+ LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
633
629
  }
634
630
 
635
631
  if (!compute_imatrix(ctx, params)) {
@@ -638,7 +634,8 @@ int main(int argc, char ** argv) {
638
634
 
639
635
  g_collector.save_imatrix();
640
636
 
641
- llama_print_timings(ctx);
637
+ LOG("\n");
638
+ llama_perf_context_print(ctx);
642
639
 
643
640
  llama_free(ctx);
644
641
  llama_free_model(model);