@fugood/llama.node 0.3.0 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (187) hide show
  1. package/CMakeLists.txt +1 -10
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +6 -4
  17. package/src/LlamaCompletionWorker.cpp +6 -6
  18. package/src/LlamaContext.cpp +7 -9
  19. package/src/common.hpp +2 -1
  20. package/src/llama.cpp/.github/workflows/build.yml +98 -24
  21. package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
  22. package/src/llama.cpp/.github/workflows/docker.yml +43 -34
  23. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
  24. package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
  25. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
  26. package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
  27. package/src/llama.cpp/.github/workflows/server.yml +7 -0
  28. package/src/llama.cpp/CMakeLists.txt +20 -8
  29. package/src/llama.cpp/common/CMakeLists.txt +12 -10
  30. package/src/llama.cpp/common/arg.cpp +2006 -0
  31. package/src/llama.cpp/common/arg.h +77 -0
  32. package/src/llama.cpp/common/common.cpp +496 -1632
  33. package/src/llama.cpp/common/common.h +161 -63
  34. package/src/llama.cpp/common/console.cpp +3 -0
  35. package/src/llama.cpp/common/log.cpp +401 -0
  36. package/src/llama.cpp/common/log.h +66 -698
  37. package/src/llama.cpp/common/ngram-cache.cpp +3 -0
  38. package/src/llama.cpp/common/sampling.cpp +348 -350
  39. package/src/llama.cpp/common/sampling.h +62 -139
  40. package/src/llama.cpp/common/stb_image.h +5990 -6398
  41. package/src/llama.cpp/common/train.cpp +2 -0
  42. package/src/llama.cpp/docs/build.md +36 -1
  43. package/src/llama.cpp/examples/CMakeLists.txt +0 -1
  44. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +1 -2
  45. package/src/llama.cpp/examples/batched/batched.cpp +39 -55
  46. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +34 -44
  47. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
  48. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +15 -15
  49. package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
  50. package/src/llama.cpp/examples/embedding/embedding.cpp +143 -87
  51. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +33 -33
  52. package/src/llama.cpp/examples/export-lora/export-lora.cpp +36 -35
  53. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
  54. package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +5 -0
  55. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
  56. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
  57. package/src/llama.cpp/examples/gritlm/gritlm.cpp +34 -27
  58. package/src/llama.cpp/examples/imatrix/imatrix.cpp +59 -62
  59. package/src/llama.cpp/examples/infill/infill.cpp +117 -132
  60. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +265 -58
  61. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +29 -22
  62. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  63. package/src/llama.cpp/examples/llava/clip.cpp +685 -150
  64. package/src/llama.cpp/examples/llava/clip.h +11 -2
  65. package/src/llama.cpp/examples/llava/llava-cli.cpp +47 -58
  66. package/src/llama.cpp/examples/llava/llava.cpp +110 -24
  67. package/src/llama.cpp/examples/llava/llava.h +2 -3
  68. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
  69. package/src/llama.cpp/examples/llava/requirements.txt +1 -0
  70. package/src/llama.cpp/examples/lookahead/lookahead.cpp +42 -43
  71. package/src/llama.cpp/examples/lookup/lookup-create.cpp +10 -8
  72. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +23 -22
  73. package/src/llama.cpp/examples/lookup/lookup.cpp +40 -43
  74. package/src/llama.cpp/examples/main/main.cpp +210 -262
  75. package/src/llama.cpp/examples/parallel/parallel.cpp +49 -49
  76. package/src/llama.cpp/examples/passkey/passkey.cpp +42 -50
  77. package/src/llama.cpp/examples/perplexity/perplexity.cpp +187 -200
  78. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  79. package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
  80. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -3
  81. package/src/llama.cpp/examples/retrieval/retrieval.cpp +49 -44
  82. package/src/llama.cpp/examples/rpc/rpc-server.cpp +24 -1
  83. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +32 -35
  84. package/src/llama.cpp/examples/server/CMakeLists.txt +3 -5
  85. package/src/llama.cpp/examples/server/server.cpp +1027 -1073
  86. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
  87. package/src/llama.cpp/examples/server/utils.hpp +107 -105
  88. package/src/llama.cpp/examples/simple/simple.cpp +35 -41
  89. package/src/llama.cpp/examples/speculative/speculative.cpp +129 -103
  90. package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
  91. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  92. package/src/llama.cpp/examples/tokenize/tokenize.cpp +25 -27
  93. package/src/llama.cpp/ggml/CMakeLists.txt +14 -3
  94. package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
  95. package/src/llama.cpp/ggml/include/ggml-backend.h +145 -60
  96. package/src/llama.cpp/ggml/include/ggml-blas.h +3 -3
  97. package/src/llama.cpp/ggml/include/ggml-cann.h +15 -19
  98. package/src/llama.cpp/ggml/include/ggml-cuda.h +16 -16
  99. package/src/llama.cpp/ggml/include/ggml-metal.h +5 -8
  100. package/src/llama.cpp/ggml/include/ggml-rpc.h +5 -5
  101. package/src/llama.cpp/ggml/include/ggml-sycl.h +8 -8
  102. package/src/llama.cpp/ggml/include/ggml-vulkan.h +7 -7
  103. package/src/llama.cpp/ggml/include/ggml.h +293 -186
  104. package/src/llama.cpp/ggml/src/CMakeLists.txt +86 -44
  105. package/src/llama.cpp/ggml/src/ggml-aarch64.c +2135 -1119
  106. package/src/llama.cpp/ggml/src/ggml-alloc.c +6 -0
  107. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +152 -70
  108. package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +606 -286
  109. package/src/llama.cpp/ggml/src/ggml-blas.cpp +9 -10
  110. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
  111. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
  112. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
  113. package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
  114. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
  115. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
  116. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
  117. package/src/llama.cpp/ggml/src/ggml-cann.cpp +215 -216
  118. package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
  119. package/src/llama.cpp/ggml/src/ggml-cpu-impl.h +614 -0
  120. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  121. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
  122. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  123. package/src/llama.cpp/ggml/src/ggml-impl.h +49 -603
  124. package/src/llama.cpp/ggml/src/ggml-kompute.cpp +4 -24
  125. package/src/llama.cpp/ggml/src/ggml-quants.c +972 -92
  126. package/src/llama.cpp/ggml/src/ggml-quants.h +15 -0
  127. package/src/llama.cpp/ggml/src/ggml-rpc.cpp +116 -66
  128. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
  129. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +11 -0
  130. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +52 -0
  131. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
  132. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
  133. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
  134. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
  135. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
  136. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
  137. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +16 -3
  138. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
  140. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
  141. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1 -1
  142. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +6 -3
  143. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
  144. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
  145. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
  146. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
  147. package/src/llama.cpp/ggml/src/ggml-sycl.cpp +97 -169
  148. package/src/llama.cpp/ggml/src/ggml-vulkan.cpp +1508 -1124
  149. package/src/llama.cpp/ggml/src/ggml.c +3001 -1647
  150. package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +192 -0
  151. package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +2 -0
  152. package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +88 -40
  153. package/src/llama.cpp/include/llama.h +241 -264
  154. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
  155. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
  156. package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
  157. package/src/llama.cpp/src/llama-grammar.cpp +721 -122
  158. package/src/llama.cpp/src/llama-grammar.h +120 -15
  159. package/src/llama.cpp/src/llama-impl.h +156 -1
  160. package/src/llama.cpp/src/llama-sampling.cpp +1375 -303
  161. package/src/llama.cpp/src/llama-sampling.h +20 -47
  162. package/src/llama.cpp/src/llama-vocab.cpp +343 -120
  163. package/src/llama.cpp/src/llama-vocab.h +33 -17
  164. package/src/llama.cpp/src/llama.cpp +4247 -1525
  165. package/src/llama.cpp/src/unicode-data.cpp +6 -4
  166. package/src/llama.cpp/src/unicode-data.h +4 -4
  167. package/src/llama.cpp/src/unicode.cpp +15 -7
  168. package/src/llama.cpp/tests/CMakeLists.txt +3 -0
  169. package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
  170. package/src/llama.cpp/tests/test-backend-ops.cpp +1592 -289
  171. package/src/llama.cpp/tests/test-barrier.cpp +93 -0
  172. package/src/llama.cpp/tests/test-grad0.cpp +187 -70
  173. package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
  174. package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
  175. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +6 -4
  176. package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
  177. package/src/llama.cpp/tests/test-log.cpp +39 -0
  178. package/src/llama.cpp/tests/test-quantize-fns.cpp +6 -0
  179. package/src/llama.cpp/tests/test-rope.cpp +1 -1
  180. package/src/llama.cpp/tests/test-sampling.cpp +157 -98
  181. package/src/llama.cpp/tests/test-tokenizer-0.cpp +55 -35
  182. package/patches/llama.patch +0 -22
  183. package/src/llama.cpp/.github/workflows/bench.yml +0 -310
  184. package/src/llama.cpp/common/grammar-parser.cpp +0 -536
  185. package/src/llama.cpp/common/grammar-parser.h +0 -29
  186. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
  187. package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
@@ -1,18 +1,21 @@
1
+ #include "arg.h"
1
2
  #include "common.h"
3
+ #include "log.h"
2
4
  #include "llama.h"
3
5
 
6
+ #include <algorithm>
7
+ #include <array>
8
+ #include <atomic>
4
9
  #include <cmath>
5
10
  #include <cstdio>
6
11
  #include <cstring>
7
12
  #include <ctime>
13
+ #include <fstream>
14
+ #include <mutex>
15
+ #include <random>
8
16
  #include <sstream>
9
17
  #include <thread>
10
- #include <mutex>
11
- #include <atomic>
12
18
  #include <vector>
13
- #include <array>
14
- #include <fstream>
15
- #include <sstream>
16
19
 
17
20
  #if defined(_MSC_VER)
18
21
  #pragma warning(disable: 4244 4267) // possible loss of data
@@ -40,7 +43,7 @@ static void write_logfile(
40
43
  }
41
44
 
42
45
  if (params.hellaswag) {
43
- fprintf(stderr, "%s: warning: logging results is not implemented for HellaSwag. No files will be written.\n", __func__);
46
+ LOG_WRN("%s: logging results is not implemented for HellaSwag. No files will be written.\n", __func__);
44
47
  return;
45
48
  }
46
49
 
@@ -48,7 +51,7 @@ static void write_logfile(
48
51
 
49
52
  const bool success = fs_create_directory_with_parents(params.logdir);
50
53
  if (!success) {
51
- fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
54
+ LOG_WRN("%s: failed to create logdir %s, cannot write logfile\n",
52
55
  __func__, params.logdir.c_str());
53
56
  return;
54
57
  }
@@ -57,7 +60,7 @@ static void write_logfile(
57
60
  FILE * logfile = fopen(logfile_path.c_str(), "w");
58
61
 
59
62
  if (logfile == NULL) {
60
- fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
63
+ LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
61
64
  return;
62
65
  }
63
66
 
@@ -76,7 +79,7 @@ static void write_logfile(
76
79
  fprintf(logfile, "ppl_value: %f\n", results.ppl_value);
77
80
  yaml_dump_vector_float(logfile, "probs", results.probs);
78
81
 
79
- llama_dump_timing_info_yaml(logfile, ctx);
82
+ llama_perf_dump_yaml(logfile, ctx);
80
83
  fclose(logfile);
81
84
  }
82
85
 
@@ -340,19 +343,19 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
340
343
  // Output: `perplexity: 13.5106 [114/114]`
341
344
  // BOS tokens will be added for each chunk before eval
342
345
 
343
- const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
344
- GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
346
+ const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
347
+ GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
345
348
 
346
- fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
349
+ LOG_INF("%s: tokenizing the input ..\n", __func__);
347
350
 
348
351
  std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
349
352
 
350
353
  const int n_ctx = llama_n_ctx(ctx);
351
354
 
352
355
  if (int(tokens.size()) < 2*n_ctx) {
353
- fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
356
+ LOG_ERR("%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
354
357
  n_ctx);
355
- fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
358
+ LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
356
359
  return {std::move(tokens), 0., {}, {}};
357
360
  }
358
361
 
@@ -363,16 +366,16 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
363
366
  prob_history.resize(tokens.size());
364
367
 
365
368
  if (params.ppl_stride <= 0) {
366
- fprintf(stderr, "%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride);
369
+ LOG_ERR("%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride);
367
370
  return {tokens, -1, logit_history, prob_history};
368
371
  }
369
372
 
370
373
  const int calc_chunk = n_ctx;
371
374
 
372
- fprintf(stderr, "%s: have %zu tokens. Calculation chunk = %d\n", __func__, tokens.size(), calc_chunk);
375
+ LOG_INF("%s: have %zu tokens. Calculation chunk = %d\n", __func__, tokens.size(), calc_chunk);
373
376
 
374
377
  if (int(tokens.size()) <= calc_chunk) {
375
- fprintf(stderr, "%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__,
378
+ LOG_ERR("%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__,
376
379
  tokens.size(), n_ctx, params.ppl_stride);
377
380
  return {tokens, -1, logit_history, prob_history};
378
381
  }
@@ -386,14 +389,14 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
386
389
  int count = 0;
387
390
  double nll = 0.0;
388
391
 
389
- fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
392
+ LOG_INF("%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
390
393
 
391
394
  for (int i = 0; i < n_chunk; ++i) {
392
395
  const int start = i * params.ppl_stride;
393
396
  const int end = start + calc_chunk;
394
397
 
395
398
  const int num_batches = (calc_chunk + n_batch - 1) / n_batch;
396
- //fprintf(stderr, "%s: evaluating %d...%d using %d batches\n", __func__, start, end, num_batches);
399
+ //LOG_DBG("%s: evaluating %d...%d using %d batches\n", __func__, start, end, num_batches);
397
400
 
398
401
  std::vector<float> logits;
399
402
 
@@ -406,10 +409,10 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
406
409
  const int batch_start = start + j * n_batch;
407
410
  const int batch_size = std::min(end - batch_start, n_batch);
408
411
 
409
- //fprintf(stderr, " Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
412
+ //LOG_DBG(" Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
410
413
  // TODO: use llama_batch.logits instead of relying on logits_all == true
411
414
  if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
412
- //fprintf(stderr, "%s : failed to eval\n", __func__);
415
+ //LOG_ERR("%s : failed to eval\n", __func__);
413
416
  return {tokens, -1, logit_history, prob_history};
414
417
  }
415
418
 
@@ -433,16 +436,16 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
433
436
 
434
437
  if (i == 0) {
435
438
  const float t_total = std::chrono::duration<float>(t_end - t_start).count();
436
- fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
439
+ LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
437
440
  int total_seconds = (int)(t_total * n_chunk);
438
441
  if (total_seconds >= 60*60) {
439
- fprintf(stderr, "%d hours ", total_seconds / (60*60));
442
+ LOG("%d hours ", total_seconds / (60*60));
440
443
  total_seconds = total_seconds % (60*60);
441
444
  }
442
- fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
445
+ LOG("%.2f minutes\n", total_seconds / 60.0);
443
446
  }
444
447
 
445
- //fprintf(stderr, "%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start);
448
+ //LOG_DBG("%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start);
446
449
  for (int j = n_ctx - params.ppl_stride - 1; j < n_ctx - 1; ++j) {
447
450
 
448
451
  // Calculate probability of next token, given the previous ones.
@@ -459,13 +462,12 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
459
462
  }
460
463
  // perplexity is e^(average negative log-likelihood)
461
464
  if (params.ppl_output_type == 0) {
462
- printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
465
+ LOG("[%d]%.4lf,", i + 1, std::exp(nll / count));
463
466
  } else {
464
- printf("%8d %.4lf\n", i*params.ppl_stride, std::exp(nll / count));
467
+ LOG("%8d %.4lf\n", i*params.ppl_stride, std::exp(nll / count));
465
468
  }
466
- fflush(stdout);
467
469
  }
468
- printf("\n");
470
+ LOG("\n");
469
471
 
470
472
  return {tokens, std::exp(nll / count), logit_history, prob_history};
471
473
  }
@@ -480,33 +482,33 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
480
482
  // Output: `perplexity: 13.5106 [114/114]`
481
483
  // BOS tokens will be added for each chunk before eval
482
484
 
483
- const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
484
- GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
485
+ const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
486
+ GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
485
487
 
486
488
  std::ofstream logits_stream;
487
489
  if (!params.logits_file.empty()) {
488
490
  logits_stream.open(params.logits_file.c_str(), std::ios::binary);
489
491
  if (!logits_stream.is_open()) {
490
- fprintf(stderr, "%s: failed to open %s for writing\n", __func__, params.logits_file.c_str());
492
+ LOG_ERR("%s: failed to open %s for writing\n", __func__, params.logits_file.c_str());
491
493
  return {};
492
494
  }
493
- fprintf(stderr, "%s: saving all logits to %s\n", __func__, params.logits_file.c_str());
495
+ LOG_INF("%s: saving all logits to %s\n", __func__, params.logits_file.c_str());
494
496
  logits_stream.write("_logits_", 8);
495
497
  logits_stream.write(reinterpret_cast<const char *>(&n_ctx), sizeof(n_ctx));
496
498
  }
497
499
 
498
500
  auto tim1 = std::chrono::high_resolution_clock::now();
499
- fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
501
+ LOG_INF("%s: tokenizing the input ..\n", __func__);
500
502
 
501
503
  std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
502
504
 
503
505
  auto tim2 = std::chrono::high_resolution_clock::now();
504
- fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
506
+ LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
505
507
 
506
508
  if (int(tokens.size()) < 2*n_ctx) {
507
- fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
509
+ LOG_ERR("%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
508
510
  n_ctx);
509
- fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
511
+ LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
510
512
  return {std::move(tokens), 0., {}, {}};
511
513
  }
512
514
 
@@ -539,7 +541,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
539
541
  logits.reserve((size_t)n_ctx * n_vocab);
540
542
  }
541
543
 
542
- fprintf(stderr, "%s: calculating perplexity over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq);
544
+ LOG_INF("%s: calculating perplexity over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq);
543
545
 
544
546
  std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
545
547
 
@@ -612,7 +614,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
612
614
  }
613
615
 
614
616
  if (llama_decode(ctx, batch)) {
615
- fprintf(stderr, "%s : failed to eval\n", __func__);
617
+ LOG_INF("%s : failed to eval\n", __func__);
616
618
  return {tokens, -1, logit_history, prob_history};
617
619
  }
618
620
 
@@ -627,13 +629,13 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
627
629
  llama_synchronize(ctx);
628
630
  const auto t_end = std::chrono::high_resolution_clock::now();
629
631
  const float t_total = std::chrono::duration<float>(t_end - t_start).count();
630
- fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
632
+ LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
631
633
  int total_seconds = (int)(t_total*n_chunk/n_seq);
632
634
  if (total_seconds >= 60*60) {
633
- fprintf(stderr, "%d hours ", total_seconds / (60*60));
635
+ LOG("%d hours ", total_seconds / (60*60));
634
636
  total_seconds = total_seconds % (60*60);
635
637
  }
636
- fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
638
+ LOG("%.2f minutes\n", total_seconds / 60.0);
637
639
  }
638
640
 
639
641
  for (int seq = 0; seq < n_seq_batch; seq++) {
@@ -655,19 +657,18 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
655
657
 
656
658
  // perplexity is e^(average negative log-likelihood)
657
659
  if (params.ppl_output_type == 0) {
658
- printf("[%d]%.4lf,", i + seq + 1, std::exp(nll / count));
660
+ LOG("[%d]%.4lf,", i + seq + 1, std::exp(nll / count));
659
661
  } else {
660
662
  double av = nll/count;
661
663
  double av2 = nll2/count - av*av;
662
664
  if (av2 > 0) av2 = sqrt(av2/(count-1));
663
- printf("%8d %.4lf %4lf %4lf\n", i*n_ctx, std::exp(nll / count), av, av2);
665
+ LOG("%8d %.4lf %4lf %4lf\n", i*n_ctx, std::exp(nll / count), av, av2);
664
666
  }
665
667
  }
666
- fflush(stdout);
667
668
 
668
669
  logits.clear();
669
670
  }
670
- printf("\n");
671
+ LOG("\n");
671
672
 
672
673
  nll2 /= count;
673
674
  nll /= count;
@@ -675,9 +676,9 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
675
676
  nll2 -= nll * nll;
676
677
  if (nll2 > 0) {
677
678
  nll2 = sqrt(nll2/(count-1));
678
- printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
679
+ LOG_INF("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
679
680
  } else {
680
- printf("Unexpected negative standard deviation of log(prob)\n");
681
+ LOG_ERR("Unexpected negative standard deviation of log(prob)\n");
681
682
  }
682
683
 
683
684
  llama_batch_free(batch);
@@ -703,7 +704,7 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<
703
704
 
704
705
  const int ret = llama_decode(ctx, batch_view);
705
706
  if (ret != 0) {
706
- LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
707
+ LOG_ERR("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
707
708
  return false;
708
709
  }
709
710
 
@@ -789,15 +790,15 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
789
790
  }
790
791
 
791
792
  if (prompt_lines.size() % 6 != 0) {
792
- fprintf(stderr, "%s : number of lines in prompt not a multiple of 6.\n", __func__);
793
+ LOG_ERR("%s : number of lines in prompt not a multiple of 6.\n", __func__);
793
794
  return;
794
795
  }
795
796
 
796
797
  size_t hs_task_count = prompt_lines.size()/6;
797
- fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count);
798
+ LOG_INF("%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count);
798
799
 
799
800
  const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM;
800
- fprintf(stderr, "================================= is_spm = %d\n", is_spm);
801
+ LOG_INF("================================= is_spm = %d\n", is_spm);
801
802
 
802
803
  // The tasks should be randomized so the score stabilizes quickly.
803
804
  bool randomize_tasks = true;
@@ -824,7 +825,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
824
825
  std::vector<llama_token> seq_tokens[4];
825
826
  };
826
827
 
827
- fprintf(stderr, "%s : selecting %zu %s tasks.\n", __func__, hs_task_count, (randomize_tasks?"randomized":"the first") );
828
+ LOG_INF("%s : selecting %zu %s tasks.\n", __func__, hs_task_count, (randomize_tasks?"randomized":"the first") );
828
829
 
829
830
  // Select and read data from prompt lines
830
831
  std::vector<hs_data_t> hs_data(hs_task_count);
@@ -870,9 +871,9 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
870
871
  }
871
872
  }
872
873
 
873
- fprintf(stderr, "%s : calculating hellaswag score over selected tasks.\n", __func__);
874
+ LOG_INF("%s : calculating hellaswag score over selected tasks.\n", __func__);
874
875
 
875
- printf("\ntask\tacc_norm\n");
876
+ LOG("\ntask\tacc_norm\n");
876
877
 
877
878
  double acc = 0.0f;
878
879
 
@@ -940,7 +941,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
940
941
  }
941
942
 
942
943
  if (i0 == i1) {
943
- fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0);
944
+ LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0);
944
945
  return;
945
946
  }
946
947
 
@@ -948,7 +949,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
948
949
 
949
950
  // decode all tasks [i0, i1)
950
951
  if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
951
- fprintf(stderr, "%s: llama_decode() failed\n", __func__);
952
+ LOG_ERR("%s: llama_decode() failed\n", __func__);
952
953
  return;
953
954
  }
954
955
 
@@ -998,7 +999,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
998
999
  }
999
1000
  }
1000
1001
 
1001
- //printf("max logprob ending idx %lu, gold ending idx %lu\n", ending_logprob_max_idx, hs_cur.gold_ending_idx);
1002
+ //LOG("max logprob ending idx %lu, gold ending idx %lu\n", ending_logprob_max_idx, hs_cur.gold_ending_idx);
1002
1003
 
1003
1004
  // If the gold ending got the maximum logprobe add one accuracy point
1004
1005
  if (ending_logprob_max_idx == hs_cur.gold_ending_idx) {
@@ -1006,8 +1007,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
1006
1007
  }
1007
1008
 
1008
1009
  // Print the accumulated accuracy mean x 100
1009
- printf("%zu\t%.8lf\n", i + 1, acc/double(i + 1)*100.0);
1010
- fflush(stdout);
1010
+ LOG("%zu\t%.8lf\n", i + 1, acc/double(i + 1)*100.0);
1011
1011
  }
1012
1012
 
1013
1013
  i0 = i1 - 1;
@@ -1015,7 +1015,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
1015
1015
 
1016
1016
  llama_batch_free(batch);
1017
1017
 
1018
- printf("\n");
1018
+ LOG("\n");
1019
1019
  }
1020
1020
 
1021
1021
  struct winogrande_entry {
@@ -1059,7 +1059,7 @@ static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string
1059
1059
  }
1060
1060
  }
1061
1061
  if (ipos != 4) {
1062
- printf("%s: failed to find comma separators in <%s>\n", __func__, line.c_str());
1062
+ LOG_ERR("%s: failed to find comma separators in <%s>\n", __func__, line.c_str());
1063
1063
  continue;
1064
1064
  }
1065
1065
  auto sentence = line[comma_pos[0]+1] == '"' ? line.substr(comma_pos[0]+2, comma_pos[1] - comma_pos[0] - 3)
@@ -1073,13 +1073,13 @@ static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string
1073
1073
  if (sentence[where] == '_') break;
1074
1074
  }
1075
1075
  if (where == int(sentence.size())) {
1076
- printf("%s: no _ in <%s>\n", __func__, sentence.c_str());
1076
+ LOG_ERR("%s: no _ in <%s>\n", __func__, sentence.c_str());
1077
1077
  continue;
1078
1078
  }
1079
1079
  std::istringstream stream(answer.c_str());
1080
1080
  int i_answer; stream >> i_answer;
1081
1081
  if (stream.fail() || i_answer < 1 || i_answer > 2) {
1082
- printf("%s: failed to parse answer <%s>\n", __func__, answer.c_str());
1082
+ LOG_ERR("%s: failed to parse answer <%s>\n", __func__, answer.c_str());
1083
1083
  continue;
1084
1084
  }
1085
1085
  result.emplace_back();
@@ -1108,14 +1108,14 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
1108
1108
 
1109
1109
  auto data = load_winogrande_from_csv(params.prompt);
1110
1110
  if (data.empty()) {
1111
- fprintf(stderr, "%s: no tasks\n", __func__);
1111
+ LOG_ERR("%s: no tasks\n", __func__);
1112
1112
  return;
1113
1113
  }
1114
1114
 
1115
- fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, data.size());
1115
+ LOG_INF("%s : loaded %zu tasks from prompt.\n", __func__, data.size());
1116
1116
 
1117
1117
  if (params.winogrande_tasks > 0 && params.winogrande_tasks < data.size()) {
1118
- fprintf(stderr, "%s : selecting %zu random tasks\n", __func__, params.winogrande_tasks);
1118
+ LOG_INF("%s : selecting %zu random tasks\n", __func__, params.winogrande_tasks);
1119
1119
  std::mt19937 rng(1);
1120
1120
  std::vector<int> aux(data.size());
1121
1121
  for (int i = 0; i < int(data.size()); ++i) {
@@ -1133,7 +1133,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
1133
1133
  data = std::move(selected);
1134
1134
  }
1135
1135
 
1136
- fprintf(stderr, "%s : tokenizing selected tasks\n", __func__);
1136
+ LOG_INF("%s : tokenizing selected tasks\n", __func__);
1137
1137
 
1138
1138
  for (auto & task : data) {
1139
1139
  task.seq_tokens[0] = ::llama_tokenize(ctx, task.first + task.choices[0] + task.second, true);
@@ -1156,7 +1156,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
1156
1156
  task.n_base2 = ::llama_tokenize(ctx, task.first + task.choices[1], true).size();
1157
1157
  }
1158
1158
 
1159
- fprintf(stderr, "%s : calculating winogrande score over selected tasks.\n", __func__);
1159
+ LOG_INF("%s : calculating winogrande score over selected tasks.\n", __func__);
1160
1160
 
1161
1161
  const int n_vocab = llama_n_vocab(llama_get_model(ctx));
1162
1162
  const int n_ctx = llama_n_ctx(ctx);
@@ -1217,7 +1217,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
1217
1217
  }
1218
1218
 
1219
1219
  if (i0 == i1) {
1220
- fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0);
1220
+ LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0);
1221
1221
  return;
1222
1222
  }
1223
1223
 
@@ -1225,7 +1225,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
1225
1225
 
1226
1226
  // decode all tasks [i0, i1)
1227
1227
  if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
1228
- fprintf(stderr, "%s: llama_decode() failed\n", __func__);
1228
+ LOG_ERR("%s: llama_decode() failed\n", __func__);
1229
1229
  return;
1230
1230
  }
1231
1231
 
@@ -1285,20 +1285,20 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
1285
1285
  ++n_done;
1286
1286
 
1287
1287
  // print the accumulated accuracy mean x 100
1288
- printf("%zu\t%.4lf\t%10.6f %10.6f %d %d\n", i+1, 100.0 * n_correct/n_done, score_1st, score_2nd, result, task.answer);
1289
- fflush(stdout);
1288
+ LOG("%zu\t%.4lf\t%10.6f %10.6f %d %d\n", i+1, 100.0 * n_correct/n_done, score_1st, score_2nd, result, task.answer);
1290
1289
  }
1291
1290
 
1292
1291
  i0 = i1 - 1;
1293
1292
  }
1294
1293
 
1295
- printf("\n");
1294
+ LOG("\n");
1296
1295
 
1297
1296
  if (n_done < 100) return;
1298
1297
 
1299
1298
  const float p = 1.f*n_correct/n_done;
1300
1299
  const float sigma = 100.f*sqrt(p*(1-p)/(n_done-1));
1301
- printf("Final Winogrande score(%d tasks): %.4lf +/- %.4lf\n", n_done, 100*p, sigma);
1300
+
1301
+ LOG_INF("Final Winogrande score(%d tasks): %.4lf +/- %.4lf\n", n_done, 100*p, sigma);
1302
1302
  }
1303
1303
 
1304
1304
  static bool deserialize_string(std::istream & in, std::string & str) {
@@ -1347,7 +1347,7 @@ struct multiple_choice_task {
1347
1347
  static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choice_task& task, bool log_error) {
1348
1348
  if (task.question.empty() || task.mc1.answers.empty()) {
1349
1349
  if (log_error) {
1350
- printf("%s: found bad task with empty question and/or answers\n", __func__);
1350
+ LOG_ERR("%s: found bad task with empty question and/or answers\n", __func__);
1351
1351
  }
1352
1352
  return false;
1353
1353
  }
@@ -1355,7 +1355,7 @@ static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choic
1355
1355
  for (auto& answer : task.mc1.answers) {
1356
1356
  if (answer.empty()) {
1357
1357
  if (log_error) {
1358
- printf("%s: found empty answer\n", __func__);
1358
+ LOG_ERR("%s: found empty answer\n", __func__);
1359
1359
  }
1360
1360
  return false;
1361
1361
  }
@@ -1409,14 +1409,14 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
1409
1409
  uint32_t n_task;
1410
1410
  strstream.read((char *)&n_task, sizeof(n_task));
1411
1411
  if (strstream.fail() || n_task == 0) {
1412
- printf("%s: no tasks\n", __func__);
1412
+ LOG_ERR("%s: no tasks\n", __func__);
1413
1413
  return;
1414
1414
  }
1415
- printf("%s: there are %u tasks in prompt\n", __func__, n_task);
1415
+ LOG_INF("%s: there are %u tasks in prompt\n", __func__, n_task);
1416
1416
  std::vector<uint32_t> task_pos(n_task);
1417
1417
  strstream.read((char *)task_pos.data(), task_pos.size()*sizeof(uint32_t));
1418
1418
  if (strstream.fail()) {
1419
- printf("%s: failed to read task positions from prompt\n", __func__);
1419
+ LOG_ERR("%s: failed to read task positions from prompt\n", __func__);
1420
1420
  return;
1421
1421
  }
1422
1422
 
@@ -1424,21 +1424,21 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
1424
1424
  if (params.multiple_choice_tasks == 0 || params.multiple_choice_tasks >= (size_t)n_task) {
1425
1425
  // Use all tasks
1426
1426
  tasks.resize(n_task);
1427
- printf("%s: reading tasks", __func__);
1427
+ LOG_INF("%s: reading tasks", __func__);
1428
1428
  int n_dot = std::max((int) n_task/100, 1);
1429
1429
  int i = 0;
1430
1430
  for (auto& task : tasks) {
1431
1431
  ++i;
1432
1432
  if (!task.deserialize(strstream)) {
1433
- printf("%s: failed to read task %d of %u\n", __func__, i, n_task);
1433
+ LOG_ERR("%s: failed to read task %d of %u\n", __func__, i, n_task);
1434
1434
  return;
1435
1435
  }
1436
- if (i%n_dot == 0) printf(".");
1436
+ if (i%n_dot == 0) LOG(".");
1437
1437
  }
1438
- printf("done\n");
1438
+ LOG("done\n");
1439
1439
  }
1440
1440
  else {
1441
- printf("%s: selecting %zu random tasks from %u tasks available\n", __func__, params.multiple_choice_tasks, n_task);
1441
+ LOG_INF("%s: selecting %zu random tasks from %u tasks available\n", __func__, params.multiple_choice_tasks, n_task);
1442
1442
  std::mt19937 rng(1);
1443
1443
  std::vector<int> aux(n_task);
1444
1444
  for (uint32_t i = 0; i < n_task; ++i) aux[i] = i;
@@ -1451,18 +1451,16 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
1451
1451
  aux.pop_back();
1452
1452
  strstream.seekg(task_pos[idx], std::ios::beg);
1453
1453
  if (!task.deserialize(strstream)) {
1454
- printf("%s: failed to read task %d at position %u\n", __func__, idx, task_pos[idx]);
1454
+ LOG_ERR("%s: failed to read task %d at position %u\n", __func__, idx, task_pos[idx]);
1455
1455
  return;
1456
1456
  }
1457
1457
  }
1458
1458
  n_task = params.multiple_choice_tasks;
1459
1459
  }
1460
1460
 
1461
- printf("%s: preparing task data", __func__);
1462
- fflush(stdout);
1461
+ LOG_INF("%s: preparing task data", __func__);
1463
1462
  if (n_task > 500) {
1464
- printf("...");
1465
- fflush(stdout);
1463
+ LOG("...");
1466
1464
  std::atomic<int> counter(0);
1467
1465
  std::atomic<int> n_bad(0);
1468
1466
  auto prepare = [&counter, &n_bad, &tasks, ctx] () {
@@ -1486,11 +1484,10 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
1486
1484
  for (auto& w : workers) w = std::thread(prepare);
1487
1485
  prepare();
1488
1486
  for (auto& w : workers) w.join();
1489
- printf("done\n");
1490
- fflush(stdout);
1487
+ LOG("done\n");
1491
1488
  int nbad = n_bad;
1492
1489
  if (nbad > 0) {
1493
- printf("%s: found %d malformed tasks\n", __func__, nbad);
1490
+ LOG_ERR("%s: found %d malformed tasks\n", __func__, nbad);
1494
1491
  return;
1495
1492
  }
1496
1493
  } else {
@@ -1502,16 +1499,15 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
1502
1499
  return;
1503
1500
  }
1504
1501
  if (i_task%n_dot == 0) {
1505
- printf(".");
1506
- fflush(stdout);
1502
+ LOG(".");
1507
1503
  }
1508
1504
  }
1509
- printf("done\n");
1505
+ LOG("done\n");
1510
1506
  }
1511
1507
 
1512
- printf("%s : calculating TruthfulQA score over %zu tasks.\n", __func__, tasks.size());
1508
+ LOG_INF("%s : calculating TruthfulQA score over %zu tasks.\n", __func__, tasks.size());
1513
1509
 
1514
- printf("\ntask\tacc_norm\n");
1510
+ LOG("\ntask\tacc_norm\n");
1515
1511
 
1516
1512
  const int n_vocab = llama_n_vocab(llama_get_model(ctx));
1517
1513
  const int n_ctx = llama_n_ctx(ctx);
@@ -1590,7 +1586,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
1590
1586
  }
1591
1587
 
1592
1588
  if (i0 == i1) {
1593
- fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0);
1589
+ LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0);
1594
1590
  return;
1595
1591
  }
1596
1592
 
@@ -1598,7 +1594,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
1598
1594
 
1599
1595
  // decode all tasks [i0, i1)
1600
1596
  if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
1601
- fprintf(stderr, "%s: llama_decode() failed\n", __func__);
1597
+ LOG_ERR("%s: llama_decode() failed\n", __func__);
1602
1598
  return;
1603
1599
  }
1604
1600
 
@@ -1622,13 +1618,13 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
1622
1618
  // compute the logprobs for each ending of the decoded tasks
1623
1619
  for (size_t i = i0; i < i1; ++i) {
1624
1620
  auto & cur_task = tasks[i];
1625
- //printf("==== Evaluating <%s> with correct answer ", cur_task.question.c_str());
1621
+ //LOG("==== Evaluating <%s> with correct answer ", cur_task.question.c_str());
1626
1622
  //for (int j = 0; j < int(cur_task.mc1.labels.size()); ++j) {
1627
1623
  // if (cur_task.mc1.labels[j] == 1) {
1628
- // printf("%d", j+1);
1624
+ // LOG("%d", j+1);
1629
1625
  // }
1630
1626
  //}
1631
- //printf("\n common_prefix: %zu\n", cur_task.common_prefix);
1627
+ //LOG("\n common_prefix: %zu\n", cur_task.common_prefix);
1632
1628
 
1633
1629
  // get the logits of the last token of the common prefix
1634
1630
  std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*cur_task.i_logits, n_vocab*sizeof(float));
@@ -1640,13 +1636,13 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
1640
1636
  size_t count = 1;
1641
1637
  float log_prob = std::log(first_probs[cur_task.seq_tokens[s][cur_task.common_prefix]]);
1642
1638
  for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) {
1643
- //printf(" %zu %g\n", ir, eval_results[ir]);
1639
+ //LOG(" %zu %g\n", ir, eval_results[ir]);
1644
1640
  ++count;
1645
1641
  log_prob += eval_results[ir++];
1646
1642
  }
1647
1643
  cur_task.log_probs[s] = log_prob / count;
1648
- //printf(" Final: %g\n", log_prob / count);
1649
- //printf(" <%s> : %g\n", cur_task.mc1.answers[s].c_str(), log_prob/count);
1644
+ //LOG(" Final: %g\n", log_prob / count);
1645
+ //LOG(" <%s> : %g\n", cur_task.mc1.answers[s].c_str(), log_prob/count);
1650
1646
  }
1651
1647
 
1652
1648
  // Find the ending with maximum logprob
@@ -1666,8 +1662,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
1666
1662
  ++n_done;
1667
1663
 
1668
1664
  // Print the accumulated accuracy mean x 100
1669
- printf("%d\t%.8lf\n", n_done, 100.*n_correct/n_done);
1670
- fflush(stdout);
1665
+ LOG("%d\t%.8lf\n", n_done, 100.*n_correct/n_done);
1671
1666
  }
1672
1667
 
1673
1668
  i0 = i1 - 1;
@@ -1679,29 +1674,30 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
1679
1674
 
1680
1675
  float p = 1.f*n_correct/n_done;
1681
1676
  float sigma = sqrt(p*(1-p)/(n_done-1));
1682
- printf("\n Final result: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
1677
+ LOG("\n");
1678
+ LOG_INF("Final result: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
1683
1679
  p = 1.f*n_done/n_tot_answers;
1684
1680
  sigma = sqrt(p*(1-p)/(n_done-1));
1685
- printf("Random chance: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
1681
+ LOG_INF("Random chance: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
1686
1682
 
1687
- printf("\n");
1683
+ LOG_INF("\n");
1688
1684
  }
1689
1685
 
1690
1686
  static void kl_divergence(llama_context * ctx, const gpt_params & params) {
1691
1687
  if (params.logits_file.empty()) {
1692
- fprintf(stderr, "%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__);
1688
+ LOG_ERR("%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__);
1693
1689
  return;
1694
1690
  }
1695
1691
  std::ifstream in(params.logits_file.c_str(), std::ios::binary);
1696
1692
  if (!in) {
1697
- fprintf(stderr, "%s: failed to open %s\n", __func__, params.logits_file.c_str());
1693
+ LOG_ERR("%s: failed to open %s\n", __func__, params.logits_file.c_str());
1698
1694
  return;
1699
1695
  }
1700
1696
  {
1701
1697
  char check[9]; check[8] = 0;
1702
1698
  in.read(check, 8);
1703
1699
  if (in.fail() || strncmp("_logits_", check, 8) != 0) {
1704
- fprintf(stderr, "%s: %s does not look like a file containing log-probabilities\n", __func__, params.logits_file.c_str());
1700
+ LOG_ERR("%s: %s does not look like a file containing log-probabilities\n", __func__, params.logits_file.c_str());
1705
1701
  return;
1706
1702
  }
1707
1703
  }
@@ -1709,7 +1705,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
1709
1705
  uint32_t n_ctx;
1710
1706
  in.read((char *)&n_ctx, sizeof(n_ctx));
1711
1707
  if (n_ctx > llama_n_ctx(ctx)) {
1712
- fprintf(stderr, "%s: %s has been computed with %u, while the current context is %d. Increase it with -c and retry\n",
1708
+ LOG_ERR("%s: %s has been computed with %u, while the current context is %d. Increase it with -c and retry\n",
1713
1709
  __func__, params.logits_file.c_str(), n_ctx, params.n_ctx);
1714
1710
  }
1715
1711
 
@@ -1717,24 +1713,24 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
1717
1713
  in.read((char *)&n_vocab, sizeof(n_vocab));
1718
1714
  in.read((char *)&n_chunk, sizeof(n_chunk));
1719
1715
  if (in.fail()) {
1720
- fprintf(stderr, "%s: failed reading n_vocab, n_chunk from %s\n", __func__, params.logits_file.c_str());
1716
+ LOG_ERR("%s: failed reading n_vocab, n_chunk from %s\n", __func__, params.logits_file.c_str());
1721
1717
  return;
1722
1718
  }
1723
1719
  if (n_vocab != llama_n_vocab(llama_get_model(ctx))) {
1724
- fprintf(stderr, "%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx)));
1720
+ LOG_ERR("%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx)));
1725
1721
  }
1726
1722
 
1727
1723
  std::vector<llama_token> tokens(n_ctx * n_chunk);
1728
1724
  if (in.read((char *)tokens.data(), tokens.size()*sizeof(tokens[0])).fail()) {
1729
- fprintf(stderr, "%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str());
1725
+ LOG_ERR("%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str());
1730
1726
  return;
1731
1727
  }
1732
1728
 
1733
1729
  const int n_batch = params.n_batch;
1734
1730
  const int num_batches = (n_ctx + n_batch - 1)/n_batch;
1735
1731
  const int nv = 2*((n_vocab + 1)/2) + 4;
1736
- const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
1737
- GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
1732
+ const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
1733
+ GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
1738
1734
 
1739
1735
  std::vector<uint16_t> log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv);
1740
1736
  std::vector<float> kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
@@ -1775,7 +1771,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
1775
1771
  const auto t_start = std::chrono::high_resolution_clock::now();
1776
1772
 
1777
1773
  if (in.read((char *)log_probs_uint16.data(), log_probs_uint16.size()*sizeof(uint16_t)).fail()) {
1778
- fprintf(stderr, "%s: failed reading log-probs for chunk %d\n", __func__, i);
1774
+ LOG_ERR("%s: failed reading log-probs for chunk %d\n", __func__, i);
1779
1775
  return;
1780
1776
  }
1781
1777
 
@@ -1796,7 +1792,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
1796
1792
 
1797
1793
  // TODO: use llama_batch.logits instead of relying on logits_all == true
1798
1794
  if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
1799
- fprintf(stderr, "%s : failed to eval\n", __func__);
1795
+ LOG_ERR("%s : failed to eval\n", __func__);
1800
1796
  return;
1801
1797
  }
1802
1798
 
@@ -1813,16 +1809,16 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
1813
1809
 
1814
1810
  if (i == 0) {
1815
1811
  const float t_total = std::chrono::duration<float>(t_end - t_start).count();
1816
- fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
1812
+ LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
1817
1813
  int total_seconds = (int)(t_total * n_chunk);
1818
1814
  if (total_seconds >= 60*60) {
1819
- fprintf(stderr, "%d hours ", total_seconds / (60*60));
1815
+ LOG("%d hours ", total_seconds / (60*60));
1820
1816
  total_seconds = total_seconds % (60*60);
1821
1817
  }
1822
- fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
1823
-
1824
- printf("\nchunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p\n");
1818
+ LOG("%.2f minutes\n", total_seconds / 60.0);
1825
1819
  }
1820
+ LOG("\n");
1821
+ LOG("chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p\n");
1826
1822
 
1827
1823
  const int first = n_ctx/2;
1828
1824
  const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
@@ -1831,79 +1827,77 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
1831
1827
  p_diff_ptr += n_ctx - 1 - first;
1832
1828
  kld_ptr += n_ctx - 1 - first;
1833
1829
 
1834
- printf("%4d", i+1);
1830
+ LOG("%4d", i+1);
1835
1831
 
1836
1832
  auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
1837
1833
  const double ppl_val = exp(log_ppl.first);
1838
1834
  const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 )
1839
- printf(" %9.4lf ± %9.4lf", ppl_val, ppl_unc);
1835
+ LOG(" %9.4lf ± %9.4lf", ppl_val, ppl_unc);
1840
1836
 
1841
1837
  auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count);
1842
1838
  const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count);
1843
1839
  const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first;
1844
1840
  const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov);
1845
- printf(" %10.5lf ± %10.5lf", log_ppl_ratio_val, log_ppl_ratio_unc);
1841
+ LOG(" %10.5lf ± %10.5lf", log_ppl_ratio_val, log_ppl_ratio_unc);
1846
1842
 
1847
1843
  auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
1848
- printf(" %10.5lf ± %10.5lf", kl_div.first, kl_div.second);
1844
+ LOG(" %10.5lf ± %10.5lf", kl_div.first, kl_div.second);
1849
1845
 
1850
1846
  auto p_diff_mse = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count);
1851
1847
  const double p_diff_rms_val = sqrt(p_diff_mse.first);
1852
1848
  const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second;
1853
- printf(" %6.3lf ± %6.3lf %%", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
1849
+ LOG(" %6.3lf ± %6.3lf %%", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
1854
1850
 
1855
1851
  double p_top_val = 1.*kld.n_same_top/kld.count;
1856
1852
  double p_top_unc = sqrt(p_top_val*(1 - p_top_val)/(kld.count - 1));
1857
- printf(" %6.3lf ± %6.3lf %%", 100.0*p_top_val, 100.0*p_top_unc);
1853
+ LOG(" %6.3lf ± %6.3lf %%", 100.0*p_top_val, 100.0*p_top_unc);
1858
1854
 
1859
- printf("\n");
1860
-
1861
- fflush(stdout);
1855
+ LOG("\n");
1862
1856
 
1863
1857
  logits.clear();
1864
1858
  }
1865
- printf("\n");
1859
+ LOG("\n");
1866
1860
 
1867
1861
  if (kld.count < 100) return; // we do not wish to do statistics on so few values
1868
1862
 
1869
1863
  std::sort(kld_values.begin(), kld_values.end());
1870
1864
  std::sort(p_diff_values.begin(), p_diff_values.end());
1871
1865
 
1872
- printf("====== Perplexity statistics ======\n");
1866
+ LOG("====== Perplexity statistics ======\n");
1873
1867
 
1874
1868
  auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
1875
1869
  const double ppl_val = exp(log_ppl.first);
1876
1870
  const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 )
1877
- printf("Mean PPL(Q) : %10.6lf ± %10.6lf\n", ppl_val, ppl_unc);
1871
+ LOG("Mean PPL(Q) : %10.6lf ± %10.6lf\n", ppl_val, ppl_unc);
1878
1872
 
1879
1873
  auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count);
1880
1874
  const double ppl_base_val = exp(log_ppl_base.first);
1881
1875
  const double ppl_base_unc = ppl_base_val * log_ppl_base.second; // ppl_base_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_base.second ** 2 )
1882
- printf("Mean PPL(base) : %10.6lf ± %10.6lf\n", ppl_base_val, ppl_base_unc);
1876
+ LOG("Mean PPL(base) : %10.6lf ± %10.6lf\n", ppl_base_val, ppl_base_unc);
1883
1877
 
1884
1878
  const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count);
1885
- // printf("Cov(ln(PPL(Q)), ln(PPL(base))): %10.6lf\n", log_ppl_cov);
1879
+ // LOG("Cov(ln(PPL(Q)), ln(PPL(base))): %10.6lf\n", log_ppl_cov);
1886
1880
  const double log_ppl_cor = log_ppl_cov / (log_ppl.second*log_ppl_base.second);
1887
- printf("Cor(ln(PPL(Q)), ln(PPL(base))): %6.2lf%%\n", 100.0*log_ppl_cor);
1881
+ LOG("Cor(ln(PPL(Q)), ln(PPL(base))): %6.2lf%%\n", 100.0*log_ppl_cor);
1888
1882
 
1889
1883
  const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first;
1890
1884
  const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov);
1891
- printf("Mean ln(PPL(Q)/PPL(base)) : %10.6lf ± %10.6lf\n", log_ppl_ratio_val, log_ppl_ratio_unc);
1885
+ LOG("Mean ln(PPL(Q)/PPL(base)) : %10.6lf ± %10.6lf\n", log_ppl_ratio_val, log_ppl_ratio_unc);
1892
1886
 
1893
1887
  const double ppl_ratio_val = exp(log_ppl_ratio_val);
1894
1888
  const double ppl_ratio_unc = ppl_ratio_val * log_ppl_ratio_unc; // ppl_ratio_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_ratio.second ** 2 )
1895
- printf("Mean PPL(Q)/PPL(base) : %10.6lf ± %10.6lf\n", ppl_ratio_val, ppl_ratio_unc);
1889
+ LOG("Mean PPL(Q)/PPL(base) : %10.6lf ± %10.6lf\n", ppl_ratio_val, ppl_ratio_unc);
1896
1890
 
1897
1891
  const double ppl_cov = ppl_val * ppl_base_val * log_ppl_cov;
1898
1892
  const double ppl_diff_val = ppl_val - ppl_base_val;
1899
1893
  const double ppl_diff_unc = sqrt(ppl_unc*ppl_unc + ppl_base_unc*ppl_base_unc - 2.0*ppl_cov);
1900
- printf("Mean PPL(Q)-PPL(base) : %10.6lf ± %10.6lf\n", ppl_diff_val, ppl_diff_unc);
1894
+ LOG("Mean PPL(Q)-PPL(base) : %10.6lf ± %10.6lf\n", ppl_diff_val, ppl_diff_unc);
1901
1895
 
1902
- printf("\n");
1896
+ LOG("\n");
1903
1897
 
1904
- printf("====== KL divergence statistics ======\n");
1898
+ LOG("====== KL divergence statistics ======\n");
1905
1899
  auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
1906
- printf("Mean KLD: %10.6lf ± %10.6lf\n", kl_div.first, kl_div.second);
1900
+ LOG("Mean KLD: %10.6lf ± %10.6lf\n", kl_div.first, kl_div.second);
1907
1901
  auto kld_median = kld_values.size()%2 == 0 ? 0.5f*(kld_values[kld_values.size()/2] + kld_values[kld_values.size()/2-1])
1908
1902
  : kld_values[kld_values.size()/2];
1909
1903
 
@@ -1915,50 +1909,49 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
1915
1909
  return (1 - p)*values[ip] + p*values[std::min(ip+1, values.size()-1)];
1916
1910
  };
1917
1911
 
1918
- printf("Maximum KLD: %10.6f\n", kld_values.back());
1919
- printf("99.9%% KLD: %10.6f\n", percentile(kld_values, 0.999f));
1920
- printf("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f));
1921
- printf("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f));
1922
- printf("Median KLD: %10.6f\n", kld_median);
1923
- printf("10.0%% KLD: %10.6f\n", percentile(kld_values, 0.100f));
1924
- printf(" 5.0%% KLD: %10.6f\n", percentile(kld_values, 0.050f));
1925
- printf(" 1.0%% KLD: %10.6f\n", percentile(kld_values, 0.010f));
1926
- printf("Minimum KLD: %10.6f\n", kld_values.front());
1912
+ LOG("Maximum KLD: %10.6f\n", kld_values.back());
1913
+ LOG("99.9%% KLD: %10.6f\n", percentile(kld_values, 0.999f));
1914
+ LOG("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f));
1915
+ LOG("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f));
1916
+ LOG("Median KLD: %10.6f\n", kld_median);
1917
+ LOG("10.0%% KLD: %10.6f\n", percentile(kld_values, 0.100f));
1918
+ LOG(" 5.0%% KLD: %10.6f\n", percentile(kld_values, 0.050f));
1919
+ LOG(" 1.0%% KLD: %10.6f\n", percentile(kld_values, 0.010f));
1920
+ LOG("Minimum KLD: %10.6f\n", kld_values.front());
1927
1921
 
1928
- printf("\n");
1922
+ LOG("\n");
1929
1923
 
1930
- printf("====== Token probability statistics ======\n");
1924
+ LOG("====== Token probability statistics ======\n");
1931
1925
 
1932
1926
  auto p_diff = mean_and_uncertainty(kld.sum_p_diff, kld.sum_p_diff2, kld.count);
1933
- printf("Mean Δp: %6.3lf ± %5.3lf %%\n", 100.0*p_diff.first, 100.0*p_diff.second);
1927
+ LOG("Mean Δp: %6.3lf ± %5.3lf %%\n", 100.0*p_diff.first, 100.0*p_diff.second);
1934
1928
 
1935
1929
  auto p_diff_median = p_diff_values.size()%2 == 0 ? 0.5f*(p_diff_values[p_diff_values.size()/2] + p_diff_values[p_diff_values.size()/2-1])
1936
1930
  : p_diff_values[p_diff_values.size()/2];
1937
1931
 
1938
- printf("Maximum Δp: %6.3lf%%\n", 100.0*p_diff_values.back());
1939
- printf("99.9%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.999f));
1940
- printf("99.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.990f));
1941
- printf("95.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.950f));
1942
- printf("90.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.900f));
1943
- printf("75.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.750f));
1944
- printf("Median Δp: %6.3lf%%\n", 100.0*p_diff_median);
1945
- printf("25.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.250f));
1946
- printf("10.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.100f));
1947
- printf(" 5.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.050f));
1948
- printf(" 1.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.010f));
1949
- printf(" 0.1%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.001f));
1950
- printf("Minimum Δp: %6.3lf%%\n", 100.0*p_diff_values.front());
1932
+ LOG("Maximum Δp: %6.3lf%%\n", 100.0*p_diff_values.back());
1933
+ LOG("99.9%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.999f));
1934
+ LOG("99.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.990f));
1935
+ LOG("95.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.950f));
1936
+ LOG("90.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.900f));
1937
+ LOG("75.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.750f));
1938
+ LOG("Median Δp: %6.3lf%%\n", 100.0*p_diff_median);
1939
+ LOG("25.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.250f));
1940
+ LOG("10.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.100f));
1941
+ LOG(" 5.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.050f));
1942
+ LOG(" 1.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.010f));
1943
+ LOG(" 0.1%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.001f));
1944
+ LOG("Minimum Δp: %6.3lf%%\n", 100.0*p_diff_values.front());
1951
1945
 
1952
1946
  auto p_diff_mse = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count);
1953
- // printf("MSE Δp : %10.6lf ± %10.6lf\n", p_diff_mse.first, p_diff_mse.second);
1947
+ // LOG("MSE Δp : %10.6lf ± %10.6lf\n", p_diff_mse.first, p_diff_mse.second);
1954
1948
 
1955
1949
  const double p_diff_rms_val = sqrt(p_diff_mse.first);
1956
1950
  const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second;
1957
- printf("RMS Δp : %6.3lf ± %5.3lf %%\n", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
1951
+ LOG("RMS Δp : %6.3lf ± %5.3lf %%\n", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
1958
1952
 
1959
1953
  const double same_top_p = 1.0*kld.n_same_top/kld.count;
1960
- printf("Same top p: %6.3lf ± %5.3lf %%\n", 100.0*same_top_p, 100.0*sqrt(same_top_p*(1.0 - same_top_p)/(kld.count - 1)));
1961
-
1954
+ LOG("Same top p: %6.3lf ± %5.3lf %%\n", 100.0*same_top_p, 100.0*sqrt(same_top_p*(1.0 - same_top_p)/(kld.count - 1)));
1962
1955
  }
1963
1956
 
1964
1957
  int main(int argc, char ** argv) {
@@ -1966,16 +1959,18 @@ int main(int argc, char ** argv) {
1966
1959
 
1967
1960
  params.n_ctx = 512;
1968
1961
  params.logits_all = true;
1962
+ params.escape = false;
1969
1963
 
1970
- if (!gpt_params_parse(argc, argv, params)) {
1971
- gpt_params_print_usage(argc, argv, params);
1964
+ if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) {
1972
1965
  return 1;
1973
1966
  }
1974
1967
 
1968
+ gpt_init();
1969
+
1975
1970
  const int32_t n_ctx = params.n_ctx;
1976
1971
 
1977
1972
  if (n_ctx <= 0) {
1978
- fprintf(stderr, "%s: perplexity tool requires '--ctx-size' > 0\n", __func__);
1973
+ LOG_ERR("%s: perplexity tool requires '--ctx-size' > 0\n", __func__);
1979
1974
  return 1;
1980
1975
  }
1981
1976
 
@@ -2000,45 +1995,35 @@ int main(int argc, char ** argv) {
2000
1995
  }
2001
1996
 
2002
1997
  if (params.ppl_stride > 0) {
2003
- fprintf(stderr, "Will perform strided perplexity calculation -> adjusting context size from %d to %d\n",
1998
+ LOG_INF("Will perform strided perplexity calculation -> adjusting context size from %d to %d\n",
2004
1999
  params.n_ctx, params.n_ctx + params.ppl_stride/2);
2005
2000
  params.n_ctx += params.ppl_stride/2;
2006
2001
  }
2007
2002
 
2008
- print_build_info();
2009
-
2010
- if (params.seed == LLAMA_DEFAULT_SEED) {
2011
- params.seed = time(NULL);
2012
- }
2013
-
2014
- fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
2015
-
2016
- std::mt19937 rng(params.seed);
2017
-
2018
2003
  llama_backend_init();
2019
2004
  llama_numa_init(params.numa);
2020
2005
 
2021
- llama_model * model;
2022
- llama_context * ctx;
2023
-
2024
2006
  // load the model and apply lora adapter, if any
2025
- std::tie(model, ctx) = llama_init_from_gpt_params(params);
2007
+ llama_init_result llama_init = llama_init_from_gpt_params(params);
2008
+
2009
+ llama_model * model = llama_init.model;
2010
+ llama_context * ctx = llama_init.context;
2026
2011
  if (model == NULL) {
2027
- fprintf(stderr, "%s: error: unable to load model\n", __func__);
2012
+ LOG_ERR("%s: unable to load model\n", __func__);
2028
2013
  return 1;
2029
2014
  }
2030
2015
 
2031
2016
  const int n_ctx_train = llama_n_ctx_train(model);
2032
2017
 
2033
2018
  if (params.n_ctx > n_ctx_train) {
2034
- fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
2019
+ LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n",
2035
2020
  __func__, n_ctx_train, params.n_ctx);
2036
2021
  }
2037
2022
 
2038
2023
  // print system information
2039
2024
  {
2040
- fprintf(stderr, "\n");
2041
- fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
2025
+ LOG_INF("\n");
2026
+ LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
2042
2027
  }
2043
2028
 
2044
2029
  struct results_perplexity results;
@@ -2054,7 +2039,9 @@ int main(int argc, char ** argv) {
2054
2039
  results = perplexity(ctx, params, n_ctx);
2055
2040
  }
2056
2041
 
2057
- llama_print_timings(ctx);
2042
+ LOG("\n");
2043
+ llama_perf_context_print(ctx);
2044
+
2058
2045
  write_logfile(ctx, params, model, results);
2059
2046
 
2060
2047
  llama_free(ctx);