@fugood/llama.node 0.3.0 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +1 -10
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +6 -4
- package/src/LlamaCompletionWorker.cpp +6 -6
- package/src/LlamaContext.cpp +7 -9
- package/src/common.hpp +2 -1
- package/src/llama.cpp/.github/workflows/build.yml +98 -24
- package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
- package/src/llama.cpp/.github/workflows/docker.yml +43 -34
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
- package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
- package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
- package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
- package/src/llama.cpp/.github/workflows/server.yml +7 -0
- package/src/llama.cpp/CMakeLists.txt +20 -8
- package/src/llama.cpp/common/CMakeLists.txt +12 -10
- package/src/llama.cpp/common/arg.cpp +2006 -0
- package/src/llama.cpp/common/arg.h +77 -0
- package/src/llama.cpp/common/common.cpp +496 -1632
- package/src/llama.cpp/common/common.h +161 -63
- package/src/llama.cpp/common/console.cpp +3 -0
- package/src/llama.cpp/common/log.cpp +401 -0
- package/src/llama.cpp/common/log.h +66 -698
- package/src/llama.cpp/common/ngram-cache.cpp +3 -0
- package/src/llama.cpp/common/sampling.cpp +348 -350
- package/src/llama.cpp/common/sampling.h +62 -139
- package/src/llama.cpp/common/stb_image.h +5990 -6398
- package/src/llama.cpp/common/train.cpp +2 -0
- package/src/llama.cpp/docs/build.md +36 -1
- package/src/llama.cpp/examples/CMakeLists.txt +0 -1
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +1 -2
- package/src/llama.cpp/examples/batched/batched.cpp +39 -55
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +34 -44
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +15 -15
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
- package/src/llama.cpp/examples/embedding/embedding.cpp +143 -87
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +33 -33
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +36 -35
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
- package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +34 -27
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +59 -62
- package/src/llama.cpp/examples/infill/infill.cpp +117 -132
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +265 -58
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +29 -22
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +685 -150
- package/src/llama.cpp/examples/llava/clip.h +11 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +47 -58
- package/src/llama.cpp/examples/llava/llava.cpp +110 -24
- package/src/llama.cpp/examples/llava/llava.h +2 -3
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
- package/src/llama.cpp/examples/llava/requirements.txt +1 -0
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +42 -43
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +10 -8
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +23 -22
- package/src/llama.cpp/examples/lookup/lookup.cpp +40 -43
- package/src/llama.cpp/examples/main/main.cpp +210 -262
- package/src/llama.cpp/examples/parallel/parallel.cpp +49 -49
- package/src/llama.cpp/examples/passkey/passkey.cpp +42 -50
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +187 -200
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -3
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +49 -44
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +24 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +32 -35
- package/src/llama.cpp/examples/server/CMakeLists.txt +3 -5
- package/src/llama.cpp/examples/server/server.cpp +1027 -1073
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
- package/src/llama.cpp/examples/server/utils.hpp +107 -105
- package/src/llama.cpp/examples/simple/simple.cpp +35 -41
- package/src/llama.cpp/examples/speculative/speculative.cpp +129 -103
- package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +25 -27
- package/src/llama.cpp/ggml/CMakeLists.txt +14 -3
- package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-backend.h +145 -60
- package/src/llama.cpp/ggml/include/ggml-blas.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +15 -19
- package/src/llama.cpp/ggml/include/ggml-cuda.h +16 -16
- package/src/llama.cpp/ggml/include/ggml-metal.h +5 -8
- package/src/llama.cpp/ggml/include/ggml-rpc.h +5 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +8 -8
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +7 -7
- package/src/llama.cpp/ggml/include/ggml.h +293 -186
- package/src/llama.cpp/ggml/src/CMakeLists.txt +86 -44
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +2135 -1119
- package/src/llama.cpp/ggml/src/ggml-alloc.c +6 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +152 -70
- package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +606 -286
- package/src/llama.cpp/ggml/src/ggml-blas.cpp +9 -10
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
- package/src/llama.cpp/ggml/src/ggml-cann.cpp +215 -216
- package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
- package/src/llama.cpp/ggml/src/ggml-cpu-impl.h +614 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +49 -603
- package/src/llama.cpp/ggml/src/ggml-kompute.cpp +4 -24
- package/src/llama.cpp/ggml/src/ggml-quants.c +972 -92
- package/src/llama.cpp/ggml/src/ggml-quants.h +15 -0
- package/src/llama.cpp/ggml/src/ggml-rpc.cpp +116 -66
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +52 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +16 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +6 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl.cpp +97 -169
- package/src/llama.cpp/ggml/src/ggml-vulkan.cpp +1508 -1124
- package/src/llama.cpp/ggml/src/ggml.c +3001 -1647
- package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +192 -0
- package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +88 -40
- package/src/llama.cpp/include/llama.h +241 -264
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
- package/src/llama.cpp/src/llama-grammar.cpp +721 -122
- package/src/llama.cpp/src/llama-grammar.h +120 -15
- package/src/llama.cpp/src/llama-impl.h +156 -1
- package/src/llama.cpp/src/llama-sampling.cpp +1375 -303
- package/src/llama.cpp/src/llama-sampling.h +20 -47
- package/src/llama.cpp/src/llama-vocab.cpp +343 -120
- package/src/llama.cpp/src/llama-vocab.h +33 -17
- package/src/llama.cpp/src/llama.cpp +4247 -1525
- package/src/llama.cpp/src/unicode-data.cpp +6 -4
- package/src/llama.cpp/src/unicode-data.h +4 -4
- package/src/llama.cpp/src/unicode.cpp +15 -7
- package/src/llama.cpp/tests/CMakeLists.txt +3 -0
- package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
- package/src/llama.cpp/tests/test-backend-ops.cpp +1592 -289
- package/src/llama.cpp/tests/test-barrier.cpp +93 -0
- package/src/llama.cpp/tests/test-grad0.cpp +187 -70
- package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
- package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +6 -4
- package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
- package/src/llama.cpp/tests/test-log.cpp +39 -0
- package/src/llama.cpp/tests/test-quantize-fns.cpp +6 -0
- package/src/llama.cpp/tests/test-rope.cpp +1 -1
- package/src/llama.cpp/tests/test-sampling.cpp +157 -98
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +55 -35
- package/patches/llama.patch +0 -22
- package/src/llama.cpp/.github/workflows/bench.yml +0 -310
- package/src/llama.cpp/common/grammar-parser.cpp +0 -536
- package/src/llama.cpp/common/grammar-parser.h +0 -29
- package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
- package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
|
@@ -1,18 +1,21 @@
|
|
|
1
|
+
#include "arg.h"
|
|
1
2
|
#include "common.h"
|
|
3
|
+
#include "log.h"
|
|
2
4
|
#include "llama.h"
|
|
3
5
|
|
|
6
|
+
#include <algorithm>
|
|
7
|
+
#include <array>
|
|
8
|
+
#include <atomic>
|
|
4
9
|
#include <cmath>
|
|
5
10
|
#include <cstdio>
|
|
6
11
|
#include <cstring>
|
|
7
12
|
#include <ctime>
|
|
13
|
+
#include <fstream>
|
|
14
|
+
#include <mutex>
|
|
15
|
+
#include <random>
|
|
8
16
|
#include <sstream>
|
|
9
17
|
#include <thread>
|
|
10
|
-
#include <mutex>
|
|
11
|
-
#include <atomic>
|
|
12
18
|
#include <vector>
|
|
13
|
-
#include <array>
|
|
14
|
-
#include <fstream>
|
|
15
|
-
#include <sstream>
|
|
16
19
|
|
|
17
20
|
#if defined(_MSC_VER)
|
|
18
21
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
|
@@ -40,7 +43,7 @@ static void write_logfile(
|
|
|
40
43
|
}
|
|
41
44
|
|
|
42
45
|
if (params.hellaswag) {
|
|
43
|
-
|
|
46
|
+
LOG_WRN("%s: logging results is not implemented for HellaSwag. No files will be written.\n", __func__);
|
|
44
47
|
return;
|
|
45
48
|
}
|
|
46
49
|
|
|
@@ -48,7 +51,7 @@ static void write_logfile(
|
|
|
48
51
|
|
|
49
52
|
const bool success = fs_create_directory_with_parents(params.logdir);
|
|
50
53
|
if (!success) {
|
|
51
|
-
|
|
54
|
+
LOG_WRN("%s: failed to create logdir %s, cannot write logfile\n",
|
|
52
55
|
__func__, params.logdir.c_str());
|
|
53
56
|
return;
|
|
54
57
|
}
|
|
@@ -57,7 +60,7 @@ static void write_logfile(
|
|
|
57
60
|
FILE * logfile = fopen(logfile_path.c_str(), "w");
|
|
58
61
|
|
|
59
62
|
if (logfile == NULL) {
|
|
60
|
-
|
|
63
|
+
LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
|
|
61
64
|
return;
|
|
62
65
|
}
|
|
63
66
|
|
|
@@ -76,7 +79,7 @@ static void write_logfile(
|
|
|
76
79
|
fprintf(logfile, "ppl_value: %f\n", results.ppl_value);
|
|
77
80
|
yaml_dump_vector_float(logfile, "probs", results.probs);
|
|
78
81
|
|
|
79
|
-
|
|
82
|
+
llama_perf_dump_yaml(logfile, ctx);
|
|
80
83
|
fclose(logfile);
|
|
81
84
|
}
|
|
82
85
|
|
|
@@ -340,19 +343,19 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
|
|
|
340
343
|
// Output: `perplexity: 13.5106 [114/114]`
|
|
341
344
|
// BOS tokens will be added for each chunk before eval
|
|
342
345
|
|
|
343
|
-
const bool add_bos =
|
|
344
|
-
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx))
|
|
346
|
+
const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
|
|
347
|
+
GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
|
|
345
348
|
|
|
346
|
-
|
|
349
|
+
LOG_INF("%s: tokenizing the input ..\n", __func__);
|
|
347
350
|
|
|
348
351
|
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
|
|
349
352
|
|
|
350
353
|
const int n_ctx = llama_n_ctx(ctx);
|
|
351
354
|
|
|
352
355
|
if (int(tokens.size()) < 2*n_ctx) {
|
|
353
|
-
|
|
356
|
+
LOG_ERR("%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
|
|
354
357
|
n_ctx);
|
|
355
|
-
|
|
358
|
+
LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
|
|
356
359
|
return {std::move(tokens), 0., {}, {}};
|
|
357
360
|
}
|
|
358
361
|
|
|
@@ -363,16 +366,16 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
|
|
|
363
366
|
prob_history.resize(tokens.size());
|
|
364
367
|
|
|
365
368
|
if (params.ppl_stride <= 0) {
|
|
366
|
-
|
|
369
|
+
LOG_ERR("%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride);
|
|
367
370
|
return {tokens, -1, logit_history, prob_history};
|
|
368
371
|
}
|
|
369
372
|
|
|
370
373
|
const int calc_chunk = n_ctx;
|
|
371
374
|
|
|
372
|
-
|
|
375
|
+
LOG_INF("%s: have %zu tokens. Calculation chunk = %d\n", __func__, tokens.size(), calc_chunk);
|
|
373
376
|
|
|
374
377
|
if (int(tokens.size()) <= calc_chunk) {
|
|
375
|
-
|
|
378
|
+
LOG_ERR("%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__,
|
|
376
379
|
tokens.size(), n_ctx, params.ppl_stride);
|
|
377
380
|
return {tokens, -1, logit_history, prob_history};
|
|
378
381
|
}
|
|
@@ -386,14 +389,14 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
|
|
|
386
389
|
int count = 0;
|
|
387
390
|
double nll = 0.0;
|
|
388
391
|
|
|
389
|
-
|
|
392
|
+
LOG_INF("%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
|
|
390
393
|
|
|
391
394
|
for (int i = 0; i < n_chunk; ++i) {
|
|
392
395
|
const int start = i * params.ppl_stride;
|
|
393
396
|
const int end = start + calc_chunk;
|
|
394
397
|
|
|
395
398
|
const int num_batches = (calc_chunk + n_batch - 1) / n_batch;
|
|
396
|
-
//
|
|
399
|
+
//LOG_DBG("%s: evaluating %d...%d using %d batches\n", __func__, start, end, num_batches);
|
|
397
400
|
|
|
398
401
|
std::vector<float> logits;
|
|
399
402
|
|
|
@@ -406,10 +409,10 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
|
|
|
406
409
|
const int batch_start = start + j * n_batch;
|
|
407
410
|
const int batch_size = std::min(end - batch_start, n_batch);
|
|
408
411
|
|
|
409
|
-
//
|
|
412
|
+
//LOG_DBG(" Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
|
|
410
413
|
// TODO: use llama_batch.logits instead of relying on logits_all == true
|
|
411
414
|
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
|
|
412
|
-
//
|
|
415
|
+
//LOG_ERR("%s : failed to eval\n", __func__);
|
|
413
416
|
return {tokens, -1, logit_history, prob_history};
|
|
414
417
|
}
|
|
415
418
|
|
|
@@ -433,16 +436,16 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
|
|
|
433
436
|
|
|
434
437
|
if (i == 0) {
|
|
435
438
|
const float t_total = std::chrono::duration<float>(t_end - t_start).count();
|
|
436
|
-
|
|
439
|
+
LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
|
|
437
440
|
int total_seconds = (int)(t_total * n_chunk);
|
|
438
441
|
if (total_seconds >= 60*60) {
|
|
439
|
-
|
|
442
|
+
LOG("%d hours ", total_seconds / (60*60));
|
|
440
443
|
total_seconds = total_seconds % (60*60);
|
|
441
444
|
}
|
|
442
|
-
|
|
445
|
+
LOG("%.2f minutes\n", total_seconds / 60.0);
|
|
443
446
|
}
|
|
444
447
|
|
|
445
|
-
//
|
|
448
|
+
//LOG_DBG("%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start);
|
|
446
449
|
for (int j = n_ctx - params.ppl_stride - 1; j < n_ctx - 1; ++j) {
|
|
447
450
|
|
|
448
451
|
// Calculate probability of next token, given the previous ones.
|
|
@@ -459,13 +462,12 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
|
|
|
459
462
|
}
|
|
460
463
|
// perplexity is e^(average negative log-likelihood)
|
|
461
464
|
if (params.ppl_output_type == 0) {
|
|
462
|
-
|
|
465
|
+
LOG("[%d]%.4lf,", i + 1, std::exp(nll / count));
|
|
463
466
|
} else {
|
|
464
|
-
|
|
467
|
+
LOG("%8d %.4lf\n", i*params.ppl_stride, std::exp(nll / count));
|
|
465
468
|
}
|
|
466
|
-
fflush(stdout);
|
|
467
469
|
}
|
|
468
|
-
|
|
470
|
+
LOG("\n");
|
|
469
471
|
|
|
470
472
|
return {tokens, std::exp(nll / count), logit_history, prob_history};
|
|
471
473
|
}
|
|
@@ -480,33 +482,33 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
|
|
480
482
|
// Output: `perplexity: 13.5106 [114/114]`
|
|
481
483
|
// BOS tokens will be added for each chunk before eval
|
|
482
484
|
|
|
483
|
-
const bool add_bos =
|
|
484
|
-
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx))
|
|
485
|
+
const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
|
|
486
|
+
GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
|
|
485
487
|
|
|
486
488
|
std::ofstream logits_stream;
|
|
487
489
|
if (!params.logits_file.empty()) {
|
|
488
490
|
logits_stream.open(params.logits_file.c_str(), std::ios::binary);
|
|
489
491
|
if (!logits_stream.is_open()) {
|
|
490
|
-
|
|
492
|
+
LOG_ERR("%s: failed to open %s for writing\n", __func__, params.logits_file.c_str());
|
|
491
493
|
return {};
|
|
492
494
|
}
|
|
493
|
-
|
|
495
|
+
LOG_INF("%s: saving all logits to %s\n", __func__, params.logits_file.c_str());
|
|
494
496
|
logits_stream.write("_logits_", 8);
|
|
495
497
|
logits_stream.write(reinterpret_cast<const char *>(&n_ctx), sizeof(n_ctx));
|
|
496
498
|
}
|
|
497
499
|
|
|
498
500
|
auto tim1 = std::chrono::high_resolution_clock::now();
|
|
499
|
-
|
|
501
|
+
LOG_INF("%s: tokenizing the input ..\n", __func__);
|
|
500
502
|
|
|
501
503
|
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
|
|
502
504
|
|
|
503
505
|
auto tim2 = std::chrono::high_resolution_clock::now();
|
|
504
|
-
|
|
506
|
+
LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
|
|
505
507
|
|
|
506
508
|
if (int(tokens.size()) < 2*n_ctx) {
|
|
507
|
-
|
|
509
|
+
LOG_ERR("%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
|
|
508
510
|
n_ctx);
|
|
509
|
-
|
|
511
|
+
LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
|
|
510
512
|
return {std::move(tokens), 0., {}, {}};
|
|
511
513
|
}
|
|
512
514
|
|
|
@@ -539,7 +541,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
|
|
539
541
|
logits.reserve((size_t)n_ctx * n_vocab);
|
|
540
542
|
}
|
|
541
543
|
|
|
542
|
-
|
|
544
|
+
LOG_INF("%s: calculating perplexity over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq);
|
|
543
545
|
|
|
544
546
|
std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
|
|
545
547
|
|
|
@@ -612,7 +614,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
|
|
612
614
|
}
|
|
613
615
|
|
|
614
616
|
if (llama_decode(ctx, batch)) {
|
|
615
|
-
|
|
617
|
+
LOG_INF("%s : failed to eval\n", __func__);
|
|
616
618
|
return {tokens, -1, logit_history, prob_history};
|
|
617
619
|
}
|
|
618
620
|
|
|
@@ -627,13 +629,13 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
|
|
627
629
|
llama_synchronize(ctx);
|
|
628
630
|
const auto t_end = std::chrono::high_resolution_clock::now();
|
|
629
631
|
const float t_total = std::chrono::duration<float>(t_end - t_start).count();
|
|
630
|
-
|
|
632
|
+
LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
|
|
631
633
|
int total_seconds = (int)(t_total*n_chunk/n_seq);
|
|
632
634
|
if (total_seconds >= 60*60) {
|
|
633
|
-
|
|
635
|
+
LOG("%d hours ", total_seconds / (60*60));
|
|
634
636
|
total_seconds = total_seconds % (60*60);
|
|
635
637
|
}
|
|
636
|
-
|
|
638
|
+
LOG("%.2f minutes\n", total_seconds / 60.0);
|
|
637
639
|
}
|
|
638
640
|
|
|
639
641
|
for (int seq = 0; seq < n_seq_batch; seq++) {
|
|
@@ -655,19 +657,18 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
|
|
655
657
|
|
|
656
658
|
// perplexity is e^(average negative log-likelihood)
|
|
657
659
|
if (params.ppl_output_type == 0) {
|
|
658
|
-
|
|
660
|
+
LOG("[%d]%.4lf,", i + seq + 1, std::exp(nll / count));
|
|
659
661
|
} else {
|
|
660
662
|
double av = nll/count;
|
|
661
663
|
double av2 = nll2/count - av*av;
|
|
662
664
|
if (av2 > 0) av2 = sqrt(av2/(count-1));
|
|
663
|
-
|
|
665
|
+
LOG("%8d %.4lf %4lf %4lf\n", i*n_ctx, std::exp(nll / count), av, av2);
|
|
664
666
|
}
|
|
665
667
|
}
|
|
666
|
-
fflush(stdout);
|
|
667
668
|
|
|
668
669
|
logits.clear();
|
|
669
670
|
}
|
|
670
|
-
|
|
671
|
+
LOG("\n");
|
|
671
672
|
|
|
672
673
|
nll2 /= count;
|
|
673
674
|
nll /= count;
|
|
@@ -675,9 +676,9 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
|
|
675
676
|
nll2 -= nll * nll;
|
|
676
677
|
if (nll2 > 0) {
|
|
677
678
|
nll2 = sqrt(nll2/(count-1));
|
|
678
|
-
|
|
679
|
+
LOG_INF("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
|
|
679
680
|
} else {
|
|
680
|
-
|
|
681
|
+
LOG_ERR("Unexpected negative standard deviation of log(prob)\n");
|
|
681
682
|
}
|
|
682
683
|
|
|
683
684
|
llama_batch_free(batch);
|
|
@@ -703,7 +704,7 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<
|
|
|
703
704
|
|
|
704
705
|
const int ret = llama_decode(ctx, batch_view);
|
|
705
706
|
if (ret != 0) {
|
|
706
|
-
|
|
707
|
+
LOG_ERR("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
|
|
707
708
|
return false;
|
|
708
709
|
}
|
|
709
710
|
|
|
@@ -789,15 +790,15 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
|
|
789
790
|
}
|
|
790
791
|
|
|
791
792
|
if (prompt_lines.size() % 6 != 0) {
|
|
792
|
-
|
|
793
|
+
LOG_ERR("%s : number of lines in prompt not a multiple of 6.\n", __func__);
|
|
793
794
|
return;
|
|
794
795
|
}
|
|
795
796
|
|
|
796
797
|
size_t hs_task_count = prompt_lines.size()/6;
|
|
797
|
-
|
|
798
|
+
LOG_INF("%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count);
|
|
798
799
|
|
|
799
800
|
const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM;
|
|
800
|
-
|
|
801
|
+
LOG_INF("================================= is_spm = %d\n", is_spm);
|
|
801
802
|
|
|
802
803
|
// The tasks should be randomized so the score stabilizes quickly.
|
|
803
804
|
bool randomize_tasks = true;
|
|
@@ -824,7 +825,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
|
|
824
825
|
std::vector<llama_token> seq_tokens[4];
|
|
825
826
|
};
|
|
826
827
|
|
|
827
|
-
|
|
828
|
+
LOG_INF("%s : selecting %zu %s tasks.\n", __func__, hs_task_count, (randomize_tasks?"randomized":"the first") );
|
|
828
829
|
|
|
829
830
|
// Select and read data from prompt lines
|
|
830
831
|
std::vector<hs_data_t> hs_data(hs_task_count);
|
|
@@ -870,9 +871,9 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
|
|
870
871
|
}
|
|
871
872
|
}
|
|
872
873
|
|
|
873
|
-
|
|
874
|
+
LOG_INF("%s : calculating hellaswag score over selected tasks.\n", __func__);
|
|
874
875
|
|
|
875
|
-
|
|
876
|
+
LOG("\ntask\tacc_norm\n");
|
|
876
877
|
|
|
877
878
|
double acc = 0.0f;
|
|
878
879
|
|
|
@@ -940,7 +941,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
|
|
940
941
|
}
|
|
941
942
|
|
|
942
943
|
if (i0 == i1) {
|
|
943
|
-
|
|
944
|
+
LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0);
|
|
944
945
|
return;
|
|
945
946
|
}
|
|
946
947
|
|
|
@@ -948,7 +949,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
|
|
948
949
|
|
|
949
950
|
// decode all tasks [i0, i1)
|
|
950
951
|
if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
|
|
951
|
-
|
|
952
|
+
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
|
952
953
|
return;
|
|
953
954
|
}
|
|
954
955
|
|
|
@@ -998,7 +999,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
|
|
998
999
|
}
|
|
999
1000
|
}
|
|
1000
1001
|
|
|
1001
|
-
//
|
|
1002
|
+
//LOG("max logprob ending idx %lu, gold ending idx %lu\n", ending_logprob_max_idx, hs_cur.gold_ending_idx);
|
|
1002
1003
|
|
|
1003
1004
|
// If the gold ending got the maximum logprobe add one accuracy point
|
|
1004
1005
|
if (ending_logprob_max_idx == hs_cur.gold_ending_idx) {
|
|
@@ -1006,8 +1007,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
|
|
1006
1007
|
}
|
|
1007
1008
|
|
|
1008
1009
|
// Print the accumulated accuracy mean x 100
|
|
1009
|
-
|
|
1010
|
-
fflush(stdout);
|
|
1010
|
+
LOG("%zu\t%.8lf\n", i + 1, acc/double(i + 1)*100.0);
|
|
1011
1011
|
}
|
|
1012
1012
|
|
|
1013
1013
|
i0 = i1 - 1;
|
|
@@ -1015,7 +1015,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
|
|
1015
1015
|
|
|
1016
1016
|
llama_batch_free(batch);
|
|
1017
1017
|
|
|
1018
|
-
|
|
1018
|
+
LOG("\n");
|
|
1019
1019
|
}
|
|
1020
1020
|
|
|
1021
1021
|
struct winogrande_entry {
|
|
@@ -1059,7 +1059,7 @@ static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string
|
|
|
1059
1059
|
}
|
|
1060
1060
|
}
|
|
1061
1061
|
if (ipos != 4) {
|
|
1062
|
-
|
|
1062
|
+
LOG_ERR("%s: failed to find comma separators in <%s>\n", __func__, line.c_str());
|
|
1063
1063
|
continue;
|
|
1064
1064
|
}
|
|
1065
1065
|
auto sentence = line[comma_pos[0]+1] == '"' ? line.substr(comma_pos[0]+2, comma_pos[1] - comma_pos[0] - 3)
|
|
@@ -1073,13 +1073,13 @@ static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string
|
|
|
1073
1073
|
if (sentence[where] == '_') break;
|
|
1074
1074
|
}
|
|
1075
1075
|
if (where == int(sentence.size())) {
|
|
1076
|
-
|
|
1076
|
+
LOG_ERR("%s: no _ in <%s>\n", __func__, sentence.c_str());
|
|
1077
1077
|
continue;
|
|
1078
1078
|
}
|
|
1079
1079
|
std::istringstream stream(answer.c_str());
|
|
1080
1080
|
int i_answer; stream >> i_answer;
|
|
1081
1081
|
if (stream.fail() || i_answer < 1 || i_answer > 2) {
|
|
1082
|
-
|
|
1082
|
+
LOG_ERR("%s: failed to parse answer <%s>\n", __func__, answer.c_str());
|
|
1083
1083
|
continue;
|
|
1084
1084
|
}
|
|
1085
1085
|
result.emplace_back();
|
|
@@ -1108,14 +1108,14 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
|
|
|
1108
1108
|
|
|
1109
1109
|
auto data = load_winogrande_from_csv(params.prompt);
|
|
1110
1110
|
if (data.empty()) {
|
|
1111
|
-
|
|
1111
|
+
LOG_ERR("%s: no tasks\n", __func__);
|
|
1112
1112
|
return;
|
|
1113
1113
|
}
|
|
1114
1114
|
|
|
1115
|
-
|
|
1115
|
+
LOG_INF("%s : loaded %zu tasks from prompt.\n", __func__, data.size());
|
|
1116
1116
|
|
|
1117
1117
|
if (params.winogrande_tasks > 0 && params.winogrande_tasks < data.size()) {
|
|
1118
|
-
|
|
1118
|
+
LOG_INF("%s : selecting %zu random tasks\n", __func__, params.winogrande_tasks);
|
|
1119
1119
|
std::mt19937 rng(1);
|
|
1120
1120
|
std::vector<int> aux(data.size());
|
|
1121
1121
|
for (int i = 0; i < int(data.size()); ++i) {
|
|
@@ -1133,7 +1133,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
|
|
|
1133
1133
|
data = std::move(selected);
|
|
1134
1134
|
}
|
|
1135
1135
|
|
|
1136
|
-
|
|
1136
|
+
LOG_INF("%s : tokenizing selected tasks\n", __func__);
|
|
1137
1137
|
|
|
1138
1138
|
for (auto & task : data) {
|
|
1139
1139
|
task.seq_tokens[0] = ::llama_tokenize(ctx, task.first + task.choices[0] + task.second, true);
|
|
@@ -1156,7 +1156,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
|
|
|
1156
1156
|
task.n_base2 = ::llama_tokenize(ctx, task.first + task.choices[1], true).size();
|
|
1157
1157
|
}
|
|
1158
1158
|
|
|
1159
|
-
|
|
1159
|
+
LOG_INF("%s : calculating winogrande score over selected tasks.\n", __func__);
|
|
1160
1160
|
|
|
1161
1161
|
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
|
1162
1162
|
const int n_ctx = llama_n_ctx(ctx);
|
|
@@ -1217,7 +1217,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
|
|
|
1217
1217
|
}
|
|
1218
1218
|
|
|
1219
1219
|
if (i0 == i1) {
|
|
1220
|
-
|
|
1220
|
+
LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0);
|
|
1221
1221
|
return;
|
|
1222
1222
|
}
|
|
1223
1223
|
|
|
@@ -1225,7 +1225,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
|
|
|
1225
1225
|
|
|
1226
1226
|
// decode all tasks [i0, i1)
|
|
1227
1227
|
if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
|
|
1228
|
-
|
|
1228
|
+
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
|
1229
1229
|
return;
|
|
1230
1230
|
}
|
|
1231
1231
|
|
|
@@ -1285,20 +1285,20 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
|
|
|
1285
1285
|
++n_done;
|
|
1286
1286
|
|
|
1287
1287
|
// print the accumulated accuracy mean x 100
|
|
1288
|
-
|
|
1289
|
-
fflush(stdout);
|
|
1288
|
+
LOG("%zu\t%.4lf\t%10.6f %10.6f %d %d\n", i+1, 100.0 * n_correct/n_done, score_1st, score_2nd, result, task.answer);
|
|
1290
1289
|
}
|
|
1291
1290
|
|
|
1292
1291
|
i0 = i1 - 1;
|
|
1293
1292
|
}
|
|
1294
1293
|
|
|
1295
|
-
|
|
1294
|
+
LOG("\n");
|
|
1296
1295
|
|
|
1297
1296
|
if (n_done < 100) return;
|
|
1298
1297
|
|
|
1299
1298
|
const float p = 1.f*n_correct/n_done;
|
|
1300
1299
|
const float sigma = 100.f*sqrt(p*(1-p)/(n_done-1));
|
|
1301
|
-
|
|
1300
|
+
|
|
1301
|
+
LOG_INF("Final Winogrande score(%d tasks): %.4lf +/- %.4lf\n", n_done, 100*p, sigma);
|
|
1302
1302
|
}
|
|
1303
1303
|
|
|
1304
1304
|
static bool deserialize_string(std::istream & in, std::string & str) {
|
|
@@ -1347,7 +1347,7 @@ struct multiple_choice_task {
|
|
|
1347
1347
|
static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choice_task& task, bool log_error) {
|
|
1348
1348
|
if (task.question.empty() || task.mc1.answers.empty()) {
|
|
1349
1349
|
if (log_error) {
|
|
1350
|
-
|
|
1350
|
+
LOG_ERR("%s: found bad task with empty question and/or answers\n", __func__);
|
|
1351
1351
|
}
|
|
1352
1352
|
return false;
|
|
1353
1353
|
}
|
|
@@ -1355,7 +1355,7 @@ static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choic
|
|
|
1355
1355
|
for (auto& answer : task.mc1.answers) {
|
|
1356
1356
|
if (answer.empty()) {
|
|
1357
1357
|
if (log_error) {
|
|
1358
|
-
|
|
1358
|
+
LOG_ERR("%s: found empty answer\n", __func__);
|
|
1359
1359
|
}
|
|
1360
1360
|
return false;
|
|
1361
1361
|
}
|
|
@@ -1409,14 +1409,14 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
|
|
1409
1409
|
uint32_t n_task;
|
|
1410
1410
|
strstream.read((char *)&n_task, sizeof(n_task));
|
|
1411
1411
|
if (strstream.fail() || n_task == 0) {
|
|
1412
|
-
|
|
1412
|
+
LOG_ERR("%s: no tasks\n", __func__);
|
|
1413
1413
|
return;
|
|
1414
1414
|
}
|
|
1415
|
-
|
|
1415
|
+
LOG_INF("%s: there are %u tasks in prompt\n", __func__, n_task);
|
|
1416
1416
|
std::vector<uint32_t> task_pos(n_task);
|
|
1417
1417
|
strstream.read((char *)task_pos.data(), task_pos.size()*sizeof(uint32_t));
|
|
1418
1418
|
if (strstream.fail()) {
|
|
1419
|
-
|
|
1419
|
+
LOG_ERR("%s: failed to read task positions from prompt\n", __func__);
|
|
1420
1420
|
return;
|
|
1421
1421
|
}
|
|
1422
1422
|
|
|
@@ -1424,21 +1424,21 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
|
|
1424
1424
|
if (params.multiple_choice_tasks == 0 || params.multiple_choice_tasks >= (size_t)n_task) {
|
|
1425
1425
|
// Use all tasks
|
|
1426
1426
|
tasks.resize(n_task);
|
|
1427
|
-
|
|
1427
|
+
LOG_INF("%s: reading tasks", __func__);
|
|
1428
1428
|
int n_dot = std::max((int) n_task/100, 1);
|
|
1429
1429
|
int i = 0;
|
|
1430
1430
|
for (auto& task : tasks) {
|
|
1431
1431
|
++i;
|
|
1432
1432
|
if (!task.deserialize(strstream)) {
|
|
1433
|
-
|
|
1433
|
+
LOG_ERR("%s: failed to read task %d of %u\n", __func__, i, n_task);
|
|
1434
1434
|
return;
|
|
1435
1435
|
}
|
|
1436
|
-
if (i%n_dot == 0)
|
|
1436
|
+
if (i%n_dot == 0) LOG(".");
|
|
1437
1437
|
}
|
|
1438
|
-
|
|
1438
|
+
LOG("done\n");
|
|
1439
1439
|
}
|
|
1440
1440
|
else {
|
|
1441
|
-
|
|
1441
|
+
LOG_INF("%s: selecting %zu random tasks from %u tasks available\n", __func__, params.multiple_choice_tasks, n_task);
|
|
1442
1442
|
std::mt19937 rng(1);
|
|
1443
1443
|
std::vector<int> aux(n_task);
|
|
1444
1444
|
for (uint32_t i = 0; i < n_task; ++i) aux[i] = i;
|
|
@@ -1451,18 +1451,16 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
|
|
1451
1451
|
aux.pop_back();
|
|
1452
1452
|
strstream.seekg(task_pos[idx], std::ios::beg);
|
|
1453
1453
|
if (!task.deserialize(strstream)) {
|
|
1454
|
-
|
|
1454
|
+
LOG_ERR("%s: failed to read task %d at position %u\n", __func__, idx, task_pos[idx]);
|
|
1455
1455
|
return;
|
|
1456
1456
|
}
|
|
1457
1457
|
}
|
|
1458
1458
|
n_task = params.multiple_choice_tasks;
|
|
1459
1459
|
}
|
|
1460
1460
|
|
|
1461
|
-
|
|
1462
|
-
fflush(stdout);
|
|
1461
|
+
LOG_INF("%s: preparing task data", __func__);
|
|
1463
1462
|
if (n_task > 500) {
|
|
1464
|
-
|
|
1465
|
-
fflush(stdout);
|
|
1463
|
+
LOG("...");
|
|
1466
1464
|
std::atomic<int> counter(0);
|
|
1467
1465
|
std::atomic<int> n_bad(0);
|
|
1468
1466
|
auto prepare = [&counter, &n_bad, &tasks, ctx] () {
|
|
@@ -1486,11 +1484,10 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
|
|
1486
1484
|
for (auto& w : workers) w = std::thread(prepare);
|
|
1487
1485
|
prepare();
|
|
1488
1486
|
for (auto& w : workers) w.join();
|
|
1489
|
-
|
|
1490
|
-
fflush(stdout);
|
|
1487
|
+
LOG("done\n");
|
|
1491
1488
|
int nbad = n_bad;
|
|
1492
1489
|
if (nbad > 0) {
|
|
1493
|
-
|
|
1490
|
+
LOG_ERR("%s: found %d malformed tasks\n", __func__, nbad);
|
|
1494
1491
|
return;
|
|
1495
1492
|
}
|
|
1496
1493
|
} else {
|
|
@@ -1502,16 +1499,15 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
|
|
1502
1499
|
return;
|
|
1503
1500
|
}
|
|
1504
1501
|
if (i_task%n_dot == 0) {
|
|
1505
|
-
|
|
1506
|
-
fflush(stdout);
|
|
1502
|
+
LOG(".");
|
|
1507
1503
|
}
|
|
1508
1504
|
}
|
|
1509
|
-
|
|
1505
|
+
LOG("done\n");
|
|
1510
1506
|
}
|
|
1511
1507
|
|
|
1512
|
-
|
|
1508
|
+
LOG_INF("%s : calculating TruthfulQA score over %zu tasks.\n", __func__, tasks.size());
|
|
1513
1509
|
|
|
1514
|
-
|
|
1510
|
+
LOG("\ntask\tacc_norm\n");
|
|
1515
1511
|
|
|
1516
1512
|
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
|
1517
1513
|
const int n_ctx = llama_n_ctx(ctx);
|
|
@@ -1590,7 +1586,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
|
|
1590
1586
|
}
|
|
1591
1587
|
|
|
1592
1588
|
if (i0 == i1) {
|
|
1593
|
-
|
|
1589
|
+
LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0);
|
|
1594
1590
|
return;
|
|
1595
1591
|
}
|
|
1596
1592
|
|
|
@@ -1598,7 +1594,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
|
|
1598
1594
|
|
|
1599
1595
|
// decode all tasks [i0, i1)
|
|
1600
1596
|
if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
|
|
1601
|
-
|
|
1597
|
+
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
|
1602
1598
|
return;
|
|
1603
1599
|
}
|
|
1604
1600
|
|
|
@@ -1622,13 +1618,13 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
|
|
1622
1618
|
// compute the logprobs for each ending of the decoded tasks
|
|
1623
1619
|
for (size_t i = i0; i < i1; ++i) {
|
|
1624
1620
|
auto & cur_task = tasks[i];
|
|
1625
|
-
//
|
|
1621
|
+
//LOG("==== Evaluating <%s> with correct answer ", cur_task.question.c_str());
|
|
1626
1622
|
//for (int j = 0; j < int(cur_task.mc1.labels.size()); ++j) {
|
|
1627
1623
|
// if (cur_task.mc1.labels[j] == 1) {
|
|
1628
|
-
//
|
|
1624
|
+
// LOG("%d", j+1);
|
|
1629
1625
|
// }
|
|
1630
1626
|
//}
|
|
1631
|
-
//
|
|
1627
|
+
//LOG("\n common_prefix: %zu\n", cur_task.common_prefix);
|
|
1632
1628
|
|
|
1633
1629
|
// get the logits of the last token of the common prefix
|
|
1634
1630
|
std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*cur_task.i_logits, n_vocab*sizeof(float));
|
|
@@ -1640,13 +1636,13 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
|
|
1640
1636
|
size_t count = 1;
|
|
1641
1637
|
float log_prob = std::log(first_probs[cur_task.seq_tokens[s][cur_task.common_prefix]]);
|
|
1642
1638
|
for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) {
|
|
1643
|
-
//
|
|
1639
|
+
//LOG(" %zu %g\n", ir, eval_results[ir]);
|
|
1644
1640
|
++count;
|
|
1645
1641
|
log_prob += eval_results[ir++];
|
|
1646
1642
|
}
|
|
1647
1643
|
cur_task.log_probs[s] = log_prob / count;
|
|
1648
|
-
//
|
|
1649
|
-
//
|
|
1644
|
+
//LOG(" Final: %g\n", log_prob / count);
|
|
1645
|
+
//LOG(" <%s> : %g\n", cur_task.mc1.answers[s].c_str(), log_prob/count);
|
|
1650
1646
|
}
|
|
1651
1647
|
|
|
1652
1648
|
// Find the ending with maximum logprob
|
|
@@ -1666,8 +1662,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
|
|
1666
1662
|
++n_done;
|
|
1667
1663
|
|
|
1668
1664
|
// Print the accumulated accuracy mean x 100
|
|
1669
|
-
|
|
1670
|
-
fflush(stdout);
|
|
1665
|
+
LOG("%d\t%.8lf\n", n_done, 100.*n_correct/n_done);
|
|
1671
1666
|
}
|
|
1672
1667
|
|
|
1673
1668
|
i0 = i1 - 1;
|
|
@@ -1679,29 +1674,30 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
|
|
1679
1674
|
|
|
1680
1675
|
float p = 1.f*n_correct/n_done;
|
|
1681
1676
|
float sigma = sqrt(p*(1-p)/(n_done-1));
|
|
1682
|
-
|
|
1677
|
+
LOG("\n");
|
|
1678
|
+
LOG_INF("Final result: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
|
|
1683
1679
|
p = 1.f*n_done/n_tot_answers;
|
|
1684
1680
|
sigma = sqrt(p*(1-p)/(n_done-1));
|
|
1685
|
-
|
|
1681
|
+
LOG_INF("Random chance: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
|
|
1686
1682
|
|
|
1687
|
-
|
|
1683
|
+
LOG_INF("\n");
|
|
1688
1684
|
}
|
|
1689
1685
|
|
|
1690
1686
|
static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
|
1691
1687
|
if (params.logits_file.empty()) {
|
|
1692
|
-
|
|
1688
|
+
LOG_ERR("%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__);
|
|
1693
1689
|
return;
|
|
1694
1690
|
}
|
|
1695
1691
|
std::ifstream in(params.logits_file.c_str(), std::ios::binary);
|
|
1696
1692
|
if (!in) {
|
|
1697
|
-
|
|
1693
|
+
LOG_ERR("%s: failed to open %s\n", __func__, params.logits_file.c_str());
|
|
1698
1694
|
return;
|
|
1699
1695
|
}
|
|
1700
1696
|
{
|
|
1701
1697
|
char check[9]; check[8] = 0;
|
|
1702
1698
|
in.read(check, 8);
|
|
1703
1699
|
if (in.fail() || strncmp("_logits_", check, 8) != 0) {
|
|
1704
|
-
|
|
1700
|
+
LOG_ERR("%s: %s does not look like a file containing log-probabilities\n", __func__, params.logits_file.c_str());
|
|
1705
1701
|
return;
|
|
1706
1702
|
}
|
|
1707
1703
|
}
|
|
@@ -1709,7 +1705,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
|
|
1709
1705
|
uint32_t n_ctx;
|
|
1710
1706
|
in.read((char *)&n_ctx, sizeof(n_ctx));
|
|
1711
1707
|
if (n_ctx > llama_n_ctx(ctx)) {
|
|
1712
|
-
|
|
1708
|
+
LOG_ERR("%s: %s has been computed with %u, while the current context is %d. Increase it with -c and retry\n",
|
|
1713
1709
|
__func__, params.logits_file.c_str(), n_ctx, params.n_ctx);
|
|
1714
1710
|
}
|
|
1715
1711
|
|
|
@@ -1717,24 +1713,24 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
|
|
1717
1713
|
in.read((char *)&n_vocab, sizeof(n_vocab));
|
|
1718
1714
|
in.read((char *)&n_chunk, sizeof(n_chunk));
|
|
1719
1715
|
if (in.fail()) {
|
|
1720
|
-
|
|
1716
|
+
LOG_ERR("%s: failed reading n_vocab, n_chunk from %s\n", __func__, params.logits_file.c_str());
|
|
1721
1717
|
return;
|
|
1722
1718
|
}
|
|
1723
1719
|
if (n_vocab != llama_n_vocab(llama_get_model(ctx))) {
|
|
1724
|
-
|
|
1720
|
+
LOG_ERR("%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx)));
|
|
1725
1721
|
}
|
|
1726
1722
|
|
|
1727
1723
|
std::vector<llama_token> tokens(n_ctx * n_chunk);
|
|
1728
1724
|
if (in.read((char *)tokens.data(), tokens.size()*sizeof(tokens[0])).fail()) {
|
|
1729
|
-
|
|
1725
|
+
LOG_ERR("%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str());
|
|
1730
1726
|
return;
|
|
1731
1727
|
}
|
|
1732
1728
|
|
|
1733
1729
|
const int n_batch = params.n_batch;
|
|
1734
1730
|
const int num_batches = (n_ctx + n_batch - 1)/n_batch;
|
|
1735
1731
|
const int nv = 2*((n_vocab + 1)/2) + 4;
|
|
1736
|
-
const bool add_bos =
|
|
1737
|
-
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx))
|
|
1732
|
+
const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
|
|
1733
|
+
GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
|
|
1738
1734
|
|
|
1739
1735
|
std::vector<uint16_t> log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv);
|
|
1740
1736
|
std::vector<float> kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
|
|
@@ -1775,7 +1771,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
|
|
1775
1771
|
const auto t_start = std::chrono::high_resolution_clock::now();
|
|
1776
1772
|
|
|
1777
1773
|
if (in.read((char *)log_probs_uint16.data(), log_probs_uint16.size()*sizeof(uint16_t)).fail()) {
|
|
1778
|
-
|
|
1774
|
+
LOG_ERR("%s: failed reading log-probs for chunk %d\n", __func__, i);
|
|
1779
1775
|
return;
|
|
1780
1776
|
}
|
|
1781
1777
|
|
|
@@ -1796,7 +1792,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
|
|
1796
1792
|
|
|
1797
1793
|
// TODO: use llama_batch.logits instead of relying on logits_all == true
|
|
1798
1794
|
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
|
|
1799
|
-
|
|
1795
|
+
LOG_ERR("%s : failed to eval\n", __func__);
|
|
1800
1796
|
return;
|
|
1801
1797
|
}
|
|
1802
1798
|
|
|
@@ -1813,16 +1809,16 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
|
|
1813
1809
|
|
|
1814
1810
|
if (i == 0) {
|
|
1815
1811
|
const float t_total = std::chrono::duration<float>(t_end - t_start).count();
|
|
1816
|
-
|
|
1812
|
+
LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
|
|
1817
1813
|
int total_seconds = (int)(t_total * n_chunk);
|
|
1818
1814
|
if (total_seconds >= 60*60) {
|
|
1819
|
-
|
|
1815
|
+
LOG("%d hours ", total_seconds / (60*60));
|
|
1820
1816
|
total_seconds = total_seconds % (60*60);
|
|
1821
1817
|
}
|
|
1822
|
-
|
|
1823
|
-
|
|
1824
|
-
printf("\nchunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p\n");
|
|
1818
|
+
LOG("%.2f minutes\n", total_seconds / 60.0);
|
|
1825
1819
|
}
|
|
1820
|
+
LOG("\n");
|
|
1821
|
+
LOG("chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p\n");
|
|
1826
1822
|
|
|
1827
1823
|
const int first = n_ctx/2;
|
|
1828
1824
|
const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
|
|
@@ -1831,79 +1827,77 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
|
|
1831
1827
|
p_diff_ptr += n_ctx - 1 - first;
|
|
1832
1828
|
kld_ptr += n_ctx - 1 - first;
|
|
1833
1829
|
|
|
1834
|
-
|
|
1830
|
+
LOG("%4d", i+1);
|
|
1835
1831
|
|
|
1836
1832
|
auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
|
|
1837
1833
|
const double ppl_val = exp(log_ppl.first);
|
|
1838
1834
|
const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 )
|
|
1839
|
-
|
|
1835
|
+
LOG(" %9.4lf ± %9.4lf", ppl_val, ppl_unc);
|
|
1840
1836
|
|
|
1841
1837
|
auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count);
|
|
1842
1838
|
const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count);
|
|
1843
1839
|
const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first;
|
|
1844
1840
|
const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov);
|
|
1845
|
-
|
|
1841
|
+
LOG(" %10.5lf ± %10.5lf", log_ppl_ratio_val, log_ppl_ratio_unc);
|
|
1846
1842
|
|
|
1847
1843
|
auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
|
|
1848
|
-
|
|
1844
|
+
LOG(" %10.5lf ± %10.5lf", kl_div.first, kl_div.second);
|
|
1849
1845
|
|
|
1850
1846
|
auto p_diff_mse = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count);
|
|
1851
1847
|
const double p_diff_rms_val = sqrt(p_diff_mse.first);
|
|
1852
1848
|
const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second;
|
|
1853
|
-
|
|
1849
|
+
LOG(" %6.3lf ± %6.3lf %%", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
|
|
1854
1850
|
|
|
1855
1851
|
double p_top_val = 1.*kld.n_same_top/kld.count;
|
|
1856
1852
|
double p_top_unc = sqrt(p_top_val*(1 - p_top_val)/(kld.count - 1));
|
|
1857
|
-
|
|
1853
|
+
LOG(" %6.3lf ± %6.3lf %%", 100.0*p_top_val, 100.0*p_top_unc);
|
|
1858
1854
|
|
|
1859
|
-
|
|
1860
|
-
|
|
1861
|
-
fflush(stdout);
|
|
1855
|
+
LOG("\n");
|
|
1862
1856
|
|
|
1863
1857
|
logits.clear();
|
|
1864
1858
|
}
|
|
1865
|
-
|
|
1859
|
+
LOG("\n");
|
|
1866
1860
|
|
|
1867
1861
|
if (kld.count < 100) return; // we do not wish to do statistics on so few values
|
|
1868
1862
|
|
|
1869
1863
|
std::sort(kld_values.begin(), kld_values.end());
|
|
1870
1864
|
std::sort(p_diff_values.begin(), p_diff_values.end());
|
|
1871
1865
|
|
|
1872
|
-
|
|
1866
|
+
LOG("====== Perplexity statistics ======\n");
|
|
1873
1867
|
|
|
1874
1868
|
auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
|
|
1875
1869
|
const double ppl_val = exp(log_ppl.first);
|
|
1876
1870
|
const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 )
|
|
1877
|
-
|
|
1871
|
+
LOG("Mean PPL(Q) : %10.6lf ± %10.6lf\n", ppl_val, ppl_unc);
|
|
1878
1872
|
|
|
1879
1873
|
auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count);
|
|
1880
1874
|
const double ppl_base_val = exp(log_ppl_base.first);
|
|
1881
1875
|
const double ppl_base_unc = ppl_base_val * log_ppl_base.second; // ppl_base_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_base.second ** 2 )
|
|
1882
|
-
|
|
1876
|
+
LOG("Mean PPL(base) : %10.6lf ± %10.6lf\n", ppl_base_val, ppl_base_unc);
|
|
1883
1877
|
|
|
1884
1878
|
const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count);
|
|
1885
|
-
//
|
|
1879
|
+
// LOG("Cov(ln(PPL(Q)), ln(PPL(base))): %10.6lf\n", log_ppl_cov);
|
|
1886
1880
|
const double log_ppl_cor = log_ppl_cov / (log_ppl.second*log_ppl_base.second);
|
|
1887
|
-
|
|
1881
|
+
LOG("Cor(ln(PPL(Q)), ln(PPL(base))): %6.2lf%%\n", 100.0*log_ppl_cor);
|
|
1888
1882
|
|
|
1889
1883
|
const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first;
|
|
1890
1884
|
const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov);
|
|
1891
|
-
|
|
1885
|
+
LOG("Mean ln(PPL(Q)/PPL(base)) : %10.6lf ± %10.6lf\n", log_ppl_ratio_val, log_ppl_ratio_unc);
|
|
1892
1886
|
|
|
1893
1887
|
const double ppl_ratio_val = exp(log_ppl_ratio_val);
|
|
1894
1888
|
const double ppl_ratio_unc = ppl_ratio_val * log_ppl_ratio_unc; // ppl_ratio_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_ratio.second ** 2 )
|
|
1895
|
-
|
|
1889
|
+
LOG("Mean PPL(Q)/PPL(base) : %10.6lf ± %10.6lf\n", ppl_ratio_val, ppl_ratio_unc);
|
|
1896
1890
|
|
|
1897
1891
|
const double ppl_cov = ppl_val * ppl_base_val * log_ppl_cov;
|
|
1898
1892
|
const double ppl_diff_val = ppl_val - ppl_base_val;
|
|
1899
1893
|
const double ppl_diff_unc = sqrt(ppl_unc*ppl_unc + ppl_base_unc*ppl_base_unc - 2.0*ppl_cov);
|
|
1900
|
-
|
|
1894
|
+
LOG("Mean PPL(Q)-PPL(base) : %10.6lf ± %10.6lf\n", ppl_diff_val, ppl_diff_unc);
|
|
1901
1895
|
|
|
1902
|
-
|
|
1896
|
+
LOG("\n");
|
|
1903
1897
|
|
|
1904
|
-
|
|
1898
|
+
LOG("====== KL divergence statistics ======\n");
|
|
1905
1899
|
auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
|
|
1906
|
-
|
|
1900
|
+
LOG("Mean KLD: %10.6lf ± %10.6lf\n", kl_div.first, kl_div.second);
|
|
1907
1901
|
auto kld_median = kld_values.size()%2 == 0 ? 0.5f*(kld_values[kld_values.size()/2] + kld_values[kld_values.size()/2-1])
|
|
1908
1902
|
: kld_values[kld_values.size()/2];
|
|
1909
1903
|
|
|
@@ -1915,50 +1909,49 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
|
|
1915
1909
|
return (1 - p)*values[ip] + p*values[std::min(ip+1, values.size()-1)];
|
|
1916
1910
|
};
|
|
1917
1911
|
|
|
1918
|
-
|
|
1919
|
-
|
|
1920
|
-
|
|
1921
|
-
|
|
1922
|
-
|
|
1923
|
-
|
|
1924
|
-
|
|
1925
|
-
|
|
1926
|
-
|
|
1912
|
+
LOG("Maximum KLD: %10.6f\n", kld_values.back());
|
|
1913
|
+
LOG("99.9%% KLD: %10.6f\n", percentile(kld_values, 0.999f));
|
|
1914
|
+
LOG("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f));
|
|
1915
|
+
LOG("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f));
|
|
1916
|
+
LOG("Median KLD: %10.6f\n", kld_median);
|
|
1917
|
+
LOG("10.0%% KLD: %10.6f\n", percentile(kld_values, 0.100f));
|
|
1918
|
+
LOG(" 5.0%% KLD: %10.6f\n", percentile(kld_values, 0.050f));
|
|
1919
|
+
LOG(" 1.0%% KLD: %10.6f\n", percentile(kld_values, 0.010f));
|
|
1920
|
+
LOG("Minimum KLD: %10.6f\n", kld_values.front());
|
|
1927
1921
|
|
|
1928
|
-
|
|
1922
|
+
LOG("\n");
|
|
1929
1923
|
|
|
1930
|
-
|
|
1924
|
+
LOG("====== Token probability statistics ======\n");
|
|
1931
1925
|
|
|
1932
1926
|
auto p_diff = mean_and_uncertainty(kld.sum_p_diff, kld.sum_p_diff2, kld.count);
|
|
1933
|
-
|
|
1927
|
+
LOG("Mean Δp: %6.3lf ± %5.3lf %%\n", 100.0*p_diff.first, 100.0*p_diff.second);
|
|
1934
1928
|
|
|
1935
1929
|
auto p_diff_median = p_diff_values.size()%2 == 0 ? 0.5f*(p_diff_values[p_diff_values.size()/2] + p_diff_values[p_diff_values.size()/2-1])
|
|
1936
1930
|
: p_diff_values[p_diff_values.size()/2];
|
|
1937
1931
|
|
|
1938
|
-
|
|
1939
|
-
|
|
1940
|
-
|
|
1941
|
-
|
|
1942
|
-
|
|
1943
|
-
|
|
1944
|
-
|
|
1945
|
-
|
|
1946
|
-
|
|
1947
|
-
|
|
1948
|
-
|
|
1949
|
-
|
|
1950
|
-
|
|
1932
|
+
LOG("Maximum Δp: %6.3lf%%\n", 100.0*p_diff_values.back());
|
|
1933
|
+
LOG("99.9%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.999f));
|
|
1934
|
+
LOG("99.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.990f));
|
|
1935
|
+
LOG("95.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.950f));
|
|
1936
|
+
LOG("90.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.900f));
|
|
1937
|
+
LOG("75.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.750f));
|
|
1938
|
+
LOG("Median Δp: %6.3lf%%\n", 100.0*p_diff_median);
|
|
1939
|
+
LOG("25.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.250f));
|
|
1940
|
+
LOG("10.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.100f));
|
|
1941
|
+
LOG(" 5.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.050f));
|
|
1942
|
+
LOG(" 1.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.010f));
|
|
1943
|
+
LOG(" 0.1%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.001f));
|
|
1944
|
+
LOG("Minimum Δp: %6.3lf%%\n", 100.0*p_diff_values.front());
|
|
1951
1945
|
|
|
1952
1946
|
auto p_diff_mse = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count);
|
|
1953
|
-
//
|
|
1947
|
+
// LOG("MSE Δp : %10.6lf ± %10.6lf\n", p_diff_mse.first, p_diff_mse.second);
|
|
1954
1948
|
|
|
1955
1949
|
const double p_diff_rms_val = sqrt(p_diff_mse.first);
|
|
1956
1950
|
const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second;
|
|
1957
|
-
|
|
1951
|
+
LOG("RMS Δp : %6.3lf ± %5.3lf %%\n", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
|
|
1958
1952
|
|
|
1959
1953
|
const double same_top_p = 1.0*kld.n_same_top/kld.count;
|
|
1960
|
-
|
|
1961
|
-
|
|
1954
|
+
LOG("Same top p: %6.3lf ± %5.3lf %%\n", 100.0*same_top_p, 100.0*sqrt(same_top_p*(1.0 - same_top_p)/(kld.count - 1)));
|
|
1962
1955
|
}
|
|
1963
1956
|
|
|
1964
1957
|
int main(int argc, char ** argv) {
|
|
@@ -1966,16 +1959,18 @@ int main(int argc, char ** argv) {
|
|
|
1966
1959
|
|
|
1967
1960
|
params.n_ctx = 512;
|
|
1968
1961
|
params.logits_all = true;
|
|
1962
|
+
params.escape = false;
|
|
1969
1963
|
|
|
1970
|
-
if (!gpt_params_parse(argc, argv, params)) {
|
|
1971
|
-
gpt_params_print_usage(argc, argv, params);
|
|
1964
|
+
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) {
|
|
1972
1965
|
return 1;
|
|
1973
1966
|
}
|
|
1974
1967
|
|
|
1968
|
+
gpt_init();
|
|
1969
|
+
|
|
1975
1970
|
const int32_t n_ctx = params.n_ctx;
|
|
1976
1971
|
|
|
1977
1972
|
if (n_ctx <= 0) {
|
|
1978
|
-
|
|
1973
|
+
LOG_ERR("%s: perplexity tool requires '--ctx-size' > 0\n", __func__);
|
|
1979
1974
|
return 1;
|
|
1980
1975
|
}
|
|
1981
1976
|
|
|
@@ -2000,45 +1995,35 @@ int main(int argc, char ** argv) {
|
|
|
2000
1995
|
}
|
|
2001
1996
|
|
|
2002
1997
|
if (params.ppl_stride > 0) {
|
|
2003
|
-
|
|
1998
|
+
LOG_INF("Will perform strided perplexity calculation -> adjusting context size from %d to %d\n",
|
|
2004
1999
|
params.n_ctx, params.n_ctx + params.ppl_stride/2);
|
|
2005
2000
|
params.n_ctx += params.ppl_stride/2;
|
|
2006
2001
|
}
|
|
2007
2002
|
|
|
2008
|
-
print_build_info();
|
|
2009
|
-
|
|
2010
|
-
if (params.seed == LLAMA_DEFAULT_SEED) {
|
|
2011
|
-
params.seed = time(NULL);
|
|
2012
|
-
}
|
|
2013
|
-
|
|
2014
|
-
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
|
|
2015
|
-
|
|
2016
|
-
std::mt19937 rng(params.seed);
|
|
2017
|
-
|
|
2018
2003
|
llama_backend_init();
|
|
2019
2004
|
llama_numa_init(params.numa);
|
|
2020
2005
|
|
|
2021
|
-
llama_model * model;
|
|
2022
|
-
llama_context * ctx;
|
|
2023
|
-
|
|
2024
2006
|
// load the model and apply lora adapter, if any
|
|
2025
|
-
|
|
2007
|
+
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
|
2008
|
+
|
|
2009
|
+
llama_model * model = llama_init.model;
|
|
2010
|
+
llama_context * ctx = llama_init.context;
|
|
2026
2011
|
if (model == NULL) {
|
|
2027
|
-
|
|
2012
|
+
LOG_ERR("%s: unable to load model\n", __func__);
|
|
2028
2013
|
return 1;
|
|
2029
2014
|
}
|
|
2030
2015
|
|
|
2031
2016
|
const int n_ctx_train = llama_n_ctx_train(model);
|
|
2032
2017
|
|
|
2033
2018
|
if (params.n_ctx > n_ctx_train) {
|
|
2034
|
-
|
|
2019
|
+
LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n",
|
|
2035
2020
|
__func__, n_ctx_train, params.n_ctx);
|
|
2036
2021
|
}
|
|
2037
2022
|
|
|
2038
2023
|
// print system information
|
|
2039
2024
|
{
|
|
2040
|
-
|
|
2041
|
-
|
|
2025
|
+
LOG_INF("\n");
|
|
2026
|
+
LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
|
|
2042
2027
|
}
|
|
2043
2028
|
|
|
2044
2029
|
struct results_perplexity results;
|
|
@@ -2054,7 +2039,9 @@ int main(int argc, char ** argv) {
|
|
|
2054
2039
|
results = perplexity(ctx, params, n_ctx);
|
|
2055
2040
|
}
|
|
2056
2041
|
|
|
2057
|
-
|
|
2042
|
+
LOG("\n");
|
|
2043
|
+
llama_perf_context_print(ctx);
|
|
2044
|
+
|
|
2058
2045
|
write_logfile(ctx, params, model, results);
|
|
2059
2046
|
|
|
2060
2047
|
llama_free(ctx);
|