@fugood/llama.node 0.3.0 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +1 -10
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +6 -4
- package/src/LlamaCompletionWorker.cpp +6 -6
- package/src/LlamaContext.cpp +7 -9
- package/src/common.hpp +2 -1
- package/src/llama.cpp/.github/workflows/build.yml +98 -24
- package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
- package/src/llama.cpp/.github/workflows/docker.yml +43 -34
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
- package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
- package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
- package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
- package/src/llama.cpp/.github/workflows/server.yml +7 -0
- package/src/llama.cpp/CMakeLists.txt +20 -8
- package/src/llama.cpp/common/CMakeLists.txt +12 -10
- package/src/llama.cpp/common/arg.cpp +2006 -0
- package/src/llama.cpp/common/arg.h +77 -0
- package/src/llama.cpp/common/common.cpp +496 -1632
- package/src/llama.cpp/common/common.h +161 -63
- package/src/llama.cpp/common/console.cpp +3 -0
- package/src/llama.cpp/common/log.cpp +401 -0
- package/src/llama.cpp/common/log.h +66 -698
- package/src/llama.cpp/common/ngram-cache.cpp +3 -0
- package/src/llama.cpp/common/sampling.cpp +348 -350
- package/src/llama.cpp/common/sampling.h +62 -139
- package/src/llama.cpp/common/stb_image.h +5990 -6398
- package/src/llama.cpp/common/train.cpp +2 -0
- package/src/llama.cpp/docs/build.md +36 -1
- package/src/llama.cpp/examples/CMakeLists.txt +0 -1
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +1 -2
- package/src/llama.cpp/examples/batched/batched.cpp +39 -55
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +34 -44
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +15 -15
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
- package/src/llama.cpp/examples/embedding/embedding.cpp +143 -87
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +33 -33
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +36 -35
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
- package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +34 -27
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +59 -62
- package/src/llama.cpp/examples/infill/infill.cpp +117 -132
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +265 -58
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +29 -22
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +685 -150
- package/src/llama.cpp/examples/llava/clip.h +11 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +47 -58
- package/src/llama.cpp/examples/llava/llava.cpp +110 -24
- package/src/llama.cpp/examples/llava/llava.h +2 -3
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
- package/src/llama.cpp/examples/llava/requirements.txt +1 -0
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +42 -43
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +10 -8
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +23 -22
- package/src/llama.cpp/examples/lookup/lookup.cpp +40 -43
- package/src/llama.cpp/examples/main/main.cpp +210 -262
- package/src/llama.cpp/examples/parallel/parallel.cpp +49 -49
- package/src/llama.cpp/examples/passkey/passkey.cpp +42 -50
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +187 -200
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +2 -3
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +49 -44
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +24 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +32 -35
- package/src/llama.cpp/examples/server/CMakeLists.txt +3 -5
- package/src/llama.cpp/examples/server/server.cpp +1027 -1073
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
- package/src/llama.cpp/examples/server/utils.hpp +107 -105
- package/src/llama.cpp/examples/simple/simple.cpp +35 -41
- package/src/llama.cpp/examples/speculative/speculative.cpp +129 -103
- package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +25 -27
- package/src/llama.cpp/ggml/CMakeLists.txt +14 -3
- package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-backend.h +145 -60
- package/src/llama.cpp/ggml/include/ggml-blas.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +15 -19
- package/src/llama.cpp/ggml/include/ggml-cuda.h +16 -16
- package/src/llama.cpp/ggml/include/ggml-metal.h +5 -8
- package/src/llama.cpp/ggml/include/ggml-rpc.h +5 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +8 -8
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +7 -7
- package/src/llama.cpp/ggml/include/ggml.h +293 -186
- package/src/llama.cpp/ggml/src/CMakeLists.txt +86 -44
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +2135 -1119
- package/src/llama.cpp/ggml/src/ggml-alloc.c +6 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +152 -70
- package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +606 -286
- package/src/llama.cpp/ggml/src/ggml-blas.cpp +9 -10
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
- package/src/llama.cpp/ggml/src/ggml-cann.cpp +215 -216
- package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
- package/src/llama.cpp/ggml/src/ggml-cpu-impl.h +614 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +49 -603
- package/src/llama.cpp/ggml/src/ggml-kompute.cpp +4 -24
- package/src/llama.cpp/ggml/src/ggml-quants.c +972 -92
- package/src/llama.cpp/ggml/src/ggml-quants.h +15 -0
- package/src/llama.cpp/ggml/src/ggml-rpc.cpp +116 -66
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +52 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +16 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +6 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl.cpp +97 -169
- package/src/llama.cpp/ggml/src/ggml-vulkan.cpp +1508 -1124
- package/src/llama.cpp/ggml/src/ggml.c +3001 -1647
- package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +192 -0
- package/src/llama.cpp/ggml/src/vulkan-shaders/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +88 -40
- package/src/llama.cpp/include/llama.h +241 -264
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
- package/src/llama.cpp/src/llama-grammar.cpp +721 -122
- package/src/llama.cpp/src/llama-grammar.h +120 -15
- package/src/llama.cpp/src/llama-impl.h +156 -1
- package/src/llama.cpp/src/llama-sampling.cpp +1375 -303
- package/src/llama.cpp/src/llama-sampling.h +20 -47
- package/src/llama.cpp/src/llama-vocab.cpp +343 -120
- package/src/llama.cpp/src/llama-vocab.h +33 -17
- package/src/llama.cpp/src/llama.cpp +4247 -1525
- package/src/llama.cpp/src/unicode-data.cpp +6 -4
- package/src/llama.cpp/src/unicode-data.h +4 -4
- package/src/llama.cpp/src/unicode.cpp +15 -7
- package/src/llama.cpp/tests/CMakeLists.txt +3 -0
- package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
- package/src/llama.cpp/tests/test-backend-ops.cpp +1592 -289
- package/src/llama.cpp/tests/test-barrier.cpp +93 -0
- package/src/llama.cpp/tests/test-grad0.cpp +187 -70
- package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
- package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +6 -4
- package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
- package/src/llama.cpp/tests/test-log.cpp +39 -0
- package/src/llama.cpp/tests/test-quantize-fns.cpp +6 -0
- package/src/llama.cpp/tests/test-rope.cpp +1 -1
- package/src/llama.cpp/tests/test-sampling.cpp +157 -98
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +55 -35
- package/patches/llama.patch +0 -22
- package/src/llama.cpp/.github/workflows/bench.yml +0 -310
- package/src/llama.cpp/common/grammar-parser.cpp +0 -536
- package/src/llama.cpp/common/grammar-parser.h +0 -29
- package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
- package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
|
@@ -1,4 +1,6 @@
|
|
|
1
|
+
#include "arg.h"
|
|
1
2
|
#include "common.h"
|
|
3
|
+
#include "log.h"
|
|
2
4
|
#include "llama.h"
|
|
3
5
|
|
|
4
6
|
#include <cmath>
|
|
@@ -17,15 +19,13 @@
|
|
|
17
19
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
|
18
20
|
#endif
|
|
19
21
|
|
|
20
|
-
static void print_usage(int
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
LOG_TEE("\n %s \\\n"
|
|
25
|
-
" -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \\\n"
|
|
22
|
+
static void print_usage(int, char ** argv) {
|
|
23
|
+
LOG("\nexample usage:\n");
|
|
24
|
+
LOG("\n %s \\\n"
|
|
25
|
+
" -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] \\\n"
|
|
26
26
|
" [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n"
|
|
27
27
|
" [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n" , argv[0]);
|
|
28
|
-
|
|
28
|
+
LOG("\n");
|
|
29
29
|
}
|
|
30
30
|
|
|
31
31
|
struct Stats {
|
|
@@ -126,12 +126,10 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
|
|
126
126
|
e.counts.resize(src1->ne[0]*n_as, 0);
|
|
127
127
|
}
|
|
128
128
|
else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
|
|
129
|
-
|
|
129
|
+
LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
|
|
130
130
|
exit(1); //GGML_ABORT("fatal error");
|
|
131
131
|
}
|
|
132
|
-
|
|
133
|
-
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
|
|
134
|
-
}
|
|
132
|
+
LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
|
|
135
133
|
// loop over all possible experts, regardless if they are used or not in the batch
|
|
136
134
|
for (int ex = 0; ex < n_as; ++ex) {
|
|
137
135
|
size_t e_start = ex*src1->ne[0];
|
|
@@ -152,7 +150,8 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
|
|
152
150
|
e.values[e_start + j] += x[j]*x[j];
|
|
153
151
|
e.counts[e_start + j]++;
|
|
154
152
|
if (!std::isfinite(e.values[e_start + j])) {
|
|
155
|
-
|
|
153
|
+
LOG("\n");
|
|
154
|
+
LOG_ERR("%f detected in %s\n", e.values[e_start + j], wname.c_str());
|
|
156
155
|
exit(1);
|
|
157
156
|
}
|
|
158
157
|
}
|
|
@@ -175,20 +174,18 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
|
|
175
174
|
e.counts.resize(src1->ne[0], 0);
|
|
176
175
|
}
|
|
177
176
|
else if (e.values.size() != (size_t)src1->ne[0]) {
|
|
178
|
-
|
|
177
|
+
LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
|
|
179
178
|
exit(1); //GGML_ABORT("fatal error");
|
|
180
179
|
}
|
|
181
180
|
++e.ncall;
|
|
182
|
-
|
|
183
|
-
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
|
|
184
|
-
}
|
|
181
|
+
LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
|
|
185
182
|
for (int row = 0; row < (int)src1->ne[1]; ++row) {
|
|
186
183
|
const float * x = data + row * src1->ne[0];
|
|
187
184
|
for (int j = 0; j < (int)src1->ne[0]; ++j) {
|
|
188
185
|
e.values[j] += x[j]*x[j];
|
|
189
186
|
e.counts[j]++;
|
|
190
187
|
if (!std::isfinite(e.values[j])) {
|
|
191
|
-
|
|
188
|
+
LOG_ERR("%f detected in %s\n", e.values[j], wname.c_str());
|
|
192
189
|
exit(1);
|
|
193
190
|
}
|
|
194
191
|
}
|
|
@@ -240,17 +237,17 @@ void IMatrixCollector::save_imatrix(int ncall) const {
|
|
|
240
237
|
}
|
|
241
238
|
|
|
242
239
|
if (n_zeros != 0 && is_first) {
|
|
243
|
-
|
|
240
|
+
LOG_INF("\n");
|
|
244
241
|
is_first = false;
|
|
245
242
|
}
|
|
246
243
|
|
|
247
244
|
if (n_zeros == n_all) {
|
|
248
|
-
|
|
245
|
+
LOG_WRN("%s: entry '%40s' has no data - skipping\n", __func__, kv.first.c_str());
|
|
249
246
|
continue;
|
|
250
247
|
}
|
|
251
248
|
|
|
252
249
|
if (n_zeros > 0) {
|
|
253
|
-
|
|
250
|
+
LOG_WRN("%s: entry '%40s' has partial data (%.2f%%) - skipping\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
|
|
254
251
|
continue;
|
|
255
252
|
}
|
|
256
253
|
|
|
@@ -259,7 +256,7 @@ void IMatrixCollector::save_imatrix(int ncall) const {
|
|
|
259
256
|
}
|
|
260
257
|
|
|
261
258
|
if (to_store.size() < m_stats.size()) {
|
|
262
|
-
|
|
259
|
+
LOG_WRN("%s: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size());
|
|
263
260
|
}
|
|
264
261
|
|
|
265
262
|
std::ofstream out(fname, std::ios::binary);
|
|
@@ -291,21 +288,20 @@ void IMatrixCollector::save_imatrix(int ncall) const {
|
|
|
291
288
|
out.write(m_params.prompt_file.c_str(), len);
|
|
292
289
|
}
|
|
293
290
|
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
}
|
|
291
|
+
LOGV(1, "\n");
|
|
292
|
+
LOG_DBGV(1, "%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str());
|
|
297
293
|
}
|
|
298
294
|
|
|
299
295
|
bool IMatrixCollector::load_imatrix(const char * fname) {
|
|
300
296
|
std::ifstream in(fname, std::ios::binary);
|
|
301
297
|
if (!in) {
|
|
302
|
-
|
|
298
|
+
LOG_ERR("%s: failed to open %s\n",__func__, fname);
|
|
303
299
|
return false;
|
|
304
300
|
}
|
|
305
301
|
int n_entries;
|
|
306
302
|
in.read((char*)&n_entries, sizeof(n_entries));
|
|
307
303
|
if (in.fail() || n_entries < 1) {
|
|
308
|
-
|
|
304
|
+
LOG_ERR("%s: no data in file %s\n", __func__, fname);
|
|
309
305
|
return false;
|
|
310
306
|
}
|
|
311
307
|
for (int i = 0; i < n_entries; ++i) {
|
|
@@ -313,7 +309,7 @@ bool IMatrixCollector::load_imatrix(const char * fname) {
|
|
|
313
309
|
std::vector<char> name_as_vec(len+1);
|
|
314
310
|
in.read((char *)name_as_vec.data(), len);
|
|
315
311
|
if (in.fail()) {
|
|
316
|
-
|
|
312
|
+
LOG_ERR("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname);
|
|
317
313
|
return false;
|
|
318
314
|
}
|
|
319
315
|
name_as_vec[len] = 0;
|
|
@@ -324,7 +320,7 @@ bool IMatrixCollector::load_imatrix(const char * fname) {
|
|
|
324
320
|
int nval;
|
|
325
321
|
in.read((char *)&nval, sizeof(nval));
|
|
326
322
|
if (in.fail() || nval < 1) {
|
|
327
|
-
|
|
323
|
+
LOG_ERR("%s: failed reading number of values for entry %d\n",__func__,i);
|
|
328
324
|
m_stats = {};
|
|
329
325
|
return false;
|
|
330
326
|
}
|
|
@@ -337,7 +333,7 @@ bool IMatrixCollector::load_imatrix(const char * fname) {
|
|
|
337
333
|
std::vector<float> tmp(nval);
|
|
338
334
|
in.read((char*)tmp.data(), nval*sizeof(float));
|
|
339
335
|
if (in.fail()) {
|
|
340
|
-
|
|
336
|
+
LOG_ERR("%s: failed reading data for entry %d\n",__func__,i);
|
|
341
337
|
m_stats = {};
|
|
342
338
|
return false;
|
|
343
339
|
}
|
|
@@ -433,31 +429,30 @@ static void process_logits(
|
|
|
433
429
|
}
|
|
434
430
|
|
|
435
431
|
static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
|
|
436
|
-
const bool add_bos =
|
|
437
|
-
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx))
|
|
432
|
+
const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
|
|
433
|
+
GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
|
|
438
434
|
const int n_ctx = llama_n_ctx(ctx);
|
|
439
435
|
|
|
440
436
|
auto tim1 = std::chrono::high_resolution_clock::now();
|
|
441
|
-
|
|
437
|
+
LOG_INF("%s: tokenizing the input ..\n", __func__);
|
|
442
438
|
|
|
443
439
|
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
|
|
444
440
|
|
|
445
441
|
auto tim2 = std::chrono::high_resolution_clock::now();
|
|
446
|
-
|
|
442
|
+
LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
|
|
447
443
|
|
|
448
444
|
if (params.i_chunk > 0) {
|
|
449
445
|
if (size_t((params.i_chunk + 2)*n_ctx) >= tokens.size()) {
|
|
450
|
-
|
|
446
|
+
LOG_ERR("%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk);
|
|
451
447
|
return false;
|
|
452
448
|
}
|
|
453
|
-
|
|
449
|
+
LOG_INF("%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx);
|
|
454
450
|
tokens.erase(tokens.begin(), tokens.begin() + params.i_chunk*n_ctx);
|
|
455
451
|
}
|
|
456
452
|
|
|
457
453
|
if (int(tokens.size()) < 2*n_ctx) {
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
|
|
454
|
+
LOG_ERR("%s: you need at least %d tokens for a context of %d tokens\n", __func__, 2*n_ctx, n_ctx);
|
|
455
|
+
LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n", __func__, tokens.size());
|
|
461
456
|
return false;
|
|
462
457
|
}
|
|
463
458
|
|
|
@@ -479,7 +474,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
|
|
|
479
474
|
double nll = 0.0;
|
|
480
475
|
double nll2 = 0.0;
|
|
481
476
|
|
|
482
|
-
|
|
477
|
+
LOG_INF("%s: computing over %d chunks with batch_size %d\n", __func__, n_chunk, n_batch);
|
|
483
478
|
|
|
484
479
|
std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
|
|
485
480
|
|
|
@@ -515,7 +510,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
|
|
|
515
510
|
|
|
516
511
|
// TODO: use batch.logits to save computations instead of relying on logits_all == true
|
|
517
512
|
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
|
|
518
|
-
|
|
513
|
+
LOG_ERR("%s : failed to eval\n", __func__);
|
|
519
514
|
return false;
|
|
520
515
|
}
|
|
521
516
|
|
|
@@ -532,29 +527,29 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
|
|
|
532
527
|
|
|
533
528
|
if (i == 0) {
|
|
534
529
|
const float t_total = std::chrono::duration<float>(t_end - t_start).count();
|
|
535
|
-
|
|
530
|
+
LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
|
|
536
531
|
int total_seconds = (int)(t_total * n_chunk);
|
|
537
532
|
if (total_seconds >= 60*60) {
|
|
538
|
-
|
|
533
|
+
LOG("%d hours ", total_seconds / (60*60));
|
|
539
534
|
total_seconds = total_seconds % (60*60);
|
|
540
535
|
}
|
|
541
|
-
|
|
536
|
+
LOG("%.2f minutes\n", total_seconds / 60.0);
|
|
542
537
|
}
|
|
543
538
|
|
|
544
539
|
if (params.compute_ppl) {
|
|
545
540
|
const int first = n_ctx/2;
|
|
546
|
-
const auto all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
|
|
541
|
+
const auto * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
|
|
547
542
|
process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
|
|
548
543
|
workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
|
|
549
544
|
count += n_ctx - first - 1;
|
|
550
545
|
|
|
551
|
-
|
|
546
|
+
LOG("[%d]%.4lf,", i + 1, std::exp(nll / count));
|
|
552
547
|
fflush(stdout);
|
|
553
548
|
|
|
554
549
|
logits.clear();
|
|
555
550
|
}
|
|
556
551
|
}
|
|
557
|
-
|
|
552
|
+
LOG("\n");
|
|
558
553
|
|
|
559
554
|
if (params.compute_ppl) {
|
|
560
555
|
nll2 /= count;
|
|
@@ -563,9 +558,9 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
|
|
|
563
558
|
nll2 -= nll * nll;
|
|
564
559
|
if (nll2 > 0) {
|
|
565
560
|
nll2 = sqrt(nll2/(count-1));
|
|
566
|
-
|
|
561
|
+
LOG("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
|
|
567
562
|
} else {
|
|
568
|
-
|
|
563
|
+
LOG("Unexpected negative standard deviation of log(prob)\n");
|
|
569
564
|
}
|
|
570
565
|
}
|
|
571
566
|
|
|
@@ -577,27 +572,28 @@ int main(int argc, char ** argv) {
|
|
|
577
572
|
|
|
578
573
|
params.n_ctx = 512;
|
|
579
574
|
params.logits_all = true;
|
|
580
|
-
params.
|
|
575
|
+
params.escape = false;
|
|
581
576
|
|
|
582
|
-
if (!gpt_params_parse(argc, argv, params)) {
|
|
583
|
-
print_usage(argc, argv, params);
|
|
577
|
+
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) {
|
|
584
578
|
return 1;
|
|
585
579
|
}
|
|
586
580
|
|
|
581
|
+
gpt_init();
|
|
582
|
+
|
|
587
583
|
params.n_batch = std::min(params.n_batch, params.n_ctx);
|
|
588
584
|
|
|
589
585
|
g_collector.set_params(params);
|
|
590
586
|
|
|
591
587
|
for (const auto & in_file : params.in_files) {
|
|
592
|
-
|
|
588
|
+
LOG_INF("%s : loading imatrix from '%s'\n", __func__, in_file.c_str());
|
|
593
589
|
if (!g_collector.load_imatrix(in_file.c_str())) {
|
|
594
|
-
|
|
590
|
+
LOG_ERR("%s : failed to load %s\n", __func__, in_file.c_str());
|
|
595
591
|
return 1;
|
|
596
592
|
}
|
|
597
593
|
}
|
|
598
594
|
|
|
599
595
|
if (params.in_files.size() > 1) {
|
|
600
|
-
|
|
596
|
+
LOG_INF("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str());
|
|
601
597
|
g_collector.save_imatrix();
|
|
602
598
|
}
|
|
603
599
|
|
|
@@ -611,25 +607,25 @@ int main(int argc, char ** argv) {
|
|
|
611
607
|
params.warmup = false;
|
|
612
608
|
|
|
613
609
|
// init
|
|
614
|
-
|
|
615
|
-
llama_context * ctx;
|
|
610
|
+
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
|
616
611
|
|
|
617
|
-
|
|
612
|
+
llama_model * model = llama_init.model;
|
|
613
|
+
llama_context * ctx = llama_init.context;
|
|
618
614
|
if (model == nullptr || ctx == nullptr) {
|
|
619
|
-
|
|
615
|
+
LOG_ERR("%s : failed to init\n", __func__);
|
|
620
616
|
return 1;
|
|
621
617
|
}
|
|
622
618
|
|
|
623
619
|
const int n_ctx_train = llama_n_ctx_train(model);
|
|
624
620
|
if (params.n_ctx > n_ctx_train) {
|
|
625
|
-
|
|
621
|
+
LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n",
|
|
626
622
|
__func__, n_ctx_train, params.n_ctx);
|
|
627
623
|
}
|
|
628
624
|
|
|
629
625
|
// print system information
|
|
630
626
|
{
|
|
631
|
-
|
|
632
|
-
|
|
627
|
+
LOG_INF("\n");
|
|
628
|
+
LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
|
|
633
629
|
}
|
|
634
630
|
|
|
635
631
|
if (!compute_imatrix(ctx, params)) {
|
|
@@ -638,7 +634,8 @@ int main(int argc, char ** argv) {
|
|
|
638
634
|
|
|
639
635
|
g_collector.save_imatrix();
|
|
640
636
|
|
|
641
|
-
|
|
637
|
+
LOG("\n");
|
|
638
|
+
llama_perf_context_print(ctx);
|
|
642
639
|
|
|
643
640
|
llama_free(ctx);
|
|
644
641
|
llama_free_model(model);
|