@fugood/llama.node 0.3.7 → 0.3.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +17 -2
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +2 -0
- package/lib/index.js +16 -1
- package/lib/index.ts +16 -0
- package/package.json +1 -1
- package/src/EmbeddingWorker.cpp +4 -3
- package/src/LlamaCompletionWorker.cpp +4 -2
- package/src/LlamaContext.cpp +61 -6
- package/src/LlamaContext.h +1 -0
- package/src/common.hpp +6 -11
- package/src/llama.cpp/.github/workflows/build.yml +19 -17
- package/src/llama.cpp/.github/workflows/docker.yml +77 -30
- package/src/llama.cpp/.github/workflows/editorconfig.yml +3 -1
- package/src/llama.cpp/.github/workflows/server.yml +22 -3
- package/src/llama.cpp/CMakeLists.txt +49 -24
- package/src/llama.cpp/common/arg.cpp +82 -26
- package/src/llama.cpp/common/arg.h +3 -0
- package/src/llama.cpp/common/common.cpp +192 -72
- package/src/llama.cpp/common/common.h +51 -18
- package/src/llama.cpp/common/ngram-cache.cpp +12 -12
- package/src/llama.cpp/common/ngram-cache.h +2 -2
- package/src/llama.cpp/common/sampling.cpp +11 -6
- package/src/llama.cpp/common/speculative.cpp +18 -15
- package/src/llama.cpp/docs/build.md +2 -0
- package/src/llama.cpp/examples/batched/batched.cpp +9 -7
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +3 -3
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +10 -8
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +11 -8
- package/src/llama.cpp/examples/cvector-generator/mean.hpp +1 -1
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +8 -7
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +7 -6
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +8 -7
- package/src/llama.cpp/examples/gguf/gguf.cpp +10 -6
- package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +1 -0
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +8 -7
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +13 -10
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +13 -12
- package/src/llama.cpp/examples/infill/infill.cpp +23 -24
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +44 -13
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -6
- package/src/llama.cpp/examples/llava/clip.cpp +4 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +9 -6
- package/src/llama.cpp/examples/llava/llava.cpp +2 -2
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +8 -4
- package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +11 -8
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +6 -7
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +4 -9
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +3 -7
- package/src/llama.cpp/examples/lookup/lookup.cpp +5 -6
- package/src/llama.cpp/examples/main/main.cpp +51 -29
- package/src/llama.cpp/examples/parallel/parallel.cpp +5 -6
- package/src/llama.cpp/examples/passkey/passkey.cpp +7 -5
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +37 -23
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -14
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +8 -8
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +12 -0
- package/src/llama.cpp/examples/run/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +1351 -0
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +114 -0
- package/src/llama.cpp/examples/run/run.cpp +175 -61
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +4 -25
- package/src/llama.cpp/examples/server/CMakeLists.txt +1 -0
- package/src/llama.cpp/examples/server/httplib.h +1295 -409
- package/src/llama.cpp/examples/server/server.cpp +387 -181
- package/src/llama.cpp/examples/server/tests/requirements.txt +1 -0
- package/src/llama.cpp/examples/server/utils.hpp +170 -58
- package/src/llama.cpp/examples/simple/simple.cpp +9 -8
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +16 -12
- package/src/llama.cpp/examples/speculative/speculative.cpp +22 -23
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +8 -12
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +17 -5
- package/src/llama.cpp/examples/tts/tts.cpp +64 -23
- package/src/llama.cpp/ggml/CMakeLists.txt +5 -21
- package/src/llama.cpp/ggml/include/ggml-backend.h +2 -0
- package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -0
- package/src/llama.cpp/ggml/include/ggml.h +36 -145
- package/src/llama.cpp/ggml/include/gguf.h +202 -0
- package/src/llama.cpp/ggml/src/CMakeLists.txt +6 -3
- package/src/llama.cpp/ggml/src/ggml-alloc.c +5 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +0 -1
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +79 -49
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +5 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +33 -23
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +57 -72
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +87 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +335 -66
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +10 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1090 -378
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +2 -2
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +3 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +3 -1
- package/src/llama.cpp/ggml/src/ggml-impl.h +11 -16
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +16 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +6 -6
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +154 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +9 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +18 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +1 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +1 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +40 -95
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +48 -48
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +24 -24
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +238 -164
- package/src/llama.cpp/ggml/src/ggml-sycl/gla.cpp +105 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/gla.hpp +8 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +3 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +1 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +1 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +7 -5
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +1 -2
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +74 -4
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +314 -116
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -2
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +9 -3
- package/src/llama.cpp/ggml/src/ggml.c +117 -1327
- package/src/llama.cpp/ggml/src/gguf.cpp +1329 -0
- package/src/llama.cpp/include/llama-cpp.h +6 -1
- package/src/llama.cpp/include/llama.h +138 -75
- package/src/llama.cpp/src/CMakeLists.txt +13 -1
- package/src/llama.cpp/src/llama-adapter.cpp +347 -0
- package/src/llama.cpp/src/llama-adapter.h +74 -0
- package/src/llama.cpp/src/llama-arch.cpp +1487 -0
- package/src/llama.cpp/src/llama-arch.h +400 -0
- package/src/llama.cpp/src/llama-batch.cpp +368 -0
- package/src/llama.cpp/src/llama-batch.h +88 -0
- package/src/llama.cpp/src/llama-chat.cpp +578 -0
- package/src/llama.cpp/src/llama-chat.h +52 -0
- package/src/llama.cpp/src/llama-context.cpp +1775 -0
- package/src/llama.cpp/src/llama-context.h +128 -0
- package/src/llama.cpp/src/llama-cparams.cpp +1 -0
- package/src/llama.cpp/src/llama-cparams.h +37 -0
- package/src/llama.cpp/src/llama-grammar.cpp +5 -4
- package/src/llama.cpp/src/llama-grammar.h +3 -1
- package/src/llama.cpp/src/llama-hparams.cpp +71 -0
- package/src/llama.cpp/src/llama-hparams.h +139 -0
- package/src/llama.cpp/src/llama-impl.cpp +167 -0
- package/src/llama.cpp/src/llama-impl.h +16 -136
- package/src/llama.cpp/src/llama-kv-cache.cpp +718 -0
- package/src/llama.cpp/src/llama-kv-cache.h +218 -0
- package/src/llama.cpp/src/llama-mmap.cpp +589 -0
- package/src/llama.cpp/src/llama-mmap.h +67 -0
- package/src/llama.cpp/src/llama-model-loader.cpp +1124 -0
- package/src/llama.cpp/src/llama-model-loader.h +167 -0
- package/src/llama.cpp/src/llama-model.cpp +3953 -0
- package/src/llama.cpp/src/llama-model.h +370 -0
- package/src/llama.cpp/src/llama-quant.cpp +934 -0
- package/src/llama.cpp/src/llama-quant.h +1 -0
- package/src/llama.cpp/src/llama-sampling.cpp +147 -32
- package/src/llama.cpp/src/llama-sampling.h +3 -19
- package/src/llama.cpp/src/llama-vocab.cpp +1832 -575
- package/src/llama.cpp/src/llama-vocab.h +97 -142
- package/src/llama.cpp/src/llama.cpp +7160 -20314
- package/src/llama.cpp/src/unicode.cpp +8 -3
- package/src/llama.cpp/tests/CMakeLists.txt +2 -0
- package/src/llama.cpp/tests/test-autorelease.cpp +3 -3
- package/src/llama.cpp/tests/test-backend-ops.cpp +370 -59
- package/src/llama.cpp/tests/test-chat-template.cpp +162 -125
- package/src/llama.cpp/tests/test-gguf.cpp +222 -187
- package/src/llama.cpp/tests/test-model-load-cancel.cpp +1 -1
- package/src/llama.cpp/tests/test-sampling.cpp +0 -1
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +4 -4
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +9 -7
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +8 -6
|
@@ -132,8 +132,10 @@ int main(int argc, char ** argv) {
|
|
|
132
132
|
// load the target model
|
|
133
133
|
common_init_result llama_init = common_init_from_params(params);
|
|
134
134
|
|
|
135
|
-
llama_model * model = llama_init.model;
|
|
136
|
-
llama_context * ctx = llama_init.context;
|
|
135
|
+
llama_model * model = llama_init.model.get();
|
|
136
|
+
llama_context * ctx = llama_init.context.get();
|
|
137
|
+
|
|
138
|
+
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
137
139
|
|
|
138
140
|
// load the prompts from an external file if there are any
|
|
139
141
|
if (params.prompt.empty()) {
|
|
@@ -358,7 +360,7 @@ int main(int argc, char ** argv) {
|
|
|
358
360
|
// client.id, client.seq_id, id, client.n_decoded, client.i_batch, token_str.c_str());
|
|
359
361
|
|
|
360
362
|
if (client.n_decoded > 2 &&
|
|
361
|
-
(
|
|
363
|
+
(llama_vocab_is_eog(vocab, id) ||
|
|
362
364
|
(params.n_predict > 0 && client.n_decoded + client.n_prompt >= params.n_predict) ||
|
|
363
365
|
client.response.find("User:") != std::string::npos ||
|
|
364
366
|
client.response.find('\n') != std::string::npos)) {
|
|
@@ -416,9 +418,6 @@ int main(int argc, char ** argv) {
|
|
|
416
418
|
|
|
417
419
|
llama_batch_free(batch);
|
|
418
420
|
|
|
419
|
-
llama_free(ctx);
|
|
420
|
-
llama_free_model(model);
|
|
421
|
-
|
|
422
421
|
llama_backend_free();
|
|
423
422
|
|
|
424
423
|
LOG("\n\n");
|
|
@@ -63,22 +63,24 @@ int main(int argc, char ** argv) {
|
|
|
63
63
|
|
|
64
64
|
llama_model_params model_params = common_model_params_to_llama(params);
|
|
65
65
|
|
|
66
|
-
llama_model * model =
|
|
66
|
+
llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
|
|
67
67
|
|
|
68
68
|
if (model == NULL) {
|
|
69
69
|
LOG_ERR("%s: unable to load model\n" , __func__);
|
|
70
70
|
return 1;
|
|
71
71
|
}
|
|
72
72
|
|
|
73
|
+
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
74
|
+
|
|
73
75
|
// initialize the context
|
|
74
76
|
|
|
75
77
|
llama_context_params ctx_params = common_context_params_to_llama(params);
|
|
76
78
|
|
|
77
|
-
ctx_params.n_ctx =
|
|
79
|
+
ctx_params.n_ctx = llama_model_n_ctx_train(model)*n_grp + n_keep;
|
|
78
80
|
|
|
79
81
|
GGML_ASSERT(ctx_params.n_batch % n_grp == 0 && "n_batch must be divisible by n_grp");
|
|
80
82
|
|
|
81
|
-
llama_context * ctx =
|
|
83
|
+
llama_context * ctx = llama_init_from_model(model, ctx_params);
|
|
82
84
|
if (ctx == NULL) {
|
|
83
85
|
LOG_ERR("%s: failed to create the llama_context\n" , __func__);
|
|
84
86
|
return 1;
|
|
@@ -223,7 +225,7 @@ int main(int argc, char ** argv) {
|
|
|
223
225
|
const llama_token new_token_id = llama_sampler_sample(smpl, ctx, batch.n_tokens - 1);
|
|
224
226
|
|
|
225
227
|
// is it an end of generation?
|
|
226
|
-
if (
|
|
228
|
+
if (llama_vocab_is_eog(vocab, new_token_id) || n_cur == n_len) {
|
|
227
229
|
LOG("\n");
|
|
228
230
|
|
|
229
231
|
break;
|
|
@@ -266,7 +268,7 @@ int main(int argc, char ** argv) {
|
|
|
266
268
|
llama_batch_free(batch);
|
|
267
269
|
|
|
268
270
|
llama_free(ctx);
|
|
269
|
-
|
|
271
|
+
llama_model_free(model);
|
|
270
272
|
|
|
271
273
|
llama_backend_free();
|
|
272
274
|
|
|
@@ -296,8 +296,11 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params
|
|
|
296
296
|
// Output: `perplexity: 13.5106 [114/114]`
|
|
297
297
|
// BOS tokens will be added for each chunk before eval
|
|
298
298
|
|
|
299
|
-
const
|
|
300
|
-
|
|
299
|
+
const llama_model * model = llama_get_model(ctx);
|
|
300
|
+
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
301
|
+
|
|
302
|
+
const bool add_bos = llama_vocab_get_add_bos(vocab);
|
|
303
|
+
GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
|
|
301
304
|
|
|
302
305
|
LOG_INF("%s: tokenizing the input ..\n", __func__);
|
|
303
306
|
|
|
@@ -338,7 +341,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params
|
|
|
338
341
|
const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
|
|
339
342
|
const int n_batch = params.n_batch;
|
|
340
343
|
|
|
341
|
-
const int n_vocab =
|
|
344
|
+
const int n_vocab = llama_vocab_n_tokens(vocab);
|
|
342
345
|
|
|
343
346
|
int count = 0;
|
|
344
347
|
double nll = 0.0;
|
|
@@ -382,7 +385,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params
|
|
|
382
385
|
|
|
383
386
|
// add BOS token for the first batch of each chunk
|
|
384
387
|
if (add_bos && j == 0) {
|
|
385
|
-
tokens[batch_start] =
|
|
388
|
+
tokens[batch_start] = llama_vocab_bos(vocab);
|
|
386
389
|
}
|
|
387
390
|
|
|
388
391
|
const auto * batch_logits = llama_get_logits(ctx);
|
|
@@ -444,8 +447,11 @@ static results_perplexity perplexity(llama_context * ctx, const common_params &
|
|
|
444
447
|
// Output: `perplexity: 13.5106 [114/114]`
|
|
445
448
|
// BOS tokens will be added for each chunk before eval
|
|
446
449
|
|
|
447
|
-
const
|
|
448
|
-
|
|
450
|
+
const llama_model * model = llama_get_model(ctx);
|
|
451
|
+
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
452
|
+
|
|
453
|
+
const bool add_bos = llama_vocab_get_add_bos(vocab);
|
|
454
|
+
GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
|
|
449
455
|
|
|
450
456
|
std::ofstream logits_stream;
|
|
451
457
|
if (!params.logits_file.empty()) {
|
|
@@ -485,7 +491,7 @@ static results_perplexity perplexity(llama_context * ctx, const common_params &
|
|
|
485
491
|
const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
|
|
486
492
|
const int n_batch = params.n_batch;
|
|
487
493
|
|
|
488
|
-
const int n_vocab =
|
|
494
|
+
const int n_vocab = llama_vocab_n_tokens(vocab);
|
|
489
495
|
|
|
490
496
|
int count = 0;
|
|
491
497
|
double nll = 0.0;
|
|
@@ -557,7 +563,7 @@ static results_perplexity perplexity(llama_context * ctx, const common_params &
|
|
|
557
563
|
|
|
558
564
|
// add BOS token for the first batch of each chunk
|
|
559
565
|
if (add_bos && j == 0) {
|
|
560
|
-
tokens[seq_start] =
|
|
566
|
+
tokens[seq_start] = llama_vocab_bos(vocab);
|
|
561
567
|
}
|
|
562
568
|
|
|
563
569
|
for (int k = 0; k < batch_size; ++k) {
|
|
@@ -732,6 +738,9 @@ static void compute_logprobs(const float * batch_logits, int n_vocab, std::vecto
|
|
|
732
738
|
}
|
|
733
739
|
|
|
734
740
|
static void hellaswag_score(llama_context * ctx, const common_params & params) {
|
|
741
|
+
const llama_model * model = llama_get_model(ctx);
|
|
742
|
+
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
743
|
+
|
|
735
744
|
// Calculates hellaswag score (acc_norm) from prompt
|
|
736
745
|
//
|
|
737
746
|
// Data extracted from the HellaSwag validation dataset (MIT license) https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl
|
|
@@ -765,7 +774,7 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) {
|
|
|
765
774
|
size_t hs_task_count = prompt_lines.size()/6;
|
|
766
775
|
LOG_INF("%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count);
|
|
767
776
|
|
|
768
|
-
const bool is_spm = llama_vocab_type(
|
|
777
|
+
const bool is_spm = llama_vocab_type(vocab) == LLAMA_VOCAB_TYPE_SPM;
|
|
769
778
|
LOG_INF("================================= is_spm = %d\n", is_spm);
|
|
770
779
|
|
|
771
780
|
// The tasks should be randomized so the score stabilizes quickly.
|
|
@@ -848,7 +857,7 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) {
|
|
|
848
857
|
const int n_ctx = llama_n_ctx(ctx);
|
|
849
858
|
const int n_batch = params.n_batch;
|
|
850
859
|
|
|
851
|
-
const int n_vocab =
|
|
860
|
+
const int n_vocab = llama_vocab_n_tokens(vocab);
|
|
852
861
|
|
|
853
862
|
const int max_tasks_per_batch = 32;
|
|
854
863
|
const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx));
|
|
@@ -1072,6 +1081,8 @@ static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string
|
|
|
1072
1081
|
*
|
|
1073
1082
|
*/
|
|
1074
1083
|
static void winogrande_score(llama_context * ctx, const common_params & params) {
|
|
1084
|
+
const llama_model * model = llama_get_model(ctx);
|
|
1085
|
+
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
1075
1086
|
|
|
1076
1087
|
constexpr int k_min_trailing_ctx = 3;
|
|
1077
1088
|
|
|
@@ -1130,7 +1141,7 @@ static void winogrande_score(llama_context * ctx, const common_params & params)
|
|
|
1130
1141
|
const int n_ctx = llama_n_ctx(ctx);
|
|
1131
1142
|
const int n_batch = params.n_batch;
|
|
1132
1143
|
|
|
1133
|
-
const int n_vocab =
|
|
1144
|
+
const int n_vocab = llama_vocab_n_tokens(vocab);
|
|
1134
1145
|
|
|
1135
1146
|
const int max_tasks_per_batch = 128;
|
|
1136
1147
|
const int max_seq = std::min(2*max_tasks_per_batch, (int) llama_n_seq_max(ctx));
|
|
@@ -1374,6 +1385,8 @@ static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choic
|
|
|
1374
1385
|
// https://huggingface.co/datasets/truthful_qa
|
|
1375
1386
|
//
|
|
1376
1387
|
static void multiple_choice_score(llama_context * ctx, const common_params & params) {
|
|
1388
|
+
const llama_model * model = llama_get_model(ctx);
|
|
1389
|
+
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
1377
1390
|
|
|
1378
1391
|
std::istringstream strstream(params.prompt);
|
|
1379
1392
|
uint32_t n_task;
|
|
@@ -1482,7 +1495,7 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par
|
|
|
1482
1495
|
const int n_ctx = llama_n_ctx(ctx);
|
|
1483
1496
|
const int n_batch = params.n_batch;
|
|
1484
1497
|
|
|
1485
|
-
const int n_vocab =
|
|
1498
|
+
const int n_vocab = llama_vocab_n_tokens(vocab);
|
|
1486
1499
|
|
|
1487
1500
|
const int max_tasks_per_batch = 32;
|
|
1488
1501
|
const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx));
|
|
@@ -1655,6 +1668,9 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par
|
|
|
1655
1668
|
}
|
|
1656
1669
|
|
|
1657
1670
|
static void kl_divergence(llama_context * ctx, const common_params & params) {
|
|
1671
|
+
const llama_model * model = llama_get_model(ctx);
|
|
1672
|
+
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
1673
|
+
|
|
1658
1674
|
if (params.logits_file.empty()) {
|
|
1659
1675
|
LOG_ERR("%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__);
|
|
1660
1676
|
return;
|
|
@@ -1688,8 +1704,8 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
|
|
|
1688
1704
|
LOG_ERR("%s: failed reading n_vocab, n_chunk from %s\n", __func__, params.logits_file.c_str());
|
|
1689
1705
|
return;
|
|
1690
1706
|
}
|
|
1691
|
-
if (n_vocab !=
|
|
1692
|
-
LOG_ERR("%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab,
|
|
1707
|
+
if (n_vocab != llama_vocab_n_tokens(vocab)) {
|
|
1708
|
+
LOG_ERR("%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_vocab_n_tokens(vocab));
|
|
1693
1709
|
}
|
|
1694
1710
|
|
|
1695
1711
|
std::vector<llama_token> tokens(size_t(n_ctx) * n_chunk);
|
|
@@ -1701,8 +1717,8 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
|
|
|
1701
1717
|
const int n_batch = params.n_batch;
|
|
1702
1718
|
const int num_batches = (n_ctx + n_batch - 1)/n_batch;
|
|
1703
1719
|
const int nv = 2*((n_vocab + 1)/2) + 4;
|
|
1704
|
-
const bool add_bos =
|
|
1705
|
-
GGML_ASSERT(!
|
|
1720
|
+
const bool add_bos = llama_vocab_get_add_bos(vocab);
|
|
1721
|
+
GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
|
|
1706
1722
|
|
|
1707
1723
|
std::vector<uint16_t> log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv);
|
|
1708
1724
|
std::vector<float> kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
|
|
@@ -1761,7 +1777,7 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
|
|
|
1761
1777
|
|
|
1762
1778
|
// add BOS token for the first batch of each chunk
|
|
1763
1779
|
if (add_bos && j == 0) {
|
|
1764
|
-
tokens[batch_start] =
|
|
1780
|
+
tokens[batch_start] = llama_vocab_bos(vocab);
|
|
1765
1781
|
}
|
|
1766
1782
|
|
|
1767
1783
|
common_batch_clear(batch);
|
|
@@ -1987,14 +2003,15 @@ int main(int argc, char ** argv) {
|
|
|
1987
2003
|
// load the model and apply lora adapter, if any
|
|
1988
2004
|
common_init_result llama_init = common_init_from_params(params);
|
|
1989
2005
|
|
|
1990
|
-
llama_model * model = llama_init.model;
|
|
1991
|
-
llama_context * ctx = llama_init.context;
|
|
2006
|
+
llama_model * model = llama_init.model.get();
|
|
2007
|
+
llama_context * ctx = llama_init.context.get();
|
|
2008
|
+
|
|
1992
2009
|
if (model == NULL) {
|
|
1993
2010
|
LOG_ERR("%s: unable to load model\n", __func__);
|
|
1994
2011
|
return 1;
|
|
1995
2012
|
}
|
|
1996
2013
|
|
|
1997
|
-
const int n_ctx_train =
|
|
2014
|
+
const int n_ctx_train = llama_model_n_ctx_train(model);
|
|
1998
2015
|
|
|
1999
2016
|
if (params.n_ctx > n_ctx_train) {
|
|
2000
2017
|
LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n",
|
|
@@ -2023,9 +2040,6 @@ int main(int argc, char ** argv) {
|
|
|
2023
2040
|
LOG("\n");
|
|
2024
2041
|
llama_perf_context_print(ctx);
|
|
2025
2042
|
|
|
2026
|
-
llama_free(ctx);
|
|
2027
|
-
llama_free_model(model);
|
|
2028
|
-
|
|
2029
2043
|
llama_backend_free();
|
|
2030
2044
|
|
|
2031
2045
|
return 0;
|
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
#include "common.h"
|
|
2
1
|
#include "ggml.h"
|
|
3
2
|
#include "llama.h"
|
|
4
|
-
#include "llama-
|
|
3
|
+
#include "llama-context.h"
|
|
4
|
+
#include "common.h"
|
|
5
5
|
|
|
6
6
|
#include <algorithm>
|
|
7
7
|
#include <cassert>
|
|
@@ -9,11 +9,9 @@
|
|
|
9
9
|
#include <cmath>
|
|
10
10
|
#include <cstdio>
|
|
11
11
|
#include <cstring>
|
|
12
|
-
#include <map>
|
|
13
12
|
#include <numeric>
|
|
14
13
|
#include <regex>
|
|
15
14
|
#include <string>
|
|
16
|
-
#include <unordered_map>
|
|
17
15
|
#include <vector>
|
|
18
16
|
#include <thread>
|
|
19
17
|
#include <mutex>
|
|
@@ -311,7 +309,7 @@ int main(int argc, char ** argv) {
|
|
|
311
309
|
auto mparams = llama_model_default_params();
|
|
312
310
|
mparams.use_mlock = false;
|
|
313
311
|
|
|
314
|
-
model =
|
|
312
|
+
model = llama_model_load_from_file(params.model.c_str(), mparams);
|
|
315
313
|
|
|
316
314
|
if (model == NULL) {
|
|
317
315
|
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
|
|
@@ -321,22 +319,22 @@ int main(int argc, char ** argv) {
|
|
|
321
319
|
auto cparams = llama_context_default_params();
|
|
322
320
|
cparams.n_ctx = 256;
|
|
323
321
|
|
|
324
|
-
ctx =
|
|
322
|
+
ctx = llama_init_from_model(model, cparams);
|
|
325
323
|
|
|
326
324
|
if (ctx == NULL) {
|
|
327
325
|
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
|
|
328
|
-
|
|
326
|
+
llama_model_free(model);
|
|
329
327
|
return 1;
|
|
330
328
|
}
|
|
331
329
|
}
|
|
332
330
|
|
|
333
|
-
const auto &tensors = llama_internal_get_tensor_map(ctx);
|
|
331
|
+
const auto & tensors = llama_internal_get_tensor_map(ctx);
|
|
334
332
|
|
|
335
333
|
// check layer tensors
|
|
336
334
|
int included_layers = 0;
|
|
337
335
|
int64_t max_nelements = 0;
|
|
338
336
|
bool is_f16 = false;
|
|
339
|
-
for (const auto& kv_tensor : tensors) {
|
|
337
|
+
for (const auto & kv_tensor : tensors) {
|
|
340
338
|
if (!layer_included(params, kv_tensor.first)) {
|
|
341
339
|
continue;
|
|
342
340
|
}
|
|
@@ -349,7 +347,7 @@ int main(int argc, char ** argv) {
|
|
|
349
347
|
fprintf(stderr, "%s: error: Quantization should be tested with a float model, "
|
|
350
348
|
"this model contains already quantized layers (%s is type %d)\n", __func__, kv_tensor.first.c_str(), kv_tensor.second->type);
|
|
351
349
|
llama_free(ctx);
|
|
352
|
-
|
|
350
|
+
llama_model_free(model);
|
|
353
351
|
return 1;
|
|
354
352
|
}
|
|
355
353
|
included_layers++;
|
|
@@ -371,8 +369,8 @@ int main(int argc, char ** argv) {
|
|
|
371
369
|
if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
|
|
372
370
|
continue;
|
|
373
371
|
}
|
|
374
|
-
const auto *
|
|
375
|
-
const auto *
|
|
372
|
+
const auto * qfns = ggml_get_type_traits(type);
|
|
373
|
+
const auto * qfns_cpu = ggml_get_type_traits_cpu(type);
|
|
376
374
|
if (qfns_cpu->from_float && qfns->to_float) {
|
|
377
375
|
if (params.verbose) {
|
|
378
376
|
printf("testing %s ...\n", ggml_type_name(type));
|
|
@@ -382,7 +380,7 @@ int main(int argc, char ** argv) {
|
|
|
382
380
|
|
|
383
381
|
error_stats global_stats {};
|
|
384
382
|
|
|
385
|
-
for (const auto& kv_tensor : tensors) {
|
|
383
|
+
for (const auto & kv_tensor : tensors) {
|
|
386
384
|
if (!layer_included(params, kv_tensor.first)) {
|
|
387
385
|
continue;
|
|
388
386
|
}
|
|
@@ -411,7 +409,7 @@ int main(int argc, char ** argv) {
|
|
|
411
409
|
|
|
412
410
|
|
|
413
411
|
llama_free(ctx);
|
|
414
|
-
|
|
412
|
+
llama_model_free(model);
|
|
415
413
|
// report timing
|
|
416
414
|
{
|
|
417
415
|
const int64_t t_main_end_us = ggml_time_us();
|
|
@@ -151,15 +151,17 @@ int main(int argc, char ** argv) {
|
|
|
151
151
|
// load the model
|
|
152
152
|
common_init_result llama_init = common_init_from_params(params);
|
|
153
153
|
|
|
154
|
-
llama_model * model = llama_init.model;
|
|
155
|
-
llama_context * ctx = llama_init.context;
|
|
154
|
+
llama_model * model = llama_init.model.get();
|
|
155
|
+
llama_context * ctx = llama_init.context.get();
|
|
156
156
|
|
|
157
157
|
if (model == NULL) {
|
|
158
158
|
LOG_ERR("%s: unable to load model\n", __func__);
|
|
159
159
|
return 1;
|
|
160
160
|
}
|
|
161
161
|
|
|
162
|
-
const
|
|
162
|
+
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
163
|
+
|
|
164
|
+
const int n_ctx_train = llama_model_n_ctx_train(model);
|
|
163
165
|
const int n_ctx = llama_n_ctx(ctx);
|
|
164
166
|
|
|
165
167
|
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
|
|
@@ -192,8 +194,8 @@ int main(int argc, char ** argv) {
|
|
|
192
194
|
return 1;
|
|
193
195
|
}
|
|
194
196
|
// add eos if not present
|
|
195
|
-
if (
|
|
196
|
-
inp.push_back(
|
|
197
|
+
if (llama_vocab_eos(vocab) >= 0 && (inp.empty() || inp.back() != llama_vocab_eos(vocab))) {
|
|
198
|
+
inp.push_back(llama_vocab_eos(vocab));
|
|
197
199
|
}
|
|
198
200
|
chunk.tokens = inp;
|
|
199
201
|
}
|
|
@@ -215,7 +217,7 @@ int main(int argc, char ** argv) {
|
|
|
215
217
|
struct llama_batch batch = llama_batch_init(n_batch, 0, 1);
|
|
216
218
|
|
|
217
219
|
// allocate output
|
|
218
|
-
const int n_embd =
|
|
220
|
+
const int n_embd = llama_model_n_embd(model);
|
|
219
221
|
std::vector<float> embeddings(n_chunks * n_embd, 0);
|
|
220
222
|
float * emb = embeddings.data();
|
|
221
223
|
|
|
@@ -298,7 +300,5 @@ int main(int argc, char ** argv) {
|
|
|
298
300
|
|
|
299
301
|
// clean up
|
|
300
302
|
llama_batch_free(query_batch);
|
|
301
|
-
llama_free(ctx);
|
|
302
|
-
llama_free_model(model);
|
|
303
303
|
llama_backend_free();
|
|
304
304
|
}
|
|
@@ -12,6 +12,10 @@
|
|
|
12
12
|
#include "ggml-vulkan.h"
|
|
13
13
|
#endif
|
|
14
14
|
|
|
15
|
+
#ifdef GGML_USE_SYCL
|
|
16
|
+
#include "ggml-sycl.h"
|
|
17
|
+
#endif
|
|
18
|
+
|
|
15
19
|
#include "ggml-rpc.h"
|
|
16
20
|
#ifdef _WIN32
|
|
17
21
|
# include <windows.h>
|
|
@@ -91,6 +95,12 @@ static ggml_backend_t create_backend() {
|
|
|
91
95
|
if (!backend) {
|
|
92
96
|
fprintf(stderr, "%s: ggml_backend_vulkan_init() failed\n", __func__);
|
|
93
97
|
}
|
|
98
|
+
#elif GGML_USE_SYCL
|
|
99
|
+
fprintf(stderr, "%s: using SYCL backend\n", __func__);
|
|
100
|
+
backend = ggml_backend_sycl_init(0); // init device 0
|
|
101
|
+
if (!backend) {
|
|
102
|
+
fprintf(stderr, "%s: ggml_backend_sycl_init() failed\n", __func__);
|
|
103
|
+
}
|
|
94
104
|
#endif
|
|
95
105
|
|
|
96
106
|
// if there aren't GPU Backends fallback to CPU backend
|
|
@@ -106,6 +116,8 @@ static void get_backend_memory(size_t * free_mem, size_t * total_mem) {
|
|
|
106
116
|
ggml_backend_cuda_get_device_memory(0, free_mem, total_mem);
|
|
107
117
|
#elif GGML_USE_VULKAN
|
|
108
118
|
ggml_backend_vk_get_device_memory(0, free_mem, total_mem);
|
|
119
|
+
#elif GGML_USE_SYCL
|
|
120
|
+
ggml_backend_sycl_get_device_memory(0, free_mem, total_mem);
|
|
109
121
|
#else
|
|
110
122
|
#ifdef _WIN32
|
|
111
123
|
MEMORYSTATUSEX status;
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
set(TARGET llama-run)
|
|
2
|
-
add_executable(${TARGET} run.cpp)
|
|
2
|
+
add_executable(${TARGET} run.cpp linenoise.cpp/linenoise.cpp)
|
|
3
3
|
install(TARGETS ${TARGET} RUNTIME)
|
|
4
4
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
|
5
5
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|