@fugood/llama.node 0.3.6 → 0.3.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +17 -2
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +3 -1
- package/lib/index.js +16 -1
- package/lib/index.ts +16 -0
- package/package.json +1 -1
- package/src/EmbeddingWorker.cpp +4 -3
- package/src/LlamaCompletionWorker.cpp +4 -2
- package/src/LlamaContext.cpp +61 -6
- package/src/LlamaContext.h +1 -0
- package/src/common.hpp +6 -11
- package/src/llama.cpp/.github/workflows/build.yml +19 -17
- package/src/llama.cpp/.github/workflows/docker.yml +77 -30
- package/src/llama.cpp/.github/workflows/editorconfig.yml +3 -1
- package/src/llama.cpp/.github/workflows/server.yml +22 -3
- package/src/llama.cpp/CMakeLists.txt +49 -24
- package/src/llama.cpp/common/arg.cpp +82 -26
- package/src/llama.cpp/common/arg.h +3 -0
- package/src/llama.cpp/common/common.cpp +192 -72
- package/src/llama.cpp/common/common.h +51 -18
- package/src/llama.cpp/common/ngram-cache.cpp +12 -12
- package/src/llama.cpp/common/ngram-cache.h +2 -2
- package/src/llama.cpp/common/sampling.cpp +11 -6
- package/src/llama.cpp/common/speculative.cpp +18 -15
- package/src/llama.cpp/docs/build.md +2 -0
- package/src/llama.cpp/examples/batched/batched.cpp +9 -7
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +3 -3
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +10 -8
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +11 -8
- package/src/llama.cpp/examples/cvector-generator/mean.hpp +1 -1
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +8 -7
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +7 -6
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +8 -7
- package/src/llama.cpp/examples/gguf/gguf.cpp +10 -6
- package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +1 -0
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +8 -7
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +13 -10
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +13 -12
- package/src/llama.cpp/examples/infill/infill.cpp +23 -24
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +44 -13
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -6
- package/src/llama.cpp/examples/llava/clip.cpp +4 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +9 -6
- package/src/llama.cpp/examples/llava/llava.cpp +2 -2
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +8 -4
- package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +11 -8
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +6 -7
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +4 -9
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +3 -7
- package/src/llama.cpp/examples/lookup/lookup.cpp +5 -6
- package/src/llama.cpp/examples/main/main.cpp +51 -29
- package/src/llama.cpp/examples/parallel/parallel.cpp +5 -6
- package/src/llama.cpp/examples/passkey/passkey.cpp +7 -5
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +37 -23
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -14
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +8 -8
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +12 -0
- package/src/llama.cpp/examples/run/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +1351 -0
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +114 -0
- package/src/llama.cpp/examples/run/run.cpp +175 -61
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +4 -25
- package/src/llama.cpp/examples/server/CMakeLists.txt +1 -0
- package/src/llama.cpp/examples/server/httplib.h +1295 -409
- package/src/llama.cpp/examples/server/server.cpp +387 -181
- package/src/llama.cpp/examples/server/tests/requirements.txt +1 -0
- package/src/llama.cpp/examples/server/utils.hpp +170 -58
- package/src/llama.cpp/examples/simple/simple.cpp +9 -8
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +16 -12
- package/src/llama.cpp/examples/speculative/speculative.cpp +22 -23
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +8 -12
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +17 -5
- package/src/llama.cpp/examples/tts/tts.cpp +64 -23
- package/src/llama.cpp/ggml/CMakeLists.txt +5 -21
- package/src/llama.cpp/ggml/include/ggml-backend.h +2 -0
- package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -0
- package/src/llama.cpp/ggml/include/ggml.h +36 -145
- package/src/llama.cpp/ggml/include/gguf.h +202 -0
- package/src/llama.cpp/ggml/src/CMakeLists.txt +6 -3
- package/src/llama.cpp/ggml/src/ggml-alloc.c +5 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +0 -1
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +79 -49
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +5 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +33 -23
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +57 -72
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +87 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +335 -66
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +10 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1090 -378
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +2 -2
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +3 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +3 -1
- package/src/llama.cpp/ggml/src/ggml-impl.h +11 -16
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +16 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +6 -6
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +154 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +9 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +18 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +1 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +1 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +40 -95
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +48 -48
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +24 -24
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +238 -164
- package/src/llama.cpp/ggml/src/ggml-sycl/gla.cpp +105 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/gla.hpp +8 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +3 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +1 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +1 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +7 -5
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +1 -2
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +74 -4
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +314 -116
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -2
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +9 -3
- package/src/llama.cpp/ggml/src/ggml.c +117 -1327
- package/src/llama.cpp/ggml/src/gguf.cpp +1329 -0
- package/src/llama.cpp/include/llama-cpp.h +6 -1
- package/src/llama.cpp/include/llama.h +138 -75
- package/src/llama.cpp/src/CMakeLists.txt +13 -1
- package/src/llama.cpp/src/llama-adapter.cpp +347 -0
- package/src/llama.cpp/src/llama-adapter.h +74 -0
- package/src/llama.cpp/src/llama-arch.cpp +1487 -0
- package/src/llama.cpp/src/llama-arch.h +400 -0
- package/src/llama.cpp/src/llama-batch.cpp +368 -0
- package/src/llama.cpp/src/llama-batch.h +88 -0
- package/src/llama.cpp/src/llama-chat.cpp +578 -0
- package/src/llama.cpp/src/llama-chat.h +52 -0
- package/src/llama.cpp/src/llama-context.cpp +1775 -0
- package/src/llama.cpp/src/llama-context.h +128 -0
- package/src/llama.cpp/src/llama-cparams.cpp +1 -0
- package/src/llama.cpp/src/llama-cparams.h +37 -0
- package/src/llama.cpp/src/llama-grammar.cpp +5 -4
- package/src/llama.cpp/src/llama-grammar.h +3 -1
- package/src/llama.cpp/src/llama-hparams.cpp +71 -0
- package/src/llama.cpp/src/llama-hparams.h +139 -0
- package/src/llama.cpp/src/llama-impl.cpp +167 -0
- package/src/llama.cpp/src/llama-impl.h +16 -136
- package/src/llama.cpp/src/llama-kv-cache.cpp +718 -0
- package/src/llama.cpp/src/llama-kv-cache.h +218 -0
- package/src/llama.cpp/src/llama-mmap.cpp +589 -0
- package/src/llama.cpp/src/llama-mmap.h +67 -0
- package/src/llama.cpp/src/llama-model-loader.cpp +1124 -0
- package/src/llama.cpp/src/llama-model-loader.h +167 -0
- package/src/llama.cpp/src/llama-model.cpp +3953 -0
- package/src/llama.cpp/src/llama-model.h +370 -0
- package/src/llama.cpp/src/llama-quant.cpp +934 -0
- package/src/llama.cpp/src/llama-quant.h +1 -0
- package/src/llama.cpp/src/llama-sampling.cpp +147 -32
- package/src/llama.cpp/src/llama-sampling.h +3 -19
- package/src/llama.cpp/src/llama-vocab.cpp +1832 -575
- package/src/llama.cpp/src/llama-vocab.h +97 -142
- package/src/llama.cpp/src/llama.cpp +7160 -20314
- package/src/llama.cpp/src/unicode.cpp +8 -3
- package/src/llama.cpp/tests/CMakeLists.txt +2 -0
- package/src/llama.cpp/tests/test-autorelease.cpp +3 -3
- package/src/llama.cpp/tests/test-backend-ops.cpp +370 -59
- package/src/llama.cpp/tests/test-chat-template.cpp +162 -125
- package/src/llama.cpp/tests/test-gguf.cpp +222 -187
- package/src/llama.cpp/tests/test-model-load-cancel.cpp +1 -1
- package/src/llama.cpp/tests/test-sampling.cpp +0 -1
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +4 -4
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +9 -7
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +8 -6
|
@@ -9,7 +9,7 @@
|
|
|
9
9
|
#include "llama.h"
|
|
10
10
|
|
|
11
11
|
struct llama_model_deleter {
|
|
12
|
-
void operator()(llama_model * model) {
|
|
12
|
+
void operator()(llama_model * model) { llama_model_free(model); }
|
|
13
13
|
};
|
|
14
14
|
|
|
15
15
|
struct llama_context_deleter {
|
|
@@ -20,6 +20,11 @@ struct llama_sampler_deleter {
|
|
|
20
20
|
void operator()(llama_sampler * sampler) { llama_sampler_free(sampler); }
|
|
21
21
|
};
|
|
22
22
|
|
|
23
|
+
struct llama_adapter_lora_deleter {
|
|
24
|
+
void operator()(llama_adapter_lora * adapter) { llama_adapter_lora_free(adapter); }
|
|
25
|
+
};
|
|
26
|
+
|
|
23
27
|
typedef std::unique_ptr<llama_model, llama_model_deleter> llama_model_ptr;
|
|
24
28
|
typedef std::unique_ptr<llama_context, llama_context_deleter> llama_context_ptr;
|
|
25
29
|
typedef std::unique_ptr<llama_sampler, llama_sampler_deleter> llama_sampler_ptr;
|
|
30
|
+
typedef std::unique_ptr<llama_adapter_lora, llama_adapter_lora_deleter> llama_adapter_lora_ptr;
|
|
@@ -34,7 +34,6 @@
|
|
|
34
34
|
|
|
35
35
|
#define LLAMA_DEFAULT_SEED 0xFFFFFFFF
|
|
36
36
|
|
|
37
|
-
// TODO: use everywhere in the implementation
|
|
38
37
|
#define LLAMA_TOKEN_NULL -1
|
|
39
38
|
|
|
40
39
|
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
|
|
@@ -57,7 +56,7 @@ extern "C" {
|
|
|
57
56
|
// TODO: show sample usage
|
|
58
57
|
//
|
|
59
58
|
|
|
60
|
-
|
|
59
|
+
struct llama_vocab;
|
|
61
60
|
struct llama_model;
|
|
62
61
|
struct llama_context;
|
|
63
62
|
struct llama_sampler;
|
|
@@ -105,6 +104,7 @@ extern "C" {
|
|
|
105
104
|
LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
|
|
106
105
|
LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
|
|
107
106
|
LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
|
|
107
|
+
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
|
|
108
108
|
};
|
|
109
109
|
|
|
110
110
|
enum llama_rope_type {
|
|
@@ -288,9 +288,6 @@ extern "C" {
|
|
|
288
288
|
// proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
|
|
289
289
|
const float * tensor_split;
|
|
290
290
|
|
|
291
|
-
// comma separated list of RPC servers to use for offloading
|
|
292
|
-
const char * rpc_servers;
|
|
293
|
-
|
|
294
291
|
// Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
|
|
295
292
|
// If the provided progress_callback returns true, model loading continues.
|
|
296
293
|
// If it returns false, model loading is immediately aborted.
|
|
@@ -385,7 +382,7 @@ extern "C" {
|
|
|
385
382
|
} llama_chat_message;
|
|
386
383
|
|
|
387
384
|
// lora adapter
|
|
388
|
-
struct
|
|
385
|
+
struct llama_adapter_lora;
|
|
389
386
|
|
|
390
387
|
// Helpers for getting default parameters
|
|
391
388
|
// TODO: update API to start accepting pointers to params structs (https://github.com/ggerganov/llama.cpp/discussions/9172)
|
|
@@ -399,30 +396,53 @@ extern "C" {
|
|
|
399
396
|
// Call once at the start of the program
|
|
400
397
|
LLAMA_API void llama_backend_init(void);
|
|
401
398
|
|
|
399
|
+
// Call once at the end of the program - currently only used for MPI
|
|
400
|
+
LLAMA_API void llama_backend_free(void);
|
|
401
|
+
|
|
402
402
|
//optional:
|
|
403
403
|
LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa);
|
|
404
404
|
|
|
405
405
|
// Optional: an auto threadpool gets created in ggml if not passed explicitly
|
|
406
406
|
LLAMA_API void llama_attach_threadpool(
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
407
|
+
struct llama_context * ctx,
|
|
408
|
+
ggml_threadpool_t threadpool,
|
|
409
|
+
ggml_threadpool_t threadpool_batch);
|
|
410
|
+
|
|
410
411
|
LLAMA_API void llama_detach_threadpool(struct llama_context * ctx);
|
|
411
412
|
|
|
412
|
-
|
|
413
|
-
|
|
413
|
+
DEPRECATED(LLAMA_API struct llama_model * llama_load_model_from_file(
|
|
414
|
+
const char * path_model,
|
|
415
|
+
struct llama_model_params params),
|
|
416
|
+
"use llama_model_load_from_file instead");
|
|
414
417
|
|
|
415
|
-
|
|
418
|
+
// Load the model from a file
|
|
419
|
+
// If the file is split into multiple parts, the file name must follow this pattern: <name>-%05d-of-%05d.gguf
|
|
420
|
+
// If the split file name does not follow this pattern, use llama_model_load_from_splits
|
|
421
|
+
LLAMA_API struct llama_model * llama_model_load_from_file(
|
|
416
422
|
const char * path_model,
|
|
417
423
|
struct llama_model_params params);
|
|
418
424
|
|
|
419
|
-
|
|
425
|
+
// Load the model from multiple splits (support custom naming scheme)
|
|
426
|
+
// The paths must be in the correct order
|
|
427
|
+
LLAMA_API struct llama_model * llama_model_load_from_splits(
|
|
428
|
+
const char ** paths,
|
|
429
|
+
size_t n_paths,
|
|
430
|
+
struct llama_model_params params);
|
|
420
431
|
|
|
421
|
-
|
|
422
|
-
|
|
432
|
+
DEPRECATED(LLAMA_API void llama_free_model(struct llama_model * model),
|
|
433
|
+
"use llama_model_free instead");
|
|
434
|
+
|
|
435
|
+
LLAMA_API void llama_model_free(struct llama_model * model);
|
|
436
|
+
|
|
437
|
+
LLAMA_API struct llama_context * llama_init_from_model(
|
|
423
438
|
struct llama_model * model,
|
|
424
439
|
struct llama_context_params params);
|
|
425
440
|
|
|
441
|
+
DEPRECATED(LLAMA_API struct llama_context * llama_new_context_with_model(
|
|
442
|
+
struct llama_model * model,
|
|
443
|
+
struct llama_context_params params),
|
|
444
|
+
"use llama_init_from_model instead");
|
|
445
|
+
|
|
426
446
|
// Frees all allocated memory
|
|
427
447
|
LLAMA_API void llama_free(struct llama_context * ctx);
|
|
428
448
|
|
|
@@ -440,20 +460,30 @@ extern "C" {
|
|
|
440
460
|
LLAMA_API uint32_t llama_n_ubatch (const struct llama_context * ctx);
|
|
441
461
|
LLAMA_API uint32_t llama_n_seq_max (const struct llama_context * ctx);
|
|
442
462
|
|
|
443
|
-
LLAMA_API int32_t
|
|
444
|
-
LLAMA_API int32_t
|
|
445
|
-
LLAMA_API int32_t
|
|
446
|
-
LLAMA_API int32_t
|
|
447
|
-
LLAMA_API int32_t llama_n_head (const struct llama_model * model);
|
|
463
|
+
DEPRECATED(LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model), "use llama_model_n_ctx_train instead");
|
|
464
|
+
DEPRECATED(LLAMA_API int32_t llama_n_embd (const struct llama_model * model), "use llama_model_n_embd instead");
|
|
465
|
+
DEPRECATED(LLAMA_API int32_t llama_n_layer (const struct llama_model * model), "use llama_model_n_layer instead");
|
|
466
|
+
DEPRECATED(LLAMA_API int32_t llama_n_head (const struct llama_model * model), "use llama_model_n_head instead");
|
|
448
467
|
|
|
449
|
-
LLAMA_API const struct
|
|
468
|
+
DEPRECATED(LLAMA_API int32_t llama_n_vocab (const struct llama_vocab * vocab), "use llama_vocab_n_tokens instead");
|
|
450
469
|
|
|
451
|
-
LLAMA_API
|
|
452
|
-
LLAMA_API enum
|
|
453
|
-
|
|
470
|
+
LLAMA_API const struct llama_model * llama_get_model (const struct llama_context * ctx);
|
|
471
|
+
LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx);
|
|
472
|
+
|
|
473
|
+
LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
|
|
474
|
+
LLAMA_API enum llama_rope_type llama_model_rope_type(const struct llama_model * model);
|
|
475
|
+
|
|
476
|
+
LLAMA_API int32_t llama_model_n_ctx_train(const struct llama_model * model);
|
|
477
|
+
LLAMA_API int32_t llama_model_n_embd (const struct llama_model * model);
|
|
478
|
+
LLAMA_API int32_t llama_model_n_layer (const struct llama_model * model);
|
|
479
|
+
LLAMA_API int32_t llama_model_n_head (const struct llama_model * model);
|
|
454
480
|
|
|
455
481
|
// Get the model's RoPE frequency scaling factor
|
|
456
|
-
LLAMA_API float
|
|
482
|
+
LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model);
|
|
483
|
+
|
|
484
|
+
LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_vocab * vocab);
|
|
485
|
+
|
|
486
|
+
LLAMA_API int32_t llama_vocab_n_tokens(const struct llama_vocab * vocab);
|
|
457
487
|
|
|
458
488
|
// Functions to access the model's GGUF metadata scalar values
|
|
459
489
|
// - The functions return the length of the string on success, or -1 on failure
|
|
@@ -479,6 +509,9 @@ extern "C" {
|
|
|
479
509
|
// Returns the total size of all the tensors in the model in bytes
|
|
480
510
|
LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
|
|
481
511
|
|
|
512
|
+
// Get the default chat template. Returns nullptr if not available
|
|
513
|
+
LLAMA_API const char * llama_model_chat_template(const struct llama_model * model);
|
|
514
|
+
|
|
482
515
|
// Returns the total number of parameters in the model
|
|
483
516
|
LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
|
|
484
517
|
|
|
@@ -501,32 +534,36 @@ extern "C" {
|
|
|
501
534
|
const char * fname_out,
|
|
502
535
|
const llama_model_quantize_params * params);
|
|
503
536
|
|
|
537
|
+
//
|
|
538
|
+
// Adapters
|
|
539
|
+
//
|
|
540
|
+
|
|
504
541
|
// Load a LoRA adapter from file
|
|
505
|
-
|
|
506
|
-
LLAMA_API struct llama_lora_adapter * llama_lora_adapter_init(
|
|
542
|
+
LLAMA_API struct llama_adapter_lora * llama_adapter_lora_init(
|
|
507
543
|
struct llama_model * model,
|
|
508
544
|
const char * path_lora);
|
|
509
545
|
|
|
546
|
+
// Manually free a LoRA adapter
|
|
547
|
+
// Note: loaded adapters will be free when the associated model is deleted
|
|
548
|
+
LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter);
|
|
549
|
+
|
|
550
|
+
// The following functions operate on a llama_context, hence the naming: llama_verb_...
|
|
551
|
+
|
|
510
552
|
// Add a loaded LoRA adapter to given context
|
|
511
553
|
// This will not modify model's weight
|
|
512
|
-
LLAMA_API int32_t
|
|
554
|
+
LLAMA_API int32_t llama_set_adapter_lora(
|
|
513
555
|
struct llama_context * ctx,
|
|
514
|
-
struct
|
|
556
|
+
struct llama_adapter_lora * adapter,
|
|
515
557
|
float scale);
|
|
516
558
|
|
|
517
559
|
// Remove a specific LoRA adapter from given context
|
|
518
560
|
// Return -1 if the adapter is not present in the context
|
|
519
|
-
LLAMA_API int32_t
|
|
561
|
+
LLAMA_API int32_t llama_rm_adapter_lora(
|
|
520
562
|
struct llama_context * ctx,
|
|
521
|
-
struct
|
|
563
|
+
struct llama_adapter_lora * adapter);
|
|
522
564
|
|
|
523
565
|
// Remove all LoRA adapters from given context
|
|
524
|
-
LLAMA_API void
|
|
525
|
-
struct llama_context * ctx);
|
|
526
|
-
|
|
527
|
-
// Manually free a LoRA adapter
|
|
528
|
-
// Note: loaded adapters will be free when the associated model is deleted
|
|
529
|
-
LLAMA_API void llama_lora_adapter_free(struct llama_lora_adapter * adapter);
|
|
566
|
+
LLAMA_API void llama_clear_adapter_lora(struct llama_context * ctx);
|
|
530
567
|
|
|
531
568
|
// Apply a loaded control vector to a llama_context, or if data is NULL, clear
|
|
532
569
|
// the currently loaded vector.
|
|
@@ -534,8 +571,8 @@ extern "C" {
|
|
|
534
571
|
// to an n_embd x n_layers buffer starting from layer 1.
|
|
535
572
|
// il_start and il_end are the layer range the vector should apply to (both inclusive)
|
|
536
573
|
// See llama_control_vector_load in common to load a control vector.
|
|
537
|
-
LLAMA_API int32_t
|
|
538
|
-
struct llama_context *
|
|
574
|
+
LLAMA_API int32_t llama_apply_adapter_cvec(
|
|
575
|
+
struct llama_context * ctx,
|
|
539
576
|
const float * data,
|
|
540
577
|
size_t len,
|
|
541
578
|
int32_t n_embd,
|
|
@@ -546,6 +583,8 @@ extern "C" {
|
|
|
546
583
|
// KV cache
|
|
547
584
|
//
|
|
548
585
|
|
|
586
|
+
// TODO: remove llama_kv_cache_view_* API
|
|
587
|
+
|
|
549
588
|
// Information associated with an individual cell in the KV cache view.
|
|
550
589
|
struct llama_kv_cache_view_cell {
|
|
551
590
|
// The position for this cell. Takes KV cache shifts into account.
|
|
@@ -592,8 +631,11 @@ extern "C" {
|
|
|
592
631
|
LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
|
|
593
632
|
|
|
594
633
|
// Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
|
|
634
|
+
// TODO: change signature to llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct llama_context * ctx)
|
|
595
635
|
LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
|
|
596
636
|
|
|
637
|
+
///
|
|
638
|
+
|
|
597
639
|
// Returns the number of tokens in the KV cache (slow, use only for debug)
|
|
598
640
|
// If a KV cell has multiple sequences assigned to it, it will be counted multiple times
|
|
599
641
|
LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx);
|
|
@@ -663,6 +705,9 @@ extern "C" {
|
|
|
663
705
|
struct llama_context * ctx,
|
|
664
706
|
llama_seq_id seq_id);
|
|
665
707
|
|
|
708
|
+
// TODO: the llama_kv_cache_defrag and llama_kv_cache_update API tightly couples llama_context with llama_kv_cache
|
|
709
|
+
// how to avoid this?
|
|
710
|
+
|
|
666
711
|
// Defragment the KV cache
|
|
667
712
|
// This will be applied:
|
|
668
713
|
// - lazily on next llama_decode()
|
|
@@ -883,41 +928,60 @@ extern "C" {
|
|
|
883
928
|
// Vocab
|
|
884
929
|
//
|
|
885
930
|
|
|
886
|
-
LLAMA_API const char *
|
|
931
|
+
LLAMA_API const char * llama_vocab_get_text(const struct llama_vocab * vocab, llama_token token);
|
|
887
932
|
|
|
888
|
-
LLAMA_API float
|
|
933
|
+
LLAMA_API float llama_vocab_get_score(const struct llama_vocab * vocab, llama_token token);
|
|
889
934
|
|
|
890
|
-
LLAMA_API enum llama_token_attr
|
|
935
|
+
LLAMA_API enum llama_token_attr llama_vocab_get_attr(const struct llama_vocab * vocab, llama_token token);
|
|
891
936
|
|
|
892
937
|
// Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
|
|
893
|
-
LLAMA_API bool
|
|
938
|
+
LLAMA_API bool llama_vocab_is_eog(const struct llama_vocab * vocab, llama_token token);
|
|
894
939
|
|
|
895
940
|
// Identify if Token Id is a control token or a render-able token
|
|
896
|
-
LLAMA_API bool
|
|
941
|
+
LLAMA_API bool llama_vocab_is_control(const struct llama_vocab * vocab, llama_token token);
|
|
897
942
|
|
|
898
943
|
// Special tokens
|
|
899
|
-
LLAMA_API llama_token
|
|
900
|
-
LLAMA_API llama_token
|
|
901
|
-
LLAMA_API llama_token
|
|
902
|
-
LLAMA_API llama_token
|
|
903
|
-
LLAMA_API llama_token
|
|
904
|
-
LLAMA_API llama_token
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
LLAMA_API bool
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
LLAMA_API
|
|
917
|
-
LLAMA_API
|
|
918
|
-
LLAMA_API
|
|
919
|
-
LLAMA_API
|
|
920
|
-
LLAMA_API
|
|
944
|
+
LLAMA_API llama_token llama_vocab_bos(const struct llama_vocab * vocab); // beginning-of-sentence
|
|
945
|
+
LLAMA_API llama_token llama_vocab_eos(const struct llama_vocab * vocab); // end-of-sentence
|
|
946
|
+
LLAMA_API llama_token llama_vocab_eot(const struct llama_vocab * vocab); // end-of-turn
|
|
947
|
+
LLAMA_API llama_token llama_vocab_sep(const struct llama_vocab * vocab); // sentence separator
|
|
948
|
+
LLAMA_API llama_token llama_vocab_nl (const struct llama_vocab * vocab); // next-line
|
|
949
|
+
LLAMA_API llama_token llama_vocab_pad(const struct llama_vocab * vocab); // padding
|
|
950
|
+
|
|
951
|
+
LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
|
|
952
|
+
LLAMA_API bool llama_vocab_get_add_eos(const struct llama_vocab * vocab);
|
|
953
|
+
|
|
954
|
+
LLAMA_API llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab);
|
|
955
|
+
LLAMA_API llama_token llama_vocab_fim_suf(const struct llama_vocab * vocab);
|
|
956
|
+
LLAMA_API llama_token llama_vocab_fim_mid(const struct llama_vocab * vocab);
|
|
957
|
+
LLAMA_API llama_token llama_vocab_fim_pad(const struct llama_vocab * vocab);
|
|
958
|
+
LLAMA_API llama_token llama_vocab_fim_rep(const struct llama_vocab * vocab);
|
|
959
|
+
LLAMA_API llama_token llama_vocab_fim_sep(const struct llama_vocab * vocab);
|
|
960
|
+
|
|
961
|
+
DEPRECATED(LLAMA_API const char * llama_token_get_text(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_get_text instead");
|
|
962
|
+
DEPRECATED(LLAMA_API float llama_token_get_score(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_get_score instead");
|
|
963
|
+
DEPRECATED(LLAMA_API enum llama_token_attr llama_token_get_attr(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_get_attr instead");
|
|
964
|
+
DEPRECATED(LLAMA_API bool llama_token_is_eog(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_is_eog instead");
|
|
965
|
+
DEPRECATED(LLAMA_API bool llama_token_is_control(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_is_control instead");
|
|
966
|
+
DEPRECATED(LLAMA_API llama_token llama_token_bos(const struct llama_vocab * vocab), "use llama_vocab_bos instead");
|
|
967
|
+
DEPRECATED(LLAMA_API llama_token llama_token_eos(const struct llama_vocab * vocab), "use llama_vocab_eos instead");
|
|
968
|
+
DEPRECATED(LLAMA_API llama_token llama_token_eot(const struct llama_vocab * vocab), "use llama_vocab_eot instead");
|
|
969
|
+
DEPRECATED(LLAMA_API llama_token llama_token_cls(const struct llama_vocab * vocab), "use llama_vocab_cls instead");
|
|
970
|
+
DEPRECATED(LLAMA_API llama_token llama_token_sep(const struct llama_vocab * vocab), "use llama_vocab_sep instead");
|
|
971
|
+
DEPRECATED(LLAMA_API llama_token llama_token_nl (const struct llama_vocab * vocab), "use llama_vocab_nl instead");
|
|
972
|
+
DEPRECATED(LLAMA_API llama_token llama_token_pad(const struct llama_vocab * vocab), "use llama_vocab_pad instead");
|
|
973
|
+
DEPRECATED(LLAMA_API bool llama_add_bos_token(const struct llama_vocab * vocab), "use llama_vocab_get_add_bos instead");
|
|
974
|
+
DEPRECATED(LLAMA_API bool llama_add_eos_token(const struct llama_vocab * vocab), "use llama_vocab_get_add_eos instead");
|
|
975
|
+
DEPRECATED(LLAMA_API llama_token llama_token_fim_pre(const struct llama_vocab * vocab), "use llama_vocab_fim_pre instead");
|
|
976
|
+
DEPRECATED(LLAMA_API llama_token llama_token_fim_suf(const struct llama_vocab * vocab), "use llama_vocab_fim_suf instead");
|
|
977
|
+
DEPRECATED(LLAMA_API llama_token llama_token_fim_mid(const struct llama_vocab * vocab), "use llama_vocab_fim_mid instead");
|
|
978
|
+
DEPRECATED(LLAMA_API llama_token llama_token_fim_pad(const struct llama_vocab * vocab), "use llama_vocab_fim_pad instead");
|
|
979
|
+
DEPRECATED(LLAMA_API llama_token llama_token_fim_rep(const struct llama_vocab * vocab), "use llama_vocab_fim_rep instead");
|
|
980
|
+
DEPRECATED(LLAMA_API llama_token llama_token_fim_sep(const struct llama_vocab * vocab), "use llama_vocab_fim_sep instead");
|
|
981
|
+
|
|
982
|
+
// CLS is equivalent to BOS
|
|
983
|
+
DEPRECATED(LLAMA_API llama_token llama_vocab_cls(const struct llama_vocab * vocab), // classification
|
|
984
|
+
"use llama_vocab_bos instead");
|
|
921
985
|
|
|
922
986
|
//
|
|
923
987
|
// Tokenization
|
|
@@ -933,7 +997,7 @@ extern "C" {
|
|
|
933
997
|
/// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
|
|
934
998
|
/// as plaintext. Does not insert a leading space.
|
|
935
999
|
LLAMA_API int32_t llama_tokenize(
|
|
936
|
-
const struct
|
|
1000
|
+
const struct llama_vocab * vocab,
|
|
937
1001
|
const char * text,
|
|
938
1002
|
int32_t text_len,
|
|
939
1003
|
llama_token * tokens,
|
|
@@ -947,7 +1011,7 @@ extern "C" {
|
|
|
947
1011
|
// User can skip up to 'lstrip' leading spaces before copying (useful when encoding/decoding multiple tokens with 'add_space_prefix')
|
|
948
1012
|
// @param special If true, special tokens are rendered in the output.
|
|
949
1013
|
LLAMA_API int32_t llama_token_to_piece(
|
|
950
|
-
const struct
|
|
1014
|
+
const struct llama_vocab * vocab,
|
|
951
1015
|
llama_token token,
|
|
952
1016
|
char * buf,
|
|
953
1017
|
int32_t length,
|
|
@@ -961,7 +1025,7 @@ extern "C" {
|
|
|
961
1025
|
/// @param remove_special Allow to remove BOS and EOS tokens if model is configured to do so.
|
|
962
1026
|
/// @param unparse_special If true, special tokens are rendered in the output.
|
|
963
1027
|
LLAMA_API int32_t llama_detokenize(
|
|
964
|
-
const struct
|
|
1028
|
+
const struct llama_vocab * vocab,
|
|
965
1029
|
const llama_token * tokens,
|
|
966
1030
|
int32_t n_tokens,
|
|
967
1031
|
char * text,
|
|
@@ -984,7 +1048,6 @@ extern "C" {
|
|
|
984
1048
|
/// @param length The size of the allocated buffer
|
|
985
1049
|
/// @return The total number of bytes of the formatted prompt. If is it larger than the size of buffer, you may need to re-alloc it and then re-apply the template.
|
|
986
1050
|
LLAMA_API int32_t llama_chat_apply_template(
|
|
987
|
-
const struct llama_model * model,
|
|
988
1051
|
const char * tmpl,
|
|
989
1052
|
const struct llama_chat_message * chat,
|
|
990
1053
|
size_t n_msg,
|
|
@@ -1032,7 +1095,6 @@ extern "C" {
|
|
|
1032
1095
|
// llama_sampler_free(smpl);
|
|
1033
1096
|
//
|
|
1034
1097
|
// TODO: In the future, llama_sampler will be utilized to offload the sampling to the backends (e.g. GPU).
|
|
1035
|
-
// TODO: in the future, the entire sampling API that uses llama_model should start using llama_vocab
|
|
1036
1098
|
//
|
|
1037
1099
|
|
|
1038
1100
|
typedef void * llama_sampler_context_t;
|
|
@@ -1132,7 +1194,7 @@ extern "C" {
|
|
|
1132
1194
|
float eta);
|
|
1133
1195
|
|
|
1134
1196
|
LLAMA_API struct llama_sampler * llama_sampler_init_grammar(
|
|
1135
|
-
const struct
|
|
1197
|
+
const struct llama_vocab * vocab,
|
|
1136
1198
|
const char * grammar_str,
|
|
1137
1199
|
const char * grammar_root);
|
|
1138
1200
|
|
|
@@ -1144,8 +1206,9 @@ extern "C" {
|
|
|
1144
1206
|
float penalty_present); // 0.0 = disabled
|
|
1145
1207
|
|
|
1146
1208
|
/// @details DRY sampler, designed by p-e-w, as described in: https://github.com/oobabooga/text-generation-webui/pull/5677, porting Koboldcpp implementation authored by pi6am: https://github.com/LostRuins/koboldcpp/pull/982
|
|
1147
|
-
LLAMA_API struct llama_sampler *
|
|
1148
|
-
const struct
|
|
1209
|
+
LLAMA_API struct llama_sampler * llama_sampler_init_dry(
|
|
1210
|
+
const struct llama_vocab * vocab,
|
|
1211
|
+
int32_t n_ctx_train,
|
|
1149
1212
|
float dry_multiplier,
|
|
1150
1213
|
float dry_base,
|
|
1151
1214
|
int32_t dry_allowed_length,
|
|
@@ -1179,7 +1242,7 @@ extern "C" {
|
|
|
1179
1242
|
// 3. discard non-EOG tokens with low prob
|
|
1180
1243
|
// 4. if no tokens are left -> pick EOT
|
|
1181
1244
|
//
|
|
1182
|
-
LLAMA_API struct llama_sampler * llama_sampler_init_infill(const struct
|
|
1245
|
+
LLAMA_API struct llama_sampler * llama_sampler_init_infill(const struct llama_vocab * vocab);
|
|
1183
1246
|
|
|
1184
1247
|
// Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise
|
|
1185
1248
|
LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl);
|
|
@@ -9,9 +9,21 @@ llama_add_compile_flags()
|
|
|
9
9
|
add_library(llama
|
|
10
10
|
../include/llama.h
|
|
11
11
|
llama.cpp
|
|
12
|
-
llama-
|
|
12
|
+
llama-adapter.cpp
|
|
13
|
+
llama-arch.cpp
|
|
14
|
+
llama-batch.cpp
|
|
15
|
+
llama-chat.cpp
|
|
16
|
+
llama-context.cpp
|
|
13
17
|
llama-grammar.cpp
|
|
18
|
+
llama-hparams.cpp
|
|
19
|
+
llama-impl.cpp
|
|
20
|
+
llama-kv-cache.cpp
|
|
21
|
+
llama-mmap.cpp
|
|
22
|
+
llama-model-loader.cpp
|
|
23
|
+
llama-model.cpp
|
|
24
|
+
llama-quant.cpp
|
|
14
25
|
llama-sampling.cpp
|
|
26
|
+
llama-vocab.cpp
|
|
15
27
|
unicode.h
|
|
16
28
|
unicode.cpp
|
|
17
29
|
unicode-data.cpp
|