@fugood/llama.node 0.3.2 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +2 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +1 -1
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/EmbeddingWorker.cpp +2 -2
- package/src/LlamaCompletionWorker.cpp +8 -8
- package/src/LlamaCompletionWorker.h +2 -2
- package/src/LlamaContext.cpp +8 -9
- package/src/TokenizeWorker.cpp +1 -1
- package/src/common.hpp +4 -4
- package/src/llama.cpp/.github/workflows/build.yml +43 -9
- package/src/llama.cpp/.github/workflows/docker.yml +3 -0
- package/src/llama.cpp/CMakeLists.txt +7 -4
- package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
- package/src/llama.cpp/common/CMakeLists.txt +0 -2
- package/src/llama.cpp/common/arg.cpp +642 -607
- package/src/llama.cpp/common/arg.h +22 -22
- package/src/llama.cpp/common/common.cpp +79 -281
- package/src/llama.cpp/common/common.h +130 -100
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
- package/src/llama.cpp/common/log.cpp +50 -50
- package/src/llama.cpp/common/log.h +18 -18
- package/src/llama.cpp/common/ngram-cache.cpp +36 -36
- package/src/llama.cpp/common/ngram-cache.h +19 -19
- package/src/llama.cpp/common/sampling.cpp +116 -108
- package/src/llama.cpp/common/sampling.h +20 -20
- package/src/llama.cpp/docs/build.md +37 -17
- package/src/llama.cpp/examples/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched/batched.cpp +14 -14
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
- package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +20 -11
- package/src/llama.cpp/examples/infill/infill.cpp +40 -86
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +42 -151
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
- package/src/llama.cpp/examples/llava/clip.cpp +1 -0
- package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
- package/src/llama.cpp/examples/llava/llava.cpp +37 -3
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +14 -14
- package/src/llama.cpp/examples/lookup/lookup.cpp +29 -29
- package/src/llama.cpp/examples/main/main.cpp +64 -109
- package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
- package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +13 -13
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +34 -17
- package/src/llama.cpp/examples/server/CMakeLists.txt +4 -13
- package/src/llama.cpp/examples/server/server.cpp +553 -691
- package/src/llama.cpp/examples/server/utils.hpp +312 -25
- package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple/simple.cpp +128 -96
- package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
- package/src/llama.cpp/examples/speculative/speculative.cpp +54 -51
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +2 -2
- package/src/llama.cpp/ggml/CMakeLists.txt +15 -9
- package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +46 -33
- package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
- package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
- package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
- package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
- package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
- package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
- package/src/llama.cpp/ggml/include/ggml.h +53 -393
- package/src/llama.cpp/ggml/src/CMakeLists.txt +66 -1149
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +46 -3126
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
- package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -27
- package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
- package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
- package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +6 -25
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +303 -864
- package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
- package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +213 -65
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
- package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +255 -149
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
- package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +0 -243
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
- package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +667 -1
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +366 -16
- package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
- package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +238 -72
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
- package/src/llama.cpp/ggml/src/ggml-quants.c +187 -10692
- package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
- package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
- package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +475 -300
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +40 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +258 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +2 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
- package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3584 -4142
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +69 -67
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +3 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
- package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
- package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +555 -623
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +125 -206
- package/src/llama.cpp/ggml/src/ggml.c +4032 -19890
- package/src/llama.cpp/include/llama.h +67 -33
- package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
- package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
- package/src/llama.cpp/src/CMakeLists.txt +2 -1
- package/src/llama.cpp/src/llama-sampling.cpp +745 -105
- package/src/llama.cpp/src/llama-sampling.h +21 -2
- package/src/llama.cpp/src/llama-vocab.cpp +49 -9
- package/src/llama.cpp/src/llama-vocab.h +35 -11
- package/src/llama.cpp/src/llama.cpp +2636 -2406
- package/src/llama.cpp/src/unicode-data.cpp +2 -2
- package/src/llama.cpp/tests/CMakeLists.txt +1 -2
- package/src/llama.cpp/tests/test-arg-parser.cpp +14 -14
- package/src/llama.cpp/tests/test-backend-ops.cpp +185 -60
- package/src/llama.cpp/tests/test-barrier.cpp +1 -0
- package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
- package/src/llama.cpp/tests/test-log.cpp +2 -2
- package/src/llama.cpp/tests/test-opt.cpp +853 -142
- package/src/llama.cpp/tests/test-quantize-fns.cpp +22 -19
- package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
- package/src/llama.cpp/tests/test-rope.cpp +1 -0
- package/src/llama.cpp/tests/test-sampling.cpp +162 -137
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
- package/src/llama.cpp/common/train.cpp +0 -1515
- package/src/llama.cpp/common/train.h +0 -233
- package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
- package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
- /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
- /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
- /package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +0 -0
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
#define LLAMA_H
|
|
3
3
|
|
|
4
4
|
#include "ggml.h"
|
|
5
|
+
#include "ggml-cpu.h"
|
|
5
6
|
#include "ggml-backend.h"
|
|
6
7
|
|
|
7
8
|
#include <stddef.h>
|
|
@@ -205,7 +206,7 @@ extern "C" {
|
|
|
205
206
|
enum llama_split_mode {
|
|
206
207
|
LLAMA_SPLIT_MODE_NONE = 0, // single GPU
|
|
207
208
|
LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
|
|
208
|
-
LLAMA_SPLIT_MODE_ROW = 2, // split
|
|
209
|
+
LLAMA_SPLIT_MODE_ROW = 2, // split layers and KV across GPUs, use tensor parallelism if supported
|
|
209
210
|
};
|
|
210
211
|
|
|
211
212
|
// TODO: simplify (https://github.com/ggerganov/llama.cpp/pull/9294#pullrequestreview-2286561979)
|
|
@@ -217,6 +218,7 @@ extern "C" {
|
|
|
217
218
|
|
|
218
219
|
typedef struct llama_token_data_array {
|
|
219
220
|
// TODO: consider SoA
|
|
221
|
+
// NOTE: this pointer can be modified by the samplers
|
|
220
222
|
llama_token_data * data;
|
|
221
223
|
size_t size;
|
|
222
224
|
int64_t selected; // this is the index in the data array (i.e. not the token id)
|
|
@@ -232,8 +234,11 @@ extern "C" {
|
|
|
232
234
|
// - token : the token ids of the input (used when embd is NULL)
|
|
233
235
|
// - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
|
|
234
236
|
// - pos : the positions of the respective token in the sequence
|
|
237
|
+
// (if set to NULL, the token position will be tracked automatically by llama_decode)
|
|
235
238
|
// - seq_id : the sequence to which the respective token belongs
|
|
239
|
+
// (if set to NULL, the sequence ID will be assumed to be 0)
|
|
236
240
|
// - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
|
|
241
|
+
// (if set to NULL, only the logits for last token will be returned)
|
|
237
242
|
//
|
|
238
243
|
typedef struct llama_batch {
|
|
239
244
|
int32_t n_tokens;
|
|
@@ -244,15 +249,6 @@ extern "C" {
|
|
|
244
249
|
int32_t * n_seq_id;
|
|
245
250
|
llama_seq_id ** seq_id;
|
|
246
251
|
int8_t * logits; // TODO: rename this to "output"
|
|
247
|
-
|
|
248
|
-
// NOTE: helpers for smooth API transition - can be deprecated in the future
|
|
249
|
-
// for future-proof code, use the above fields instead and ignore everything below
|
|
250
|
-
//
|
|
251
|
-
// pos[i] = all_pos_0 + i*all_pos_1
|
|
252
|
-
//
|
|
253
|
-
llama_pos all_pos_0; // used if pos == NULL
|
|
254
|
-
llama_pos all_pos_1; // used if pos == NULL
|
|
255
|
-
llama_seq_id all_seq_id; // used if seq_id == NULL
|
|
256
252
|
} llama_batch;
|
|
257
253
|
|
|
258
254
|
enum llama_model_kv_override_type {
|
|
@@ -279,10 +275,7 @@ extern "C" {
|
|
|
279
275
|
int32_t n_gpu_layers; // number of layers to store in VRAM
|
|
280
276
|
enum llama_split_mode split_mode; // how to split the model across multiple GPUs
|
|
281
277
|
|
|
282
|
-
//
|
|
283
|
-
// LLAMA_SPLIT_MODE_NONE: the GPU that is used for the entire model
|
|
284
|
-
// LLAMA_SPLIT_MODE_ROW: the GPU that is used for small tensors and intermediate results
|
|
285
|
-
// LLAMA_SPLIT_MODE_LAYER: ignored
|
|
278
|
+
// the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE
|
|
286
279
|
int32_t main_gpu;
|
|
287
280
|
|
|
288
281
|
// proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
|
|
@@ -433,6 +426,7 @@ extern "C" {
|
|
|
433
426
|
LLAMA_API bool llama_supports_mmap (void);
|
|
434
427
|
LLAMA_API bool llama_supports_mlock (void);
|
|
435
428
|
LLAMA_API bool llama_supports_gpu_offload(void);
|
|
429
|
+
LLAMA_API bool llama_supports_rpc (void);
|
|
436
430
|
|
|
437
431
|
LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx);
|
|
438
432
|
LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
|
|
@@ -775,15 +769,15 @@ extern "C" {
|
|
|
775
769
|
// Decoding
|
|
776
770
|
//
|
|
777
771
|
|
|
778
|
-
// Return batch for single sequence of tokens
|
|
772
|
+
// Return batch for single sequence of tokens
|
|
773
|
+
// The sequence ID will be fixed to 0
|
|
774
|
+
// The position of the tokens will be tracked automatically by llama_decode
|
|
779
775
|
//
|
|
780
776
|
// NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
|
|
781
777
|
//
|
|
782
778
|
LLAMA_API struct llama_batch llama_batch_get_one(
|
|
783
779
|
llama_token * tokens,
|
|
784
|
-
int32_t n_tokens
|
|
785
|
-
llama_pos pos_0,
|
|
786
|
-
llama_seq_id seq_id);
|
|
780
|
+
int32_t n_tokens);
|
|
787
781
|
|
|
788
782
|
// Allocates a batch of tokens on the heap that can hold a maximum of n_tokens
|
|
789
783
|
// Each token can be assigned up to n_seq_max sequence ids
|
|
@@ -803,7 +797,7 @@ extern "C" {
|
|
|
803
797
|
// Processes a batch of tokens with the ecoder part of the encoder-decoder model.
|
|
804
798
|
// Stores the encoder output internally for later use by the decoder cross-attention layers.
|
|
805
799
|
// 0 - success
|
|
806
|
-
// < 0 - error
|
|
800
|
+
// < 0 - error. the KV cache state is restored to the state before this call
|
|
807
801
|
LLAMA_API int32_t llama_encode(
|
|
808
802
|
struct llama_context * ctx,
|
|
809
803
|
struct llama_batch batch);
|
|
@@ -811,7 +805,7 @@ extern "C" {
|
|
|
811
805
|
// Positive return values does not mean a fatal error, but rather a warning.
|
|
812
806
|
// 0 - success
|
|
813
807
|
// 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
|
|
814
|
-
// < 0 - error
|
|
808
|
+
// < 0 - error. the KV cache state is restored to the state before this call
|
|
815
809
|
LLAMA_API int32_t llama_decode(
|
|
816
810
|
struct llama_context * ctx,
|
|
817
811
|
struct llama_batch batch);
|
|
@@ -896,6 +890,7 @@ extern "C" {
|
|
|
896
890
|
// Special tokens
|
|
897
891
|
LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
|
|
898
892
|
LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
|
|
893
|
+
LLAMA_API llama_token llama_token_eot(const struct llama_model * model); // end-of-turn
|
|
899
894
|
LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification
|
|
900
895
|
LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator
|
|
901
896
|
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
|
|
@@ -904,11 +899,17 @@ extern "C" {
|
|
|
904
899
|
LLAMA_API bool llama_add_bos_token(const struct llama_model * model);
|
|
905
900
|
LLAMA_API bool llama_add_eos_token(const struct llama_model * model);
|
|
906
901
|
|
|
907
|
-
//
|
|
908
|
-
LLAMA_API llama_token llama_token_prefix(const struct llama_model * model)
|
|
909
|
-
LLAMA_API llama_token llama_token_middle(const struct llama_model * model)
|
|
910
|
-
LLAMA_API llama_token llama_token_suffix(const struct llama_model * model)
|
|
911
|
-
|
|
902
|
+
// infill tokens
|
|
903
|
+
DEPRECATED(LLAMA_API llama_token llama_token_prefix(const struct llama_model * model), "use llama_token_fim_pre instead");
|
|
904
|
+
DEPRECATED(LLAMA_API llama_token llama_token_middle(const struct llama_model * model), "use llama_token_fim_mid instead");
|
|
905
|
+
DEPRECATED(LLAMA_API llama_token llama_token_suffix(const struct llama_model * model), "use llama_token_fim_suf instead");
|
|
906
|
+
|
|
907
|
+
LLAMA_API llama_token llama_token_fim_pre(const struct llama_model * model);
|
|
908
|
+
LLAMA_API llama_token llama_token_fim_suf(const struct llama_model * model);
|
|
909
|
+
LLAMA_API llama_token llama_token_fim_mid(const struct llama_model * model);
|
|
910
|
+
LLAMA_API llama_token llama_token_fim_pad(const struct llama_model * model);
|
|
911
|
+
LLAMA_API llama_token llama_token_fim_rep(const struct llama_model * model);
|
|
912
|
+
LLAMA_API llama_token llama_token_fim_sep(const struct llama_model * model);
|
|
912
913
|
|
|
913
914
|
//
|
|
914
915
|
// Tokenization
|
|
@@ -1067,12 +1068,13 @@ extern "C" {
|
|
|
1067
1068
|
|
|
1068
1069
|
// available samplers:
|
|
1069
1070
|
|
|
1070
|
-
LLAMA_API struct llama_sampler * llama_sampler_init_greedy
|
|
1071
|
-
LLAMA_API struct llama_sampler * llama_sampler_init_dist
|
|
1071
|
+
LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void);
|
|
1072
|
+
LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed);
|
|
1072
1073
|
|
|
1073
1074
|
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
|
|
1074
1075
|
/// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first.
|
|
1075
|
-
LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void)
|
|
1076
|
+
DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void),
|
|
1077
|
+
"will be removed in the future (see https://github.com/ggerganov/llama.cpp/pull/9896#discussion_r1800920915)");
|
|
1076
1078
|
|
|
1077
1079
|
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
|
1078
1080
|
LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k);
|
|
@@ -1083,16 +1085,18 @@ extern "C" {
|
|
|
1083
1085
|
/// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
|
|
1084
1086
|
LLAMA_API struct llama_sampler * llama_sampler_init_min_p (float p, size_t min_keep);
|
|
1085
1087
|
|
|
1086
|
-
/// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
|
|
1087
|
-
LLAMA_API struct llama_sampler * llama_sampler_init_tail_free (float z, size_t min_keep);
|
|
1088
|
-
|
|
1089
1088
|
/// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
|
|
1090
1089
|
LLAMA_API struct llama_sampler * llama_sampler_init_typical (float p, size_t min_keep);
|
|
1090
|
+
|
|
1091
|
+
/// #details Updates the logits l_i` = l_i/t. When t <= 0.0f, the maximum logit is kept at it's original value, the rest are set to -inf
|
|
1091
1092
|
LLAMA_API struct llama_sampler * llama_sampler_init_temp (float t);
|
|
1092
1093
|
|
|
1093
1094
|
/// @details Dynamic temperature implementation (a.k.a. entropy) described in the paper https://arxiv.org/abs/2309.02772.
|
|
1094
1095
|
LLAMA_API struct llama_sampler * llama_sampler_init_temp_ext (float t, float delta, float exponent);
|
|
1095
1096
|
|
|
1097
|
+
/// @details XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335
|
|
1098
|
+
LLAMA_API struct llama_sampler * llama_sampler_init_xtc (float p, float t, size_t min_keep, uint32_t seed);
|
|
1099
|
+
|
|
1096
1100
|
/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
|
1097
1101
|
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
|
|
1098
1102
|
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
|
|
@@ -1132,11 +1136,43 @@ extern "C" {
|
|
|
1132
1136
|
bool penalize_nl, // consider newlines as a repeatable token
|
|
1133
1137
|
bool ignore_eos); // ignore the end-of-sequence token
|
|
1134
1138
|
|
|
1139
|
+
/// @details DRY sampler, designed by p-e-w, as described in: https://github.com/oobabooga/text-generation-webui/pull/5677, porting Koboldcpp implementation authored by pi6am: https://github.com/LostRuins/koboldcpp/pull/982
|
|
1140
|
+
LLAMA_API struct llama_sampler * llama_sampler_init_dry(
|
|
1141
|
+
const struct llama_model * model,
|
|
1142
|
+
float dry_multiplier,
|
|
1143
|
+
float dry_base,
|
|
1144
|
+
int32_t dry_allowed_length,
|
|
1145
|
+
int32_t dry_penalty_last_n,
|
|
1146
|
+
const char ** seq_breakers,
|
|
1147
|
+
size_t num_breakers);
|
|
1148
|
+
|
|
1135
1149
|
LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias(
|
|
1136
1150
|
int32_t n_vocab,
|
|
1137
1151
|
int32_t n_logit_bias,
|
|
1138
1152
|
const llama_logit_bias * logit_bias);
|
|
1139
1153
|
|
|
1154
|
+
// this sampler is meant to be used for fill-in-the-middle infilling
|
|
1155
|
+
// it's supposed to be used after top_k + top_p sampling
|
|
1156
|
+
//
|
|
1157
|
+
// 1. if the sum of the EOG probs times the number of candidates is higher than the sum of the other probs -> pick EOG
|
|
1158
|
+
// 2. combine probs of tokens that have the same prefix
|
|
1159
|
+
//
|
|
1160
|
+
// example:
|
|
1161
|
+
//
|
|
1162
|
+
// - before:
|
|
1163
|
+
// "hel": 0.5
|
|
1164
|
+
// "hell": 0.2
|
|
1165
|
+
// "hello": 0.1
|
|
1166
|
+
// "dummy": 0.1
|
|
1167
|
+
//
|
|
1168
|
+
// - after:
|
|
1169
|
+
// "hel": 0.8
|
|
1170
|
+
// "dummy": 0.1
|
|
1171
|
+
//
|
|
1172
|
+
// 3. discard non-EOG tokens with low prob
|
|
1173
|
+
// 4. if no tokens are left -> pick EOT
|
|
1174
|
+
//
|
|
1175
|
+
LLAMA_API struct llama_sampler * llama_sampler_init_infill(const struct llama_model * model);
|
|
1140
1176
|
|
|
1141
1177
|
// Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise
|
|
1142
1178
|
LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl);
|
|
@@ -1208,8 +1244,6 @@ extern "C" {
|
|
|
1208
1244
|
LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain);
|
|
1209
1245
|
LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain);
|
|
1210
1246
|
|
|
1211
|
-
LLAMA_API void llama_perf_dump_yaml(FILE * stream, const struct llama_context * ctx);
|
|
1212
|
-
|
|
1213
1247
|
#ifdef __cplusplus
|
|
1214
1248
|
}
|
|
1215
1249
|
#endif
|
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
#include <type_traits>
|
|
12
12
|
|
|
13
13
|
#include <ggml.h>
|
|
14
|
+
#include <ggml-cpu.h>
|
|
14
15
|
|
|
15
16
|
constexpr int kVecSize = 1 << 16;
|
|
16
17
|
|
|
@@ -136,7 +137,7 @@ int main(int argc, char** argv) {
|
|
|
136
137
|
|
|
137
138
|
auto ggml_type = type == 0 ? GGML_TYPE_Q4_0 : GGML_TYPE_Q4_1;
|
|
138
139
|
|
|
139
|
-
auto funcs =
|
|
140
|
+
const auto * funcs = ggml_get_type_traits_cpu(ggml_type);
|
|
140
141
|
|
|
141
142
|
Stat simple, ggml;
|
|
142
143
|
|
|
@@ -156,8 +157,8 @@ int main(int argc, char** argv) {
|
|
|
156
157
|
|
|
157
158
|
t1 = std::chrono::high_resolution_clock::now();
|
|
158
159
|
float fs;
|
|
159
|
-
if (type == 0) funcs
|
|
160
|
-
else funcs
|
|
160
|
+
if (type == 0) funcs->vec_dot(kVecSize * QK4_1, &fs, 0, x40.data(), 0, y.data(), 0, 1);
|
|
161
|
+
else funcs->vec_dot(kVecSize * QK4_1, &fs, 0, x41.data(), 0, y.data(), 0, 1);
|
|
161
162
|
t2 = std::chrono::high_resolution_clock::now();
|
|
162
163
|
t = 1e-3*std::chrono::duration_cast<std::chrono::nanoseconds>(t2-t1).count();
|
|
163
164
|
if (iloop > 3) ggml.addResult(fs, t);
|
|
@@ -9,6 +9,7 @@
|
|
|
9
9
|
#include <array>
|
|
10
10
|
|
|
11
11
|
#include <ggml.h>
|
|
12
|
+
#include <ggml-cpu.h>
|
|
12
13
|
|
|
13
14
|
#if defined(_MSC_VER)
|
|
14
15
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
|
@@ -236,7 +237,7 @@ int main(int argc, char** argv) {
|
|
|
236
237
|
int n4 = useQ4_1 ? kVecSize / QK4_1 : kVecSize / QK4_0; n4 = 64*((n4 + 63)/64);
|
|
237
238
|
int n8 = kVecSize / QK8_0; n8 = 64*((n8 + 63)/64);
|
|
238
239
|
|
|
239
|
-
auto
|
|
240
|
+
const auto * funcs_cpu = ggml_get_type_traits_cpu(useQ4_1 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q4_0);
|
|
240
241
|
|
|
241
242
|
std::vector<block_q4_0> q40;
|
|
242
243
|
std::vector<block_q4_1> q41;
|
|
@@ -261,9 +262,9 @@ int main(int argc, char** argv) {
|
|
|
261
262
|
// Note, we do not include this in the timing as in practical application
|
|
262
263
|
// we already have the quantized model weights.
|
|
263
264
|
if (useQ4_1) {
|
|
264
|
-
|
|
265
|
+
funcs_cpu->from_float(x1.data(), q41.data(), kVecSize);
|
|
265
266
|
} else {
|
|
266
|
-
|
|
267
|
+
funcs_cpu->from_float(x1.data(), q40.data(), kVecSize);
|
|
267
268
|
}
|
|
268
269
|
|
|
269
270
|
// Now measure time the dot product needs using the "scalar" version above
|
|
@@ -282,10 +283,10 @@ int main(int argc, char** argv) {
|
|
|
282
283
|
dot_q4_q8(kVecSize, &result, q40.data(), q8.data());
|
|
283
284
|
}
|
|
284
285
|
else {
|
|
285
|
-
auto vdot =
|
|
286
|
-
vdot
|
|
287
|
-
if (useQ4_1)
|
|
288
|
-
else
|
|
286
|
+
const auto * vdot = ggml_get_type_traits_cpu(funcs_cpu->vec_dot_type);
|
|
287
|
+
vdot->from_float(y1.data(), q8.data(), kVecSize);
|
|
288
|
+
if (useQ4_1) funcs_cpu->vec_dot(kVecSize, &result, 0, q41.data(), 0, q8.data(), 0, 1);
|
|
289
|
+
else funcs_cpu->vec_dot(kVecSize, &result, 0, q40.data(), 0, q8.data(), 0, 1);
|
|
289
290
|
}
|
|
290
291
|
sumq += result;
|
|
291
292
|
t2 = std::chrono::high_resolution_clock::now();
|
|
@@ -29,5 +29,6 @@ target_link_libraries(llama PUBLIC ggml)
|
|
|
29
29
|
|
|
30
30
|
if (BUILD_SHARED_LIBS)
|
|
31
31
|
set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
|
32
|
-
target_compile_definitions(llama PRIVATE
|
|
32
|
+
target_compile_definitions(llama PRIVATE LLAMA_BUILD)
|
|
33
|
+
target_compile_definitions(llama PUBLIC LLAMA_SHARED)
|
|
33
34
|
endif()
|