llama_cpp 0.16.2 → 0.17.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +18 -0
- data/README.md +7 -12
- data/ext/llama_cpp/extconf.rb +2 -43
- data/ext/llama_cpp/llama_cpp.cpp +8 -0
- data/lib/llama_cpp/version.rb +3 -3
- data/sig/llama_cpp.rbs +3 -0
- metadata +2 -171
- data/vendor/include/.gitkeep +0 -0
- data/vendor/lib/.gitkeep +0 -0
- data/vendor/tmp/llama.cpp/LICENSE +0 -21
- data/vendor/tmp/llama.cpp/Makefile +0 -1124
- data/vendor/tmp/llama.cpp/ggml-alloc.c +0 -1041
- data/vendor/tmp/llama.cpp/ggml-alloc.h +0 -76
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +0 -153
- data/vendor/tmp/llama.cpp/ggml-backend.c +0 -2225
- data/vendor/tmp/llama.cpp/ggml-backend.h +0 -236
- data/vendor/tmp/llama.cpp/ggml-blas.cpp +0 -363
- data/vendor/tmp/llama.cpp/ggml-blas.h +0 -23
- data/vendor/tmp/llama.cpp/ggml-common.h +0 -1805
- data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +0 -47
- data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +0 -34
- data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +0 -104
- data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +0 -280
- data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +0 -34
- data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +0 -196
- data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +0 -686
- data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +0 -490
- data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +0 -40
- data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +0 -674
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +0 -319
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +0 -312
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +0 -345
- data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +0 -178
- data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +0 -104
- data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +0 -88
- data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +0 -419
- data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +0 -221
- data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +0 -49
- data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +0 -94
- data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +0 -112
- data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +0 -271
- data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +0 -31
- data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +0 -206
- data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +0 -40
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +0 -10
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +0 -9
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +0 -10
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +0 -10
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +0 -8
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +0 -47
- data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +0 -314
- data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +0 -51
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +0 -3069
- data/vendor/tmp/llama.cpp/ggml-cuda.h +0 -44
- data/vendor/tmp/llama.cpp/ggml-impl.h +0 -651
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +0 -2038
- data/vendor/tmp/llama.cpp/ggml-kompute.h +0 -46
- data/vendor/tmp/llama.cpp/ggml-metal.h +0 -66
- data/vendor/tmp/llama.cpp/ggml-metal.m +0 -3273
- data/vendor/tmp/llama.cpp/ggml-metal.metal +0 -6540
- data/vendor/tmp/llama.cpp/ggml-quants.c +0 -14994
- data/vendor/tmp/llama.cpp/ggml-quants.h +0 -133
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +0 -1178
- data/vendor/tmp/llama.cpp/ggml-rpc.h +0 -24
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +0 -6351
- data/vendor/tmp/llama.cpp/ggml-sycl.h +0 -40
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +0 -144508
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +0 -7183
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +0 -29
- data/vendor/tmp/llama.cpp/ggml.c +0 -22506
- data/vendor/tmp/llama.cpp/ggml.h +0 -2458
- data/vendor/tmp/llama.cpp/llama.cpp +0 -18985
- data/vendor/tmp/llama.cpp/llama.h +0 -1147
- data/vendor/tmp/llama.cpp/scripts/get-flags.mk +0 -38
- data/vendor/tmp/llama.cpp/sgemm.cpp +0 -1032
- data/vendor/tmp/llama.cpp/sgemm.h +0 -14
- data/vendor/tmp/llama.cpp/unicode-data.cpp +0 -7033
- data/vendor/tmp/llama.cpp/unicode-data.h +0 -20
- data/vendor/tmp/llama.cpp/unicode.cpp +0 -810
- data/vendor/tmp/llama.cpp/unicode.h +0 -63
@@ -1,1147 +0,0 @@
|
|
1
|
-
#ifndef LLAMA_H
|
2
|
-
#define LLAMA_H
|
3
|
-
|
4
|
-
#include "ggml.h"
|
5
|
-
#include "ggml-backend.h"
|
6
|
-
|
7
|
-
#include <stddef.h>
|
8
|
-
#include <stdint.h>
|
9
|
-
#include <stdio.h>
|
10
|
-
#include <stdbool.h>
|
11
|
-
|
12
|
-
#ifdef LLAMA_SHARED
|
13
|
-
# if defined(_WIN32) && !defined(__MINGW32__)
|
14
|
-
# ifdef LLAMA_BUILD
|
15
|
-
# define LLAMA_API __declspec(dllexport)
|
16
|
-
# else
|
17
|
-
# define LLAMA_API __declspec(dllimport)
|
18
|
-
# endif
|
19
|
-
# else
|
20
|
-
# define LLAMA_API __attribute__ ((visibility ("default")))
|
21
|
-
# endif
|
22
|
-
#else
|
23
|
-
# define LLAMA_API
|
24
|
-
#endif
|
25
|
-
|
26
|
-
#ifdef __GNUC__
|
27
|
-
# define DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
|
28
|
-
#elif defined(_MSC_VER)
|
29
|
-
# define DEPRECATED(func, hint) __declspec(deprecated(hint)) func
|
30
|
-
#else
|
31
|
-
# define DEPRECATED(func, hint) func
|
32
|
-
#endif
|
33
|
-
|
34
|
-
#define LLAMA_DEFAULT_SEED 0xFFFFFFFF
|
35
|
-
|
36
|
-
#define LLAMA_MAX_RNG_STATE (64*1024)
|
37
|
-
|
38
|
-
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
|
39
|
-
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
40
|
-
#define LLAMA_FILE_MAGIC_GGSQ 0x67677371u // 'ggsq'
|
41
|
-
|
42
|
-
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
43
|
-
#define LLAMA_SESSION_VERSION 6
|
44
|
-
|
45
|
-
#define LLAMA_STATE_SEQ_MAGIC LLAMA_FILE_MAGIC_GGSQ
|
46
|
-
#define LLAMA_STATE_SEQ_VERSION 1
|
47
|
-
|
48
|
-
#ifdef __cplusplus
|
49
|
-
extern "C" {
|
50
|
-
#endif
|
51
|
-
|
52
|
-
//
|
53
|
-
// C interface
|
54
|
-
//
|
55
|
-
// TODO: show sample usage
|
56
|
-
//
|
57
|
-
|
58
|
-
struct llama_model;
|
59
|
-
struct llama_context;
|
60
|
-
|
61
|
-
typedef int32_t llama_pos;
|
62
|
-
typedef int32_t llama_token;
|
63
|
-
typedef int32_t llama_seq_id;
|
64
|
-
|
65
|
-
enum llama_vocab_type {
|
66
|
-
LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
|
67
|
-
LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
|
68
|
-
LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
|
69
|
-
LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
|
70
|
-
};
|
71
|
-
|
72
|
-
// pre-tokenization types
|
73
|
-
enum llama_vocab_pre_type {
|
74
|
-
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
|
75
|
-
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
|
76
|
-
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
|
77
|
-
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
|
78
|
-
LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
|
79
|
-
LLAMA_VOCAB_PRE_TYPE_MPT = 5,
|
80
|
-
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
|
81
|
-
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
|
82
|
-
LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
|
83
|
-
LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
|
84
|
-
LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10,
|
85
|
-
LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
|
86
|
-
LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
|
87
|
-
LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
|
88
|
-
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
|
89
|
-
LLAMA_VOCAB_PRE_TYPE_PORO = 15,
|
90
|
-
};
|
91
|
-
|
92
|
-
// note: these values should be synchronized with ggml_rope
|
93
|
-
// TODO: maybe move this enum to ggml.h (ggml_rope_type)
|
94
|
-
enum llama_rope_type {
|
95
|
-
LLAMA_ROPE_TYPE_NONE = -1,
|
96
|
-
LLAMA_ROPE_TYPE_NORM = 0,
|
97
|
-
LLAMA_ROPE_TYPE_NEOX = 2,
|
98
|
-
LLAMA_ROPE_TYPE_GLM = 4,
|
99
|
-
};
|
100
|
-
|
101
|
-
enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file
|
102
|
-
LLAMA_TOKEN_TYPE_UNDEFINED = 0,
|
103
|
-
LLAMA_TOKEN_TYPE_NORMAL = 1,
|
104
|
-
LLAMA_TOKEN_TYPE_UNKNOWN = 2,
|
105
|
-
LLAMA_TOKEN_TYPE_CONTROL = 3,
|
106
|
-
LLAMA_TOKEN_TYPE_USER_DEFINED = 4,
|
107
|
-
LLAMA_TOKEN_TYPE_UNUSED = 5,
|
108
|
-
LLAMA_TOKEN_TYPE_BYTE = 6,
|
109
|
-
};
|
110
|
-
|
111
|
-
enum llama_token_attr {
|
112
|
-
LLAMA_TOKEN_ATTR_UNDEFINED = 0,
|
113
|
-
LLAMA_TOKEN_ATTR_UNKNOWN = 1 << 0,
|
114
|
-
LLAMA_TOKEN_ATTR_UNUSED = 1 << 1,
|
115
|
-
LLAMA_TOKEN_ATTR_NORMAL = 1 << 2,
|
116
|
-
LLAMA_TOKEN_ATTR_CONTROL = 1 << 3, // SPECIAL?
|
117
|
-
LLAMA_TOKEN_ATTR_USER_DEFINED = 1 << 4,
|
118
|
-
LLAMA_TOKEN_ATTR_BYTE = 1 << 5,
|
119
|
-
LLAMA_TOKEN_ATTR_NORMALIZED = 1 << 6,
|
120
|
-
LLAMA_TOKEN_ATTR_LSTRIP = 1 << 7,
|
121
|
-
LLAMA_TOKEN_ATTR_RSTRIP = 1 << 8,
|
122
|
-
LLAMA_TOKEN_ATTR_SINGLE_WORD = 1 << 9,
|
123
|
-
};
|
124
|
-
|
125
|
-
// model file types
|
126
|
-
enum llama_ftype {
|
127
|
-
LLAMA_FTYPE_ALL_F32 = 0,
|
128
|
-
LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
|
129
|
-
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
130
|
-
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
131
|
-
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
132
|
-
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
|
133
|
-
// LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
|
134
|
-
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
135
|
-
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
136
|
-
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
137
|
-
LLAMA_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
|
138
|
-
LLAMA_FTYPE_MOSTLY_Q3_K_S = 11, // except 1d tensors
|
139
|
-
LLAMA_FTYPE_MOSTLY_Q3_K_M = 12, // except 1d tensors
|
140
|
-
LLAMA_FTYPE_MOSTLY_Q3_K_L = 13, // except 1d tensors
|
141
|
-
LLAMA_FTYPE_MOSTLY_Q4_K_S = 14, // except 1d tensors
|
142
|
-
LLAMA_FTYPE_MOSTLY_Q4_K_M = 15, // except 1d tensors
|
143
|
-
LLAMA_FTYPE_MOSTLY_Q5_K_S = 16, // except 1d tensors
|
144
|
-
LLAMA_FTYPE_MOSTLY_Q5_K_M = 17, // except 1d tensors
|
145
|
-
LLAMA_FTYPE_MOSTLY_Q6_K = 18, // except 1d tensors
|
146
|
-
LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19, // except 1d tensors
|
147
|
-
LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors
|
148
|
-
LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
|
149
|
-
LLAMA_FTYPE_MOSTLY_IQ3_XS = 22, // except 1d tensors
|
150
|
-
LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, // except 1d tensors
|
151
|
-
LLAMA_FTYPE_MOSTLY_IQ1_S = 24, // except 1d tensors
|
152
|
-
LLAMA_FTYPE_MOSTLY_IQ4_NL = 25, // except 1d tensors
|
153
|
-
LLAMA_FTYPE_MOSTLY_IQ3_S = 26, // except 1d tensors
|
154
|
-
LLAMA_FTYPE_MOSTLY_IQ3_M = 27, // except 1d tensors
|
155
|
-
LLAMA_FTYPE_MOSTLY_IQ2_S = 28, // except 1d tensors
|
156
|
-
LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors
|
157
|
-
LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
|
158
|
-
LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
|
159
|
-
LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors
|
160
|
-
|
161
|
-
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
162
|
-
};
|
163
|
-
|
164
|
-
enum llama_rope_scaling_type {
|
165
|
-
LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED = -1,
|
166
|
-
LLAMA_ROPE_SCALING_TYPE_NONE = 0,
|
167
|
-
LLAMA_ROPE_SCALING_TYPE_LINEAR = 1,
|
168
|
-
LLAMA_ROPE_SCALING_TYPE_YARN = 2,
|
169
|
-
LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_YARN,
|
170
|
-
};
|
171
|
-
|
172
|
-
enum llama_pooling_type {
|
173
|
-
LLAMA_POOLING_TYPE_UNSPECIFIED = -1,
|
174
|
-
LLAMA_POOLING_TYPE_NONE = 0,
|
175
|
-
LLAMA_POOLING_TYPE_MEAN = 1,
|
176
|
-
LLAMA_POOLING_TYPE_CLS = 2,
|
177
|
-
LLAMA_POOLING_TYPE_LAST = 3,
|
178
|
-
};
|
179
|
-
|
180
|
-
enum llama_split_mode {
|
181
|
-
LLAMA_SPLIT_MODE_NONE = 0, // single GPU
|
182
|
-
LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
|
183
|
-
LLAMA_SPLIT_MODE_ROW = 2, // split rows across GPUs
|
184
|
-
};
|
185
|
-
|
186
|
-
typedef struct llama_token_data {
|
187
|
-
llama_token id; // token id
|
188
|
-
float logit; // log-odds of the token
|
189
|
-
float p; // probability of the token
|
190
|
-
} llama_token_data;
|
191
|
-
|
192
|
-
typedef struct llama_token_data_array {
|
193
|
-
llama_token_data * data;
|
194
|
-
size_t size;
|
195
|
-
bool sorted;
|
196
|
-
} llama_token_data_array;
|
197
|
-
|
198
|
-
typedef bool (*llama_progress_callback)(float progress, void * user_data);
|
199
|
-
|
200
|
-
// Input data for llama_decode
|
201
|
-
// A llama_batch object can contain input about one or many sequences
|
202
|
-
// The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens
|
203
|
-
//
|
204
|
-
// - token : the token ids of the input (used when embd is NULL)
|
205
|
-
// - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
|
206
|
-
// - pos : the positions of the respective token in the sequence
|
207
|
-
// - seq_id : the sequence to which the respective token belongs
|
208
|
-
// - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
|
209
|
-
//
|
210
|
-
typedef struct llama_batch {
|
211
|
-
int32_t n_tokens;
|
212
|
-
|
213
|
-
llama_token * token;
|
214
|
-
float * embd;
|
215
|
-
llama_pos * pos;
|
216
|
-
int32_t * n_seq_id;
|
217
|
-
llama_seq_id ** seq_id;
|
218
|
-
int8_t * logits; // TODO: rename this to "output"
|
219
|
-
|
220
|
-
// NOTE: helpers for smooth API transition - can be deprecated in the future
|
221
|
-
// for future-proof code, use the above fields instead and ignore everything below
|
222
|
-
//
|
223
|
-
// pos[i] = all_pos_0 + i*all_pos_1
|
224
|
-
//
|
225
|
-
llama_pos all_pos_0; // used if pos == NULL
|
226
|
-
llama_pos all_pos_1; // used if pos == NULL
|
227
|
-
llama_seq_id all_seq_id; // used if seq_id == NULL
|
228
|
-
} llama_batch;
|
229
|
-
|
230
|
-
enum llama_model_kv_override_type {
|
231
|
-
LLAMA_KV_OVERRIDE_TYPE_INT,
|
232
|
-
LLAMA_KV_OVERRIDE_TYPE_FLOAT,
|
233
|
-
LLAMA_KV_OVERRIDE_TYPE_BOOL,
|
234
|
-
LLAMA_KV_OVERRIDE_TYPE_STR,
|
235
|
-
};
|
236
|
-
|
237
|
-
struct llama_model_kv_override {
|
238
|
-
enum llama_model_kv_override_type tag;
|
239
|
-
|
240
|
-
char key[128];
|
241
|
-
|
242
|
-
union {
|
243
|
-
int64_t val_i64;
|
244
|
-
double val_f64;
|
245
|
-
bool val_bool;
|
246
|
-
char val_str[128];
|
247
|
-
};
|
248
|
-
};
|
249
|
-
|
250
|
-
struct llama_model_params {
|
251
|
-
int32_t n_gpu_layers; // number of layers to store in VRAM
|
252
|
-
enum llama_split_mode split_mode; // how to split the model across multiple GPUs
|
253
|
-
|
254
|
-
// main_gpu interpretation depends on split_mode:
|
255
|
-
// LLAMA_SPLIT_NONE: the GPU that is used for the entire model
|
256
|
-
// LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results
|
257
|
-
// LLAMA_SPLIT_LAYER: ignored
|
258
|
-
int32_t main_gpu;
|
259
|
-
|
260
|
-
// proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
|
261
|
-
const float * tensor_split;
|
262
|
-
|
263
|
-
// comma separated list of RPC servers to use for offloading
|
264
|
-
const char * rpc_servers;
|
265
|
-
|
266
|
-
// Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
|
267
|
-
// If the provided progress_callback returns true, model loading continues.
|
268
|
-
// If it returns false, model loading is immediately aborted.
|
269
|
-
llama_progress_callback progress_callback;
|
270
|
-
|
271
|
-
// context pointer passed to the progress callback
|
272
|
-
void * progress_callback_user_data;
|
273
|
-
|
274
|
-
// override key-value pairs of the model meta data
|
275
|
-
const struct llama_model_kv_override * kv_overrides;
|
276
|
-
|
277
|
-
// Keep the booleans together to avoid misalignment during copy-by-value.
|
278
|
-
bool vocab_only; // only load the vocabulary, no weights
|
279
|
-
bool use_mmap; // use mmap if possible
|
280
|
-
bool use_mlock; // force system to keep model in RAM
|
281
|
-
bool check_tensors; // validate model tensor data
|
282
|
-
};
|
283
|
-
|
284
|
-
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
|
285
|
-
// https://github.com/ggerganov/llama.cpp/pull/7544
|
286
|
-
struct llama_context_params {
|
287
|
-
uint32_t seed; // RNG seed, -1 for random
|
288
|
-
uint32_t n_ctx; // text context, 0 = from model
|
289
|
-
uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode
|
290
|
-
uint32_t n_ubatch; // physical maximum batch size
|
291
|
-
uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models)
|
292
|
-
uint32_t n_threads; // number of threads to use for generation
|
293
|
-
uint32_t n_threads_batch; // number of threads to use for batch processing
|
294
|
-
|
295
|
-
enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
|
296
|
-
enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
|
297
|
-
|
298
|
-
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
299
|
-
float rope_freq_base; // RoPE base frequency, 0 = from model
|
300
|
-
float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
|
301
|
-
float yarn_ext_factor; // YaRN extrapolation mix factor, negative = from model
|
302
|
-
float yarn_attn_factor; // YaRN magnitude scaling factor
|
303
|
-
float yarn_beta_fast; // YaRN low correction dim
|
304
|
-
float yarn_beta_slow; // YaRN high correction dim
|
305
|
-
uint32_t yarn_orig_ctx; // YaRN original context size
|
306
|
-
float defrag_thold; // defragment the KV cache if holes/size > thold, < 0 disabled (default)
|
307
|
-
|
308
|
-
ggml_backend_sched_eval_callback cb_eval;
|
309
|
-
void * cb_eval_user_data;
|
310
|
-
|
311
|
-
enum ggml_type type_k; // data type for K cache [EXPERIMENTAL]
|
312
|
-
enum ggml_type type_v; // data type for V cache [EXPERIMENTAL]
|
313
|
-
|
314
|
-
// Keep the booleans together to avoid misalignment during copy-by-value.
|
315
|
-
bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
|
316
|
-
bool embeddings; // if true, extract embeddings (together with logits)
|
317
|
-
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
318
|
-
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
|
319
|
-
|
320
|
-
// Abort callback
|
321
|
-
// if it returns true, execution of llama_decode() will be aborted
|
322
|
-
// currently works only with CPU execution
|
323
|
-
ggml_abort_callback abort_callback;
|
324
|
-
void * abort_callback_data;
|
325
|
-
};
|
326
|
-
|
327
|
-
// model quantization parameters
|
328
|
-
typedef struct llama_model_quantize_params {
|
329
|
-
int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
330
|
-
enum llama_ftype ftype; // quantize to this llama_ftype
|
331
|
-
enum ggml_type output_tensor_type; // output tensor type
|
332
|
-
enum ggml_type token_embedding_type; // itoken embeddings tensor type
|
333
|
-
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
334
|
-
bool quantize_output_tensor; // quantize output.weight
|
335
|
-
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
336
|
-
bool pure; // quantize all tensors to the default type
|
337
|
-
bool keep_split; // quantize to the same number of shards
|
338
|
-
void * imatrix; // pointer to importance matrix data
|
339
|
-
void * kv_overrides; // pointer to vector containing overrides
|
340
|
-
} llama_model_quantize_params;
|
341
|
-
|
342
|
-
// grammar types
|
343
|
-
struct llama_grammar;
|
344
|
-
|
345
|
-
// grammar element type
|
346
|
-
enum llama_gretype {
|
347
|
-
// end of rule definition
|
348
|
-
LLAMA_GRETYPE_END = 0,
|
349
|
-
|
350
|
-
// start of alternate definition for rule
|
351
|
-
LLAMA_GRETYPE_ALT = 1,
|
352
|
-
|
353
|
-
// non-terminal element: reference to rule
|
354
|
-
LLAMA_GRETYPE_RULE_REF = 2,
|
355
|
-
|
356
|
-
// terminal element: character (code point)
|
357
|
-
LLAMA_GRETYPE_CHAR = 3,
|
358
|
-
|
359
|
-
// inverse char(s) ([^a], [^a-b] [^abc])
|
360
|
-
LLAMA_GRETYPE_CHAR_NOT = 4,
|
361
|
-
|
362
|
-
// modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
|
363
|
-
// be an inclusive range ([a-z])
|
364
|
-
LLAMA_GRETYPE_CHAR_RNG_UPPER = 5,
|
365
|
-
|
366
|
-
// modifies a preceding LLAMA_GRETYPE_CHAR or
|
367
|
-
// LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
|
368
|
-
LLAMA_GRETYPE_CHAR_ALT = 6,
|
369
|
-
|
370
|
-
// any character (.)
|
371
|
-
LLAMA_GRETYPE_CHAR_ANY = 7,
|
372
|
-
};
|
373
|
-
|
374
|
-
typedef struct llama_grammar_element {
|
375
|
-
enum llama_gretype type;
|
376
|
-
uint32_t value; // Unicode code point or rule ID
|
377
|
-
} llama_grammar_element;
|
378
|
-
|
379
|
-
// performance timing information
|
380
|
-
struct llama_timings {
|
381
|
-
double t_start_ms;
|
382
|
-
double t_end_ms;
|
383
|
-
double t_load_ms;
|
384
|
-
double t_sample_ms;
|
385
|
-
double t_p_eval_ms;
|
386
|
-
double t_eval_ms;
|
387
|
-
|
388
|
-
int32_t n_sample;
|
389
|
-
int32_t n_p_eval;
|
390
|
-
int32_t n_eval;
|
391
|
-
};
|
392
|
-
|
393
|
-
// used in chat template
|
394
|
-
typedef struct llama_chat_message {
|
395
|
-
const char * role;
|
396
|
-
const char * content;
|
397
|
-
} llama_chat_message;
|
398
|
-
|
399
|
-
// Helpers for getting default parameters
|
400
|
-
LLAMA_API struct llama_model_params llama_model_default_params(void);
|
401
|
-
LLAMA_API struct llama_context_params llama_context_default_params(void);
|
402
|
-
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void);
|
403
|
-
|
404
|
-
// Initialize the llama + ggml backend
|
405
|
-
// If numa is true, use NUMA optimizations
|
406
|
-
// Call once at the start of the program
|
407
|
-
LLAMA_API void llama_backend_init(void);
|
408
|
-
|
409
|
-
//optional:
|
410
|
-
LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa);
|
411
|
-
|
412
|
-
// Call once at the end of the program - currently only used for MPI
|
413
|
-
LLAMA_API void llama_backend_free(void);
|
414
|
-
|
415
|
-
LLAMA_API struct llama_model * llama_load_model_from_file(
|
416
|
-
const char * path_model,
|
417
|
-
struct llama_model_params params);
|
418
|
-
|
419
|
-
LLAMA_API void llama_free_model(struct llama_model * model);
|
420
|
-
|
421
|
-
LLAMA_API struct llama_context * llama_new_context_with_model(
|
422
|
-
struct llama_model * model,
|
423
|
-
struct llama_context_params params);
|
424
|
-
|
425
|
-
// Frees all allocated memory
|
426
|
-
LLAMA_API void llama_free(struct llama_context * ctx);
|
427
|
-
|
428
|
-
LLAMA_API int64_t llama_time_us(void);
|
429
|
-
|
430
|
-
LLAMA_API size_t llama_max_devices(void);
|
431
|
-
|
432
|
-
LLAMA_API bool llama_supports_mmap (void);
|
433
|
-
LLAMA_API bool llama_supports_mlock (void);
|
434
|
-
LLAMA_API bool llama_supports_gpu_offload(void);
|
435
|
-
|
436
|
-
LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
|
437
|
-
|
438
|
-
LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx);
|
439
|
-
LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
|
440
|
-
LLAMA_API uint32_t llama_n_ubatch (const struct llama_context * ctx);
|
441
|
-
LLAMA_API uint32_t llama_n_seq_max (const struct llama_context * ctx);
|
442
|
-
|
443
|
-
LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx);
|
444
|
-
|
445
|
-
LLAMA_API enum llama_vocab_type llama_vocab_type (const struct llama_model * model);
|
446
|
-
LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model);
|
447
|
-
|
448
|
-
LLAMA_API int32_t llama_n_vocab (const struct llama_model * model);
|
449
|
-
LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
|
450
|
-
LLAMA_API int32_t llama_n_embd (const struct llama_model * model);
|
451
|
-
LLAMA_API int32_t llama_n_layer (const struct llama_model * model);
|
452
|
-
|
453
|
-
// Get the model's RoPE frequency scaling factor
|
454
|
-
LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
|
455
|
-
|
456
|
-
// Functions to access the model's GGUF metadata scalar values
|
457
|
-
// - The functions return the length of the string on success, or -1 on failure
|
458
|
-
// - The output string is always null-terminated and cleared on failure
|
459
|
-
// - GGUF array values are not supported by these functions
|
460
|
-
|
461
|
-
// Get metadata value as a string by key name
|
462
|
-
LLAMA_API int32_t llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size);
|
463
|
-
|
464
|
-
// Get the number of metadata key/value pairs
|
465
|
-
LLAMA_API int32_t llama_model_meta_count(const struct llama_model * model);
|
466
|
-
|
467
|
-
// Get metadata key name by index
|
468
|
-
LLAMA_API int32_t llama_model_meta_key_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size);
|
469
|
-
|
470
|
-
// Get metadata value as a string by index
|
471
|
-
LLAMA_API int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size);
|
472
|
-
|
473
|
-
// Get a string describing the model type
|
474
|
-
LLAMA_API int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
|
475
|
-
|
476
|
-
// Returns the total size of all the tensors in the model in bytes
|
477
|
-
LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
|
478
|
-
|
479
|
-
// Returns the total number of parameters in the model
|
480
|
-
LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
|
481
|
-
|
482
|
-
// Get a llama model tensor
|
483
|
-
LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
|
484
|
-
|
485
|
-
// Returns 0 on success
|
486
|
-
LLAMA_API uint32_t llama_model_quantize(
|
487
|
-
const char * fname_inp,
|
488
|
-
const char * fname_out,
|
489
|
-
const llama_model_quantize_params * params);
|
490
|
-
|
491
|
-
// Apply a LoRA adapter to a loaded model
|
492
|
-
// path_base_model is the path to a higher quality model to use as a base for
|
493
|
-
// the layers modified by the adapter. Can be NULL to use the current loaded model.
|
494
|
-
// The model needs to be reloaded before applying a new adapter, otherwise the adapter
|
495
|
-
// will be applied on top of the previous one
|
496
|
-
// Returns 0 on success
|
497
|
-
LLAMA_API int32_t llama_model_apply_lora_from_file(
|
498
|
-
const struct llama_model * model,
|
499
|
-
const char * path_lora,
|
500
|
-
float scale,
|
501
|
-
const char * path_base_model,
|
502
|
-
int32_t n_threads);
|
503
|
-
|
504
|
-
// Apply a loaded control vector to a llama_context, or if data is NULL, clear
|
505
|
-
// the currently loaded vector.
|
506
|
-
// n_embd should be the size of a single layer's control, and data should point
|
507
|
-
// to an n_embd x n_layers buffer starting from layer 1.
|
508
|
-
// il_start and il_end are the layer range the vector should apply to (both inclusive)
|
509
|
-
// See llama_control_vector_load in common to load a control vector.
|
510
|
-
LLAMA_API int32_t llama_control_vector_apply(
|
511
|
-
struct llama_context * lctx,
|
512
|
-
const float * data,
|
513
|
-
size_t len,
|
514
|
-
int32_t n_embd,
|
515
|
-
int32_t il_start,
|
516
|
-
int32_t il_end);
|
517
|
-
|
518
|
-
//
|
519
|
-
// KV cache
|
520
|
-
//
|
521
|
-
|
522
|
-
// Information associated with an individual cell in the KV cache view.
|
523
|
-
struct llama_kv_cache_view_cell {
|
524
|
-
// The position for this cell. Takes KV cache shifts into account.
|
525
|
-
// May be negative if the cell is not populated.
|
526
|
-
llama_pos pos;
|
527
|
-
};
|
528
|
-
|
529
|
-
// An updateable view of the KV cache.
|
530
|
-
struct llama_kv_cache_view {
|
531
|
-
// Number of KV cache cells. This will be the same as the context size.
|
532
|
-
int32_t n_cells;
|
533
|
-
|
534
|
-
// Maximum number of sequences that can exist in a cell. It's not an error
|
535
|
-
// if there are more sequences in a cell than this value, however they will
|
536
|
-
// not be visible in the view cells_sequences.
|
537
|
-
int32_t n_seq_max;
|
538
|
-
|
539
|
-
// Number of tokens in the cache. For example, if there are two populated
|
540
|
-
// cells, the first with 1 sequence id in it and the second with 2 sequence
|
541
|
-
// ids then you'll have 3 tokens.
|
542
|
-
int32_t token_count;
|
543
|
-
|
544
|
-
// Number of populated cache cells.
|
545
|
-
int32_t used_cells;
|
546
|
-
|
547
|
-
// Maximum contiguous empty slots in the cache.
|
548
|
-
int32_t max_contiguous;
|
549
|
-
|
550
|
-
// Index to the start of the max_contiguous slot range. Can be negative
|
551
|
-
// when cache is full.
|
552
|
-
int32_t max_contiguous_idx;
|
553
|
-
|
554
|
-
// Information for an individual cell.
|
555
|
-
struct llama_kv_cache_view_cell * cells;
|
556
|
-
|
557
|
-
// The sequences for each cell. There will be n_seq_max items per cell.
|
558
|
-
llama_seq_id * cells_sequences;
|
559
|
-
};
|
560
|
-
|
561
|
-
// Create an empty KV cache view. (use only for debugging purposes)
|
562
|
-
LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max);
|
563
|
-
|
564
|
-
// Free a KV cache view. (use only for debugging purposes)
|
565
|
-
LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
|
566
|
-
|
567
|
-
// Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
|
568
|
-
LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
|
569
|
-
|
570
|
-
// Returns the number of tokens in the KV cache (slow, use only for debug)
|
571
|
-
// If a KV cell has multiple sequences assigned to it, it will be counted multiple times
|
572
|
-
LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx);
|
573
|
-
|
574
|
-
// Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
|
575
|
-
LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx);
|
576
|
-
|
577
|
-
// Clear the KV cache - both cell info is erased and KV data is zeroed
|
578
|
-
LLAMA_API void llama_kv_cache_clear(
|
579
|
-
struct llama_context * ctx);
|
580
|
-
|
581
|
-
// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
|
582
|
-
// Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
|
583
|
-
// seq_id < 0 : match any sequence
|
584
|
-
// p0 < 0 : [0, p1]
|
585
|
-
// p1 < 0 : [p0, inf)
|
586
|
-
LLAMA_API bool llama_kv_cache_seq_rm(
|
587
|
-
struct llama_context * ctx,
|
588
|
-
llama_seq_id seq_id,
|
589
|
-
llama_pos p0,
|
590
|
-
llama_pos p1);
|
591
|
-
|
592
|
-
// Copy all tokens that belong to the specified sequence to another sequence
|
593
|
-
// Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
|
594
|
-
// p0 < 0 : [0, p1]
|
595
|
-
// p1 < 0 : [p0, inf)
|
596
|
-
LLAMA_API void llama_kv_cache_seq_cp(
|
597
|
-
struct llama_context * ctx,
|
598
|
-
llama_seq_id seq_id_src,
|
599
|
-
llama_seq_id seq_id_dst,
|
600
|
-
llama_pos p0,
|
601
|
-
llama_pos p1);
|
602
|
-
|
603
|
-
// Removes all tokens that do not belong to the specified sequence
|
604
|
-
LLAMA_API void llama_kv_cache_seq_keep(
|
605
|
-
struct llama_context * ctx,
|
606
|
-
llama_seq_id seq_id);
|
607
|
-
|
608
|
-
// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
|
609
|
-
// If the KV cache is RoPEd, the KV data is updated accordingly:
|
610
|
-
// - lazily on next llama_decode()
|
611
|
-
// - explicitly with llama_kv_cache_update()
|
612
|
-
// p0 < 0 : [0, p1]
|
613
|
-
// p1 < 0 : [p0, inf)
|
614
|
-
LLAMA_API void llama_kv_cache_seq_add(
|
615
|
-
struct llama_context * ctx,
|
616
|
-
llama_seq_id seq_id,
|
617
|
-
llama_pos p0,
|
618
|
-
llama_pos p1,
|
619
|
-
llama_pos delta);
|
620
|
-
|
621
|
-
// Integer division of the positions by factor of `d > 1`
|
622
|
-
// If the KV cache is RoPEd, the KV data is updated accordingly:
|
623
|
-
// - lazily on next llama_decode()
|
624
|
-
// - explicitly with llama_kv_cache_update()
|
625
|
-
// p0 < 0 : [0, p1]
|
626
|
-
// p1 < 0 : [p0, inf)
|
627
|
-
LLAMA_API void llama_kv_cache_seq_div(
|
628
|
-
struct llama_context * ctx,
|
629
|
-
llama_seq_id seq_id,
|
630
|
-
llama_pos p0,
|
631
|
-
llama_pos p1,
|
632
|
-
int d);
|
633
|
-
|
634
|
-
// Returns the largest position present in the KV cache for the specified sequence
|
635
|
-
LLAMA_API llama_pos llama_kv_cache_seq_pos_max(
|
636
|
-
struct llama_context * ctx,
|
637
|
-
llama_seq_id seq_id);
|
638
|
-
|
639
|
-
// Defragment the KV cache
|
640
|
-
// This will be applied:
|
641
|
-
// - lazily on next llama_decode()
|
642
|
-
// - explicitly with llama_kv_cache_update()
|
643
|
-
LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx);
|
644
|
-
|
645
|
-
// Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
|
646
|
-
LLAMA_API void llama_kv_cache_update(struct llama_context * ctx);
|
647
|
-
|
648
|
-
//
|
649
|
-
// State / sessions
|
650
|
-
//
|
651
|
-
|
652
|
-
// Returns the maximum size in bytes of the state (rng, logits, embedding
|
653
|
-
// and kv_cache) - will often be smaller after compacting tokens
|
654
|
-
LLAMA_API size_t llama_state_get_size(const struct llama_context * ctx);
|
655
|
-
LLAMA_API DEPRECATED(size_t llama_get_state_size(const struct llama_context * ctx),
|
656
|
-
"use llama_state_get_size instead");
|
657
|
-
|
658
|
-
// Copies the state to the specified destination address.
|
659
|
-
// Destination needs to have allocated enough memory.
|
660
|
-
// Returns the number of bytes copied
|
661
|
-
LLAMA_API size_t llama_state_get_data(
|
662
|
-
struct llama_context * ctx,
|
663
|
-
uint8_t * dst);
|
664
|
-
LLAMA_API DEPRECATED(size_t llama_copy_state_data(
|
665
|
-
struct llama_context * ctx,
|
666
|
-
uint8_t * dst),
|
667
|
-
"use llama_state_get_data instead");
|
668
|
-
|
669
|
-
// Set the state reading from the specified address
|
670
|
-
// Returns the number of bytes read
|
671
|
-
LLAMA_API size_t llama_state_set_data(
|
672
|
-
struct llama_context * ctx,
|
673
|
-
const uint8_t * src);
|
674
|
-
LLAMA_API DEPRECATED(size_t llama_set_state_data(
|
675
|
-
struct llama_context * ctx,
|
676
|
-
const uint8_t * src),
|
677
|
-
"use llama_state_set_data instead");
|
678
|
-
|
679
|
-
// Save/load session file
|
680
|
-
LLAMA_API bool llama_state_load_file(
|
681
|
-
struct llama_context * ctx,
|
682
|
-
const char * path_session,
|
683
|
-
llama_token * tokens_out,
|
684
|
-
size_t n_token_capacity,
|
685
|
-
size_t * n_token_count_out);
|
686
|
-
LLAMA_API DEPRECATED(bool llama_load_session_file(
|
687
|
-
struct llama_context * ctx,
|
688
|
-
const char * path_session,
|
689
|
-
llama_token * tokens_out,
|
690
|
-
size_t n_token_capacity,
|
691
|
-
size_t * n_token_count_out),
|
692
|
-
"use llama_state_load_file instead");
|
693
|
-
|
694
|
-
LLAMA_API bool llama_state_save_file(
|
695
|
-
struct llama_context * ctx,
|
696
|
-
const char * path_session,
|
697
|
-
const llama_token * tokens,
|
698
|
-
size_t n_token_count);
|
699
|
-
LLAMA_API DEPRECATED(bool llama_save_session_file(
|
700
|
-
struct llama_context * ctx,
|
701
|
-
const char * path_session,
|
702
|
-
const llama_token * tokens,
|
703
|
-
size_t n_token_count),
|
704
|
-
"use llama_state_save_file instead");
|
705
|
-
|
706
|
-
// Get the exact size needed to copy the KV cache of a single sequence
|
707
|
-
LLAMA_API size_t llama_state_seq_get_size(
|
708
|
-
struct llama_context * ctx,
|
709
|
-
llama_seq_id seq_id);
|
710
|
-
|
711
|
-
// Copy the KV cache of a single sequence into the specified buffer
|
712
|
-
LLAMA_API size_t llama_state_seq_get_data(
|
713
|
-
struct llama_context * ctx,
|
714
|
-
uint8_t * dst,
|
715
|
-
llama_seq_id seq_id);
|
716
|
-
|
717
|
-
// Copy the sequence data (originally copied with `llama_state_seq_get_data`) into the specified sequence
|
718
|
-
// Returns:
|
719
|
-
// - Positive: Ok
|
720
|
-
// - Zero: Failed to load
|
721
|
-
LLAMA_API size_t llama_state_seq_set_data(
|
722
|
-
struct llama_context * ctx,
|
723
|
-
const uint8_t * src,
|
724
|
-
llama_seq_id dest_seq_id);
|
725
|
-
|
726
|
-
LLAMA_API size_t llama_state_seq_save_file(
|
727
|
-
struct llama_context * ctx,
|
728
|
-
const char * filepath,
|
729
|
-
llama_seq_id seq_id,
|
730
|
-
const llama_token * tokens,
|
731
|
-
size_t n_token_count);
|
732
|
-
|
733
|
-
LLAMA_API size_t llama_state_seq_load_file(
|
734
|
-
struct llama_context * ctx,
|
735
|
-
const char * filepath,
|
736
|
-
llama_seq_id dest_seq_id,
|
737
|
-
llama_token * tokens_out,
|
738
|
-
size_t n_token_capacity,
|
739
|
-
size_t * n_token_count_out);
|
740
|
-
|
741
|
-
//
|
742
|
-
// Decoding
|
743
|
-
//
|
744
|
-
|
745
|
-
// Return batch for single sequence of tokens starting at pos_0
|
746
|
-
//
|
747
|
-
// NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
|
748
|
-
//
|
749
|
-
LLAMA_API struct llama_batch llama_batch_get_one(
|
750
|
-
llama_token * tokens,
|
751
|
-
int32_t n_tokens,
|
752
|
-
llama_pos pos_0,
|
753
|
-
llama_seq_id seq_id);
|
754
|
-
|
755
|
-
// Allocates a batch of tokens on the heap that can hold a maximum of n_tokens
|
756
|
-
// Each token can be assigned up to n_seq_max sequence ids
|
757
|
-
// The batch has to be freed with llama_batch_free()
|
758
|
-
// If embd != 0, llama_batch.embd will be allocated with size of n_tokens * embd * sizeof(float)
|
759
|
-
// Otherwise, llama_batch.token will be allocated to store n_tokens llama_token
|
760
|
-
// The rest of the llama_batch members are allocated with size n_tokens
|
761
|
-
// All members are left uninitialized
|
762
|
-
LLAMA_API struct llama_batch llama_batch_init(
|
763
|
-
int32_t n_tokens,
|
764
|
-
int32_t embd,
|
765
|
-
int32_t n_seq_max);
|
766
|
-
|
767
|
-
// Frees a batch of tokens allocated with llama_batch_init()
|
768
|
-
LLAMA_API void llama_batch_free(struct llama_batch batch);
|
769
|
-
|
770
|
-
// Positive return values does not mean a fatal error, but rather a warning.
|
771
|
-
// 0 - success
|
772
|
-
// 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
|
773
|
-
// < 0 - error
|
774
|
-
LLAMA_API int32_t llama_decode(
|
775
|
-
struct llama_context * ctx,
|
776
|
-
struct llama_batch batch);
|
777
|
-
|
778
|
-
// Set the number of threads used for decoding
|
779
|
-
// n_threads is the number of threads used for generation (single token)
|
780
|
-
// n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
|
781
|
-
LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);
|
782
|
-
|
783
|
-
// Get the number of threads used for generation of a single token.
|
784
|
-
LLAMA_API uint32_t llama_n_threads(struct llama_context * ctx);
|
785
|
-
|
786
|
-
// Get the number of threads used for prompt and batch processing (multiple token).
|
787
|
-
LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx);
|
788
|
-
|
789
|
-
// Set whether the model is in embeddings model or not
|
790
|
-
// If true, embeddings will be returned but logits will not
|
791
|
-
LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings);
|
792
|
-
|
793
|
-
// Set whether to use causal attention or not
|
794
|
-
// If set to true, the model will only attend to the past tokens
|
795
|
-
LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);
|
796
|
-
|
797
|
-
// Set abort callback
|
798
|
-
LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
|
799
|
-
|
800
|
-
// Wait until all computations are finished
|
801
|
-
// This is automatically done when using one of the functions below to obtain the computation results
|
802
|
-
// and is not necessary to call it explicitly in most cases
|
803
|
-
LLAMA_API void llama_synchronize(struct llama_context * ctx);
|
804
|
-
|
805
|
-
// Token logits obtained from the last call to llama_decode()
|
806
|
-
// The logits for which llama_batch.logits[i] != 0 are stored contiguously
|
807
|
-
// in the order they have appeared in the batch.
|
808
|
-
// Rows: number of tokens for which llama_batch.logits[i] != 0
|
809
|
-
// Cols: n_vocab
|
810
|
-
LLAMA_API float * llama_get_logits(struct llama_context * ctx);
|
811
|
-
|
812
|
-
// Logits for the ith token. For positive indices, Equivalent to:
|
813
|
-
// llama_get_logits(ctx) + ctx->output_ids[i]*n_vocab
|
814
|
-
// Negative indicies can be used to access logits in reverse order, -1 is the last logit.
|
815
|
-
// returns NULL for invalid ids.
|
816
|
-
LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
|
817
|
-
|
818
|
-
// Get all output token embeddings.
|
819
|
-
// when pooling_type == LLAMA_POOLING_TYPE_NONE or when using a generative model,
|
820
|
-
// the embeddings for which llama_batch.logits[i] != 0 are stored contiguously
|
821
|
-
// in the order they have appeared in the batch.
|
822
|
-
// shape: [n_outputs*n_embd]
|
823
|
-
// Otherwise, returns NULL.
|
824
|
-
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
825
|
-
|
826
|
-
// Get the embeddings for the ith token. For positive indices, Equivalent to:
|
827
|
-
// llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd
|
828
|
-
// Negative indicies can be used to access embeddings in reverse order, -1 is the last embedding.
|
829
|
-
// shape: [n_embd] (1-dimensional)
|
830
|
-
// returns NULL for invalid ids.
|
831
|
-
LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
|
832
|
-
|
833
|
-
// Get the embeddings for a sequence id
|
834
|
-
// Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
|
835
|
-
// shape: [n_embd] (1-dimensional)
|
836
|
-
LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
|
837
|
-
|
838
|
-
//
|
839
|
-
// Vocab
|
840
|
-
//
|
841
|
-
|
842
|
-
LLAMA_API const char * llama_token_get_text(const struct llama_model * model, llama_token token);
|
843
|
-
|
844
|
-
LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token);
|
845
|
-
|
846
|
-
LLAMA_API enum llama_token_attr llama_token_get_attr(const struct llama_model * model, llama_token token);
|
847
|
-
|
848
|
-
// Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
|
849
|
-
LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token);
|
850
|
-
|
851
|
-
// Identify if Token Id is a control token or a render-able token
|
852
|
-
LLAMA_API bool llama_token_is_control(const struct llama_model * model, llama_token token);
|
853
|
-
|
854
|
-
// Special tokens
|
855
|
-
LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
|
856
|
-
LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
|
857
|
-
LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification
|
858
|
-
LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator
|
859
|
-
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
|
860
|
-
|
861
|
-
// Returns -1 if unknown, 1 for true or 0 for false.
|
862
|
-
LLAMA_API int32_t llama_add_bos_token(const struct llama_model * model);
|
863
|
-
|
864
|
-
// Returns -1 if unknown, 1 for true or 0 for false.
|
865
|
-
LLAMA_API int32_t llama_add_eos_token(const struct llama_model * model);
|
866
|
-
|
867
|
-
// Codellama infill tokens
|
868
|
-
LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
|
869
|
-
LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
|
870
|
-
LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix
|
871
|
-
LLAMA_API llama_token llama_token_eot (const struct llama_model * model); // End of infill middle
|
872
|
-
|
873
|
-
//
|
874
|
-
// Tokenization
|
875
|
-
//
|
876
|
-
|
877
|
-
/// @details Convert the provided text into tokens.
|
878
|
-
/// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
|
879
|
-
/// @return Returns the number of tokens on success, no more than n_tokens_max
|
880
|
-
/// @return Returns a negative number on failure - the number of tokens that would have been returned
|
881
|
-
/// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
|
882
|
-
/// as plaintext. Does not insert a leading space.
|
883
|
-
LLAMA_API int32_t llama_tokenize(
|
884
|
-
const struct llama_model * model,
|
885
|
-
const char * text,
|
886
|
-
int32_t text_len,
|
887
|
-
llama_token * tokens,
|
888
|
-
int32_t n_tokens_max,
|
889
|
-
bool add_special,
|
890
|
-
bool parse_special);
|
891
|
-
|
892
|
-
// Token Id -> Piece.
|
893
|
-
// Uses the vocabulary in the provided context.
|
894
|
-
// Does not write null terminator to the buffer.
|
895
|
-
// User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
|
896
|
-
// @param special If true, special tokens are rendered in the output.
|
897
|
-
LLAMA_API int32_t llama_token_to_piece(
|
898
|
-
const struct llama_model * model,
|
899
|
-
llama_token token,
|
900
|
-
char * buf,
|
901
|
-
int32_t length,
|
902
|
-
bool special);
|
903
|
-
|
904
|
-
/// Apply chat template. Inspired by hf apply_chat_template() on python.
|
905
|
-
/// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
|
906
|
-
/// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
|
907
|
-
/// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead.
|
908
|
-
/// @param chat Pointer to a list of multiple llama_chat_message
|
909
|
-
/// @param n_msg Number of llama_chat_message in this chat
|
910
|
-
/// @param add_ass Whether to end the prompt with the token(s) that indicate the start of an assistant message.
|
911
|
-
/// @param buf A buffer to hold the output formatted prompt. The recommended alloc size is 2 * (total number of characters of all messages)
|
912
|
-
/// @param length The size of the allocated buffer
|
913
|
-
/// @return The total number of bytes of the formatted prompt. If is it larger than the size of buffer, you may need to re-alloc it and then re-apply the template.
|
914
|
-
LLAMA_API int32_t llama_chat_apply_template(
|
915
|
-
const struct llama_model * model,
|
916
|
-
const char * tmpl,
|
917
|
-
const struct llama_chat_message * chat,
|
918
|
-
size_t n_msg,
|
919
|
-
bool add_ass,
|
920
|
-
char * buf,
|
921
|
-
int32_t length);
|
922
|
-
|
923
|
-
//
|
924
|
-
// Grammar
|
925
|
-
//
|
926
|
-
|
927
|
-
LLAMA_API struct llama_grammar * llama_grammar_init(
|
928
|
-
const llama_grammar_element ** rules,
|
929
|
-
size_t n_rules,
|
930
|
-
size_t start_rule_index);
|
931
|
-
|
932
|
-
LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
|
933
|
-
|
934
|
-
LLAMA_API struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar);
|
935
|
-
|
936
|
-
//
|
937
|
-
// Sampling functions
|
938
|
-
//
|
939
|
-
|
940
|
-
// Sets the current rng seed.
|
941
|
-
LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
|
942
|
-
|
943
|
-
/// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
|
944
|
-
/// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
|
945
|
-
LLAMA_API void llama_sample_repetition_penalties(
|
946
|
-
struct llama_context * ctx,
|
947
|
-
llama_token_data_array * candidates,
|
948
|
-
const llama_token * last_tokens,
|
949
|
-
size_t penalty_last_n,
|
950
|
-
float penalty_repeat,
|
951
|
-
float penalty_freq,
|
952
|
-
float penalty_present);
|
953
|
-
|
954
|
-
/// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
|
955
|
-
/// @param logits Logits extracted from the original generation context.
|
956
|
-
/// @param logits_guidance Logits extracted from a separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
|
957
|
-
/// @param scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
|
958
|
-
LLAMA_API void llama_sample_apply_guidance(
|
959
|
-
struct llama_context * ctx,
|
960
|
-
float * logits,
|
961
|
-
float * logits_guidance,
|
962
|
-
float scale);
|
963
|
-
|
964
|
-
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
|
965
|
-
LLAMA_API void llama_sample_softmax(
|
966
|
-
struct llama_context * ctx,
|
967
|
-
llama_token_data_array * candidates);
|
968
|
-
|
969
|
-
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
970
|
-
LLAMA_API void llama_sample_top_k(
|
971
|
-
struct llama_context * ctx,
|
972
|
-
llama_token_data_array * candidates,
|
973
|
-
int32_t k,
|
974
|
-
size_t min_keep);
|
975
|
-
|
976
|
-
/// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
|
977
|
-
LLAMA_API void llama_sample_top_p(
|
978
|
-
struct llama_context * ctx,
|
979
|
-
llama_token_data_array * candidates,
|
980
|
-
float p,
|
981
|
-
size_t min_keep);
|
982
|
-
|
983
|
-
/// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
|
984
|
-
LLAMA_API void llama_sample_min_p(
|
985
|
-
struct llama_context * ctx,
|
986
|
-
llama_token_data_array * candidates,
|
987
|
-
float p,
|
988
|
-
size_t min_keep);
|
989
|
-
|
990
|
-
/// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
|
991
|
-
LLAMA_API void llama_sample_tail_free(
|
992
|
-
struct llama_context * ctx,
|
993
|
-
llama_token_data_array * candidates,
|
994
|
-
float z,
|
995
|
-
size_t min_keep);
|
996
|
-
|
997
|
-
/// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
|
998
|
-
LLAMA_API void llama_sample_typical(
|
999
|
-
struct llama_context * ctx,
|
1000
|
-
llama_token_data_array * candidates,
|
1001
|
-
float p,
|
1002
|
-
size_t min_keep);
|
1003
|
-
|
1004
|
-
/// @details Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772.
|
1005
|
-
LLAMA_API void llama_sample_entropy(
|
1006
|
-
struct llama_context * ctx,
|
1007
|
-
llama_token_data_array * candidates_p,
|
1008
|
-
float min_temp,
|
1009
|
-
float max_temp,
|
1010
|
-
float exponent_val);
|
1011
|
-
|
1012
|
-
LLAMA_API void llama_sample_temp(
|
1013
|
-
struct llama_context * ctx,
|
1014
|
-
llama_token_data_array * candidates,
|
1015
|
-
float temp);
|
1016
|
-
|
1017
|
-
/// @details Apply constraints from grammar
|
1018
|
-
LLAMA_API void llama_sample_grammar(
|
1019
|
-
struct llama_context * ctx,
|
1020
|
-
llama_token_data_array * candidates,
|
1021
|
-
const struct llama_grammar * grammar);
|
1022
|
-
|
1023
|
-
/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
1024
|
-
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
|
1025
|
-
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
|
1026
|
-
/// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
|
1027
|
-
/// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
|
1028
|
-
/// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
|
1029
|
-
LLAMA_API llama_token llama_sample_token_mirostat(
|
1030
|
-
struct llama_context * ctx,
|
1031
|
-
llama_token_data_array * candidates,
|
1032
|
-
float tau,
|
1033
|
-
float eta,
|
1034
|
-
int32_t m,
|
1035
|
-
float * mu);
|
1036
|
-
|
1037
|
-
/// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
|
1038
|
-
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
|
1039
|
-
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
|
1040
|
-
/// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
|
1041
|
-
/// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
|
1042
|
-
LLAMA_API llama_token llama_sample_token_mirostat_v2(
|
1043
|
-
struct llama_context * ctx,
|
1044
|
-
llama_token_data_array * candidates,
|
1045
|
-
float tau,
|
1046
|
-
float eta,
|
1047
|
-
float * mu);
|
1048
|
-
|
1049
|
-
/// @details Selects the token with the highest probability.
|
1050
|
-
/// Does not compute the token probabilities. Use llama_sample_softmax() instead.
|
1051
|
-
LLAMA_API llama_token llama_sample_token_greedy(
|
1052
|
-
struct llama_context * ctx,
|
1053
|
-
llama_token_data_array * candidates);
|
1054
|
-
|
1055
|
-
/// @details Randomly selects a token from the candidates based on their probabilities using the RNG of ctx.
|
1056
|
-
LLAMA_API llama_token llama_sample_token(
|
1057
|
-
struct llama_context * ctx,
|
1058
|
-
llama_token_data_array * candidates);
|
1059
|
-
|
1060
|
-
/// @details Accepts the sampled token into the grammar
|
1061
|
-
LLAMA_API void llama_grammar_accept_token(
|
1062
|
-
struct llama_context * ctx,
|
1063
|
-
struct llama_grammar * grammar,
|
1064
|
-
llama_token token);
|
1065
|
-
|
1066
|
-
//
|
1067
|
-
// Model split
|
1068
|
-
//
|
1069
|
-
|
1070
|
-
/// @details Build a split GGUF final path for this chunk.
|
1071
|
-
/// llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
|
1072
|
-
// Returns the split_path length.
|
1073
|
-
LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count);
|
1074
|
-
|
1075
|
-
/// @details Extract the path prefix from the split_path if and only if the split_no and split_count match.
|
1076
|
-
/// llama_split_prefix(split_prefix, 64, "/models/ggml-model-q4_0-00002-of-00004.gguf", 2, 4) => split_prefix = "/models/ggml-model-q4_0"
|
1077
|
-
// Returns the split_prefix length.
|
1078
|
-
LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count);
|
1079
|
-
|
1080
|
-
// Performance information
|
1081
|
-
LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
|
1082
|
-
|
1083
|
-
LLAMA_API void llama_print_timings(struct llama_context * ctx);
|
1084
|
-
LLAMA_API void llama_reset_timings(struct llama_context * ctx);
|
1085
|
-
|
1086
|
-
// Print system information
|
1087
|
-
LLAMA_API const char * llama_print_system_info(void);
|
1088
|
-
|
1089
|
-
// Set callback for all future logging events.
|
1090
|
-
// If this is not called, or NULL is supplied, everything is output on stderr.
|
1091
|
-
LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);
|
1092
|
-
|
1093
|
-
LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
|
1094
|
-
|
1095
|
-
#ifdef __cplusplus
|
1096
|
-
}
|
1097
|
-
#endif
|
1098
|
-
|
1099
|
-
// Internal API to be implemented by llama.cpp and used by tests/benchmarks only
|
1100
|
-
#ifdef LLAMA_API_INTERNAL
|
1101
|
-
|
1102
|
-
#include <random>
|
1103
|
-
#include <string>
|
1104
|
-
#include <vector>
|
1105
|
-
|
1106
|
-
struct ggml_tensor;
|
1107
|
-
|
1108
|
-
struct llama_partial_utf8 {
|
1109
|
-
uint32_t value; // bit value so far (unshifted)
|
1110
|
-
int n_remain; // num bytes remaining; -1 indicates invalid sequence
|
1111
|
-
};
|
1112
|
-
|
1113
|
-
struct llama_grammar {
|
1114
|
-
const std::vector<std::vector<llama_grammar_element>> rules;
|
1115
|
-
std::vector<std::vector<const llama_grammar_element *>> stacks;
|
1116
|
-
|
1117
|
-
// buffer for partially generated UTF-8 sequence from accepted tokens
|
1118
|
-
llama_partial_utf8 partial_utf8;
|
1119
|
-
};
|
1120
|
-
|
1121
|
-
struct llama_grammar_candidate {
|
1122
|
-
size_t index;
|
1123
|
-
const uint32_t * code_points;
|
1124
|
-
llama_partial_utf8 partial_utf8;
|
1125
|
-
};
|
1126
|
-
|
1127
|
-
const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
|
1128
|
-
struct llama_context * ctx
|
1129
|
-
);
|
1130
|
-
|
1131
|
-
void llama_grammar_accept(
|
1132
|
-
const std::vector<std::vector<llama_grammar_element>> & rules,
|
1133
|
-
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
|
1134
|
-
const uint32_t chr,
|
1135
|
-
std::vector<std::vector<const llama_grammar_element *>> & new_stacks);
|
1136
|
-
|
1137
|
-
std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
1138
|
-
const std::string & src,
|
1139
|
-
llama_partial_utf8 partial_start);
|
1140
|
-
|
1141
|
-
// Randomly selects a token from the candidates based on their probabilities using given std::mt19937.
|
1142
|
-
// This is a temporary workaround in order to fix race conditions when sampling with multiple sequences.
|
1143
|
-
llama_token llama_sample_token_with_rng(struct llama_context * ctx, llama_token_data_array * candidates, std::mt19937 & rng);
|
1144
|
-
|
1145
|
-
#endif // LLAMA_API_INTERNAL
|
1146
|
-
|
1147
|
-
#endif // LLAMA_H
|