llama_cpp 0.3.8 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +19 -0
- data/README.md +1 -1
- data/examples/chat.rb +4 -6
- data/ext/llama_cpp/extconf.rb +3 -3
- data/ext/llama_cpp/llama_cpp.cpp +129 -124
- data/ext/llama_cpp/src/ggml-alloc.c +90 -113
- data/ext/llama_cpp/src/ggml-alloc.h +1 -1
- data/ext/llama_cpp/src/ggml-cuda.cu +350 -77
- data/ext/llama_cpp/src/ggml-cuda.h +13 -0
- data/ext/llama_cpp/src/ggml-metal.h +4 -0
- data/ext/llama_cpp/src/ggml-metal.m +226 -121
- data/ext/llama_cpp/src/ggml-metal.metal +157 -35
- data/ext/llama_cpp/src/ggml.c +2724 -584
- data/ext/llama_cpp/src/ggml.h +282 -31
- data/ext/llama_cpp/src/k_quants.c +112 -56
- data/ext/llama_cpp/src/llama.cpp +4857 -2986
- data/ext/llama_cpp/src/llama.h +180 -126
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +2 -2
- data/sig/llama_cpp.rbs +12 -11
- metadata +2 -2
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -10,6 +10,7 @@
|
|
10
10
|
#endif // GGML_USE_CUBLAS
|
11
11
|
#include <stddef.h>
|
12
12
|
#include <stdint.h>
|
13
|
+
#include <stdio.h>
|
13
14
|
#include <stdbool.h>
|
14
15
|
|
15
16
|
#ifdef LLAMA_SHARED
|
@@ -34,29 +35,18 @@
|
|
34
35
|
# define DEPRECATED(func, hint) func
|
35
36
|
#endif
|
36
37
|
|
37
|
-
#define
|
38
|
-
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
|
39
|
-
#define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf'
|
40
|
-
#define LLAMA_FILE_MAGIC_GGML 0x67676d6cu // 'ggml'
|
41
|
-
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
38
|
+
#define LLAMA_DEFAULT_SEED 0xFFFFFFFF
|
42
39
|
|
43
|
-
#define
|
44
|
-
#define LLAMA_FILE_MAGIC LLAMA_FILE_MAGIC_GGJT
|
45
|
-
#define LLAMA_FILE_MAGIC_UNVERSIONED LLAMA_FILE_MAGIC_GGML
|
46
|
-
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
47
|
-
#define LLAMA_SESSION_VERSION 1
|
40
|
+
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
48
41
|
|
49
|
-
#define
|
42
|
+
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
43
|
+
#define LLAMA_SESSION_VERSION 1
|
50
44
|
|
51
45
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
|
52
46
|
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
53
47
|
#define LLAMA_SUPPORTS_GPU_OFFLOAD
|
54
48
|
#endif
|
55
49
|
|
56
|
-
#ifndef LLAMA_DEFAULT_RMS_EPS
|
57
|
-
#define LLAMA_DEFAULT_RMS_EPS 5e-6f
|
58
|
-
#endif
|
59
|
-
|
60
50
|
#ifdef __cplusplus
|
61
51
|
extern "C" {
|
62
52
|
#endif
|
@@ -72,6 +62,52 @@ extern "C" {
|
|
72
62
|
|
73
63
|
typedef int llama_token;
|
74
64
|
|
65
|
+
enum llama_log_level {
|
66
|
+
LLAMA_LOG_LEVEL_ERROR = 2,
|
67
|
+
LLAMA_LOG_LEVEL_WARN = 3,
|
68
|
+
LLAMA_LOG_LEVEL_INFO = 4
|
69
|
+
};
|
70
|
+
|
71
|
+
enum llama_vocab_type {
|
72
|
+
LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
|
73
|
+
LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
|
74
|
+
};
|
75
|
+
|
76
|
+
enum llama_token_type {
|
77
|
+
LLAMA_TOKEN_TYPE_UNDEFINED = 0,
|
78
|
+
LLAMA_TOKEN_TYPE_NORMAL = 1,
|
79
|
+
LLAMA_TOKEN_TYPE_UNKNOWN = 2,
|
80
|
+
LLAMA_TOKEN_TYPE_CONTROL = 3,
|
81
|
+
LLAMA_TOKEN_TYPE_USER_DEFINED = 4,
|
82
|
+
LLAMA_TOKEN_TYPE_UNUSED = 5,
|
83
|
+
LLAMA_TOKEN_TYPE_BYTE = 6,
|
84
|
+
};
|
85
|
+
|
86
|
+
// model file types
|
87
|
+
enum llama_ftype {
|
88
|
+
LLAMA_FTYPE_ALL_F32 = 0,
|
89
|
+
LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
|
90
|
+
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
91
|
+
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
92
|
+
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
93
|
+
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
|
94
|
+
// LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
|
95
|
+
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
96
|
+
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
97
|
+
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
98
|
+
LLAMA_FTYPE_MOSTLY_Q2_K = 10,// except 1d tensors
|
99
|
+
LLAMA_FTYPE_MOSTLY_Q3_K_S = 11,// except 1d tensors
|
100
|
+
LLAMA_FTYPE_MOSTLY_Q3_K_M = 12,// except 1d tensors
|
101
|
+
LLAMA_FTYPE_MOSTLY_Q3_K_L = 13,// except 1d tensors
|
102
|
+
LLAMA_FTYPE_MOSTLY_Q4_K_S = 14,// except 1d tensors
|
103
|
+
LLAMA_FTYPE_MOSTLY_Q4_K_M = 15,// except 1d tensors
|
104
|
+
LLAMA_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors
|
105
|
+
LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors
|
106
|
+
LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors
|
107
|
+
|
108
|
+
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
109
|
+
};
|
110
|
+
|
75
111
|
typedef struct llama_token_data {
|
76
112
|
llama_token id; // token id
|
77
113
|
float logit; // log-odds of the token
|
@@ -86,25 +122,10 @@ extern "C" {
|
|
86
122
|
|
87
123
|
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
88
124
|
|
89
|
-
enum llama_log_level {
|
90
|
-
LLAMA_LOG_LEVEL_ERROR = 2,
|
91
|
-
LLAMA_LOG_LEVEL_WARN = 3,
|
92
|
-
LLAMA_LOG_LEVEL_INFO = 4
|
93
|
-
};
|
94
|
-
|
95
|
-
// Signature for logging events
|
96
|
-
// Note that text includes the new line character at the end for most events.
|
97
|
-
// If your logging mechanism cannot handle that, check if the last character is '\n' and strip it
|
98
|
-
// if it exists.
|
99
|
-
// It might not exist for progress report where '.' is output repeatedly.
|
100
|
-
typedef void (*llama_log_callback)(enum llama_log_level level, const char * text, void * user_data);
|
101
|
-
|
102
125
|
struct llama_context_params {
|
103
126
|
uint32_t seed; // RNG seed, -1 for random
|
104
127
|
int32_t n_ctx; // text context
|
105
128
|
int32_t n_batch; // prompt processing batch size
|
106
|
-
int32_t n_gqa; // grouped-query attention (TEMP - will be moved to model hparams)
|
107
|
-
float rms_norm_eps; // rms norm epsilon (TEMP - will be moved to model hparams)
|
108
129
|
int32_t n_gpu_layers; // number of layers to store in VRAM
|
109
130
|
int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
110
131
|
|
@@ -129,33 +150,18 @@ extern "C" {
|
|
129
150
|
bool use_mlock; // force system to keep model in RAM
|
130
151
|
bool embedding; // embedding mode only
|
131
152
|
};
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
|
140
|
-
// LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
|
141
|
-
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
142
|
-
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
143
|
-
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
144
|
-
LLAMA_FTYPE_MOSTLY_Q2_K = 10,// except 1d tensors
|
145
|
-
LLAMA_FTYPE_MOSTLY_Q3_K_S = 11,// except 1d tensors
|
146
|
-
LLAMA_FTYPE_MOSTLY_Q3_K_M = 12,// except 1d tensors
|
147
|
-
LLAMA_FTYPE_MOSTLY_Q3_K_L = 13,// except 1d tensors
|
148
|
-
LLAMA_FTYPE_MOSTLY_Q4_K_S = 14,// except 1d tensors
|
149
|
-
LLAMA_FTYPE_MOSTLY_Q4_K_M = 15,// except 1d tensors
|
150
|
-
LLAMA_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors
|
151
|
-
LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors
|
152
|
-
LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors
|
153
|
-
};
|
153
|
+
|
154
|
+
// Signature for logging events
|
155
|
+
// Note that text includes the new line character at the end for most events.
|
156
|
+
// If your logging mechanism cannot handle that, check if the last character is '\n' and strip it
|
157
|
+
// if it exists.
|
158
|
+
// It might not exist for progress report where '.' is output repeatedly.
|
159
|
+
typedef void (*llama_log_callback)(enum llama_log_level level, const char * text, void * user_data);
|
154
160
|
|
155
161
|
// model quantization parameters
|
156
162
|
typedef struct llama_model_quantize_params {
|
157
163
|
int nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
158
|
-
enum llama_ftype
|
164
|
+
enum llama_ftype ftype; // quantize to this llama_ftype
|
159
165
|
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
160
166
|
bool quantize_output_tensor; // quantize output.weight
|
161
167
|
} llama_model_quantize_params;
|
@@ -208,27 +214,16 @@ extern "C" {
|
|
208
214
|
int32_t n_eval;
|
209
215
|
};
|
210
216
|
|
211
|
-
|
212
|
-
|
213
|
-
LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data);
|
214
|
-
|
215
|
-
LLAMA_API int llama_max_devices();
|
217
|
+
LLAMA_API struct llama_context_params llama_context_default_params(void);
|
218
|
+
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void);
|
216
219
|
|
217
|
-
LLAMA_API struct llama_context_params llama_context_default_params();
|
218
|
-
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
|
219
|
-
|
220
|
-
LLAMA_API bool llama_mmap_supported();
|
221
|
-
LLAMA_API bool llama_mlock_supported();
|
222
|
-
|
223
|
-
// TODO: not great API - very likely to change
|
224
220
|
// Initialize the llama + ggml backend
|
225
221
|
// If numa is true, use NUMA optimizations
|
226
222
|
// Call once at the start of the program
|
227
223
|
LLAMA_API void llama_backend_init(bool numa);
|
228
|
-
// Call once at the end of the program - currently only used for MPI
|
229
|
-
LLAMA_API void llama_backend_free();
|
230
224
|
|
231
|
-
|
225
|
+
// Call once at the end of the program - currently only used for MPI
|
226
|
+
LLAMA_API void llama_backend_free(void);
|
232
227
|
|
233
228
|
LLAMA_API struct llama_model * llama_load_model_from_file(
|
234
229
|
const char * path_model,
|
@@ -240,17 +235,32 @@ extern "C" {
|
|
240
235
|
struct llama_model * model,
|
241
236
|
struct llama_context_params params);
|
242
237
|
|
243
|
-
// Various functions for loading a ggml llama model.
|
244
|
-
// Allocate (almost) all memory needed for the model.
|
245
|
-
// Return NULL on failure
|
246
|
-
LLAMA_API DEPRECATED(struct llama_context * llama_init_from_file(
|
247
|
-
const char * path_model,
|
248
|
-
struct llama_context_params params),
|
249
|
-
"please use llama_load_model_from_file combined with llama_new_context_with_model instead");
|
250
|
-
|
251
238
|
// Frees all allocated memory
|
252
239
|
LLAMA_API void llama_free(struct llama_context * ctx);
|
253
240
|
|
241
|
+
LLAMA_API int64_t llama_time_us(void);
|
242
|
+
|
243
|
+
LLAMA_API int llama_max_devices (void);
|
244
|
+
LLAMA_API bool llama_mmap_supported (void);
|
245
|
+
LLAMA_API bool llama_mlock_supported(void);
|
246
|
+
|
247
|
+
LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
|
248
|
+
LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
|
249
|
+
LLAMA_API int llama_n_embd (const struct llama_context * ctx);
|
250
|
+
|
251
|
+
LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_context * ctx);
|
252
|
+
|
253
|
+
LLAMA_API int llama_model_n_vocab(const struct llama_model * model);
|
254
|
+
LLAMA_API int llama_model_n_ctx (const struct llama_model * model);
|
255
|
+
LLAMA_API int llama_model_n_embd (const struct llama_model * model);
|
256
|
+
|
257
|
+
// Get a string describing the model type
|
258
|
+
LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
|
259
|
+
// Returns the total size of all the tensors in the model in bytes
|
260
|
+
LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
|
261
|
+
// Returns the total number of parameters in the model
|
262
|
+
LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
|
263
|
+
|
254
264
|
// Returns 0 on success
|
255
265
|
LLAMA_API int llama_model_quantize(
|
256
266
|
const char * fname_inp,
|
@@ -272,9 +282,9 @@ extern "C" {
|
|
272
282
|
|
273
283
|
LLAMA_API int llama_model_apply_lora_from_file(
|
274
284
|
const struct llama_model * model,
|
275
|
-
|
276
|
-
|
277
|
-
|
285
|
+
const char * path_lora,
|
286
|
+
const char * path_base_model,
|
287
|
+
int n_threads);
|
278
288
|
|
279
289
|
// Returns the number of tokens in the KV cache
|
280
290
|
LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
|
@@ -324,11 +334,40 @@ extern "C" {
|
|
324
334
|
// IMPORTANT: do not use for anything else other than debugging and testing!
|
325
335
|
LLAMA_API int llama_eval_export(struct llama_context * ctx, const char * fname);
|
326
336
|
|
337
|
+
// Token logits obtained from the last call to llama_eval()
|
338
|
+
// The logits for the last token are stored in the last row
|
339
|
+
// Can be mutated in order to change the probabilities of the next token
|
340
|
+
// Rows: n_tokens
|
341
|
+
// Cols: n_vocab
|
342
|
+
LLAMA_API float * llama_get_logits(struct llama_context * ctx);
|
343
|
+
|
344
|
+
// Get the embeddings for the input
|
345
|
+
// shape: [n_embd] (1-dimensional)
|
346
|
+
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
347
|
+
|
348
|
+
//
|
349
|
+
// Vocab
|
350
|
+
//
|
351
|
+
|
352
|
+
LLAMA_API const char * llama_token_get_text(const struct llama_context * ctx, llama_token token);
|
353
|
+
|
354
|
+
LLAMA_API float llama_token_get_score(const struct llama_context * ctx, llama_token token);
|
355
|
+
|
356
|
+
LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token);
|
357
|
+
|
358
|
+
// Special tokens
|
359
|
+
LLAMA_API llama_token llama_token_bos(const struct llama_context * ctx); // beginning-of-sentence
|
360
|
+
LLAMA_API llama_token llama_token_eos(const struct llama_context * ctx); // end-of-sentence
|
361
|
+
LLAMA_API llama_token llama_token_nl (const struct llama_context * ctx); // next-line
|
362
|
+
|
363
|
+
//
|
364
|
+
// Tokenization
|
365
|
+
//
|
366
|
+
|
327
367
|
// Convert the provided text into tokens.
|
328
368
|
// The tokens pointer must be large enough to hold the resulting tokens.
|
329
369
|
// Returns the number of tokens on success, no more than n_max_tokens
|
330
370
|
// Returns a negative number on failure - the number of tokens that would have been returned
|
331
|
-
// TODO: not sure if correct
|
332
371
|
LLAMA_API int llama_tokenize(
|
333
372
|
struct llama_context * ctx,
|
334
373
|
const char * text,
|
@@ -343,57 +382,26 @@ extern "C" {
|
|
343
382
|
int n_max_tokens,
|
344
383
|
bool add_bos);
|
345
384
|
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
LLAMA_API int
|
351
|
-
LLAMA_API int llama_n_ctx_from_model (const struct llama_model * model);
|
352
|
-
LLAMA_API int llama_n_embd_from_model (const struct llama_model * model);
|
353
|
-
|
354
|
-
LLAMA_API int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size);
|
355
|
-
|
356
|
-
// Get the vocabulary as output parameters.
|
357
|
-
// Returns number of results.
|
358
|
-
LLAMA_API int llama_get_vocab(
|
385
|
+
// Token Id -> Piece.
|
386
|
+
// Uses the vocabulary in the provided context.
|
387
|
+
// Does not write null terminator to the buffer.
|
388
|
+
// User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
|
389
|
+
LLAMA_API int llama_token_to_piece(
|
359
390
|
const struct llama_context * ctx,
|
360
|
-
|
361
|
-
|
362
|
-
|
391
|
+
llama_token token,
|
392
|
+
char * buf,
|
393
|
+
int length);
|
363
394
|
|
364
|
-
LLAMA_API int
|
395
|
+
LLAMA_API int llama_token_to_piece_with_model(
|
365
396
|
const struct llama_model * model,
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
// Token logits obtained from the last call to llama_eval()
|
371
|
-
// The logits for the last token are stored in the last row
|
372
|
-
// Can be mutated in order to change the probabilities of the next token
|
373
|
-
// Rows: n_tokens
|
374
|
-
// Cols: n_vocab
|
375
|
-
LLAMA_API float * llama_get_logits(struct llama_context * ctx);
|
376
|
-
|
377
|
-
// Get the embeddings for the input
|
378
|
-
// shape: [n_embd] (1-dimensional)
|
379
|
-
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
380
|
-
|
381
|
-
// Token Id -> String. Uses the vocabulary in the provided context
|
382
|
-
LLAMA_API const char * llama_token_to_str(
|
383
|
-
const struct llama_context * ctx,
|
384
|
-
llama_token token);
|
385
|
-
|
386
|
-
LLAMA_API const char * llama_token_to_str_with_model(
|
387
|
-
const struct llama_model * model,
|
388
|
-
llama_token token);
|
389
|
-
|
390
|
-
// Special tokens
|
391
|
-
LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence
|
392
|
-
LLAMA_API llama_token llama_token_eos(); // end-of-sentence
|
393
|
-
LLAMA_API llama_token llama_token_nl(); // next-line
|
397
|
+
llama_token token,
|
398
|
+
char * buf,
|
399
|
+
int length);
|
394
400
|
|
401
|
+
//
|
395
402
|
// Grammar
|
396
403
|
//
|
404
|
+
|
397
405
|
LLAMA_API struct llama_grammar * llama_grammar_init(
|
398
406
|
const llama_grammar_element ** rules,
|
399
407
|
size_t n_rules,
|
@@ -401,7 +409,9 @@ extern "C" {
|
|
401
409
|
|
402
410
|
LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
|
403
411
|
|
412
|
+
//
|
404
413
|
// Sampling functions
|
414
|
+
//
|
405
415
|
|
406
416
|
/// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
|
407
417
|
LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty);
|
@@ -462,6 +472,43 @@ extern "C" {
|
|
462
472
|
/// @details Accepts the sampled token into the grammar
|
463
473
|
LLAMA_API void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token);
|
464
474
|
|
475
|
+
//
|
476
|
+
// Beam search
|
477
|
+
//
|
478
|
+
|
479
|
+
struct llama_beam_view {
|
480
|
+
const llama_token * tokens;
|
481
|
+
size_t n_tokens;
|
482
|
+
float p; // Cumulative beam probability (renormalized relative to all beams)
|
483
|
+
bool eob; // Callback should set this to true when a beam is at end-of-beam.
|
484
|
+
};
|
485
|
+
|
486
|
+
// Passed to beam_search_callback function.
|
487
|
+
// Whenever 0 < common_prefix_length, this number of tokens should be copied from any of the beams
|
488
|
+
// (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks.
|
489
|
+
// These pointers are valid only during the synchronous callback, so should not be saved.
|
490
|
+
struct llama_beams_state {
|
491
|
+
struct llama_beam_view * beam_views;
|
492
|
+
size_t n_beams; // Number of elements in beam_views[].
|
493
|
+
size_t common_prefix_length; // Current max length of prefix tokens shared by all beams.
|
494
|
+
bool last_call; // True iff this is the last callback invocation.
|
495
|
+
};
|
496
|
+
|
497
|
+
// Type of pointer to the beam_search_callback function.
|
498
|
+
// void* callback_data is any custom data passed to llama_beam_search, that is subsequently
|
499
|
+
// passed back to beam_search_callback. This avoids having to use global variables in the callback.
|
500
|
+
typedef void (*llama_beam_search_callback_fn_t)(void * callback_data, struct llama_beams_state);
|
501
|
+
|
502
|
+
/// @details Deterministically returns entire sentence constructed by a beam search.
|
503
|
+
/// @param ctx Pointer to the llama_context.
|
504
|
+
/// @param callback Invoked for each iteration of the beam_search loop, passing in beams_state.
|
505
|
+
/// @param callback_data A pointer that is simply passed back to callback.
|
506
|
+
/// @param n_beams Number of beams to use.
|
507
|
+
/// @param n_past Number of tokens already evaluated.
|
508
|
+
/// @param n_predict Maximum number of tokens to predict. EOS may occur earlier.
|
509
|
+
/// @param n_threads Number of threads as passed to llama_eval().
|
510
|
+
LLAMA_API void llama_beam_search(struct llama_context * ctx, llama_beam_search_callback_fn_t callback, void * callback_data, size_t n_beams, int n_past, int n_predict, int n_threads);
|
511
|
+
|
465
512
|
// Performance information
|
466
513
|
LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
|
467
514
|
LLAMA_API void llama_print_timings(struct llama_context * ctx);
|
@@ -470,6 +517,12 @@ extern "C" {
|
|
470
517
|
// Print system information
|
471
518
|
LLAMA_API const char * llama_print_system_info(void);
|
472
519
|
|
520
|
+
// Set callback for all future logging events.
|
521
|
+
// If this is not called, or NULL is supplied, everything is output on stderr.
|
522
|
+
LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data);
|
523
|
+
|
524
|
+
LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
|
525
|
+
|
473
526
|
#ifdef __cplusplus
|
474
527
|
}
|
475
528
|
#endif
|
@@ -479,10 +532,11 @@ extern "C" {
|
|
479
532
|
|
480
533
|
#include <vector>
|
481
534
|
#include <string>
|
535
|
+
|
482
536
|
struct ggml_tensor;
|
483
537
|
|
484
538
|
const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
|
485
539
|
|
486
|
-
#endif
|
540
|
+
#endif // LLAMA_API_INTERNAL
|
487
541
|
|
488
542
|
#endif // LLAMA_H
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.
|
6
|
+
VERSION = '0.5.0'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b1140'
|
10
10
|
end
|
data/lib/llama_cpp.rb
CHANGED
@@ -99,9 +99,9 @@ module LLaMACpp
|
|
99
99
|
end
|
100
100
|
end
|
101
101
|
|
102
|
-
embd.each { |token| output << context.
|
102
|
+
embd.each { |token| output << context.token_to_piece(token) }
|
103
103
|
|
104
|
-
break if !embd.empty? && embd[-1] ==
|
104
|
+
break if !embd.empty? && embd[-1] == context.token_eos
|
105
105
|
end
|
106
106
|
|
107
107
|
output.join.delete_prefix(spaced_prompt).strip
|
data/sig/llama_cpp.rbs
CHANGED
@@ -1,9 +1,6 @@
|
|
1
1
|
module LLaMACpp
|
2
2
|
VERSION: String
|
3
3
|
LLAMA_CPP_VERSION: String
|
4
|
-
LLAMA_FILE_VERSION: String
|
5
|
-
LLAMA_FILE_MAGIC: String
|
6
|
-
LLAMA_FILE_MAGIC_UNVERSIONED: String
|
7
4
|
LLAMA_DEFALUT_SEED: String
|
8
5
|
|
9
6
|
LLAMA_MAX_DEVICES: Integer
|
@@ -42,9 +39,7 @@ module LLaMACpp
|
|
42
39
|
?repeat_last_n: Integer, ?repeat_penalty: Float, ?frequency: Float, ?presence: Float,
|
43
40
|
?top_k: Integer, ?top_p: Float, ?tfs_z: Float, ?typical_p: Float, ?temperature: Float) -> String
|
44
41
|
def self?.print_system_info: () -> void
|
45
|
-
def self?.
|
46
|
-
def self?.token_eos: () -> Integer
|
47
|
-
def self?.token_nl: () -> Integer
|
42
|
+
def self?.time_us: () -> Integer
|
48
43
|
def self?.mmap_supported?: () -> bool
|
49
44
|
def self?.mlock_supported?: () -> bool
|
50
45
|
def self?.max_devices: () -> Integer
|
@@ -81,10 +76,11 @@ module LLaMACpp
|
|
81
76
|
def n_vocab: () -> Integer
|
82
77
|
def n_ctx: () -> Integer
|
83
78
|
def n_embd: () -> Integer
|
84
|
-
def
|
85
|
-
def token_to_str: (Integer) -> String
|
79
|
+
def token_to_piece: (Integer) -> String
|
86
80
|
def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
|
87
|
-
def
|
81
|
+
def desc: () -> String
|
82
|
+
def size: () -> Integer
|
83
|
+
def n_params: () -> Integer
|
88
84
|
end
|
89
85
|
|
90
86
|
class Timings
|
@@ -106,6 +102,12 @@ module LLaMACpp
|
|
106
102
|
|
107
103
|
def initialize: (model: ::LLaMACpp::Model) -> void
|
108
104
|
def embeddings: () -> Array[Float]
|
105
|
+
def text: (Integer) -> String
|
106
|
+
def score: (Integer) -> Float
|
107
|
+
def type: (Integer) -> Integer
|
108
|
+
def token_bos: () -> Integer
|
109
|
+
def token_eos: () -> Integer
|
110
|
+
def token_nl: () -> Integer
|
109
111
|
def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
|
110
112
|
def eval_embd: (tokens: Array[Float], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
|
111
113
|
def eval_export: (String) -> bool
|
@@ -113,11 +115,10 @@ module LLaMACpp
|
|
113
115
|
def n_ctx: () -> Integer
|
114
116
|
def n_embd: () -> Integer
|
115
117
|
def n_vocab: () -> Integer
|
116
|
-
def vocab: (capacity: Integer) -> [Array[String], Array[Float]]
|
117
118
|
def timings: () -> ::LLaMACpp::Timings
|
118
119
|
def print_timings: () -> void
|
119
120
|
def reset_timings: () -> void
|
120
|
-
def
|
121
|
+
def token_to_piece: (Integer) -> String
|
121
122
|
def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
|
122
123
|
def kv_cache_token_count: () -> Integer
|
123
124
|
def set_rng_seed: (Integer) -> void
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-09-02 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|