llama_cpp 0.3.8 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +19 -0
- data/README.md +1 -1
- data/examples/chat.rb +4 -6
- data/ext/llama_cpp/extconf.rb +3 -3
- data/ext/llama_cpp/llama_cpp.cpp +129 -124
- data/ext/llama_cpp/src/ggml-alloc.c +90 -113
- data/ext/llama_cpp/src/ggml-alloc.h +1 -1
- data/ext/llama_cpp/src/ggml-cuda.cu +350 -77
- data/ext/llama_cpp/src/ggml-cuda.h +13 -0
- data/ext/llama_cpp/src/ggml-metal.h +4 -0
- data/ext/llama_cpp/src/ggml-metal.m +226 -121
- data/ext/llama_cpp/src/ggml-metal.metal +157 -35
- data/ext/llama_cpp/src/ggml.c +2724 -584
- data/ext/llama_cpp/src/ggml.h +282 -31
- data/ext/llama_cpp/src/k_quants.c +112 -56
- data/ext/llama_cpp/src/llama.cpp +4857 -2986
- data/ext/llama_cpp/src/llama.h +180 -126
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +2 -2
- data/sig/llama_cpp.rbs +12 -11
- metadata +2 -2
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -10,6 +10,7 @@
|
|
10
10
|
#endif // GGML_USE_CUBLAS
|
11
11
|
#include <stddef.h>
|
12
12
|
#include <stdint.h>
|
13
|
+
#include <stdio.h>
|
13
14
|
#include <stdbool.h>
|
14
15
|
|
15
16
|
#ifdef LLAMA_SHARED
|
@@ -34,29 +35,18 @@
|
|
34
35
|
# define DEPRECATED(func, hint) func
|
35
36
|
#endif
|
36
37
|
|
37
|
-
#define
|
38
|
-
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
|
39
|
-
#define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf'
|
40
|
-
#define LLAMA_FILE_MAGIC_GGML 0x67676d6cu // 'ggml'
|
41
|
-
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
38
|
+
#define LLAMA_DEFAULT_SEED 0xFFFFFFFF
|
42
39
|
|
43
|
-
#define
|
44
|
-
#define LLAMA_FILE_MAGIC LLAMA_FILE_MAGIC_GGJT
|
45
|
-
#define LLAMA_FILE_MAGIC_UNVERSIONED LLAMA_FILE_MAGIC_GGML
|
46
|
-
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
47
|
-
#define LLAMA_SESSION_VERSION 1
|
40
|
+
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
48
41
|
|
49
|
-
#define
|
42
|
+
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
43
|
+
#define LLAMA_SESSION_VERSION 1
|
50
44
|
|
51
45
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
|
52
46
|
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
53
47
|
#define LLAMA_SUPPORTS_GPU_OFFLOAD
|
54
48
|
#endif
|
55
49
|
|
56
|
-
#ifndef LLAMA_DEFAULT_RMS_EPS
|
57
|
-
#define LLAMA_DEFAULT_RMS_EPS 5e-6f
|
58
|
-
#endif
|
59
|
-
|
60
50
|
#ifdef __cplusplus
|
61
51
|
extern "C" {
|
62
52
|
#endif
|
@@ -72,6 +62,52 @@ extern "C" {
|
|
72
62
|
|
73
63
|
typedef int llama_token;
|
74
64
|
|
65
|
+
enum llama_log_level {
|
66
|
+
LLAMA_LOG_LEVEL_ERROR = 2,
|
67
|
+
LLAMA_LOG_LEVEL_WARN = 3,
|
68
|
+
LLAMA_LOG_LEVEL_INFO = 4
|
69
|
+
};
|
70
|
+
|
71
|
+
enum llama_vocab_type {
|
72
|
+
LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
|
73
|
+
LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
|
74
|
+
};
|
75
|
+
|
76
|
+
enum llama_token_type {
|
77
|
+
LLAMA_TOKEN_TYPE_UNDEFINED = 0,
|
78
|
+
LLAMA_TOKEN_TYPE_NORMAL = 1,
|
79
|
+
LLAMA_TOKEN_TYPE_UNKNOWN = 2,
|
80
|
+
LLAMA_TOKEN_TYPE_CONTROL = 3,
|
81
|
+
LLAMA_TOKEN_TYPE_USER_DEFINED = 4,
|
82
|
+
LLAMA_TOKEN_TYPE_UNUSED = 5,
|
83
|
+
LLAMA_TOKEN_TYPE_BYTE = 6,
|
84
|
+
};
|
85
|
+
|
86
|
+
// model file types
|
87
|
+
enum llama_ftype {
|
88
|
+
LLAMA_FTYPE_ALL_F32 = 0,
|
89
|
+
LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
|
90
|
+
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
91
|
+
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
92
|
+
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
93
|
+
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
|
94
|
+
// LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
|
95
|
+
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
96
|
+
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
97
|
+
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
98
|
+
LLAMA_FTYPE_MOSTLY_Q2_K = 10,// except 1d tensors
|
99
|
+
LLAMA_FTYPE_MOSTLY_Q3_K_S = 11,// except 1d tensors
|
100
|
+
LLAMA_FTYPE_MOSTLY_Q3_K_M = 12,// except 1d tensors
|
101
|
+
LLAMA_FTYPE_MOSTLY_Q3_K_L = 13,// except 1d tensors
|
102
|
+
LLAMA_FTYPE_MOSTLY_Q4_K_S = 14,// except 1d tensors
|
103
|
+
LLAMA_FTYPE_MOSTLY_Q4_K_M = 15,// except 1d tensors
|
104
|
+
LLAMA_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors
|
105
|
+
LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors
|
106
|
+
LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors
|
107
|
+
|
108
|
+
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
109
|
+
};
|
110
|
+
|
75
111
|
typedef struct llama_token_data {
|
76
112
|
llama_token id; // token id
|
77
113
|
float logit; // log-odds of the token
|
@@ -86,25 +122,10 @@ extern "C" {
|
|
86
122
|
|
87
123
|
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
88
124
|
|
89
|
-
enum llama_log_level {
|
90
|
-
LLAMA_LOG_LEVEL_ERROR = 2,
|
91
|
-
LLAMA_LOG_LEVEL_WARN = 3,
|
92
|
-
LLAMA_LOG_LEVEL_INFO = 4
|
93
|
-
};
|
94
|
-
|
95
|
-
// Signature for logging events
|
96
|
-
// Note that text includes the new line character at the end for most events.
|
97
|
-
// If your logging mechanism cannot handle that, check if the last character is '\n' and strip it
|
98
|
-
// if it exists.
|
99
|
-
// It might not exist for progress report where '.' is output repeatedly.
|
100
|
-
typedef void (*llama_log_callback)(enum llama_log_level level, const char * text, void * user_data);
|
101
|
-
|
102
125
|
struct llama_context_params {
|
103
126
|
uint32_t seed; // RNG seed, -1 for random
|
104
127
|
int32_t n_ctx; // text context
|
105
128
|
int32_t n_batch; // prompt processing batch size
|
106
|
-
int32_t n_gqa; // grouped-query attention (TEMP - will be moved to model hparams)
|
107
|
-
float rms_norm_eps; // rms norm epsilon (TEMP - will be moved to model hparams)
|
108
129
|
int32_t n_gpu_layers; // number of layers to store in VRAM
|
109
130
|
int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
110
131
|
|
@@ -129,33 +150,18 @@ extern "C" {
|
|
129
150
|
bool use_mlock; // force system to keep model in RAM
|
130
151
|
bool embedding; // embedding mode only
|
131
152
|
};
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
|
140
|
-
// LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
|
141
|
-
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
142
|
-
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
143
|
-
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
144
|
-
LLAMA_FTYPE_MOSTLY_Q2_K = 10,// except 1d tensors
|
145
|
-
LLAMA_FTYPE_MOSTLY_Q3_K_S = 11,// except 1d tensors
|
146
|
-
LLAMA_FTYPE_MOSTLY_Q3_K_M = 12,// except 1d tensors
|
147
|
-
LLAMA_FTYPE_MOSTLY_Q3_K_L = 13,// except 1d tensors
|
148
|
-
LLAMA_FTYPE_MOSTLY_Q4_K_S = 14,// except 1d tensors
|
149
|
-
LLAMA_FTYPE_MOSTLY_Q4_K_M = 15,// except 1d tensors
|
150
|
-
LLAMA_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors
|
151
|
-
LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors
|
152
|
-
LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors
|
153
|
-
};
|
153
|
+
|
154
|
+
// Signature for logging events
|
155
|
+
// Note that text includes the new line character at the end for most events.
|
156
|
+
// If your logging mechanism cannot handle that, check if the last character is '\n' and strip it
|
157
|
+
// if it exists.
|
158
|
+
// It might not exist for progress report where '.' is output repeatedly.
|
159
|
+
typedef void (*llama_log_callback)(enum llama_log_level level, const char * text, void * user_data);
|
154
160
|
|
155
161
|
// model quantization parameters
|
156
162
|
typedef struct llama_model_quantize_params {
|
157
163
|
int nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
158
|
-
enum llama_ftype
|
164
|
+
enum llama_ftype ftype; // quantize to this llama_ftype
|
159
165
|
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
160
166
|
bool quantize_output_tensor; // quantize output.weight
|
161
167
|
} llama_model_quantize_params;
|
@@ -208,27 +214,16 @@ extern "C" {
|
|
208
214
|
int32_t n_eval;
|
209
215
|
};
|
210
216
|
|
211
|
-
|
212
|
-
|
213
|
-
LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data);
|
214
|
-
|
215
|
-
LLAMA_API int llama_max_devices();
|
217
|
+
LLAMA_API struct llama_context_params llama_context_default_params(void);
|
218
|
+
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void);
|
216
219
|
|
217
|
-
LLAMA_API struct llama_context_params llama_context_default_params();
|
218
|
-
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
|
219
|
-
|
220
|
-
LLAMA_API bool llama_mmap_supported();
|
221
|
-
LLAMA_API bool llama_mlock_supported();
|
222
|
-
|
223
|
-
// TODO: not great API - very likely to change
|
224
220
|
// Initialize the llama + ggml backend
|
225
221
|
// If numa is true, use NUMA optimizations
|
226
222
|
// Call once at the start of the program
|
227
223
|
LLAMA_API void llama_backend_init(bool numa);
|
228
|
-
// Call once at the end of the program - currently only used for MPI
|
229
|
-
LLAMA_API void llama_backend_free();
|
230
224
|
|
231
|
-
|
225
|
+
// Call once at the end of the program - currently only used for MPI
|
226
|
+
LLAMA_API void llama_backend_free(void);
|
232
227
|
|
233
228
|
LLAMA_API struct llama_model * llama_load_model_from_file(
|
234
229
|
const char * path_model,
|
@@ -240,17 +235,32 @@ extern "C" {
|
|
240
235
|
struct llama_model * model,
|
241
236
|
struct llama_context_params params);
|
242
237
|
|
243
|
-
// Various functions for loading a ggml llama model.
|
244
|
-
// Allocate (almost) all memory needed for the model.
|
245
|
-
// Return NULL on failure
|
246
|
-
LLAMA_API DEPRECATED(struct llama_context * llama_init_from_file(
|
247
|
-
const char * path_model,
|
248
|
-
struct llama_context_params params),
|
249
|
-
"please use llama_load_model_from_file combined with llama_new_context_with_model instead");
|
250
|
-
|
251
238
|
// Frees all allocated memory
|
252
239
|
LLAMA_API void llama_free(struct llama_context * ctx);
|
253
240
|
|
241
|
+
LLAMA_API int64_t llama_time_us(void);
|
242
|
+
|
243
|
+
LLAMA_API int llama_max_devices (void);
|
244
|
+
LLAMA_API bool llama_mmap_supported (void);
|
245
|
+
LLAMA_API bool llama_mlock_supported(void);
|
246
|
+
|
247
|
+
LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
|
248
|
+
LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
|
249
|
+
LLAMA_API int llama_n_embd (const struct llama_context * ctx);
|
250
|
+
|
251
|
+
LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_context * ctx);
|
252
|
+
|
253
|
+
LLAMA_API int llama_model_n_vocab(const struct llama_model * model);
|
254
|
+
LLAMA_API int llama_model_n_ctx (const struct llama_model * model);
|
255
|
+
LLAMA_API int llama_model_n_embd (const struct llama_model * model);
|
256
|
+
|
257
|
+
// Get a string describing the model type
|
258
|
+
LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
|
259
|
+
// Returns the total size of all the tensors in the model in bytes
|
260
|
+
LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
|
261
|
+
// Returns the total number of parameters in the model
|
262
|
+
LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
|
263
|
+
|
254
264
|
// Returns 0 on success
|
255
265
|
LLAMA_API int llama_model_quantize(
|
256
266
|
const char * fname_inp,
|
@@ -272,9 +282,9 @@ extern "C" {
|
|
272
282
|
|
273
283
|
LLAMA_API int llama_model_apply_lora_from_file(
|
274
284
|
const struct llama_model * model,
|
275
|
-
|
276
|
-
|
277
|
-
|
285
|
+
const char * path_lora,
|
286
|
+
const char * path_base_model,
|
287
|
+
int n_threads);
|
278
288
|
|
279
289
|
// Returns the number of tokens in the KV cache
|
280
290
|
LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
|
@@ -324,11 +334,40 @@ extern "C" {
|
|
324
334
|
// IMPORTANT: do not use for anything else other than debugging and testing!
|
325
335
|
LLAMA_API int llama_eval_export(struct llama_context * ctx, const char * fname);
|
326
336
|
|
337
|
+
// Token logits obtained from the last call to llama_eval()
|
338
|
+
// The logits for the last token are stored in the last row
|
339
|
+
// Can be mutated in order to change the probabilities of the next token
|
340
|
+
// Rows: n_tokens
|
341
|
+
// Cols: n_vocab
|
342
|
+
LLAMA_API float * llama_get_logits(struct llama_context * ctx);
|
343
|
+
|
344
|
+
// Get the embeddings for the input
|
345
|
+
// shape: [n_embd] (1-dimensional)
|
346
|
+
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
347
|
+
|
348
|
+
//
|
349
|
+
// Vocab
|
350
|
+
//
|
351
|
+
|
352
|
+
LLAMA_API const char * llama_token_get_text(const struct llama_context * ctx, llama_token token);
|
353
|
+
|
354
|
+
LLAMA_API float llama_token_get_score(const struct llama_context * ctx, llama_token token);
|
355
|
+
|
356
|
+
LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token);
|
357
|
+
|
358
|
+
// Special tokens
|
359
|
+
LLAMA_API llama_token llama_token_bos(const struct llama_context * ctx); // beginning-of-sentence
|
360
|
+
LLAMA_API llama_token llama_token_eos(const struct llama_context * ctx); // end-of-sentence
|
361
|
+
LLAMA_API llama_token llama_token_nl (const struct llama_context * ctx); // next-line
|
362
|
+
|
363
|
+
//
|
364
|
+
// Tokenization
|
365
|
+
//
|
366
|
+
|
327
367
|
// Convert the provided text into tokens.
|
328
368
|
// The tokens pointer must be large enough to hold the resulting tokens.
|
329
369
|
// Returns the number of tokens on success, no more than n_max_tokens
|
330
370
|
// Returns a negative number on failure - the number of tokens that would have been returned
|
331
|
-
// TODO: not sure if correct
|
332
371
|
LLAMA_API int llama_tokenize(
|
333
372
|
struct llama_context * ctx,
|
334
373
|
const char * text,
|
@@ -343,57 +382,26 @@ extern "C" {
|
|
343
382
|
int n_max_tokens,
|
344
383
|
bool add_bos);
|
345
384
|
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
LLAMA_API int
|
351
|
-
LLAMA_API int llama_n_ctx_from_model (const struct llama_model * model);
|
352
|
-
LLAMA_API int llama_n_embd_from_model (const struct llama_model * model);
|
353
|
-
|
354
|
-
LLAMA_API int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size);
|
355
|
-
|
356
|
-
// Get the vocabulary as output parameters.
|
357
|
-
// Returns number of results.
|
358
|
-
LLAMA_API int llama_get_vocab(
|
385
|
+
// Token Id -> Piece.
|
386
|
+
// Uses the vocabulary in the provided context.
|
387
|
+
// Does not write null terminator to the buffer.
|
388
|
+
// User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
|
389
|
+
LLAMA_API int llama_token_to_piece(
|
359
390
|
const struct llama_context * ctx,
|
360
|
-
|
361
|
-
|
362
|
-
|
391
|
+
llama_token token,
|
392
|
+
char * buf,
|
393
|
+
int length);
|
363
394
|
|
364
|
-
LLAMA_API int
|
395
|
+
LLAMA_API int llama_token_to_piece_with_model(
|
365
396
|
const struct llama_model * model,
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
// Token logits obtained from the last call to llama_eval()
|
371
|
-
// The logits for the last token are stored in the last row
|
372
|
-
// Can be mutated in order to change the probabilities of the next token
|
373
|
-
// Rows: n_tokens
|
374
|
-
// Cols: n_vocab
|
375
|
-
LLAMA_API float * llama_get_logits(struct llama_context * ctx);
|
376
|
-
|
377
|
-
// Get the embeddings for the input
|
378
|
-
// shape: [n_embd] (1-dimensional)
|
379
|
-
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
380
|
-
|
381
|
-
// Token Id -> String. Uses the vocabulary in the provided context
|
382
|
-
LLAMA_API const char * llama_token_to_str(
|
383
|
-
const struct llama_context * ctx,
|
384
|
-
llama_token token);
|
385
|
-
|
386
|
-
LLAMA_API const char * llama_token_to_str_with_model(
|
387
|
-
const struct llama_model * model,
|
388
|
-
llama_token token);
|
389
|
-
|
390
|
-
// Special tokens
|
391
|
-
LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence
|
392
|
-
LLAMA_API llama_token llama_token_eos(); // end-of-sentence
|
393
|
-
LLAMA_API llama_token llama_token_nl(); // next-line
|
397
|
+
llama_token token,
|
398
|
+
char * buf,
|
399
|
+
int length);
|
394
400
|
|
401
|
+
//
|
395
402
|
// Grammar
|
396
403
|
//
|
404
|
+
|
397
405
|
LLAMA_API struct llama_grammar * llama_grammar_init(
|
398
406
|
const llama_grammar_element ** rules,
|
399
407
|
size_t n_rules,
|
@@ -401,7 +409,9 @@ extern "C" {
|
|
401
409
|
|
402
410
|
LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
|
403
411
|
|
412
|
+
//
|
404
413
|
// Sampling functions
|
414
|
+
//
|
405
415
|
|
406
416
|
/// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
|
407
417
|
LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty);
|
@@ -462,6 +472,43 @@ extern "C" {
|
|
462
472
|
/// @details Accepts the sampled token into the grammar
|
463
473
|
LLAMA_API void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token);
|
464
474
|
|
475
|
+
//
|
476
|
+
// Beam search
|
477
|
+
//
|
478
|
+
|
479
|
+
struct llama_beam_view {
|
480
|
+
const llama_token * tokens;
|
481
|
+
size_t n_tokens;
|
482
|
+
float p; // Cumulative beam probability (renormalized relative to all beams)
|
483
|
+
bool eob; // Callback should set this to true when a beam is at end-of-beam.
|
484
|
+
};
|
485
|
+
|
486
|
+
// Passed to beam_search_callback function.
|
487
|
+
// Whenever 0 < common_prefix_length, this number of tokens should be copied from any of the beams
|
488
|
+
// (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks.
|
489
|
+
// These pointers are valid only during the synchronous callback, so should not be saved.
|
490
|
+
struct llama_beams_state {
|
491
|
+
struct llama_beam_view * beam_views;
|
492
|
+
size_t n_beams; // Number of elements in beam_views[].
|
493
|
+
size_t common_prefix_length; // Current max length of prefix tokens shared by all beams.
|
494
|
+
bool last_call; // True iff this is the last callback invocation.
|
495
|
+
};
|
496
|
+
|
497
|
+
// Type of pointer to the beam_search_callback function.
|
498
|
+
// void* callback_data is any custom data passed to llama_beam_search, that is subsequently
|
499
|
+
// passed back to beam_search_callback. This avoids having to use global variables in the callback.
|
500
|
+
typedef void (*llama_beam_search_callback_fn_t)(void * callback_data, struct llama_beams_state);
|
501
|
+
|
502
|
+
/// @details Deterministically returns entire sentence constructed by a beam search.
|
503
|
+
/// @param ctx Pointer to the llama_context.
|
504
|
+
/// @param callback Invoked for each iteration of the beam_search loop, passing in beams_state.
|
505
|
+
/// @param callback_data A pointer that is simply passed back to callback.
|
506
|
+
/// @param n_beams Number of beams to use.
|
507
|
+
/// @param n_past Number of tokens already evaluated.
|
508
|
+
/// @param n_predict Maximum number of tokens to predict. EOS may occur earlier.
|
509
|
+
/// @param n_threads Number of threads as passed to llama_eval().
|
510
|
+
LLAMA_API void llama_beam_search(struct llama_context * ctx, llama_beam_search_callback_fn_t callback, void * callback_data, size_t n_beams, int n_past, int n_predict, int n_threads);
|
511
|
+
|
465
512
|
// Performance information
|
466
513
|
LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
|
467
514
|
LLAMA_API void llama_print_timings(struct llama_context * ctx);
|
@@ -470,6 +517,12 @@ extern "C" {
|
|
470
517
|
// Print system information
|
471
518
|
LLAMA_API const char * llama_print_system_info(void);
|
472
519
|
|
520
|
+
// Set callback for all future logging events.
|
521
|
+
// If this is not called, or NULL is supplied, everything is output on stderr.
|
522
|
+
LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data);
|
523
|
+
|
524
|
+
LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
|
525
|
+
|
473
526
|
#ifdef __cplusplus
|
474
527
|
}
|
475
528
|
#endif
|
@@ -479,10 +532,11 @@ extern "C" {
|
|
479
532
|
|
480
533
|
#include <vector>
|
481
534
|
#include <string>
|
535
|
+
|
482
536
|
struct ggml_tensor;
|
483
537
|
|
484
538
|
const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
|
485
539
|
|
486
|
-
#endif
|
540
|
+
#endif // LLAMA_API_INTERNAL
|
487
541
|
|
488
542
|
#endif // LLAMA_H
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.
|
6
|
+
VERSION = '0.5.0'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b1140'
|
10
10
|
end
|
data/lib/llama_cpp.rb
CHANGED
@@ -99,9 +99,9 @@ module LLaMACpp
|
|
99
99
|
end
|
100
100
|
end
|
101
101
|
|
102
|
-
embd.each { |token| output << context.
|
102
|
+
embd.each { |token| output << context.token_to_piece(token) }
|
103
103
|
|
104
|
-
break if !embd.empty? && embd[-1] ==
|
104
|
+
break if !embd.empty? && embd[-1] == context.token_eos
|
105
105
|
end
|
106
106
|
|
107
107
|
output.join.delete_prefix(spaced_prompt).strip
|
data/sig/llama_cpp.rbs
CHANGED
@@ -1,9 +1,6 @@
|
|
1
1
|
module LLaMACpp
|
2
2
|
VERSION: String
|
3
3
|
LLAMA_CPP_VERSION: String
|
4
|
-
LLAMA_FILE_VERSION: String
|
5
|
-
LLAMA_FILE_MAGIC: String
|
6
|
-
LLAMA_FILE_MAGIC_UNVERSIONED: String
|
7
4
|
LLAMA_DEFALUT_SEED: String
|
8
5
|
|
9
6
|
LLAMA_MAX_DEVICES: Integer
|
@@ -42,9 +39,7 @@ module LLaMACpp
|
|
42
39
|
?repeat_last_n: Integer, ?repeat_penalty: Float, ?frequency: Float, ?presence: Float,
|
43
40
|
?top_k: Integer, ?top_p: Float, ?tfs_z: Float, ?typical_p: Float, ?temperature: Float) -> String
|
44
41
|
def self?.print_system_info: () -> void
|
45
|
-
def self?.
|
46
|
-
def self?.token_eos: () -> Integer
|
47
|
-
def self?.token_nl: () -> Integer
|
42
|
+
def self?.time_us: () -> Integer
|
48
43
|
def self?.mmap_supported?: () -> bool
|
49
44
|
def self?.mlock_supported?: () -> bool
|
50
45
|
def self?.max_devices: () -> Integer
|
@@ -81,10 +76,11 @@ module LLaMACpp
|
|
81
76
|
def n_vocab: () -> Integer
|
82
77
|
def n_ctx: () -> Integer
|
83
78
|
def n_embd: () -> Integer
|
84
|
-
def
|
85
|
-
def token_to_str: (Integer) -> String
|
79
|
+
def token_to_piece: (Integer) -> String
|
86
80
|
def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
|
87
|
-
def
|
81
|
+
def desc: () -> String
|
82
|
+
def size: () -> Integer
|
83
|
+
def n_params: () -> Integer
|
88
84
|
end
|
89
85
|
|
90
86
|
class Timings
|
@@ -106,6 +102,12 @@ module LLaMACpp
|
|
106
102
|
|
107
103
|
def initialize: (model: ::LLaMACpp::Model) -> void
|
108
104
|
def embeddings: () -> Array[Float]
|
105
|
+
def text: (Integer) -> String
|
106
|
+
def score: (Integer) -> Float
|
107
|
+
def type: (Integer) -> Integer
|
108
|
+
def token_bos: () -> Integer
|
109
|
+
def token_eos: () -> Integer
|
110
|
+
def token_nl: () -> Integer
|
109
111
|
def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
|
110
112
|
def eval_embd: (tokens: Array[Float], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
|
111
113
|
def eval_export: (String) -> bool
|
@@ -113,11 +115,10 @@ module LLaMACpp
|
|
113
115
|
def n_ctx: () -> Integer
|
114
116
|
def n_embd: () -> Integer
|
115
117
|
def n_vocab: () -> Integer
|
116
|
-
def vocab: (capacity: Integer) -> [Array[String], Array[Float]]
|
117
118
|
def timings: () -> ::LLaMACpp::Timings
|
118
119
|
def print_timings: () -> void
|
119
120
|
def reset_timings: () -> void
|
120
|
-
def
|
121
|
+
def token_to_piece: (Integer) -> String
|
121
122
|
def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
|
122
123
|
def kv_cache_token_count: () -> Integer
|
123
124
|
def set_rng_seed: (Integer) -> void
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-09-02 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|