llama_cpp 0.3.8 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/README.md +1 -1
- data/examples/chat.rb +2 -4
- data/ext/llama_cpp/extconf.rb +2 -2
- data/ext/llama_cpp/llama_cpp.cpp +110 -117
- data/ext/llama_cpp/src/ggml-alloc.c +79 -65
- data/ext/llama_cpp/src/ggml-alloc.h +1 -1
- data/ext/llama_cpp/src/ggml-cuda.cu +330 -69
- data/ext/llama_cpp/src/ggml-cuda.h +13 -0
- data/ext/llama_cpp/src/ggml-metal.h +3 -0
- data/ext/llama_cpp/src/ggml-metal.m +102 -66
- data/ext/llama_cpp/src/ggml-metal.metal +113 -9
- data/ext/llama_cpp/src/ggml.c +2064 -233
- data/ext/llama_cpp/src/ggml.h +238 -13
- data/ext/llama_cpp/src/k_quants.c +110 -54
- data/ext/llama_cpp/src/llama.cpp +4520 -2978
- data/ext/llama_cpp/src/llama.h +133 -125
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +7 -8
- metadata +2 -2
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -34,29 +34,18 @@
|
|
34
34
|
# define DEPRECATED(func, hint) func
|
35
35
|
#endif
|
36
36
|
|
37
|
-
#define
|
38
|
-
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
|
39
|
-
#define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf'
|
40
|
-
#define LLAMA_FILE_MAGIC_GGML 0x67676d6cu // 'ggml'
|
41
|
-
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
37
|
+
#define LLAMA_DEFAULT_SEED 0xFFFFFFFF
|
42
38
|
|
43
|
-
#define
|
44
|
-
#define LLAMA_FILE_MAGIC LLAMA_FILE_MAGIC_GGJT
|
45
|
-
#define LLAMA_FILE_MAGIC_UNVERSIONED LLAMA_FILE_MAGIC_GGML
|
46
|
-
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
47
|
-
#define LLAMA_SESSION_VERSION 1
|
39
|
+
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
48
40
|
|
49
|
-
#define
|
41
|
+
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
42
|
+
#define LLAMA_SESSION_VERSION 1
|
50
43
|
|
51
44
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
|
52
45
|
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
53
46
|
#define LLAMA_SUPPORTS_GPU_OFFLOAD
|
54
47
|
#endif
|
55
48
|
|
56
|
-
#ifndef LLAMA_DEFAULT_RMS_EPS
|
57
|
-
#define LLAMA_DEFAULT_RMS_EPS 5e-6f
|
58
|
-
#endif
|
59
|
-
|
60
49
|
#ifdef __cplusplus
|
61
50
|
extern "C" {
|
62
51
|
#endif
|
@@ -72,6 +61,52 @@ extern "C" {
|
|
72
61
|
|
73
62
|
typedef int llama_token;
|
74
63
|
|
64
|
+
enum llama_log_level {
|
65
|
+
LLAMA_LOG_LEVEL_ERROR = 2,
|
66
|
+
LLAMA_LOG_LEVEL_WARN = 3,
|
67
|
+
LLAMA_LOG_LEVEL_INFO = 4
|
68
|
+
};
|
69
|
+
|
70
|
+
enum llama_vocab_type {
|
71
|
+
LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
|
72
|
+
LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
|
73
|
+
};
|
74
|
+
|
75
|
+
enum llama_token_type {
|
76
|
+
LLAMA_TOKEN_TYPE_UNDEFINED = 0,
|
77
|
+
LLAMA_TOKEN_TYPE_NORMAL = 1,
|
78
|
+
LLAMA_TOKEN_TYPE_UNKNOWN = 2,
|
79
|
+
LLAMA_TOKEN_TYPE_CONTROL = 3,
|
80
|
+
LLAMA_TOKEN_TYPE_USER_DEFINED = 4,
|
81
|
+
LLAMA_TOKEN_TYPE_UNUSED = 5,
|
82
|
+
LLAMA_TOKEN_TYPE_BYTE = 6,
|
83
|
+
};
|
84
|
+
|
85
|
+
// model file types
|
86
|
+
enum llama_ftype {
|
87
|
+
LLAMA_FTYPE_ALL_F32 = 0,
|
88
|
+
LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
|
89
|
+
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
90
|
+
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
91
|
+
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
92
|
+
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
|
93
|
+
// LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
|
94
|
+
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
95
|
+
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
96
|
+
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
97
|
+
LLAMA_FTYPE_MOSTLY_Q2_K = 10,// except 1d tensors
|
98
|
+
LLAMA_FTYPE_MOSTLY_Q3_K_S = 11,// except 1d tensors
|
99
|
+
LLAMA_FTYPE_MOSTLY_Q3_K_M = 12,// except 1d tensors
|
100
|
+
LLAMA_FTYPE_MOSTLY_Q3_K_L = 13,// except 1d tensors
|
101
|
+
LLAMA_FTYPE_MOSTLY_Q4_K_S = 14,// except 1d tensors
|
102
|
+
LLAMA_FTYPE_MOSTLY_Q4_K_M = 15,// except 1d tensors
|
103
|
+
LLAMA_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors
|
104
|
+
LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors
|
105
|
+
LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors
|
106
|
+
|
107
|
+
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
108
|
+
};
|
109
|
+
|
75
110
|
typedef struct llama_token_data {
|
76
111
|
llama_token id; // token id
|
77
112
|
float logit; // log-odds of the token
|
@@ -86,25 +121,10 @@ extern "C" {
|
|
86
121
|
|
87
122
|
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
88
123
|
|
89
|
-
enum llama_log_level {
|
90
|
-
LLAMA_LOG_LEVEL_ERROR = 2,
|
91
|
-
LLAMA_LOG_LEVEL_WARN = 3,
|
92
|
-
LLAMA_LOG_LEVEL_INFO = 4
|
93
|
-
};
|
94
|
-
|
95
|
-
// Signature for logging events
|
96
|
-
// Note that text includes the new line character at the end for most events.
|
97
|
-
// If your logging mechanism cannot handle that, check if the last character is '\n' and strip it
|
98
|
-
// if it exists.
|
99
|
-
// It might not exist for progress report where '.' is output repeatedly.
|
100
|
-
typedef void (*llama_log_callback)(enum llama_log_level level, const char * text, void * user_data);
|
101
|
-
|
102
124
|
struct llama_context_params {
|
103
125
|
uint32_t seed; // RNG seed, -1 for random
|
104
126
|
int32_t n_ctx; // text context
|
105
127
|
int32_t n_batch; // prompt processing batch size
|
106
|
-
int32_t n_gqa; // grouped-query attention (TEMP - will be moved to model hparams)
|
107
|
-
float rms_norm_eps; // rms norm epsilon (TEMP - will be moved to model hparams)
|
108
128
|
int32_t n_gpu_layers; // number of layers to store in VRAM
|
109
129
|
int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
110
130
|
|
@@ -129,33 +149,18 @@ extern "C" {
|
|
129
149
|
bool use_mlock; // force system to keep model in RAM
|
130
150
|
bool embedding; // embedding mode only
|
131
151
|
};
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
|
140
|
-
// LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
|
141
|
-
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
142
|
-
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
143
|
-
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
144
|
-
LLAMA_FTYPE_MOSTLY_Q2_K = 10,// except 1d tensors
|
145
|
-
LLAMA_FTYPE_MOSTLY_Q3_K_S = 11,// except 1d tensors
|
146
|
-
LLAMA_FTYPE_MOSTLY_Q3_K_M = 12,// except 1d tensors
|
147
|
-
LLAMA_FTYPE_MOSTLY_Q3_K_L = 13,// except 1d tensors
|
148
|
-
LLAMA_FTYPE_MOSTLY_Q4_K_S = 14,// except 1d tensors
|
149
|
-
LLAMA_FTYPE_MOSTLY_Q4_K_M = 15,// except 1d tensors
|
150
|
-
LLAMA_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors
|
151
|
-
LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors
|
152
|
-
LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors
|
153
|
-
};
|
152
|
+
|
153
|
+
// Signature for logging events
|
154
|
+
// Note that text includes the new line character at the end for most events.
|
155
|
+
// If your logging mechanism cannot handle that, check if the last character is '\n' and strip it
|
156
|
+
// if it exists.
|
157
|
+
// It might not exist for progress report where '.' is output repeatedly.
|
158
|
+
typedef void (*llama_log_callback)(enum llama_log_level level, const char * text, void * user_data);
|
154
159
|
|
155
160
|
// model quantization parameters
|
156
161
|
typedef struct llama_model_quantize_params {
|
157
162
|
int nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
158
|
-
enum llama_ftype
|
163
|
+
enum llama_ftype ftype; // quantize to this llama_ftype
|
159
164
|
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
160
165
|
bool quantize_output_tensor; // quantize output.weight
|
161
166
|
} llama_model_quantize_params;
|
@@ -208,27 +213,16 @@ extern "C" {
|
|
208
213
|
int32_t n_eval;
|
209
214
|
};
|
210
215
|
|
211
|
-
|
212
|
-
|
213
|
-
LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data);
|
214
|
-
|
215
|
-
LLAMA_API int llama_max_devices();
|
216
|
-
|
217
|
-
LLAMA_API struct llama_context_params llama_context_default_params();
|
218
|
-
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
|
219
|
-
|
220
|
-
LLAMA_API bool llama_mmap_supported();
|
221
|
-
LLAMA_API bool llama_mlock_supported();
|
216
|
+
LLAMA_API struct llama_context_params llama_context_default_params(void);
|
217
|
+
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void);
|
222
218
|
|
223
|
-
// TODO: not great API - very likely to change
|
224
219
|
// Initialize the llama + ggml backend
|
225
220
|
// If numa is true, use NUMA optimizations
|
226
221
|
// Call once at the start of the program
|
227
222
|
LLAMA_API void llama_backend_init(bool numa);
|
228
|
-
// Call once at the end of the program - currently only used for MPI
|
229
|
-
LLAMA_API void llama_backend_free();
|
230
223
|
|
231
|
-
|
224
|
+
// Call once at the end of the program - currently only used for MPI
|
225
|
+
LLAMA_API void llama_backend_free(void);
|
232
226
|
|
233
227
|
LLAMA_API struct llama_model * llama_load_model_from_file(
|
234
228
|
const char * path_model,
|
@@ -240,17 +234,28 @@ extern "C" {
|
|
240
234
|
struct llama_model * model,
|
241
235
|
struct llama_context_params params);
|
242
236
|
|
243
|
-
// Various functions for loading a ggml llama model.
|
244
|
-
// Allocate (almost) all memory needed for the model.
|
245
|
-
// Return NULL on failure
|
246
|
-
LLAMA_API DEPRECATED(struct llama_context * llama_init_from_file(
|
247
|
-
const char * path_model,
|
248
|
-
struct llama_context_params params),
|
249
|
-
"please use llama_load_model_from_file combined with llama_new_context_with_model instead");
|
250
|
-
|
251
237
|
// Frees all allocated memory
|
252
238
|
LLAMA_API void llama_free(struct llama_context * ctx);
|
253
239
|
|
240
|
+
LLAMA_API int64_t llama_time_us(void);
|
241
|
+
|
242
|
+
LLAMA_API int llama_max_devices (void);
|
243
|
+
LLAMA_API bool llama_mmap_supported (void);
|
244
|
+
LLAMA_API bool llama_mlock_supported(void);
|
245
|
+
|
246
|
+
LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
|
247
|
+
LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
|
248
|
+
LLAMA_API int llama_n_embd (const struct llama_context * ctx);
|
249
|
+
|
250
|
+
LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_context * ctx);
|
251
|
+
|
252
|
+
LLAMA_API int llama_model_n_vocab(const struct llama_model * model);
|
253
|
+
LLAMA_API int llama_model_n_ctx (const struct llama_model * model);
|
254
|
+
LLAMA_API int llama_model_n_embd (const struct llama_model * model);
|
255
|
+
|
256
|
+
// Get a string describing the model type
|
257
|
+
LLAMA_API int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size);
|
258
|
+
|
254
259
|
// Returns 0 on success
|
255
260
|
LLAMA_API int llama_model_quantize(
|
256
261
|
const char * fname_inp,
|
@@ -272,9 +277,9 @@ extern "C" {
|
|
272
277
|
|
273
278
|
LLAMA_API int llama_model_apply_lora_from_file(
|
274
279
|
const struct llama_model * model,
|
275
|
-
|
276
|
-
|
277
|
-
|
280
|
+
const char * path_lora,
|
281
|
+
const char * path_base_model,
|
282
|
+
int n_threads);
|
278
283
|
|
279
284
|
// Returns the number of tokens in the KV cache
|
280
285
|
LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
|
@@ -324,11 +329,40 @@ extern "C" {
|
|
324
329
|
// IMPORTANT: do not use for anything else other than debugging and testing!
|
325
330
|
LLAMA_API int llama_eval_export(struct llama_context * ctx, const char * fname);
|
326
331
|
|
332
|
+
// Token logits obtained from the last call to llama_eval()
|
333
|
+
// The logits for the last token are stored in the last row
|
334
|
+
// Can be mutated in order to change the probabilities of the next token
|
335
|
+
// Rows: n_tokens
|
336
|
+
// Cols: n_vocab
|
337
|
+
LLAMA_API float * llama_get_logits(struct llama_context * ctx);
|
338
|
+
|
339
|
+
// Get the embeddings for the input
|
340
|
+
// shape: [n_embd] (1-dimensional)
|
341
|
+
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
342
|
+
|
343
|
+
//
|
344
|
+
// Vocab
|
345
|
+
//
|
346
|
+
|
347
|
+
LLAMA_API const char * llama_token_get_text(const struct llama_context * ctx, llama_token token);
|
348
|
+
|
349
|
+
LLAMA_API float llama_token_get_score(const struct llama_context * ctx, llama_token token);
|
350
|
+
|
351
|
+
LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token);
|
352
|
+
|
353
|
+
// Special tokens
|
354
|
+
LLAMA_API llama_token llama_token_bos(const struct llama_context * ctx); // beginning-of-sentence
|
355
|
+
LLAMA_API llama_token llama_token_eos(const struct llama_context * ctx); // end-of-sentence
|
356
|
+
LLAMA_API llama_token llama_token_nl (const struct llama_context * ctx); // next-line
|
357
|
+
|
358
|
+
//
|
359
|
+
// Tokenization
|
360
|
+
//
|
361
|
+
|
327
362
|
// Convert the provided text into tokens.
|
328
363
|
// The tokens pointer must be large enough to hold the resulting tokens.
|
329
364
|
// Returns the number of tokens on success, no more than n_max_tokens
|
330
365
|
// Returns a negative number on failure - the number of tokens that would have been returned
|
331
|
-
// TODO: not sure if correct
|
332
366
|
LLAMA_API int llama_tokenize(
|
333
367
|
struct llama_context * ctx,
|
334
368
|
const char * text,
|
@@ -343,57 +377,24 @@ extern "C" {
|
|
343
377
|
int n_max_tokens,
|
344
378
|
bool add_bos);
|
345
379
|
|
346
|
-
LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
|
347
|
-
LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
|
348
|
-
LLAMA_API int llama_n_embd (const struct llama_context * ctx);
|
349
|
-
|
350
|
-
LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model);
|
351
|
-
LLAMA_API int llama_n_ctx_from_model (const struct llama_model * model);
|
352
|
-
LLAMA_API int llama_n_embd_from_model (const struct llama_model * model);
|
353
|
-
|
354
|
-
LLAMA_API int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size);
|
355
|
-
|
356
|
-
// Get the vocabulary as output parameters.
|
357
|
-
// Returns number of results.
|
358
|
-
LLAMA_API int llama_get_vocab(
|
359
|
-
const struct llama_context * ctx,
|
360
|
-
const char * * strings,
|
361
|
-
float * scores,
|
362
|
-
int capacity);
|
363
|
-
|
364
|
-
LLAMA_API int llama_get_vocab_from_model(
|
365
|
-
const struct llama_model * model,
|
366
|
-
const char * * strings,
|
367
|
-
float * scores,
|
368
|
-
int capacity);
|
369
|
-
|
370
|
-
// Token logits obtained from the last call to llama_eval()
|
371
|
-
// The logits for the last token are stored in the last row
|
372
|
-
// Can be mutated in order to change the probabilities of the next token
|
373
|
-
// Rows: n_tokens
|
374
|
-
// Cols: n_vocab
|
375
|
-
LLAMA_API float * llama_get_logits(struct llama_context * ctx);
|
376
|
-
|
377
|
-
// Get the embeddings for the input
|
378
|
-
// shape: [n_embd] (1-dimensional)
|
379
|
-
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
380
|
-
|
381
380
|
// Token Id -> String. Uses the vocabulary in the provided context
|
382
|
-
|
381
|
+
// Does not write null terminator to the buffer
|
382
|
+
LLAMA_API int llama_token_to_str(
|
383
383
|
const struct llama_context * ctx,
|
384
|
-
llama_token token
|
384
|
+
llama_token token,
|
385
|
+
char * buf,
|
386
|
+
int length);
|
385
387
|
|
386
|
-
LLAMA_API
|
388
|
+
LLAMA_API int llama_token_to_str_with_model(
|
387
389
|
const struct llama_model * model,
|
388
|
-
llama_token token
|
389
|
-
|
390
|
-
|
391
|
-
LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence
|
392
|
-
LLAMA_API llama_token llama_token_eos(); // end-of-sentence
|
393
|
-
LLAMA_API llama_token llama_token_nl(); // next-line
|
390
|
+
llama_token token,
|
391
|
+
char * buf,
|
392
|
+
int length);
|
394
393
|
|
394
|
+
//
|
395
395
|
// Grammar
|
396
396
|
//
|
397
|
+
|
397
398
|
LLAMA_API struct llama_grammar * llama_grammar_init(
|
398
399
|
const llama_grammar_element ** rules,
|
399
400
|
size_t n_rules,
|
@@ -401,7 +402,9 @@ extern "C" {
|
|
401
402
|
|
402
403
|
LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
|
403
404
|
|
405
|
+
//
|
404
406
|
// Sampling functions
|
407
|
+
//
|
405
408
|
|
406
409
|
/// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
|
407
410
|
LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty);
|
@@ -470,6 +473,10 @@ extern "C" {
|
|
470
473
|
// Print system information
|
471
474
|
LLAMA_API const char * llama_print_system_info(void);
|
472
475
|
|
476
|
+
// Set callback for all future logging events.
|
477
|
+
// If this is not called, or NULL is supplied, everything is output on stderr.
|
478
|
+
LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data);
|
479
|
+
|
473
480
|
#ifdef __cplusplus
|
474
481
|
}
|
475
482
|
#endif
|
@@ -479,10 +486,11 @@ extern "C" {
|
|
479
486
|
|
480
487
|
#include <vector>
|
481
488
|
#include <string>
|
489
|
+
|
482
490
|
struct ggml_tensor;
|
483
491
|
|
484
492
|
const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
|
485
493
|
|
486
|
-
#endif
|
494
|
+
#endif // LLAMA_API_INTERNAL
|
487
495
|
|
488
496
|
#endif // LLAMA_H
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.
|
6
|
+
VERSION = '0.4.0'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b1060'
|
10
10
|
end
|
data/lib/llama_cpp.rb
CHANGED
@@ -101,7 +101,7 @@ module LLaMACpp
|
|
101
101
|
|
102
102
|
embd.each { |token| output << context.token_to_str(token) }
|
103
103
|
|
104
|
-
break if !embd.empty? && embd[-1] ==
|
104
|
+
break if !embd.empty? && embd[-1] == context.token_eos
|
105
105
|
end
|
106
106
|
|
107
107
|
output.join.delete_prefix(spaced_prompt).strip
|
data/sig/llama_cpp.rbs
CHANGED
@@ -1,9 +1,6 @@
|
|
1
1
|
module LLaMACpp
|
2
2
|
VERSION: String
|
3
3
|
LLAMA_CPP_VERSION: String
|
4
|
-
LLAMA_FILE_VERSION: String
|
5
|
-
LLAMA_FILE_MAGIC: String
|
6
|
-
LLAMA_FILE_MAGIC_UNVERSIONED: String
|
7
4
|
LLAMA_DEFALUT_SEED: String
|
8
5
|
|
9
6
|
LLAMA_MAX_DEVICES: Integer
|
@@ -42,9 +39,7 @@ module LLaMACpp
|
|
42
39
|
?repeat_last_n: Integer, ?repeat_penalty: Float, ?frequency: Float, ?presence: Float,
|
43
40
|
?top_k: Integer, ?top_p: Float, ?tfs_z: Float, ?typical_p: Float, ?temperature: Float) -> String
|
44
41
|
def self?.print_system_info: () -> void
|
45
|
-
def self?.
|
46
|
-
def self?.token_eos: () -> Integer
|
47
|
-
def self?.token_nl: () -> Integer
|
42
|
+
def self?.time_us: () -> Integer
|
48
43
|
def self?.mmap_supported?: () -> bool
|
49
44
|
def self?.mlock_supported?: () -> bool
|
50
45
|
def self?.max_devices: () -> Integer
|
@@ -81,7 +76,6 @@ module LLaMACpp
|
|
81
76
|
def n_vocab: () -> Integer
|
82
77
|
def n_ctx: () -> Integer
|
83
78
|
def n_embd: () -> Integer
|
84
|
-
def vocab: (capacity: Integer) -> [Array[String], Array[Float]]
|
85
79
|
def token_to_str: (Integer) -> String
|
86
80
|
def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
|
87
81
|
def type: () -> String
|
@@ -106,6 +100,12 @@ module LLaMACpp
|
|
106
100
|
|
107
101
|
def initialize: (model: ::LLaMACpp::Model) -> void
|
108
102
|
def embeddings: () -> Array[Float]
|
103
|
+
def text: (Integer) -> String
|
104
|
+
def score: (Integer) -> Float
|
105
|
+
def type: (Integer) -> Integer
|
106
|
+
def token_bos: () -> Integer
|
107
|
+
def token_eos: () -> Integer
|
108
|
+
def token_nl: () -> Integer
|
109
109
|
def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
|
110
110
|
def eval_embd: (tokens: Array[Float], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
|
111
111
|
def eval_export: (String) -> bool
|
@@ -113,7 +113,6 @@ module LLaMACpp
|
|
113
113
|
def n_ctx: () -> Integer
|
114
114
|
def n_embd: () -> Integer
|
115
115
|
def n_vocab: () -> Integer
|
116
|
-
def vocab: (capacity: Integer) -> [Array[String], Array[Float]]
|
117
116
|
def timings: () -> ::LLaMACpp::Timings
|
118
117
|
def print_timings: () -> void
|
119
118
|
def reset_timings: () -> void
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-08-
|
11
|
+
date: 2023-08-26 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|