llama_cpp 0.3.8 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/README.md +1 -1
- data/examples/chat.rb +2 -4
- data/ext/llama_cpp/extconf.rb +2 -2
- data/ext/llama_cpp/llama_cpp.cpp +110 -117
- data/ext/llama_cpp/src/ggml-alloc.c +79 -65
- data/ext/llama_cpp/src/ggml-alloc.h +1 -1
- data/ext/llama_cpp/src/ggml-cuda.cu +330 -69
- data/ext/llama_cpp/src/ggml-cuda.h +13 -0
- data/ext/llama_cpp/src/ggml-metal.h +3 -0
- data/ext/llama_cpp/src/ggml-metal.m +102 -66
- data/ext/llama_cpp/src/ggml-metal.metal +113 -9
- data/ext/llama_cpp/src/ggml.c +2064 -233
- data/ext/llama_cpp/src/ggml.h +238 -13
- data/ext/llama_cpp/src/k_quants.c +110 -54
- data/ext/llama_cpp/src/llama.cpp +4520 -2978
- data/ext/llama_cpp/src/llama.h +133 -125
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +7 -8
- metadata +2 -2
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -34,29 +34,18 @@
|
|
34
34
|
# define DEPRECATED(func, hint) func
|
35
35
|
#endif
|
36
36
|
|
37
|
-
#define
|
38
|
-
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
|
39
|
-
#define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf'
|
40
|
-
#define LLAMA_FILE_MAGIC_GGML 0x67676d6cu // 'ggml'
|
41
|
-
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
37
|
+
#define LLAMA_DEFAULT_SEED 0xFFFFFFFF
|
42
38
|
|
43
|
-
#define
|
44
|
-
#define LLAMA_FILE_MAGIC LLAMA_FILE_MAGIC_GGJT
|
45
|
-
#define LLAMA_FILE_MAGIC_UNVERSIONED LLAMA_FILE_MAGIC_GGML
|
46
|
-
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
47
|
-
#define LLAMA_SESSION_VERSION 1
|
39
|
+
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
48
40
|
|
49
|
-
#define
|
41
|
+
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
42
|
+
#define LLAMA_SESSION_VERSION 1
|
50
43
|
|
51
44
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
|
52
45
|
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
53
46
|
#define LLAMA_SUPPORTS_GPU_OFFLOAD
|
54
47
|
#endif
|
55
48
|
|
56
|
-
#ifndef LLAMA_DEFAULT_RMS_EPS
|
57
|
-
#define LLAMA_DEFAULT_RMS_EPS 5e-6f
|
58
|
-
#endif
|
59
|
-
|
60
49
|
#ifdef __cplusplus
|
61
50
|
extern "C" {
|
62
51
|
#endif
|
@@ -72,6 +61,52 @@ extern "C" {
|
|
72
61
|
|
73
62
|
typedef int llama_token;
|
74
63
|
|
64
|
+
enum llama_log_level {
|
65
|
+
LLAMA_LOG_LEVEL_ERROR = 2,
|
66
|
+
LLAMA_LOG_LEVEL_WARN = 3,
|
67
|
+
LLAMA_LOG_LEVEL_INFO = 4
|
68
|
+
};
|
69
|
+
|
70
|
+
enum llama_vocab_type {
|
71
|
+
LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
|
72
|
+
LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
|
73
|
+
};
|
74
|
+
|
75
|
+
enum llama_token_type {
|
76
|
+
LLAMA_TOKEN_TYPE_UNDEFINED = 0,
|
77
|
+
LLAMA_TOKEN_TYPE_NORMAL = 1,
|
78
|
+
LLAMA_TOKEN_TYPE_UNKNOWN = 2,
|
79
|
+
LLAMA_TOKEN_TYPE_CONTROL = 3,
|
80
|
+
LLAMA_TOKEN_TYPE_USER_DEFINED = 4,
|
81
|
+
LLAMA_TOKEN_TYPE_UNUSED = 5,
|
82
|
+
LLAMA_TOKEN_TYPE_BYTE = 6,
|
83
|
+
};
|
84
|
+
|
85
|
+
// model file types
|
86
|
+
enum llama_ftype {
|
87
|
+
LLAMA_FTYPE_ALL_F32 = 0,
|
88
|
+
LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
|
89
|
+
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
90
|
+
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
91
|
+
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
92
|
+
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
|
93
|
+
// LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
|
94
|
+
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
95
|
+
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
96
|
+
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
97
|
+
LLAMA_FTYPE_MOSTLY_Q2_K = 10,// except 1d tensors
|
98
|
+
LLAMA_FTYPE_MOSTLY_Q3_K_S = 11,// except 1d tensors
|
99
|
+
LLAMA_FTYPE_MOSTLY_Q3_K_M = 12,// except 1d tensors
|
100
|
+
LLAMA_FTYPE_MOSTLY_Q3_K_L = 13,// except 1d tensors
|
101
|
+
LLAMA_FTYPE_MOSTLY_Q4_K_S = 14,// except 1d tensors
|
102
|
+
LLAMA_FTYPE_MOSTLY_Q4_K_M = 15,// except 1d tensors
|
103
|
+
LLAMA_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors
|
104
|
+
LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors
|
105
|
+
LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors
|
106
|
+
|
107
|
+
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
108
|
+
};
|
109
|
+
|
75
110
|
typedef struct llama_token_data {
|
76
111
|
llama_token id; // token id
|
77
112
|
float logit; // log-odds of the token
|
@@ -86,25 +121,10 @@ extern "C" {
|
|
86
121
|
|
87
122
|
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
88
123
|
|
89
|
-
enum llama_log_level {
|
90
|
-
LLAMA_LOG_LEVEL_ERROR = 2,
|
91
|
-
LLAMA_LOG_LEVEL_WARN = 3,
|
92
|
-
LLAMA_LOG_LEVEL_INFO = 4
|
93
|
-
};
|
94
|
-
|
95
|
-
// Signature for logging events
|
96
|
-
// Note that text includes the new line character at the end for most events.
|
97
|
-
// If your logging mechanism cannot handle that, check if the last character is '\n' and strip it
|
98
|
-
// if it exists.
|
99
|
-
// It might not exist for progress report where '.' is output repeatedly.
|
100
|
-
typedef void (*llama_log_callback)(enum llama_log_level level, const char * text, void * user_data);
|
101
|
-
|
102
124
|
struct llama_context_params {
|
103
125
|
uint32_t seed; // RNG seed, -1 for random
|
104
126
|
int32_t n_ctx; // text context
|
105
127
|
int32_t n_batch; // prompt processing batch size
|
106
|
-
int32_t n_gqa; // grouped-query attention (TEMP - will be moved to model hparams)
|
107
|
-
float rms_norm_eps; // rms norm epsilon (TEMP - will be moved to model hparams)
|
108
128
|
int32_t n_gpu_layers; // number of layers to store in VRAM
|
109
129
|
int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
110
130
|
|
@@ -129,33 +149,18 @@ extern "C" {
|
|
129
149
|
bool use_mlock; // force system to keep model in RAM
|
130
150
|
bool embedding; // embedding mode only
|
131
151
|
};
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
|
140
|
-
// LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
|
141
|
-
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
142
|
-
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
143
|
-
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
144
|
-
LLAMA_FTYPE_MOSTLY_Q2_K = 10,// except 1d tensors
|
145
|
-
LLAMA_FTYPE_MOSTLY_Q3_K_S = 11,// except 1d tensors
|
146
|
-
LLAMA_FTYPE_MOSTLY_Q3_K_M = 12,// except 1d tensors
|
147
|
-
LLAMA_FTYPE_MOSTLY_Q3_K_L = 13,// except 1d tensors
|
148
|
-
LLAMA_FTYPE_MOSTLY_Q4_K_S = 14,// except 1d tensors
|
149
|
-
LLAMA_FTYPE_MOSTLY_Q4_K_M = 15,// except 1d tensors
|
150
|
-
LLAMA_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors
|
151
|
-
LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors
|
152
|
-
LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors
|
153
|
-
};
|
152
|
+
|
153
|
+
// Signature for logging events
|
154
|
+
// Note that text includes the new line character at the end for most events.
|
155
|
+
// If your logging mechanism cannot handle that, check if the last character is '\n' and strip it
|
156
|
+
// if it exists.
|
157
|
+
// It might not exist for progress report where '.' is output repeatedly.
|
158
|
+
typedef void (*llama_log_callback)(enum llama_log_level level, const char * text, void * user_data);
|
154
159
|
|
155
160
|
// model quantization parameters
|
156
161
|
typedef struct llama_model_quantize_params {
|
157
162
|
int nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
|
158
|
-
enum llama_ftype
|
163
|
+
enum llama_ftype ftype; // quantize to this llama_ftype
|
159
164
|
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
160
165
|
bool quantize_output_tensor; // quantize output.weight
|
161
166
|
} llama_model_quantize_params;
|
@@ -208,27 +213,16 @@ extern "C" {
|
|
208
213
|
int32_t n_eval;
|
209
214
|
};
|
210
215
|
|
211
|
-
|
212
|
-
|
213
|
-
LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data);
|
214
|
-
|
215
|
-
LLAMA_API int llama_max_devices();
|
216
|
-
|
217
|
-
LLAMA_API struct llama_context_params llama_context_default_params();
|
218
|
-
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
|
219
|
-
|
220
|
-
LLAMA_API bool llama_mmap_supported();
|
221
|
-
LLAMA_API bool llama_mlock_supported();
|
216
|
+
LLAMA_API struct llama_context_params llama_context_default_params(void);
|
217
|
+
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void);
|
222
218
|
|
223
|
-
// TODO: not great API - very likely to change
|
224
219
|
// Initialize the llama + ggml backend
|
225
220
|
// If numa is true, use NUMA optimizations
|
226
221
|
// Call once at the start of the program
|
227
222
|
LLAMA_API void llama_backend_init(bool numa);
|
228
|
-
// Call once at the end of the program - currently only used for MPI
|
229
|
-
LLAMA_API void llama_backend_free();
|
230
223
|
|
231
|
-
|
224
|
+
// Call once at the end of the program - currently only used for MPI
|
225
|
+
LLAMA_API void llama_backend_free(void);
|
232
226
|
|
233
227
|
LLAMA_API struct llama_model * llama_load_model_from_file(
|
234
228
|
const char * path_model,
|
@@ -240,17 +234,28 @@ extern "C" {
|
|
240
234
|
struct llama_model * model,
|
241
235
|
struct llama_context_params params);
|
242
236
|
|
243
|
-
// Various functions for loading a ggml llama model.
|
244
|
-
// Allocate (almost) all memory needed for the model.
|
245
|
-
// Return NULL on failure
|
246
|
-
LLAMA_API DEPRECATED(struct llama_context * llama_init_from_file(
|
247
|
-
const char * path_model,
|
248
|
-
struct llama_context_params params),
|
249
|
-
"please use llama_load_model_from_file combined with llama_new_context_with_model instead");
|
250
|
-
|
251
237
|
// Frees all allocated memory
|
252
238
|
LLAMA_API void llama_free(struct llama_context * ctx);
|
253
239
|
|
240
|
+
LLAMA_API int64_t llama_time_us(void);
|
241
|
+
|
242
|
+
LLAMA_API int llama_max_devices (void);
|
243
|
+
LLAMA_API bool llama_mmap_supported (void);
|
244
|
+
LLAMA_API bool llama_mlock_supported(void);
|
245
|
+
|
246
|
+
LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
|
247
|
+
LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
|
248
|
+
LLAMA_API int llama_n_embd (const struct llama_context * ctx);
|
249
|
+
|
250
|
+
LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_context * ctx);
|
251
|
+
|
252
|
+
LLAMA_API int llama_model_n_vocab(const struct llama_model * model);
|
253
|
+
LLAMA_API int llama_model_n_ctx (const struct llama_model * model);
|
254
|
+
LLAMA_API int llama_model_n_embd (const struct llama_model * model);
|
255
|
+
|
256
|
+
// Get a string describing the model type
|
257
|
+
LLAMA_API int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size);
|
258
|
+
|
254
259
|
// Returns 0 on success
|
255
260
|
LLAMA_API int llama_model_quantize(
|
256
261
|
const char * fname_inp,
|
@@ -272,9 +277,9 @@ extern "C" {
|
|
272
277
|
|
273
278
|
LLAMA_API int llama_model_apply_lora_from_file(
|
274
279
|
const struct llama_model * model,
|
275
|
-
|
276
|
-
|
277
|
-
|
280
|
+
const char * path_lora,
|
281
|
+
const char * path_base_model,
|
282
|
+
int n_threads);
|
278
283
|
|
279
284
|
// Returns the number of tokens in the KV cache
|
280
285
|
LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
|
@@ -324,11 +329,40 @@ extern "C" {
|
|
324
329
|
// IMPORTANT: do not use for anything else other than debugging and testing!
|
325
330
|
LLAMA_API int llama_eval_export(struct llama_context * ctx, const char * fname);
|
326
331
|
|
332
|
+
// Token logits obtained from the last call to llama_eval()
|
333
|
+
// The logits for the last token are stored in the last row
|
334
|
+
// Can be mutated in order to change the probabilities of the next token
|
335
|
+
// Rows: n_tokens
|
336
|
+
// Cols: n_vocab
|
337
|
+
LLAMA_API float * llama_get_logits(struct llama_context * ctx);
|
338
|
+
|
339
|
+
// Get the embeddings for the input
|
340
|
+
// shape: [n_embd] (1-dimensional)
|
341
|
+
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
342
|
+
|
343
|
+
//
|
344
|
+
// Vocab
|
345
|
+
//
|
346
|
+
|
347
|
+
LLAMA_API const char * llama_token_get_text(const struct llama_context * ctx, llama_token token);
|
348
|
+
|
349
|
+
LLAMA_API float llama_token_get_score(const struct llama_context * ctx, llama_token token);
|
350
|
+
|
351
|
+
LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token);
|
352
|
+
|
353
|
+
// Special tokens
|
354
|
+
LLAMA_API llama_token llama_token_bos(const struct llama_context * ctx); // beginning-of-sentence
|
355
|
+
LLAMA_API llama_token llama_token_eos(const struct llama_context * ctx); // end-of-sentence
|
356
|
+
LLAMA_API llama_token llama_token_nl (const struct llama_context * ctx); // next-line
|
357
|
+
|
358
|
+
//
|
359
|
+
// Tokenization
|
360
|
+
//
|
361
|
+
|
327
362
|
// Convert the provided text into tokens.
|
328
363
|
// The tokens pointer must be large enough to hold the resulting tokens.
|
329
364
|
// Returns the number of tokens on success, no more than n_max_tokens
|
330
365
|
// Returns a negative number on failure - the number of tokens that would have been returned
|
331
|
-
// TODO: not sure if correct
|
332
366
|
LLAMA_API int llama_tokenize(
|
333
367
|
struct llama_context * ctx,
|
334
368
|
const char * text,
|
@@ -343,57 +377,24 @@ extern "C" {
|
|
343
377
|
int n_max_tokens,
|
344
378
|
bool add_bos);
|
345
379
|
|
346
|
-
LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
|
347
|
-
LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
|
348
|
-
LLAMA_API int llama_n_embd (const struct llama_context * ctx);
|
349
|
-
|
350
|
-
LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model);
|
351
|
-
LLAMA_API int llama_n_ctx_from_model (const struct llama_model * model);
|
352
|
-
LLAMA_API int llama_n_embd_from_model (const struct llama_model * model);
|
353
|
-
|
354
|
-
LLAMA_API int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size);
|
355
|
-
|
356
|
-
// Get the vocabulary as output parameters.
|
357
|
-
// Returns number of results.
|
358
|
-
LLAMA_API int llama_get_vocab(
|
359
|
-
const struct llama_context * ctx,
|
360
|
-
const char * * strings,
|
361
|
-
float * scores,
|
362
|
-
int capacity);
|
363
|
-
|
364
|
-
LLAMA_API int llama_get_vocab_from_model(
|
365
|
-
const struct llama_model * model,
|
366
|
-
const char * * strings,
|
367
|
-
float * scores,
|
368
|
-
int capacity);
|
369
|
-
|
370
|
-
// Token logits obtained from the last call to llama_eval()
|
371
|
-
// The logits for the last token are stored in the last row
|
372
|
-
// Can be mutated in order to change the probabilities of the next token
|
373
|
-
// Rows: n_tokens
|
374
|
-
// Cols: n_vocab
|
375
|
-
LLAMA_API float * llama_get_logits(struct llama_context * ctx);
|
376
|
-
|
377
|
-
// Get the embeddings for the input
|
378
|
-
// shape: [n_embd] (1-dimensional)
|
379
|
-
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
380
|
-
|
381
380
|
// Token Id -> String. Uses the vocabulary in the provided context
|
382
|
-
|
381
|
+
// Does not write null terminator to the buffer
|
382
|
+
LLAMA_API int llama_token_to_str(
|
383
383
|
const struct llama_context * ctx,
|
384
|
-
llama_token token
|
384
|
+
llama_token token,
|
385
|
+
char * buf,
|
386
|
+
int length);
|
385
387
|
|
386
|
-
LLAMA_API
|
388
|
+
LLAMA_API int llama_token_to_str_with_model(
|
387
389
|
const struct llama_model * model,
|
388
|
-
llama_token token
|
389
|
-
|
390
|
-
|
391
|
-
LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence
|
392
|
-
LLAMA_API llama_token llama_token_eos(); // end-of-sentence
|
393
|
-
LLAMA_API llama_token llama_token_nl(); // next-line
|
390
|
+
llama_token token,
|
391
|
+
char * buf,
|
392
|
+
int length);
|
394
393
|
|
394
|
+
//
|
395
395
|
// Grammar
|
396
396
|
//
|
397
|
+
|
397
398
|
LLAMA_API struct llama_grammar * llama_grammar_init(
|
398
399
|
const llama_grammar_element ** rules,
|
399
400
|
size_t n_rules,
|
@@ -401,7 +402,9 @@ extern "C" {
|
|
401
402
|
|
402
403
|
LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
|
403
404
|
|
405
|
+
//
|
404
406
|
// Sampling functions
|
407
|
+
//
|
405
408
|
|
406
409
|
/// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
|
407
410
|
LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty);
|
@@ -470,6 +473,10 @@ extern "C" {
|
|
470
473
|
// Print system information
|
471
474
|
LLAMA_API const char * llama_print_system_info(void);
|
472
475
|
|
476
|
+
// Set callback for all future logging events.
|
477
|
+
// If this is not called, or NULL is supplied, everything is output on stderr.
|
478
|
+
LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data);
|
479
|
+
|
473
480
|
#ifdef __cplusplus
|
474
481
|
}
|
475
482
|
#endif
|
@@ -479,10 +486,11 @@ extern "C" {
|
|
479
486
|
|
480
487
|
#include <vector>
|
481
488
|
#include <string>
|
489
|
+
|
482
490
|
struct ggml_tensor;
|
483
491
|
|
484
492
|
const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
|
485
493
|
|
486
|
-
#endif
|
494
|
+
#endif // LLAMA_API_INTERNAL
|
487
495
|
|
488
496
|
#endif // LLAMA_H
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.
|
6
|
+
VERSION = '0.4.0'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b1060'
|
10
10
|
end
|
data/lib/llama_cpp.rb
CHANGED
@@ -101,7 +101,7 @@ module LLaMACpp
|
|
101
101
|
|
102
102
|
embd.each { |token| output << context.token_to_str(token) }
|
103
103
|
|
104
|
-
break if !embd.empty? && embd[-1] ==
|
104
|
+
break if !embd.empty? && embd[-1] == context.token_eos
|
105
105
|
end
|
106
106
|
|
107
107
|
output.join.delete_prefix(spaced_prompt).strip
|
data/sig/llama_cpp.rbs
CHANGED
@@ -1,9 +1,6 @@
|
|
1
1
|
module LLaMACpp
|
2
2
|
VERSION: String
|
3
3
|
LLAMA_CPP_VERSION: String
|
4
|
-
LLAMA_FILE_VERSION: String
|
5
|
-
LLAMA_FILE_MAGIC: String
|
6
|
-
LLAMA_FILE_MAGIC_UNVERSIONED: String
|
7
4
|
LLAMA_DEFALUT_SEED: String
|
8
5
|
|
9
6
|
LLAMA_MAX_DEVICES: Integer
|
@@ -42,9 +39,7 @@ module LLaMACpp
|
|
42
39
|
?repeat_last_n: Integer, ?repeat_penalty: Float, ?frequency: Float, ?presence: Float,
|
43
40
|
?top_k: Integer, ?top_p: Float, ?tfs_z: Float, ?typical_p: Float, ?temperature: Float) -> String
|
44
41
|
def self?.print_system_info: () -> void
|
45
|
-
def self?.
|
46
|
-
def self?.token_eos: () -> Integer
|
47
|
-
def self?.token_nl: () -> Integer
|
42
|
+
def self?.time_us: () -> Integer
|
48
43
|
def self?.mmap_supported?: () -> bool
|
49
44
|
def self?.mlock_supported?: () -> bool
|
50
45
|
def self?.max_devices: () -> Integer
|
@@ -81,7 +76,6 @@ module LLaMACpp
|
|
81
76
|
def n_vocab: () -> Integer
|
82
77
|
def n_ctx: () -> Integer
|
83
78
|
def n_embd: () -> Integer
|
84
|
-
def vocab: (capacity: Integer) -> [Array[String], Array[Float]]
|
85
79
|
def token_to_str: (Integer) -> String
|
86
80
|
def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
|
87
81
|
def type: () -> String
|
@@ -106,6 +100,12 @@ module LLaMACpp
|
|
106
100
|
|
107
101
|
def initialize: (model: ::LLaMACpp::Model) -> void
|
108
102
|
def embeddings: () -> Array[Float]
|
103
|
+
def text: (Integer) -> String
|
104
|
+
def score: (Integer) -> Float
|
105
|
+
def type: (Integer) -> Integer
|
106
|
+
def token_bos: () -> Integer
|
107
|
+
def token_eos: () -> Integer
|
108
|
+
def token_nl: () -> Integer
|
109
109
|
def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
|
110
110
|
def eval_embd: (tokens: Array[Float], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
|
111
111
|
def eval_export: (String) -> bool
|
@@ -113,7 +113,6 @@ module LLaMACpp
|
|
113
113
|
def n_ctx: () -> Integer
|
114
114
|
def n_embd: () -> Integer
|
115
115
|
def n_vocab: () -> Integer
|
116
|
-
def vocab: (capacity: Integer) -> [Array[String], Array[Float]]
|
117
116
|
def timings: () -> ::LLaMACpp::Timings
|
118
117
|
def print_timings: () -> void
|
119
118
|
def reset_timings: () -> void
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-08-
|
11
|
+
date: 2023-08-26 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|