llama_cpp 0.9.2 → 0.9.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -1
- data/ext/llama_cpp/llama_cpp.cpp +12 -0
- data/ext/llama_cpp/src/ggml-alloc.c +378 -208
- data/ext/llama_cpp/src/ggml-alloc.h +68 -16
- data/ext/llama_cpp/src/ggml-backend-impl.h +87 -0
- data/ext/llama_cpp/src/ggml-backend.c +578 -13
- data/ext/llama_cpp/src/ggml-backend.h +70 -77
- data/ext/llama_cpp/src/ggml-cuda.cu +260 -46
- data/ext/llama_cpp/src/ggml-impl.h +13 -7
- data/ext/llama_cpp/src/ggml-metal.h +1 -1
- data/ext/llama_cpp/src/ggml-metal.m +113 -32
- data/ext/llama_cpp/src/ggml-metal.metal +107 -1
- data/ext/llama_cpp/src/ggml-quants.c +174 -74
- data/ext/llama_cpp/src/ggml.c +881 -1459
- data/ext/llama_cpp/src/ggml.h +64 -45
- data/ext/llama_cpp/src/llama.cpp +555 -49
- data/ext/llama_cpp/src/llama.h +77 -3
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- metadata +3 -2
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -301,6 +301,23 @@ extern "C" {
|
|
301
301
|
// Get the model's RoPE frequency scaling factor
|
302
302
|
LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
|
303
303
|
|
304
|
+
// Functions to access the model's GGUF metadata scalar values
|
305
|
+
// - The functions return the length of the string on success, or -1 on failure
|
306
|
+
// - The output string is always null-terminated and cleared on failure
|
307
|
+
// - GGUF array values are not supported by these functions
|
308
|
+
|
309
|
+
// Get metadata value as a string by key name
|
310
|
+
LLAMA_API int llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size);
|
311
|
+
|
312
|
+
// Get the number of metadata key/value pairs
|
313
|
+
LLAMA_API int llama_model_meta_count(const struct llama_model * model);
|
314
|
+
|
315
|
+
// Get metadata key name by index
|
316
|
+
LLAMA_API int llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size);
|
317
|
+
|
318
|
+
// Get metadata value as a string by index
|
319
|
+
LLAMA_API int llama_model_meta_val_str_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size);
|
320
|
+
|
304
321
|
// Get a string describing the model type
|
305
322
|
LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
|
306
323
|
|
@@ -344,9 +361,60 @@ extern "C" {
|
|
344
361
|
// KV cache
|
345
362
|
//
|
346
363
|
|
347
|
-
//
|
348
|
-
|
349
|
-
|
364
|
+
// Information associated with an individual cell in the KV cache view.
|
365
|
+
struct llama_kv_cache_view_cell {
|
366
|
+
// The position for this cell. Takes KV cache shifts into account.
|
367
|
+
// May be negative if the cell is not populated.
|
368
|
+
llama_pos pos;
|
369
|
+
};
|
370
|
+
|
371
|
+
// An updateable view of the KV cache.
|
372
|
+
struct llama_kv_cache_view {
|
373
|
+
// Number of KV cache cells. This will be the same as the context size.
|
374
|
+
int32_t n_cells;
|
375
|
+
|
376
|
+
// Maximum number of sequences that can exist in a cell. It's not an error
|
377
|
+
// if there are more sequences in a cell than this value, however they will
|
378
|
+
// not be visible in the view cells_sequences.
|
379
|
+
int32_t n_max_seq;
|
380
|
+
|
381
|
+
// Number of tokens in the cache. For example, if there are two populated
|
382
|
+
// cells, the first with 1 sequence id in it and the second with 2 sequence
|
383
|
+
// ids then you'll have 3 tokens.
|
384
|
+
int32_t token_count;
|
385
|
+
|
386
|
+
// Number of populated cache cells.
|
387
|
+
int32_t used_cells;
|
388
|
+
|
389
|
+
// Maximum contiguous empty slots in the cache.
|
390
|
+
int32_t max_contiguous;
|
391
|
+
|
392
|
+
// Index to the start of the max_contiguous slot range. Can be negative
|
393
|
+
// when cache is full.
|
394
|
+
int32_t max_contiguous_idx;
|
395
|
+
|
396
|
+
// Information for an individual cell.
|
397
|
+
struct llama_kv_cache_view_cell * cells;
|
398
|
+
|
399
|
+
// The sequences for each cell. There will be n_max_seq items per cell.
|
400
|
+
llama_seq_id * cells_sequences;
|
401
|
+
};
|
402
|
+
|
403
|
+
// Create an empty KV cache view. (use only for debugging purposes)
|
404
|
+
LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq);
|
405
|
+
|
406
|
+
// Free a KV cache view. (use only for debugging purposes)
|
407
|
+
LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
|
408
|
+
|
409
|
+
// Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
|
410
|
+
LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
|
411
|
+
|
412
|
+
// Returns the number of tokens in the KV cache (slow, use only for debug)
|
413
|
+
// If a KV cell has multiple sequences assigned to it, it will be counted multiple times
|
414
|
+
LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
|
415
|
+
|
416
|
+
// Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
|
417
|
+
LLAMA_API int llama_get_kv_cache_used_cells(const struct llama_context * ctx);
|
350
418
|
|
351
419
|
// Clear the KV cache
|
352
420
|
LLAMA_API void llama_kv_cache_clear(
|
@@ -517,6 +585,12 @@ extern "C" {
|
|
517
585
|
LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
|
518
586
|
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
|
519
587
|
|
588
|
+
// Returns -1 if unknown, 1 for true or 0 for false.
|
589
|
+
LLAMA_API int llama_add_bos_token(const struct llama_model * model);
|
590
|
+
|
591
|
+
// Returns -1 if unknown, 1 for true or 0 for false.
|
592
|
+
LLAMA_API int llama_add_eos_token(const struct llama_model * model);
|
593
|
+
|
520
594
|
// codellama infill tokens
|
521
595
|
LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
|
522
596
|
LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.9.
|
6
|
+
VERSION = '0.9.4'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b1555'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -94,6 +94,8 @@ module LLaMACpp
|
|
94
94
|
def token_bos: () -> Integer
|
95
95
|
def token_eos: () -> Integer
|
96
96
|
def token_nl: () -> Integer
|
97
|
+
def add_bos_token?: () -> bool
|
98
|
+
def add_eos_token?: () -> bool
|
97
99
|
def token_prefix: () -> Integer
|
98
100
|
def token_middle: () -> Integer
|
99
101
|
def token_suffix: () -> Integer
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.9.
|
4
|
+
version: 0.9.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-11-
|
11
|
+
date: 2023-11-25 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|
@@ -32,6 +32,7 @@ files:
|
|
32
32
|
- ext/llama_cpp/src/LICENSE
|
33
33
|
- ext/llama_cpp/src/ggml-alloc.c
|
34
34
|
- ext/llama_cpp/src/ggml-alloc.h
|
35
|
+
- ext/llama_cpp/src/ggml-backend-impl.h
|
35
36
|
- ext/llama_cpp/src/ggml-backend.c
|
36
37
|
- ext/llama_cpp/src/ggml-backend.h
|
37
38
|
- ext/llama_cpp/src/ggml-cuda.cu
|