llama_cpp 0.9.2 → 0.9.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -1
- data/ext/llama_cpp/llama_cpp.cpp +12 -0
- data/ext/llama_cpp/src/ggml-alloc.c +378 -208
- data/ext/llama_cpp/src/ggml-alloc.h +68 -16
- data/ext/llama_cpp/src/ggml-backend-impl.h +87 -0
- data/ext/llama_cpp/src/ggml-backend.c +578 -13
- data/ext/llama_cpp/src/ggml-backend.h +70 -77
- data/ext/llama_cpp/src/ggml-cuda.cu +260 -46
- data/ext/llama_cpp/src/ggml-impl.h +13 -7
- data/ext/llama_cpp/src/ggml-metal.h +1 -1
- data/ext/llama_cpp/src/ggml-metal.m +113 -32
- data/ext/llama_cpp/src/ggml-metal.metal +107 -1
- data/ext/llama_cpp/src/ggml-quants.c +174 -74
- data/ext/llama_cpp/src/ggml.c +881 -1459
- data/ext/llama_cpp/src/ggml.h +64 -45
- data/ext/llama_cpp/src/llama.cpp +555 -49
- data/ext/llama_cpp/src/llama.h +77 -3
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- metadata +3 -2
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -301,6 +301,23 @@ extern "C" {
|
|
301
301
|
// Get the model's RoPE frequency scaling factor
|
302
302
|
LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
|
303
303
|
|
304
|
+
// Functions to access the model's GGUF metadata scalar values
|
305
|
+
// - The functions return the length of the string on success, or -1 on failure
|
306
|
+
// - The output string is always null-terminated and cleared on failure
|
307
|
+
// - GGUF array values are not supported by these functions
|
308
|
+
|
309
|
+
// Get metadata value as a string by key name
|
310
|
+
LLAMA_API int llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size);
|
311
|
+
|
312
|
+
// Get the number of metadata key/value pairs
|
313
|
+
LLAMA_API int llama_model_meta_count(const struct llama_model * model);
|
314
|
+
|
315
|
+
// Get metadata key name by index
|
316
|
+
LLAMA_API int llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size);
|
317
|
+
|
318
|
+
// Get metadata value as a string by index
|
319
|
+
LLAMA_API int llama_model_meta_val_str_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size);
|
320
|
+
|
304
321
|
// Get a string describing the model type
|
305
322
|
LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
|
306
323
|
|
@@ -344,9 +361,60 @@ extern "C" {
|
|
344
361
|
// KV cache
|
345
362
|
//
|
346
363
|
|
347
|
-
//
|
348
|
-
|
349
|
-
|
364
|
+
// Information associated with an individual cell in the KV cache view.
|
365
|
+
struct llama_kv_cache_view_cell {
|
366
|
+
// The position for this cell. Takes KV cache shifts into account.
|
367
|
+
// May be negative if the cell is not populated.
|
368
|
+
llama_pos pos;
|
369
|
+
};
|
370
|
+
|
371
|
+
// An updateable view of the KV cache.
|
372
|
+
struct llama_kv_cache_view {
|
373
|
+
// Number of KV cache cells. This will be the same as the context size.
|
374
|
+
int32_t n_cells;
|
375
|
+
|
376
|
+
// Maximum number of sequences that can exist in a cell. It's not an error
|
377
|
+
// if there are more sequences in a cell than this value, however they will
|
378
|
+
// not be visible in the view cells_sequences.
|
379
|
+
int32_t n_max_seq;
|
380
|
+
|
381
|
+
// Number of tokens in the cache. For example, if there are two populated
|
382
|
+
// cells, the first with 1 sequence id in it and the second with 2 sequence
|
383
|
+
// ids then you'll have 3 tokens.
|
384
|
+
int32_t token_count;
|
385
|
+
|
386
|
+
// Number of populated cache cells.
|
387
|
+
int32_t used_cells;
|
388
|
+
|
389
|
+
// Maximum contiguous empty slots in the cache.
|
390
|
+
int32_t max_contiguous;
|
391
|
+
|
392
|
+
// Index to the start of the max_contiguous slot range. Can be negative
|
393
|
+
// when cache is full.
|
394
|
+
int32_t max_contiguous_idx;
|
395
|
+
|
396
|
+
// Information for an individual cell.
|
397
|
+
struct llama_kv_cache_view_cell * cells;
|
398
|
+
|
399
|
+
// The sequences for each cell. There will be n_max_seq items per cell.
|
400
|
+
llama_seq_id * cells_sequences;
|
401
|
+
};
|
402
|
+
|
403
|
+
// Create an empty KV cache view. (use only for debugging purposes)
|
404
|
+
LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq);
|
405
|
+
|
406
|
+
// Free a KV cache view. (use only for debugging purposes)
|
407
|
+
LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
|
408
|
+
|
409
|
+
// Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
|
410
|
+
LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
|
411
|
+
|
412
|
+
// Returns the number of tokens in the KV cache (slow, use only for debug)
|
413
|
+
// If a KV cell has multiple sequences assigned to it, it will be counted multiple times
|
414
|
+
LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
|
415
|
+
|
416
|
+
// Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
|
417
|
+
LLAMA_API int llama_get_kv_cache_used_cells(const struct llama_context * ctx);
|
350
418
|
|
351
419
|
// Clear the KV cache
|
352
420
|
LLAMA_API void llama_kv_cache_clear(
|
@@ -517,6 +585,12 @@ extern "C" {
|
|
517
585
|
LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
|
518
586
|
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
|
519
587
|
|
588
|
+
// Returns -1 if unknown, 1 for true or 0 for false.
|
589
|
+
LLAMA_API int llama_add_bos_token(const struct llama_model * model);
|
590
|
+
|
591
|
+
// Returns -1 if unknown, 1 for true or 0 for false.
|
592
|
+
LLAMA_API int llama_add_eos_token(const struct llama_model * model);
|
593
|
+
|
520
594
|
// codellama infill tokens
|
521
595
|
LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
|
522
596
|
LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.9.
|
6
|
+
VERSION = '0.9.4'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b1555'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -94,6 +94,8 @@ module LLaMACpp
|
|
94
94
|
def token_bos: () -> Integer
|
95
95
|
def token_eos: () -> Integer
|
96
96
|
def token_nl: () -> Integer
|
97
|
+
def add_bos_token?: () -> bool
|
98
|
+
def add_eos_token?: () -> bool
|
97
99
|
def token_prefix: () -> Integer
|
98
100
|
def token_middle: () -> Integer
|
99
101
|
def token_suffix: () -> Integer
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.9.
|
4
|
+
version: 0.9.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-11-
|
11
|
+
date: 2023-11-25 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|
@@ -32,6 +32,7 @@ files:
|
|
32
32
|
- ext/llama_cpp/src/LICENSE
|
33
33
|
- ext/llama_cpp/src/ggml-alloc.c
|
34
34
|
- ext/llama_cpp/src/ggml-alloc.h
|
35
|
+
- ext/llama_cpp/src/ggml-backend-impl.h
|
35
36
|
- ext/llama_cpp/src/ggml-backend.c
|
36
37
|
- ext/llama_cpp/src/ggml-backend.h
|
37
38
|
- ext/llama_cpp/src/ggml-cuda.cu
|