llama_cpp 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/examples/chat.rb +2 -2
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +23 -11
- data/ext/llama_cpp/src/ggml-alloc.c +13 -50
- data/ext/llama_cpp/src/ggml-cuda.cu +23 -11
- data/ext/llama_cpp/src/ggml-metal.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.m +130 -61
- data/ext/llama_cpp/src/ggml-metal.metal +44 -26
- data/ext/llama_cpp/src/ggml.c +637 -328
- data/ext/llama_cpp/src/ggml.h +45 -19
- data/ext/llama_cpp/src/k_quants.c +2 -2
- data/ext/llama_cpp/src/llama.cpp +426 -97
- data/ext/llama_cpp/src/llama.h +51 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +5 -3
- metadata +2 -2
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -10,6 +10,7 @@
|
|
10
10
|
#endif // GGML_USE_CUBLAS
|
11
11
|
#include <stddef.h>
|
12
12
|
#include <stdint.h>
|
13
|
+
#include <stdio.h>
|
13
14
|
#include <stdbool.h>
|
14
15
|
|
15
16
|
#ifdef LLAMA_SHARED
|
@@ -254,7 +255,11 @@ extern "C" {
|
|
254
255
|
LLAMA_API int llama_model_n_embd (const struct llama_model * model);
|
255
256
|
|
256
257
|
// Get a string describing the model type
|
257
|
-
LLAMA_API int
|
258
|
+
LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
|
259
|
+
// Returns the total size of all the tensors in the model in bytes
|
260
|
+
LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
|
261
|
+
// Returns the total number of parameters in the model
|
262
|
+
LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
|
258
263
|
|
259
264
|
// Returns 0 on success
|
260
265
|
LLAMA_API int llama_model_quantize(
|
@@ -377,15 +382,17 @@ extern "C" {
|
|
377
382
|
int n_max_tokens,
|
378
383
|
bool add_bos);
|
379
384
|
|
380
|
-
// Token Id ->
|
381
|
-
//
|
382
|
-
|
385
|
+
// Token Id -> Piece.
|
386
|
+
// Uses the vocabulary in the provided context.
|
387
|
+
// Does not write null terminator to the buffer.
|
388
|
+
// User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
|
389
|
+
LLAMA_API int llama_token_to_piece(
|
383
390
|
const struct llama_context * ctx,
|
384
391
|
llama_token token,
|
385
392
|
char * buf,
|
386
393
|
int length);
|
387
394
|
|
388
|
-
LLAMA_API int
|
395
|
+
LLAMA_API int llama_token_to_piece_with_model(
|
389
396
|
const struct llama_model * model,
|
390
397
|
llama_token token,
|
391
398
|
char * buf,
|
@@ -465,6 +472,43 @@ extern "C" {
|
|
465
472
|
/// @details Accepts the sampled token into the grammar
|
466
473
|
LLAMA_API void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token);
|
467
474
|
|
475
|
+
//
|
476
|
+
// Beam search
|
477
|
+
//
|
478
|
+
|
479
|
+
struct llama_beam_view {
|
480
|
+
const llama_token * tokens;
|
481
|
+
size_t n_tokens;
|
482
|
+
float p; // Cumulative beam probability (renormalized relative to all beams)
|
483
|
+
bool eob; // Callback should set this to true when a beam is at end-of-beam.
|
484
|
+
};
|
485
|
+
|
486
|
+
// Passed to beam_search_callback function.
|
487
|
+
// Whenever 0 < common_prefix_length, this number of tokens should be copied from any of the beams
|
488
|
+
// (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks.
|
489
|
+
// These pointers are valid only during the synchronous callback, so should not be saved.
|
490
|
+
struct llama_beams_state {
|
491
|
+
struct llama_beam_view * beam_views;
|
492
|
+
size_t n_beams; // Number of elements in beam_views[].
|
493
|
+
size_t common_prefix_length; // Current max length of prefix tokens shared by all beams.
|
494
|
+
bool last_call; // True iff this is the last callback invocation.
|
495
|
+
};
|
496
|
+
|
497
|
+
// Type of pointer to the beam_search_callback function.
|
498
|
+
// void* callback_data is any custom data passed to llama_beam_search, that is subsequently
|
499
|
+
// passed back to beam_search_callback. This avoids having to use global variables in the callback.
|
500
|
+
typedef void (*llama_beam_search_callback_fn_t)(void * callback_data, struct llama_beams_state);
|
501
|
+
|
502
|
+
/// @details Deterministically returns entire sentence constructed by a beam search.
|
503
|
+
/// @param ctx Pointer to the llama_context.
|
504
|
+
/// @param callback Invoked for each iteration of the beam_search loop, passing in beams_state.
|
505
|
+
/// @param callback_data A pointer that is simply passed back to callback.
|
506
|
+
/// @param n_beams Number of beams to use.
|
507
|
+
/// @param n_past Number of tokens already evaluated.
|
508
|
+
/// @param n_predict Maximum number of tokens to predict. EOS may occur earlier.
|
509
|
+
/// @param n_threads Number of threads as passed to llama_eval().
|
510
|
+
LLAMA_API void llama_beam_search(struct llama_context * ctx, llama_beam_search_callback_fn_t callback, void * callback_data, size_t n_beams, int n_past, int n_predict, int n_threads);
|
511
|
+
|
468
512
|
// Performance information
|
469
513
|
LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
|
470
514
|
LLAMA_API void llama_print_timings(struct llama_context * ctx);
|
@@ -477,6 +521,8 @@ extern "C" {
|
|
477
521
|
// If this is not called, or NULL is supplied, everything is output on stderr.
|
478
522
|
LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data);
|
479
523
|
|
524
|
+
LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
|
525
|
+
|
480
526
|
#ifdef __cplusplus
|
481
527
|
}
|
482
528
|
#endif
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.
|
6
|
+
VERSION = '0.5.0'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b1140'
|
10
10
|
end
|
data/lib/llama_cpp.rb
CHANGED
data/sig/llama_cpp.rbs
CHANGED
@@ -76,9 +76,11 @@ module LLaMACpp
|
|
76
76
|
def n_vocab: () -> Integer
|
77
77
|
def n_ctx: () -> Integer
|
78
78
|
def n_embd: () -> Integer
|
79
|
-
def
|
79
|
+
def token_to_piece: (Integer) -> String
|
80
80
|
def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
|
81
|
-
def
|
81
|
+
def desc: () -> String
|
82
|
+
def size: () -> Integer
|
83
|
+
def n_params: () -> Integer
|
82
84
|
end
|
83
85
|
|
84
86
|
class Timings
|
@@ -116,7 +118,7 @@ module LLaMACpp
|
|
116
118
|
def timings: () -> ::LLaMACpp::Timings
|
117
119
|
def print_timings: () -> void
|
118
120
|
def reset_timings: () -> void
|
119
|
-
def
|
121
|
+
def token_to_piece: (Integer) -> String
|
120
122
|
def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
|
121
123
|
def kv_cache_token_count: () -> Integer
|
122
124
|
def set_rng_seed: (Integer) -> void
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-09-02 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|