llama_cpp 0.4.0 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/examples/chat.rb +2 -2
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +23 -11
- data/ext/llama_cpp/src/ggml-alloc.c +118 -73
- data/ext/llama_cpp/src/ggml-cuda.cu +106 -34
- data/ext/llama_cpp/src/ggml-metal.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.m +165 -72
- data/ext/llama_cpp/src/ggml-metal.metal +160 -89
- data/ext/llama_cpp/src/ggml-opencl.cpp +7 -7
- data/ext/llama_cpp/src/ggml.c +661 -380
- data/ext/llama_cpp/src/ggml.h +45 -19
- data/ext/llama_cpp/src/k_quants.c +47 -14
- data/ext/llama_cpp/src/llama.cpp +571 -166
- data/ext/llama_cpp/src/llama.h +54 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +5 -3
- metadata +2 -2
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -10,6 +10,7 @@
|
|
10
10
|
#endif // GGML_USE_CUBLAS
|
11
11
|
#include <stddef.h>
|
12
12
|
#include <stdint.h>
|
13
|
+
#include <stdio.h>
|
13
14
|
#include <stdbool.h>
|
14
15
|
|
15
16
|
#ifdef LLAMA_SHARED
|
@@ -163,6 +164,7 @@ extern "C" {
|
|
163
164
|
enum llama_ftype ftype; // quantize to this llama_ftype
|
164
165
|
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
165
166
|
bool quantize_output_tensor; // quantize output.weight
|
167
|
+
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
166
168
|
} llama_model_quantize_params;
|
167
169
|
|
168
170
|
// grammar types
|
@@ -254,7 +256,11 @@ extern "C" {
|
|
254
256
|
LLAMA_API int llama_model_n_embd (const struct llama_model * model);
|
255
257
|
|
256
258
|
// Get a string describing the model type
|
257
|
-
LLAMA_API int
|
259
|
+
LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
|
260
|
+
// Returns the total size of all the tensors in the model in bytes
|
261
|
+
LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
|
262
|
+
// Returns the total number of parameters in the model
|
263
|
+
LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
|
258
264
|
|
259
265
|
// Returns 0 on success
|
260
266
|
LLAMA_API int llama_model_quantize(
|
@@ -377,15 +383,17 @@ extern "C" {
|
|
377
383
|
int n_max_tokens,
|
378
384
|
bool add_bos);
|
379
385
|
|
380
|
-
// Token Id ->
|
381
|
-
//
|
382
|
-
|
386
|
+
// Token Id -> Piece.
|
387
|
+
// Uses the vocabulary in the provided context.
|
388
|
+
// Does not write null terminator to the buffer.
|
389
|
+
// User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
|
390
|
+
LLAMA_API int llama_token_to_piece(
|
383
391
|
const struct llama_context * ctx,
|
384
392
|
llama_token token,
|
385
393
|
char * buf,
|
386
394
|
int length);
|
387
395
|
|
388
|
-
LLAMA_API int
|
396
|
+
LLAMA_API int llama_token_to_piece_with_model(
|
389
397
|
const struct llama_model * model,
|
390
398
|
llama_token token,
|
391
399
|
char * buf,
|
@@ -402,6 +410,8 @@ extern "C" {
|
|
402
410
|
|
403
411
|
LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
|
404
412
|
|
413
|
+
LLAMA_API struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar);
|
414
|
+
|
405
415
|
//
|
406
416
|
// Sampling functions
|
407
417
|
//
|
@@ -465,6 +475,43 @@ extern "C" {
|
|
465
475
|
/// @details Accepts the sampled token into the grammar
|
466
476
|
LLAMA_API void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token);
|
467
477
|
|
478
|
+
//
|
479
|
+
// Beam search
|
480
|
+
//
|
481
|
+
|
482
|
+
struct llama_beam_view {
|
483
|
+
const llama_token * tokens;
|
484
|
+
size_t n_tokens;
|
485
|
+
float p; // Cumulative beam probability (renormalized relative to all beams)
|
486
|
+
bool eob; // Callback should set this to true when a beam is at end-of-beam.
|
487
|
+
};
|
488
|
+
|
489
|
+
// Passed to beam_search_callback function.
|
490
|
+
// Whenever 0 < common_prefix_length, this number of tokens should be copied from any of the beams
|
491
|
+
// (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks.
|
492
|
+
// These pointers are valid only during the synchronous callback, so should not be saved.
|
493
|
+
struct llama_beams_state {
|
494
|
+
struct llama_beam_view * beam_views;
|
495
|
+
size_t n_beams; // Number of elements in beam_views[].
|
496
|
+
size_t common_prefix_length; // Current max length of prefix tokens shared by all beams.
|
497
|
+
bool last_call; // True iff this is the last callback invocation.
|
498
|
+
};
|
499
|
+
|
500
|
+
// Type of pointer to the beam_search_callback function.
|
501
|
+
// void* callback_data is any custom data passed to llama_beam_search, that is subsequently
|
502
|
+
// passed back to beam_search_callback. This avoids having to use global variables in the callback.
|
503
|
+
typedef void (*llama_beam_search_callback_fn_t)(void * callback_data, struct llama_beams_state);
|
504
|
+
|
505
|
+
/// @details Deterministically returns entire sentence constructed by a beam search.
|
506
|
+
/// @param ctx Pointer to the llama_context.
|
507
|
+
/// @param callback Invoked for each iteration of the beam_search loop, passing in beams_state.
|
508
|
+
/// @param callback_data A pointer that is simply passed back to callback.
|
509
|
+
/// @param n_beams Number of beams to use.
|
510
|
+
/// @param n_past Number of tokens already evaluated.
|
511
|
+
/// @param n_predict Maximum number of tokens to predict. EOS may occur earlier.
|
512
|
+
/// @param n_threads Number of threads as passed to llama_eval().
|
513
|
+
LLAMA_API void llama_beam_search(struct llama_context * ctx, llama_beam_search_callback_fn_t callback, void * callback_data, size_t n_beams, int n_past, int n_predict, int n_threads);
|
514
|
+
|
468
515
|
// Performance information
|
469
516
|
LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
|
470
517
|
LLAMA_API void llama_print_timings(struct llama_context * ctx);
|
@@ -477,6 +524,8 @@ extern "C" {
|
|
477
524
|
// If this is not called, or NULL is supplied, everything is output on stderr.
|
478
525
|
LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data);
|
479
526
|
|
527
|
+
LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
|
528
|
+
|
480
529
|
#ifdef __cplusplus
|
481
530
|
}
|
482
531
|
#endif
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.
|
6
|
+
VERSION = '0.5.1'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b1198'
|
10
10
|
end
|
data/lib/llama_cpp.rb
CHANGED
data/sig/llama_cpp.rbs
CHANGED
@@ -76,9 +76,11 @@ module LLaMACpp
|
|
76
76
|
def n_vocab: () -> Integer
|
77
77
|
def n_ctx: () -> Integer
|
78
78
|
def n_embd: () -> Integer
|
79
|
-
def
|
79
|
+
def token_to_piece: (Integer) -> String
|
80
80
|
def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
|
81
|
-
def
|
81
|
+
def desc: () -> String
|
82
|
+
def size: () -> Integer
|
83
|
+
def n_params: () -> Integer
|
82
84
|
end
|
83
85
|
|
84
86
|
class Timings
|
@@ -116,7 +118,7 @@ module LLaMACpp
|
|
116
118
|
def timings: () -> ::LLaMACpp::Timings
|
117
119
|
def print_timings: () -> void
|
118
120
|
def reset_timings: () -> void
|
119
|
-
def
|
121
|
+
def token_to_piece: (Integer) -> String
|
120
122
|
def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
|
121
123
|
def kv_cache_token_count: () -> Integer
|
122
124
|
def set_rng_seed: (Integer) -> void
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-08
|
11
|
+
date: 2023-09-08 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|