llama_cpp 0.4.0 → 0.5.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/examples/chat.rb +2 -2
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +23 -11
- data/ext/llama_cpp/src/ggml-alloc.c +118 -73
- data/ext/llama_cpp/src/ggml-cuda.cu +106 -34
- data/ext/llama_cpp/src/ggml-metal.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.m +165 -72
- data/ext/llama_cpp/src/ggml-metal.metal +160 -89
- data/ext/llama_cpp/src/ggml-opencl.cpp +7 -7
- data/ext/llama_cpp/src/ggml.c +661 -380
- data/ext/llama_cpp/src/ggml.h +45 -19
- data/ext/llama_cpp/src/k_quants.c +47 -14
- data/ext/llama_cpp/src/llama.cpp +571 -166
- data/ext/llama_cpp/src/llama.h +54 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +5 -3
- metadata +2 -2
data/ext/llama_cpp/src/llama.h
CHANGED
@@ -10,6 +10,7 @@
|
|
10
10
|
#endif // GGML_USE_CUBLAS
|
11
11
|
#include <stddef.h>
|
12
12
|
#include <stdint.h>
|
13
|
+
#include <stdio.h>
|
13
14
|
#include <stdbool.h>
|
14
15
|
|
15
16
|
#ifdef LLAMA_SHARED
|
@@ -163,6 +164,7 @@ extern "C" {
|
|
163
164
|
enum llama_ftype ftype; // quantize to this llama_ftype
|
164
165
|
bool allow_requantize; // allow quantizing non-f32/f16 tensors
|
165
166
|
bool quantize_output_tensor; // quantize output.weight
|
167
|
+
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
|
166
168
|
} llama_model_quantize_params;
|
167
169
|
|
168
170
|
// grammar types
|
@@ -254,7 +256,11 @@ extern "C" {
|
|
254
256
|
LLAMA_API int llama_model_n_embd (const struct llama_model * model);
|
255
257
|
|
256
258
|
// Get a string describing the model type
|
257
|
-
LLAMA_API int
|
259
|
+
LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
|
260
|
+
// Returns the total size of all the tensors in the model in bytes
|
261
|
+
LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
|
262
|
+
// Returns the total number of parameters in the model
|
263
|
+
LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
|
258
264
|
|
259
265
|
// Returns 0 on success
|
260
266
|
LLAMA_API int llama_model_quantize(
|
@@ -377,15 +383,17 @@ extern "C" {
|
|
377
383
|
int n_max_tokens,
|
378
384
|
bool add_bos);
|
379
385
|
|
380
|
-
// Token Id ->
|
381
|
-
//
|
382
|
-
|
386
|
+
// Token Id -> Piece.
|
387
|
+
// Uses the vocabulary in the provided context.
|
388
|
+
// Does not write null terminator to the buffer.
|
389
|
+
// User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
|
390
|
+
LLAMA_API int llama_token_to_piece(
|
383
391
|
const struct llama_context * ctx,
|
384
392
|
llama_token token,
|
385
393
|
char * buf,
|
386
394
|
int length);
|
387
395
|
|
388
|
-
LLAMA_API int
|
396
|
+
LLAMA_API int llama_token_to_piece_with_model(
|
389
397
|
const struct llama_model * model,
|
390
398
|
llama_token token,
|
391
399
|
char * buf,
|
@@ -402,6 +410,8 @@ extern "C" {
|
|
402
410
|
|
403
411
|
LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
|
404
412
|
|
413
|
+
LLAMA_API struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar);
|
414
|
+
|
405
415
|
//
|
406
416
|
// Sampling functions
|
407
417
|
//
|
@@ -465,6 +475,43 @@ extern "C" {
|
|
465
475
|
/// @details Accepts the sampled token into the grammar
|
466
476
|
LLAMA_API void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token);
|
467
477
|
|
478
|
+
//
|
479
|
+
// Beam search
|
480
|
+
//
|
481
|
+
|
482
|
+
struct llama_beam_view {
|
483
|
+
const llama_token * tokens;
|
484
|
+
size_t n_tokens;
|
485
|
+
float p; // Cumulative beam probability (renormalized relative to all beams)
|
486
|
+
bool eob; // Callback should set this to true when a beam is at end-of-beam.
|
487
|
+
};
|
488
|
+
|
489
|
+
// Passed to beam_search_callback function.
|
490
|
+
// Whenever 0 < common_prefix_length, this number of tokens should be copied from any of the beams
|
491
|
+
// (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks.
|
492
|
+
// These pointers are valid only during the synchronous callback, so should not be saved.
|
493
|
+
struct llama_beams_state {
|
494
|
+
struct llama_beam_view * beam_views;
|
495
|
+
size_t n_beams; // Number of elements in beam_views[].
|
496
|
+
size_t common_prefix_length; // Current max length of prefix tokens shared by all beams.
|
497
|
+
bool last_call; // True iff this is the last callback invocation.
|
498
|
+
};
|
499
|
+
|
500
|
+
// Type of pointer to the beam_search_callback function.
|
501
|
+
// void* callback_data is any custom data passed to llama_beam_search, that is subsequently
|
502
|
+
// passed back to beam_search_callback. This avoids having to use global variables in the callback.
|
503
|
+
typedef void (*llama_beam_search_callback_fn_t)(void * callback_data, struct llama_beams_state);
|
504
|
+
|
505
|
+
/// @details Deterministically returns entire sentence constructed by a beam search.
|
506
|
+
/// @param ctx Pointer to the llama_context.
|
507
|
+
/// @param callback Invoked for each iteration of the beam_search loop, passing in beams_state.
|
508
|
+
/// @param callback_data A pointer that is simply passed back to callback.
|
509
|
+
/// @param n_beams Number of beams to use.
|
510
|
+
/// @param n_past Number of tokens already evaluated.
|
511
|
+
/// @param n_predict Maximum number of tokens to predict. EOS may occur earlier.
|
512
|
+
/// @param n_threads Number of threads as passed to llama_eval().
|
513
|
+
LLAMA_API void llama_beam_search(struct llama_context * ctx, llama_beam_search_callback_fn_t callback, void * callback_data, size_t n_beams, int n_past, int n_predict, int n_threads);
|
514
|
+
|
468
515
|
// Performance information
|
469
516
|
LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
|
470
517
|
LLAMA_API void llama_print_timings(struct llama_context * ctx);
|
@@ -477,6 +524,8 @@ extern "C" {
|
|
477
524
|
// If this is not called, or NULL is supplied, everything is output on stderr.
|
478
525
|
LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data);
|
479
526
|
|
527
|
+
LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
|
528
|
+
|
480
529
|
#ifdef __cplusplus
|
481
530
|
}
|
482
531
|
#endif
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.
|
6
|
+
VERSION = '0.5.1'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b1198'
|
10
10
|
end
|
data/lib/llama_cpp.rb
CHANGED
data/sig/llama_cpp.rbs
CHANGED
@@ -76,9 +76,11 @@ module LLaMACpp
|
|
76
76
|
def n_vocab: () -> Integer
|
77
77
|
def n_ctx: () -> Integer
|
78
78
|
def n_embd: () -> Integer
|
79
|
-
def
|
79
|
+
def token_to_piece: (Integer) -> String
|
80
80
|
def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
|
81
|
-
def
|
81
|
+
def desc: () -> String
|
82
|
+
def size: () -> Integer
|
83
|
+
def n_params: () -> Integer
|
82
84
|
end
|
83
85
|
|
84
86
|
class Timings
|
@@ -116,7 +118,7 @@ module LLaMACpp
|
|
116
118
|
def timings: () -> ::LLaMACpp::Timings
|
117
119
|
def print_timings: () -> void
|
118
120
|
def reset_timings: () -> void
|
119
|
-
def
|
121
|
+
def token_to_piece: (Integer) -> String
|
120
122
|
def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
|
121
123
|
def kv_cache_token_count: () -> Integer
|
122
124
|
def set_rng_seed: (Integer) -> void
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llama_cpp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshoku
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-08
|
11
|
+
date: 2023-09-08 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
14
14
|
email:
|