llama_cpp 0.4.0 → 0.5.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -10,6 +10,7 @@
10
10
  #endif // GGML_USE_CUBLAS
11
11
  #include <stddef.h>
12
12
  #include <stdint.h>
13
+ #include <stdio.h>
13
14
  #include <stdbool.h>
14
15
 
15
16
  #ifdef LLAMA_SHARED
@@ -163,6 +164,7 @@ extern "C" {
163
164
  enum llama_ftype ftype; // quantize to this llama_ftype
164
165
  bool allow_requantize; // allow quantizing non-f32/f16 tensors
165
166
  bool quantize_output_tensor; // quantize output.weight
167
+ bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
166
168
  } llama_model_quantize_params;
167
169
 
168
170
  // grammar types
@@ -254,7 +256,11 @@ extern "C" {
254
256
  LLAMA_API int llama_model_n_embd (const struct llama_model * model);
255
257
 
256
258
  // Get a string describing the model type
257
- LLAMA_API int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size);
259
+ LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
260
+ // Returns the total size of all the tensors in the model in bytes
261
+ LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
262
+ // Returns the total number of parameters in the model
263
+ LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
258
264
 
259
265
  // Returns 0 on success
260
266
  LLAMA_API int llama_model_quantize(
@@ -377,15 +383,17 @@ extern "C" {
377
383
  int n_max_tokens,
378
384
  bool add_bos);
379
385
 
380
- // Token Id -> String. Uses the vocabulary in the provided context
381
- // Does not write null terminator to the buffer
382
- LLAMA_API int llama_token_to_str(
386
+ // Token Id -> Piece.
387
+ // Uses the vocabulary in the provided context.
388
+ // Does not write null terminator to the buffer.
389
+ // User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
390
+ LLAMA_API int llama_token_to_piece(
383
391
  const struct llama_context * ctx,
384
392
  llama_token token,
385
393
  char * buf,
386
394
  int length);
387
395
 
388
- LLAMA_API int llama_token_to_str_with_model(
396
+ LLAMA_API int llama_token_to_piece_with_model(
389
397
  const struct llama_model * model,
390
398
  llama_token token,
391
399
  char * buf,
@@ -402,6 +410,8 @@ extern "C" {
402
410
 
403
411
  LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
404
412
 
413
+ LLAMA_API struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar);
414
+
405
415
  //
406
416
  // Sampling functions
407
417
  //
@@ -465,6 +475,43 @@ extern "C" {
465
475
  /// @details Accepts the sampled token into the grammar
466
476
  LLAMA_API void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token);
467
477
 
478
+ //
479
+ // Beam search
480
+ //
481
+
482
+ struct llama_beam_view {
483
+ const llama_token * tokens;
484
+ size_t n_tokens;
485
+ float p; // Cumulative beam probability (renormalized relative to all beams)
486
+ bool eob; // Callback should set this to true when a beam is at end-of-beam.
487
+ };
488
+
489
+ // Passed to beam_search_callback function.
490
+ // Whenever 0 < common_prefix_length, this number of tokens should be copied from any of the beams
491
+ // (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks.
492
+ // These pointers are valid only during the synchronous callback, so should not be saved.
493
+ struct llama_beams_state {
494
+ struct llama_beam_view * beam_views;
495
+ size_t n_beams; // Number of elements in beam_views[].
496
+ size_t common_prefix_length; // Current max length of prefix tokens shared by all beams.
497
+ bool last_call; // True iff this is the last callback invocation.
498
+ };
499
+
500
+ // Type of pointer to the beam_search_callback function.
501
+ // void* callback_data is any custom data passed to llama_beam_search, that is subsequently
502
+ // passed back to beam_search_callback. This avoids having to use global variables in the callback.
503
+ typedef void (*llama_beam_search_callback_fn_t)(void * callback_data, struct llama_beams_state);
504
+
505
+ /// @details Deterministically returns entire sentence constructed by a beam search.
506
+ /// @param ctx Pointer to the llama_context.
507
+ /// @param callback Invoked for each iteration of the beam_search loop, passing in beams_state.
508
+ /// @param callback_data A pointer that is simply passed back to callback.
509
+ /// @param n_beams Number of beams to use.
510
+ /// @param n_past Number of tokens already evaluated.
511
+ /// @param n_predict Maximum number of tokens to predict. EOS may occur earlier.
512
+ /// @param n_threads Number of threads as passed to llama_eval().
513
+ LLAMA_API void llama_beam_search(struct llama_context * ctx, llama_beam_search_callback_fn_t callback, void * callback_data, size_t n_beams, int n_past, int n_predict, int n_threads);
514
+
468
515
  // Performance information
469
516
  LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
470
517
  LLAMA_API void llama_print_timings(struct llama_context * ctx);
@@ -477,6 +524,8 @@ extern "C" {
477
524
  // If this is not called, or NULL is supplied, everything is output on stderr.
478
525
  LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data);
479
526
 
527
+ LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
528
+
480
529
  #ifdef __cplusplus
481
530
  }
482
531
  #endif
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.4.0'
6
+ VERSION = '0.5.1'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b1060'
9
+ LLAMA_CPP_VERSION = 'b1198'
10
10
  end
data/lib/llama_cpp.rb CHANGED
@@ -99,7 +99,7 @@ module LLaMACpp
99
99
  end
100
100
  end
101
101
 
102
- embd.each { |token| output << context.token_to_str(token) }
102
+ embd.each { |token| output << context.token_to_piece(token) }
103
103
 
104
104
  break if !embd.empty? && embd[-1] == context.token_eos
105
105
  end
data/sig/llama_cpp.rbs CHANGED
@@ -76,9 +76,11 @@ module LLaMACpp
76
76
  def n_vocab: () -> Integer
77
77
  def n_ctx: () -> Integer
78
78
  def n_embd: () -> Integer
79
- def token_to_str: (Integer) -> String
79
+ def token_to_piece: (Integer) -> String
80
80
  def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
81
- def type: () -> String
81
+ def desc: () -> String
82
+ def size: () -> Integer
83
+ def n_params: () -> Integer
82
84
  end
83
85
 
84
86
  class Timings
@@ -116,7 +118,7 @@ module LLaMACpp
116
118
  def timings: () -> ::LLaMACpp::Timings
117
119
  def print_timings: () -> void
118
120
  def reset_timings: () -> void
119
- def token_to_str: (Integer) -> String
121
+ def token_to_piece: (Integer) -> String
120
122
  def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
121
123
  def kv_cache_token_count: () -> Integer
122
124
  def set_rng_seed: (Integer) -> void
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.5.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-08-26 00:00:00.000000000 Z
11
+ date: 2023-09-08 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email: