llama_cpp 0.4.0 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -10,6 +10,7 @@
10
10
  #endif // GGML_USE_CUBLAS
11
11
  #include <stddef.h>
12
12
  #include <stdint.h>
13
+ #include <stdio.h>
13
14
  #include <stdbool.h>
14
15
 
15
16
  #ifdef LLAMA_SHARED
@@ -163,6 +164,7 @@ extern "C" {
163
164
  enum llama_ftype ftype; // quantize to this llama_ftype
164
165
  bool allow_requantize; // allow quantizing non-f32/f16 tensors
165
166
  bool quantize_output_tensor; // quantize output.weight
167
+ bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
166
168
  } llama_model_quantize_params;
167
169
 
168
170
  // grammar types
@@ -254,7 +256,11 @@ extern "C" {
254
256
  LLAMA_API int llama_model_n_embd (const struct llama_model * model);
255
257
 
256
258
  // Get a string describing the model type
257
- LLAMA_API int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size);
259
+ LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
260
+ // Returns the total size of all the tensors in the model in bytes
261
+ LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
262
+ // Returns the total number of parameters in the model
263
+ LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
258
264
 
259
265
  // Returns 0 on success
260
266
  LLAMA_API int llama_model_quantize(
@@ -377,15 +383,17 @@ extern "C" {
377
383
  int n_max_tokens,
378
384
  bool add_bos);
379
385
 
380
- // Token Id -> String. Uses the vocabulary in the provided context
381
- // Does not write null terminator to the buffer
382
- LLAMA_API int llama_token_to_str(
386
+ // Token Id -> Piece.
387
+ // Uses the vocabulary in the provided context.
388
+ // Does not write null terminator to the buffer.
389
+ // User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
390
+ LLAMA_API int llama_token_to_piece(
383
391
  const struct llama_context * ctx,
384
392
  llama_token token,
385
393
  char * buf,
386
394
  int length);
387
395
 
388
- LLAMA_API int llama_token_to_str_with_model(
396
+ LLAMA_API int llama_token_to_piece_with_model(
389
397
  const struct llama_model * model,
390
398
  llama_token token,
391
399
  char * buf,
@@ -402,6 +410,8 @@ extern "C" {
402
410
 
403
411
  LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
404
412
 
413
+ LLAMA_API struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar);
414
+
405
415
  //
406
416
  // Sampling functions
407
417
  //
@@ -465,6 +475,43 @@ extern "C" {
465
475
  /// @details Accepts the sampled token into the grammar
466
476
  LLAMA_API void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token);
467
477
 
478
+ //
479
+ // Beam search
480
+ //
481
+
482
+ struct llama_beam_view {
483
+ const llama_token * tokens;
484
+ size_t n_tokens;
485
+ float p; // Cumulative beam probability (renormalized relative to all beams)
486
+ bool eob; // Callback should set this to true when a beam is at end-of-beam.
487
+ };
488
+
489
+ // Passed to beam_search_callback function.
490
+ // Whenever 0 < common_prefix_length, this number of tokens should be copied from any of the beams
491
+ // (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks.
492
+ // These pointers are valid only during the synchronous callback, so should not be saved.
493
+ struct llama_beams_state {
494
+ struct llama_beam_view * beam_views;
495
+ size_t n_beams; // Number of elements in beam_views[].
496
+ size_t common_prefix_length; // Current max length of prefix tokens shared by all beams.
497
+ bool last_call; // True iff this is the last callback invocation.
498
+ };
499
+
500
+ // Type of pointer to the beam_search_callback function.
501
+ // void* callback_data is any custom data passed to llama_beam_search, that is subsequently
502
+ // passed back to beam_search_callback. This avoids having to use global variables in the callback.
503
+ typedef void (*llama_beam_search_callback_fn_t)(void * callback_data, struct llama_beams_state);
504
+
505
+ /// @details Deterministically returns entire sentence constructed by a beam search.
506
+ /// @param ctx Pointer to the llama_context.
507
+ /// @param callback Invoked for each iteration of the beam_search loop, passing in beams_state.
508
+ /// @param callback_data A pointer that is simply passed back to callback.
509
+ /// @param n_beams Number of beams to use.
510
+ /// @param n_past Number of tokens already evaluated.
511
+ /// @param n_predict Maximum number of tokens to predict. EOS may occur earlier.
512
+ /// @param n_threads Number of threads as passed to llama_eval().
513
+ LLAMA_API void llama_beam_search(struct llama_context * ctx, llama_beam_search_callback_fn_t callback, void * callback_data, size_t n_beams, int n_past, int n_predict, int n_threads);
514
+
468
515
  // Performance information
469
516
  LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
470
517
  LLAMA_API void llama_print_timings(struct llama_context * ctx);
@@ -477,6 +524,8 @@ extern "C" {
477
524
  // If this is not called, or NULL is supplied, everything is output on stderr.
478
525
  LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data);
479
526
 
527
+ LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
528
+
480
529
  #ifdef __cplusplus
481
530
  }
482
531
  #endif
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.4.0'
6
+ VERSION = '0.5.1'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b1060'
9
+ LLAMA_CPP_VERSION = 'b1198'
10
10
  end
data/lib/llama_cpp.rb CHANGED
@@ -99,7 +99,7 @@ module LLaMACpp
99
99
  end
100
100
  end
101
101
 
102
- embd.each { |token| output << context.token_to_str(token) }
102
+ embd.each { |token| output << context.token_to_piece(token) }
103
103
 
104
104
  break if !embd.empty? && embd[-1] == context.token_eos
105
105
  end
data/sig/llama_cpp.rbs CHANGED
@@ -76,9 +76,11 @@ module LLaMACpp
76
76
  def n_vocab: () -> Integer
77
77
  def n_ctx: () -> Integer
78
78
  def n_embd: () -> Integer
79
- def token_to_str: (Integer) -> String
79
+ def token_to_piece: (Integer) -> String
80
80
  def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
81
- def type: () -> String
81
+ def desc: () -> String
82
+ def size: () -> Integer
83
+ def n_params: () -> Integer
82
84
  end
83
85
 
84
86
  class Timings
@@ -116,7 +118,7 @@ module LLaMACpp
116
118
  def timings: () -> ::LLaMACpp::Timings
117
119
  def print_timings: () -> void
118
120
  def reset_timings: () -> void
119
- def token_to_str: (Integer) -> String
121
+ def token_to_piece: (Integer) -> String
120
122
  def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
121
123
  def kv_cache_token_count: () -> Integer
122
124
  def set_rng_seed: (Integer) -> void
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.5.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-08-26 00:00:00.000000000 Z
11
+ date: 2023-09-08 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email: