llama_cpp 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -10,6 +10,7 @@
10
10
  #endif // GGML_USE_CUBLAS
11
11
  #include <stddef.h>
12
12
  #include <stdint.h>
13
+ #include <stdio.h>
13
14
  #include <stdbool.h>
14
15
 
15
16
  #ifdef LLAMA_SHARED
@@ -254,7 +255,11 @@ extern "C" {
254
255
  LLAMA_API int llama_model_n_embd (const struct llama_model * model);
255
256
 
256
257
  // Get a string describing the model type
257
- LLAMA_API int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size);
258
+ LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
259
+ // Returns the total size of all the tensors in the model in bytes
260
+ LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
261
+ // Returns the total number of parameters in the model
262
+ LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
258
263
 
259
264
  // Returns 0 on success
260
265
  LLAMA_API int llama_model_quantize(
@@ -377,15 +382,17 @@ extern "C" {
377
382
  int n_max_tokens,
378
383
  bool add_bos);
379
384
 
380
- // Token Id -> String. Uses the vocabulary in the provided context
381
- // Does not write null terminator to the buffer
382
- LLAMA_API int llama_token_to_str(
385
+ // Token Id -> Piece.
386
+ // Uses the vocabulary in the provided context.
387
+ // Does not write null terminator to the buffer.
388
+ // User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
389
+ LLAMA_API int llama_token_to_piece(
383
390
  const struct llama_context * ctx,
384
391
  llama_token token,
385
392
  char * buf,
386
393
  int length);
387
394
 
388
- LLAMA_API int llama_token_to_str_with_model(
395
+ LLAMA_API int llama_token_to_piece_with_model(
389
396
  const struct llama_model * model,
390
397
  llama_token token,
391
398
  char * buf,
@@ -465,6 +472,43 @@ extern "C" {
465
472
  /// @details Accepts the sampled token into the grammar
466
473
  LLAMA_API void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token);
467
474
 
475
+ //
476
+ // Beam search
477
+ //
478
+
479
+ struct llama_beam_view {
480
+ const llama_token * tokens;
481
+ size_t n_tokens;
482
+ float p; // Cumulative beam probability (renormalized relative to all beams)
483
+ bool eob; // Callback should set this to true when a beam is at end-of-beam.
484
+ };
485
+
486
+ // Passed to beam_search_callback function.
487
+ // Whenever 0 < common_prefix_length, this number of tokens should be copied from any of the beams
488
+ // (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks.
489
+ // These pointers are valid only during the synchronous callback, so should not be saved.
490
+ struct llama_beams_state {
491
+ struct llama_beam_view * beam_views;
492
+ size_t n_beams; // Number of elements in beam_views[].
493
+ size_t common_prefix_length; // Current max length of prefix tokens shared by all beams.
494
+ bool last_call; // True iff this is the last callback invocation.
495
+ };
496
+
497
+ // Type of pointer to the beam_search_callback function.
498
+ // void* callback_data is any custom data passed to llama_beam_search, that is subsequently
499
+ // passed back to beam_search_callback. This avoids having to use global variables in the callback.
500
+ typedef void (*llama_beam_search_callback_fn_t)(void * callback_data, struct llama_beams_state);
501
+
502
+ /// @details Deterministically returns entire sentence constructed by a beam search.
503
+ /// @param ctx Pointer to the llama_context.
504
+ /// @param callback Invoked for each iteration of the beam_search loop, passing in beams_state.
505
+ /// @param callback_data A pointer that is simply passed back to callback.
506
+ /// @param n_beams Number of beams to use.
507
+ /// @param n_past Number of tokens already evaluated.
508
+ /// @param n_predict Maximum number of tokens to predict. EOS may occur earlier.
509
+ /// @param n_threads Number of threads as passed to llama_eval().
510
+ LLAMA_API void llama_beam_search(struct llama_context * ctx, llama_beam_search_callback_fn_t callback, void * callback_data, size_t n_beams, int n_past, int n_predict, int n_threads);
511
+
468
512
  // Performance information
469
513
  LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
470
514
  LLAMA_API void llama_print_timings(struct llama_context * ctx);
@@ -477,6 +521,8 @@ extern "C" {
477
521
  // If this is not called, or NULL is supplied, everything is output on stderr.
478
522
  LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data);
479
523
 
524
+ LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
525
+
480
526
  #ifdef __cplusplus
481
527
  }
482
528
  #endif
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.4.0'
6
+ VERSION = '0.5.0'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b1060'
9
+ LLAMA_CPP_VERSION = 'b1140'
10
10
  end
data/lib/llama_cpp.rb CHANGED
@@ -99,7 +99,7 @@ module LLaMACpp
99
99
  end
100
100
  end
101
101
 
102
- embd.each { |token| output << context.token_to_str(token) }
102
+ embd.each { |token| output << context.token_to_piece(token) }
103
103
 
104
104
  break if !embd.empty? && embd[-1] == context.token_eos
105
105
  end
data/sig/llama_cpp.rbs CHANGED
@@ -76,9 +76,11 @@ module LLaMACpp
76
76
  def n_vocab: () -> Integer
77
77
  def n_ctx: () -> Integer
78
78
  def n_embd: () -> Integer
79
- def token_to_str: (Integer) -> String
79
+ def token_to_piece: (Integer) -> String
80
80
  def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
81
- def type: () -> String
81
+ def desc: () -> String
82
+ def size: () -> Integer
83
+ def n_params: () -> Integer
82
84
  end
83
85
 
84
86
  class Timings
@@ -116,7 +118,7 @@ module LLaMACpp
116
118
  def timings: () -> ::LLaMACpp::Timings
117
119
  def print_timings: () -> void
118
120
  def reset_timings: () -> void
119
- def token_to_str: (Integer) -> String
121
+ def token_to_piece: (Integer) -> String
120
122
  def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
121
123
  def kv_cache_token_count: () -> Integer
122
124
  def set_rng_seed: (Integer) -> void
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-08-26 00:00:00.000000000 Z
11
+ date: 2023-09-02 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email: