llama_cpp 0.3.0 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -46,6 +46,8 @@
46
46
  #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
47
47
  #define LLAMA_SESSION_VERSION 1
48
48
 
49
+ #define LLAMA_DEFAULT_SEED 0xFFFFFFFF
50
+
49
51
  #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
50
52
  // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
51
53
  #define LLAMA_SUPPORTS_GPU_OFFLOAD
@@ -81,11 +83,11 @@ extern "C" {
81
83
  typedef void (*llama_progress_callback)(float progress, void *ctx);
82
84
 
83
85
  struct llama_context_params {
84
- int seed; // RNG seed, -1 for random
85
- int n_ctx; // text context
86
- int n_batch; // prompt processing batch size
87
- int n_gpu_layers; // number of layers to store in VRAM
88
- int main_gpu; // the GPU that is used for scratch and small tensors
86
+ uint32_t seed; // RNG seed, -1 for random
87
+ int32_t n_ctx; // text context
88
+ int32_t n_batch; // prompt processing batch size
89
+ int32_t n_gpu_layers; // number of layers to store in VRAM
90
+ int32_t main_gpu; // the GPU that is used for scratch and small tensors
89
91
  float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
90
92
  // called with a progress value between 0 and 1, pass NULL to disable
91
93
  llama_progress_callback progress_callback;
@@ -132,6 +134,20 @@ extern "C" {
132
134
  bool quantize_output_tensor; // quantize output.weight
133
135
  } llama_model_quantize_params;
134
136
 
137
+ // performance timing information
138
+ struct llama_timings {
139
+ double t_start_ms;
140
+ double t_end_ms;
141
+ double t_load_ms;
142
+ double t_sample_ms;
143
+ double t_p_eval_ms;
144
+ double t_eval_ms;
145
+
146
+ int32_t n_sample;
147
+ int32_t n_p_eval;
148
+ int32_t n_eval;
149
+ };
150
+
135
151
  LLAMA_API struct llama_context_params llama_context_default_params();
136
152
  LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
137
153
 
@@ -196,7 +212,7 @@ extern "C" {
196
212
  LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
197
213
 
198
214
  // Sets the current rng seed.
199
- LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed);
215
+ LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
200
216
 
201
217
  // Returns the maximum size in bytes of the state (rng, logits, embedding
202
218
  // and kv_cache) - will often be smaller after compacting tokens
@@ -226,6 +242,14 @@ extern "C" {
226
242
  int n_past,
227
243
  int n_threads);
228
244
 
245
+ // Same as llama_eval, but use float matrix input directly.
246
+ LLAMA_API int llama_eval_embd(
247
+ struct llama_context * ctx,
248
+ const float * embd,
249
+ int n_tokens,
250
+ int n_past,
251
+ int n_threads);
252
+
229
253
  // Export a static computation graph for context of 511 and batch size of 1
230
254
  // NOTE: since this functionality is mostly for debugging and demonstration purposes, we hardcode these
231
255
  // parameters here to keep things simple
@@ -321,6 +345,7 @@ extern "C" {
321
345
  LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
322
346
 
323
347
  // Performance information
348
+ LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
324
349
  LLAMA_API void llama_print_timings(struct llama_context * ctx);
325
350
  LLAMA_API void llama_reset_timings(struct llama_context * ctx);
326
351
 
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.3.0'
6
+ VERSION = '0.3.2'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'master-9d23589'
9
+ LLAMA_CPP_VERSION = 'master-481f793'
10
10
  end
data/lib/llama_cpp.rb CHANGED
@@ -16,8 +16,22 @@ module LLaMACpp
16
16
  # @param prompt [String] The prompt to start generation with.
17
17
  # @param n_predict [Integer] The number of tokens to predict.
18
18
  # @param n_threads [Integer] The number of threads.
19
+ # @param n_keep [Integer] The number of tokens to keep in the context.
20
+ # @param n_batch [Integer] The number of tokens to process in a batch.
21
+ # @param repeat_last_n [Integer] The number of tokens to consider for repetition penalty.
22
+ # @param repeat_penalty [Float] The repetition penalty.
23
+ # @param frequency [Float] The frequency penalty.
24
+ # @param presence [Float] The presence penalty.
25
+ # @param top_k [Integer] The number of tokens to consider for top-k sampling.
26
+ # @param top_p [Float] The probability threshold for nucleus sampling.
27
+ # @param tfs_z [Float] The z parameter for tail-free sampling.
28
+ # @param typical_p [Float] The probability for typical sampling.
29
+ # @param temperature [Float] The temperature for temperature sampling.
19
30
  # @return [String]
20
- def generate(context, prompt, n_predict: 128, n_threads: 1) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
31
+ def generate(context, prompt, # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
32
+ n_predict: 128, n_threads: 1, n_keep: 10, n_batch: 512, repeat_last_n: 64,
33
+ repeat_penalty: 1.1, frequency: 0.0, presence: 0.0, top_k: 40,
34
+ top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8)
21
35
  raise ArgumentError, 'context must be an instance of LLaMACpp::Context' unless context.is_a?(LLaMACpp::Context)
22
36
  raise ArgumentError, 'prompt must be a String' unless prompt.is_a?(String)
23
37
 
@@ -31,19 +45,8 @@ module LLaMACpp
31
45
 
32
46
  embd = []
33
47
  n_consumed = 0
34
- n_keep = 10
35
48
  n_past = 0
36
49
  n_remain = n_predict
37
- repeat_last_n = 64
38
- repeat_penalty = 1.1
39
- frequency = 0.0
40
- presence = 0.0
41
- top_k = 40
42
- top_p = 0.95
43
- tfs_z = 1.0
44
- typical_p = 1.0
45
- temperature = 0.8
46
- n_batch = 512
47
50
  n_vocab = context.n_vocab
48
51
  output = []
49
52
 
data/sig/llama_cpp.rbs CHANGED
@@ -4,6 +4,7 @@ module LLaMACpp
4
4
  LLAMA_FILE_VERSION: String
5
5
  LLAMA_FILE_MAGIC: String
6
6
  LLAMA_FILE_MAGIC_UNVERSIONED: String
7
+ LLAMA_DEFALUT_SEED: String
7
8
 
8
9
  LLAMA_MAX_DEVICES: Integer
9
10
 
@@ -27,7 +28,10 @@ module LLaMACpp
27
28
 
28
29
  def self?.init_backend: (?numa: bool) -> void
29
30
  def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
30
- def self?.generate: (::LLaMACpp::Context, String, ?n_predict: Integer, ?n_threads: Integer) -> String
31
+ def self?.generate: (::LLaMACpp::Context, String,
32
+ ?n_predict: Integer, ?n_threads: Integer, ?n_keep: Integer, ?n_batch: Integer,
33
+ ?repeat_last_n: Integer, ?repeat_penalty: Float, ?frequency: Float, ?presence: Float,
34
+ ?top_k: Integer, ?top_p: Float, ?tfs_z: Float, ?typical_p: Float, ?temperature: Float) -> String
31
35
  def self?.print_system_info: () -> void
32
36
  def self?.token_bos: () -> Integer
33
37
  def self?.token_eos: () -> Integer
@@ -66,18 +70,34 @@ module LLaMACpp
66
70
  def apply_lora_from_file: (lora_path: String, ?base_model_path: String, ?n_threads: Integer) -> void
67
71
  end
68
72
 
73
+ class Timings
74
+ public
75
+
76
+ def t_start_ms: () -> Float
77
+ def t_end_ms: () -> Float
78
+ def t_load_ms: () -> Float
79
+ def t_sample_ms: () -> Float
80
+ def t_p_eval_ms: () -> Float
81
+ def t_eval_ms: () -> Float
82
+ def n_sample: () -> Integer
83
+ def n_p_eval: () -> Integer
84
+ def n_eval: () -> Integer
85
+ end
86
+
69
87
  class Context
70
88
  public
71
89
 
72
90
  def initialize: (model: ::LLaMACpp::Model) -> void
73
91
  def embeddings: () -> Array[Float]
74
92
  def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
93
+ def eval_embd: (tokens: Array[Float], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
75
94
  def eval_export: (String) -> bool
76
95
  def logits: () -> Array[Float]
77
96
  def n_ctx: () -> Integer
78
97
  def n_embd: () -> Integer
79
98
  def n_vocab: () -> Integer
80
99
  def vocab: (capacity: Integer) -> [Array[String], Array[Float]]
100
+ def timings: () -> ::LLaMACpp::Timings
81
101
  def print_timings: () -> void
82
102
  def reset_timings: () -> void
83
103
  def token_to_str: (Integer) -> String
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.3.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-06-29 00:00:00.000000000 Z
11
+ date: 2023-07-08 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email:
@@ -25,6 +25,7 @@ files:
25
25
  - examples/README.md
26
26
  - examples/chat.rb
27
27
  - examples/embedding.rb
28
+ - examples/prompt_jp.txt
28
29
  - ext/llama_cpp/extconf.rb
29
30
  - ext/llama_cpp/llama_cpp.cpp
30
31
  - ext/llama_cpp/llama_cpp.h