llama_cpp 0.3.0 → 0.3.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -46,6 +46,8 @@
46
46
  #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
47
47
  #define LLAMA_SESSION_VERSION 1
48
48
 
49
+ #define LLAMA_DEFAULT_SEED 0xFFFFFFFF
50
+
49
51
  #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
50
52
  // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
51
53
  #define LLAMA_SUPPORTS_GPU_OFFLOAD
@@ -81,11 +83,11 @@ extern "C" {
81
83
  typedef void (*llama_progress_callback)(float progress, void *ctx);
82
84
 
83
85
  struct llama_context_params {
84
- int seed; // RNG seed, -1 for random
85
- int n_ctx; // text context
86
- int n_batch; // prompt processing batch size
87
- int n_gpu_layers; // number of layers to store in VRAM
88
- int main_gpu; // the GPU that is used for scratch and small tensors
86
+ uint32_t seed; // RNG seed, -1 for random
87
+ int32_t n_ctx; // text context
88
+ int32_t n_batch; // prompt processing batch size
89
+ int32_t n_gpu_layers; // number of layers to store in VRAM
90
+ int32_t main_gpu; // the GPU that is used for scratch and small tensors
89
91
  float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
90
92
  // called with a progress value between 0 and 1, pass NULL to disable
91
93
  llama_progress_callback progress_callback;
@@ -132,6 +134,20 @@ extern "C" {
132
134
  bool quantize_output_tensor; // quantize output.weight
133
135
  } llama_model_quantize_params;
134
136
 
137
+ // performance timing information
138
+ struct llama_timings {
139
+ double t_start_ms;
140
+ double t_end_ms;
141
+ double t_load_ms;
142
+ double t_sample_ms;
143
+ double t_p_eval_ms;
144
+ double t_eval_ms;
145
+
146
+ int32_t n_sample;
147
+ int32_t n_p_eval;
148
+ int32_t n_eval;
149
+ };
150
+
135
151
  LLAMA_API struct llama_context_params llama_context_default_params();
136
152
  LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
137
153
 
@@ -196,7 +212,7 @@ extern "C" {
196
212
  LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
197
213
 
198
214
  // Sets the current rng seed.
199
- LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed);
215
+ LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
200
216
 
201
217
  // Returns the maximum size in bytes of the state (rng, logits, embedding
202
218
  // and kv_cache) - will often be smaller after compacting tokens
@@ -226,6 +242,14 @@ extern "C" {
226
242
  int n_past,
227
243
  int n_threads);
228
244
 
245
+ // Same as llama_eval, but use float matrix input directly.
246
+ LLAMA_API int llama_eval_embd(
247
+ struct llama_context * ctx,
248
+ const float * embd,
249
+ int n_tokens,
250
+ int n_past,
251
+ int n_threads);
252
+
229
253
  // Export a static computation graph for context of 511 and batch size of 1
230
254
  // NOTE: since this functionality is mostly for debugging and demonstration purposes, we hardcode these
231
255
  // parameters here to keep things simple
@@ -321,6 +345,7 @@ extern "C" {
321
345
  LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
322
346
 
323
347
  // Performance information
348
+ LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
324
349
  LLAMA_API void llama_print_timings(struct llama_context * ctx);
325
350
  LLAMA_API void llama_reset_timings(struct llama_context * ctx);
326
351
 
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.3.0'
6
+ VERSION = '0.3.2'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'master-9d23589'
9
+ LLAMA_CPP_VERSION = 'master-481f793'
10
10
  end
data/lib/llama_cpp.rb CHANGED
@@ -16,8 +16,22 @@ module LLaMACpp
16
16
  # @param prompt [String] The prompt to start generation with.
17
17
  # @param n_predict [Integer] The number of tokens to predict.
18
18
  # @param n_threads [Integer] The number of threads.
19
+ # @param n_keep [Integer] The number of tokens to keep in the context.
20
+ # @param n_batch [Integer] The number of tokens to process in a batch.
21
+ # @param repeat_last_n [Integer] The number of tokens to consider for repetition penalty.
22
+ # @param repeat_penalty [Float] The repetition penalty.
23
+ # @param frequency [Float] The frequency penalty.
24
+ # @param presence [Float] The presence penalty.
25
+ # @param top_k [Integer] The number of tokens to consider for top-k sampling.
26
+ # @param top_p [Float] The probability threshold for nucleus sampling.
27
+ # @param tfs_z [Float] The z parameter for tail-free sampling.
28
+ # @param typical_p [Float] The probability for typical sampling.
29
+ # @param temperature [Float] The temperature for temperature sampling.
19
30
  # @return [String]
20
- def generate(context, prompt, n_predict: 128, n_threads: 1) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
31
+ def generate(context, prompt, # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
32
+ n_predict: 128, n_threads: 1, n_keep: 10, n_batch: 512, repeat_last_n: 64,
33
+ repeat_penalty: 1.1, frequency: 0.0, presence: 0.0, top_k: 40,
34
+ top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8)
21
35
  raise ArgumentError, 'context must be an instance of LLaMACpp::Context' unless context.is_a?(LLaMACpp::Context)
22
36
  raise ArgumentError, 'prompt must be a String' unless prompt.is_a?(String)
23
37
 
@@ -31,19 +45,8 @@ module LLaMACpp
31
45
 
32
46
  embd = []
33
47
  n_consumed = 0
34
- n_keep = 10
35
48
  n_past = 0
36
49
  n_remain = n_predict
37
- repeat_last_n = 64
38
- repeat_penalty = 1.1
39
- frequency = 0.0
40
- presence = 0.0
41
- top_k = 40
42
- top_p = 0.95
43
- tfs_z = 1.0
44
- typical_p = 1.0
45
- temperature = 0.8
46
- n_batch = 512
47
50
  n_vocab = context.n_vocab
48
51
  output = []
49
52
 
data/sig/llama_cpp.rbs CHANGED
@@ -4,6 +4,7 @@ module LLaMACpp
4
4
  LLAMA_FILE_VERSION: String
5
5
  LLAMA_FILE_MAGIC: String
6
6
  LLAMA_FILE_MAGIC_UNVERSIONED: String
7
+ LLAMA_DEFALUT_SEED: String
7
8
 
8
9
  LLAMA_MAX_DEVICES: Integer
9
10
 
@@ -27,7 +28,10 @@ module LLaMACpp
27
28
 
28
29
  def self?.init_backend: (?numa: bool) -> void
29
30
  def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
30
- def self?.generate: (::LLaMACpp::Context, String, ?n_predict: Integer, ?n_threads: Integer) -> String
31
+ def self?.generate: (::LLaMACpp::Context, String,
32
+ ?n_predict: Integer, ?n_threads: Integer, ?n_keep: Integer, ?n_batch: Integer,
33
+ ?repeat_last_n: Integer, ?repeat_penalty: Float, ?frequency: Float, ?presence: Float,
34
+ ?top_k: Integer, ?top_p: Float, ?tfs_z: Float, ?typical_p: Float, ?temperature: Float) -> String
31
35
  def self?.print_system_info: () -> void
32
36
  def self?.token_bos: () -> Integer
33
37
  def self?.token_eos: () -> Integer
@@ -66,18 +70,34 @@ module LLaMACpp
66
70
  def apply_lora_from_file: (lora_path: String, ?base_model_path: String, ?n_threads: Integer) -> void
67
71
  end
68
72
 
73
+ class Timings
74
+ public
75
+
76
+ def t_start_ms: () -> Float
77
+ def t_end_ms: () -> Float
78
+ def t_load_ms: () -> Float
79
+ def t_sample_ms: () -> Float
80
+ def t_p_eval_ms: () -> Float
81
+ def t_eval_ms: () -> Float
82
+ def n_sample: () -> Integer
83
+ def n_p_eval: () -> Integer
84
+ def n_eval: () -> Integer
85
+ end
86
+
69
87
  class Context
70
88
  public
71
89
 
72
90
  def initialize: (model: ::LLaMACpp::Model) -> void
73
91
  def embeddings: () -> Array[Float]
74
92
  def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
93
+ def eval_embd: (tokens: Array[Float], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
75
94
  def eval_export: (String) -> bool
76
95
  def logits: () -> Array[Float]
77
96
  def n_ctx: () -> Integer
78
97
  def n_embd: () -> Integer
79
98
  def n_vocab: () -> Integer
80
99
  def vocab: (capacity: Integer) -> [Array[String], Array[Float]]
100
+ def timings: () -> ::LLaMACpp::Timings
81
101
  def print_timings: () -> void
82
102
  def reset_timings: () -> void
83
103
  def token_to_str: (Integer) -> String
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.3.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-06-29 00:00:00.000000000 Z
11
+ date: 2023-07-08 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email:
@@ -25,6 +25,7 @@ files:
25
25
  - examples/README.md
26
26
  - examples/chat.rb
27
27
  - examples/embedding.rb
28
+ - examples/prompt_jp.txt
28
29
  - ext/llama_cpp/extconf.rb
29
30
  - ext/llama_cpp/llama_cpp.cpp
30
31
  - ext/llama_cpp/llama_cpp.h