llama_cpp 0.0.6 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -19,9 +19,11 @@
19
19
  # define LLAMA_API
20
20
  #endif
21
21
 
22
- #define LLAMA_FILE_VERSION 1
23
- #define LLAMA_FILE_MAGIC 0x67676a74 // 'ggjt' in hex
24
- #define LLAMA_FILE_MAGIC_UNVERSIONED 0x67676d6c // pre-versioned files
22
+ #define LLAMA_FILE_VERSION 1
23
+ #define LLAMA_FILE_MAGIC 'ggjt'
24
+ #define LLAMA_FILE_MAGIC_UNVERSIONED 'ggml'
25
+ #define LLAMA_SESSION_MAGIC 'ggsn'
26
+ #define LLAMA_SESSION_VERSION 1
25
27
 
26
28
  #ifdef __cplusplus
27
29
  extern "C" {
@@ -39,18 +41,22 @@ extern "C" {
39
41
 
40
42
  typedef struct llama_token_data {
41
43
  llama_token id; // token id
42
-
44
+ float logit; // log-odds of the token
43
45
  float p; // probability of the token
44
- float plog; // log probability of the token
45
-
46
46
  } llama_token_data;
47
47
 
48
+ typedef struct llama_token_data_array {
49
+ llama_token_data * data;
50
+ size_t size;
51
+ bool sorted;
52
+ } llama_token_data_array;
53
+
48
54
  typedef void (*llama_progress_callback)(float progress, void *ctx);
49
55
 
50
56
  struct llama_context_params {
51
57
  int n_ctx; // text context
52
58
  int n_parts; // -1 for default
53
- int seed; // RNG seed, 0 for random
59
+ int seed; // RNG seed, -1 for random
54
60
 
55
61
  bool f16_kv; // use fp16 for KV cache
56
62
  bool logits_all; // the llama_eval() call computes all logits, not just the last one
@@ -73,7 +79,10 @@ extern "C" {
73
79
  LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
74
80
  LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
75
81
  LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
76
- LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // except 1d tensors
82
+ // LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed
83
+ LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
84
+ LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
85
+ LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
77
86
  };
78
87
 
79
88
  LLAMA_API struct llama_context_params llama_context_default_params();
@@ -112,22 +121,28 @@ extern "C" {
112
121
  const char * path_base_model,
113
122
  int n_threads);
114
123
 
115
- // Returns the KV cache that will contain the context for the
116
- // ongoing prediction with the model.
117
- LLAMA_API const uint8_t * llama_get_kv_cache(struct llama_context * ctx);
124
+ // Returns the number of tokens in the KV cache
125
+ LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
118
126
 
119
- // Returns the size of the KV cache
120
- LLAMA_API size_t llama_get_kv_cache_size(struct llama_context * ctx);
127
+ // Sets the current rng seed.
128
+ LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed);
121
129
 
122
- // Returns the number of tokens in the KV cache
123
- LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx);
130
+ // Returns the maximum size in bytes of the state (rng, logits, embedding
131
+ // and kv_cache) - will often be smaller after compacting tokens
132
+ LLAMA_API size_t llama_get_state_size(const struct llama_context * ctx);
124
133
 
125
- // Sets the KV cache containing the current context for the model
126
- LLAMA_API void llama_set_kv_cache(
127
- struct llama_context * ctx,
128
- const uint8_t * kv_cache,
129
- size_t n_size,
130
- int n_token_count);
134
+ // Copies the state to the specified destination address.
135
+ // Destination needs to have allocated enough memory.
136
+ // Returns the number of bytes copied
137
+ LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest);
138
+
139
+ // Set the state reading from the specified address
140
+ // Returns the number of bytes read
141
+ LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src);
142
+
143
+ // Save/load session file
144
+ LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
145
+ LLAMA_API bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count);
131
146
 
132
147
  // Run the llama inference to obtain the logits and probabilities for the next token.
133
148
  // tokens + n_tokens is the provided batch of new tokens to process
@@ -152,9 +167,9 @@ extern "C" {
152
167
  int n_max_tokens,
153
168
  bool add_bos);
154
169
 
155
- LLAMA_API int llama_n_vocab(struct llama_context * ctx);
156
- LLAMA_API int llama_n_ctx (struct llama_context * ctx);
157
- LLAMA_API int llama_n_embd (struct llama_context * ctx);
170
+ LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
171
+ LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
172
+ LLAMA_API int llama_n_embd (const struct llama_context * ctx);
158
173
 
159
174
  // Token logits obtained from the last call to llama_eval()
160
175
  // The logits for the last token are stored in the last row
@@ -168,21 +183,57 @@ extern "C" {
168
183
  LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
169
184
 
170
185
  // Token Id -> String. Uses the vocabulary in the provided context
171
- LLAMA_API const char * llama_token_to_str(struct llama_context * ctx, llama_token token);
186
+ LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
172
187
 
173
188
  // Special tokens
174
189
  LLAMA_API llama_token llama_token_bos();
175
190
  LLAMA_API llama_token llama_token_eos();
191
+ LLAMA_API llama_token llama_token_nl();
192
+
193
+ // Sampling functions
194
+
195
+ /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
196
+ LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty);
197
+
198
+ /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
199
+ LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
200
+
201
+ /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
202
+ LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
203
+
204
+ /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
205
+ LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep = 1);
206
+
207
+ /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
208
+ LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1);
209
+
210
+ /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
211
+ LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep = 1);
212
+
213
+ /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
214
+ LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1);
215
+ LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
216
+
217
+ /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
218
+ /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
219
+ /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
220
+ /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
221
+ /// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
222
+ /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
223
+ LLAMA_API llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu);
224
+
225
+ /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
226
+ /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
227
+ /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
228
+ /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
229
+ /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
230
+ LLAMA_API llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu);
231
+
232
+ /// @details Selects the token with the highest probability.
233
+ LLAMA_API llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates);
176
234
 
177
- // TODO: improve the last_n_tokens interface ?
178
- LLAMA_API llama_token llama_sample_top_p_top_k(
179
- struct llama_context * ctx,
180
- const llama_token * last_n_tokens_data,
181
- int last_n_tokens_size,
182
- int top_k,
183
- float top_p,
184
- float temp,
185
- float repeat_penalty);
235
+ /// @details Randomly selects a token from the candidates based on their probabilities.
236
+ LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
186
237
 
187
238
  // Performance information
188
239
  LLAMA_API void llama_print_timings(struct llama_context * ctx);
@@ -0,0 +1,174 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LLaMACpp
4
+ # Client provides a high-level interface to the LLM model.
5
+ class Client # rubocop:disable Metrics/ClassLength
6
+ # Creates a new client.
7
+ #
8
+ # @param model_path [String] The path to the model file.
9
+ # @param lora_adapter_path [String] The path to the LoRA adapter file.
10
+ # @param lora_base_path [String] The path to the LoRA base model file.
11
+ # @param n_ctx [Integer] The context size.
12
+ # @param n_parts [Integer] The amount of model parts (-1 = determine from model dimensions).
13
+ # @param memory_f16 [Boolean] The flag wheter to use f16 instead of f32 for memory kv.
14
+ # @param use_mmap [Boolean] The flag whether to use mmap.
15
+ # @param use_mlock [Boolean] The flag hether to use mlock.
16
+ # @param embedding [Boolean] The flag whether to calculate embedding.
17
+ # @param n_threads [Integer] The number of threads to use.
18
+ # @param seed [Integer] The seed for the random number generator.
19
+ # @return [Client]
20
+ # rubocop:disable Metrics/MethodLength, Metrics/ParameterLists
21
+ def initialize(model_path:, lora_adapter_path: nil, lora_base_path: nil,
22
+ n_ctx: 512, n_parts: -1, memory_f16: false, use_mmap: true, use_mlock: false,
23
+ embedding: false,
24
+ n_threads: 1, seed: 0)
25
+ @params = {
26
+ model_path: model_path,
27
+ lora_adapter_path: lora_adapter_path,
28
+ lora_base_path: lora_base_path,
29
+ n_ctx: n_ctx,
30
+ n_parts: n_parts,
31
+ memory_f16: memory_f16,
32
+ use_mmap: use_mmap,
33
+ use_mlock: use_mlock,
34
+ embedding: embedding,
35
+ n_threads: n_threads,
36
+ seed: seed
37
+ }
38
+ @context_params = ContextParams.new
39
+ @context_params.n_ctx = n_ctx
40
+ @context_params.n_parts = n_parts
41
+ @context_params.f16_kv = memory_f16
42
+ @context_params.use_mmap = use_mmap
43
+ @context_params.use_mlock = use_mlock
44
+ @context_params.embedding = embedding
45
+ @context_params.seed = seed
46
+ @context = Context.new(model_path: model_path, params: @context_params)
47
+ return unless lora_adapter_path.is_a?(String)
48
+
49
+ if lora_base_path.is_a?(String)
50
+ @context.apply_lora_from_file(lora_path: lora_adapter_path, base_model_path: lora_base_path, n_threads: n_threads)
51
+ else
52
+ @context.apply_lora_from_file(lora_path: lora_adapter_path, n_threads: n_threads)
53
+ end
54
+ end
55
+ # rubocop:enable Metrics/MethodLength, Metrics/ParameterLists
56
+
57
+ # Generates completions for a given prompt.
58
+ #
59
+ # @param prompt [String] The prompt to generate completions for.
60
+ # @param max_tokens [Integer] The maximum number of tokens to generate.
61
+ # @param n_keep [Integer] The number of tokens to keep from the initial prompt.
62
+ # @param repeat_last_n [Integer] The number of tokens to use for repeat penalty.
63
+ # @param n_batch [Integer] The batch size.
64
+ # @param frequency [Float] The frequency penalty value.
65
+ # @param presence [Float] The presence penalty value.
66
+ # @param top_k [Integer] The top-k value.
67
+ # @param top_p [Float] The top-p value.
68
+ # @param tfs_z [Float] The tail free sampling parameter.
69
+ # @param typical_p [Float] The typical probability value.
70
+ # @param temperature [Float] The temperature value.
71
+ # @param repeat_penalty [Float] The repeat penalty value.
72
+ # @return [String]
73
+ # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
74
+ def completions(prompt, max_tokens: 64, n_keep: 10, repeat_last_n: 64, n_batch: 512,
75
+ frequency: 0.0, presence: 0.0,
76
+ top_k: 40, top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8, repeat_penalty: 1.1)
77
+ embd_input = tokenize_prompt(prompt)
78
+
79
+ n_ctx = @context.n_ctx
80
+ raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4
81
+
82
+ last_n_tokens = [0] * n_ctx
83
+
84
+ embd = []
85
+ n_consumed = 0
86
+ n_past = 0
87
+ n_remain = max_tokens
88
+ n_vocab = @context.n_vocab
89
+ output = []
90
+
91
+ while n_remain != 0
92
+ unless embd.empty?
93
+ if n_past + embd.size > n_ctx
94
+ n_left = n_past - n_keep
95
+ n_past = n_keep
96
+ embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size])
97
+ end
98
+
99
+ @context.eval(tokens: embd, n_past: n_past, n_threads: @params[:n_threads])
100
+ end
101
+
102
+ n_past += embd.size
103
+ embd.clear
104
+
105
+ if embd_input.size <= n_consumed
106
+ logits = @context.logits
107
+ base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i], p: 0.0) }
108
+ candidates = LLaMACpp::TokenDataArray.new(base_candidates)
109
+
110
+ # apply penalties
111
+ last_n_repeat = [last_n_tokens.size, repeat_last_n, n_ctx].min
112
+ @context.sample_repetition_penalty(candidates, last_n_tokens[-last_n_repeat..], penalty: repeat_penalty)
113
+ @context.sample_frequency_and_presence_penalties(
114
+ candidates, last_n_tokens[-last_n_repeat..], frequency: frequency, presence: presence
115
+ )
116
+
117
+ # temperature sampling
118
+ @context.sample_top_k(candidates, k: top_k)
119
+ @context.sample_tail_free(candidates, z: tfs_z)
120
+ @context.sample_typical(candidates, prob: typical_p)
121
+ @context.sample_top_p(candidates, prob: top_p)
122
+ @context.sample_temperature(candidates, temperature: temperature)
123
+ id = @context.sample_token(candidates)
124
+
125
+ last_n_tokens.shift
126
+ last_n_tokens.push(id)
127
+
128
+ last_n_tokens.shift
129
+ last_n_tokens.push(id)
130
+
131
+ embd.push(id)
132
+ n_remain -= 1
133
+ else
134
+ while embd_input.size > n_consumed
135
+ embd.push(embd_input[n_consumed])
136
+ last_n_tokens.shift
137
+ last_n_tokens.push(embd_input[n_consumed])
138
+ n_consumed += 1
139
+ break if embd.size >= n_batch
140
+ end
141
+ end
142
+
143
+ embd.each { |token| output << @context.token_to_str(token) }
144
+
145
+ break if !embd.empty? && embd[-1] == LLaMACpp.token_eos
146
+ end
147
+
148
+ output.join.delete_prefix(" #{prompt}").strip
149
+ end
150
+ # rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
151
+
152
+ # def chat(prompt); end
153
+
154
+ # Obtains the embedding for a given text.
155
+ #
156
+ # @param text [String] The text to obtain the embedding for.
157
+ # @return [Array<Float>]
158
+ def embeddings(text)
159
+ raise 'The embedding option is set to false' unless @params[:embedding]
160
+
161
+ embd_input = tokenize_prompt(text)
162
+ raise 'The result of tokenizing the input text is empty' unless embd_input.size.positive?
163
+
164
+ @context.eval(tokens: embd_input, n_past: 0, n_threads: @params[:n_threads])
165
+ @context.embeddings
166
+ end
167
+
168
+ private
169
+
170
+ def tokenize_prompt(prompt)
171
+ @context.tokenize(text: " #{prompt}", add_bos: true)
172
+ end
173
+ end
174
+ end
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.0.6'
6
+ VERSION = '0.1.0'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'master-12b5900'
9
+ LLAMA_CPP_VERSION = 'master-173d0e6'
10
10
  end
data/lib/llama_cpp.rb CHANGED
@@ -2,6 +2,7 @@
2
2
 
3
3
  require_relative 'llama_cpp/version'
4
4
  require_relative 'llama_cpp/llama_cpp'
5
+ require_relative 'llama_cpp/client'
5
6
 
6
7
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
7
8
  module LLaMACpp
@@ -12,24 +13,40 @@ module LLaMACpp
12
13
 
13
14
  # Generates sentences following the given prompt for operation check.
14
15
  #
15
- # @param context [LLaMACpp::Context]
16
- # @param prompt [String]
17
- # @param n_threads [Integer]
16
+ # @param context [LLaMACpp::Context] The context to use.
17
+ # @param prompt [String] The prompt to start generation with.
18
+ # @param n_predict [Integer] The number of tokens to predict.
19
+ # @param n_threads [Integer] The number of threads.
18
20
  # @return [String]
19
- def generate(context, prompt, n_threads: 1) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/PerceivedComplexity
20
- spaced_prompt = " #{prompt}"
21
+ def generate(context, prompt, n_predict: 128, n_threads: 1) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
22
+ raise ArgumentError, 'context must be an instance of LLaMACpp::Context' unless context.is_a?(LLaMACpp::Context)
23
+ raise ArgumentError, 'context must have loaded the model' if context.empty?
24
+ raise ArgumentError, 'prompt must be a String' unless prompt.is_a?(String)
21
25
 
26
+ spaced_prompt = " #{prompt}"
22
27
  embd_input = context.tokenize(text: spaced_prompt, add_bos: true)
23
28
 
24
29
  n_ctx = context.n_ctx
30
+ raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4
31
+
25
32
  last_n_tokens = [0] * n_ctx
26
33
 
27
34
  embd = []
28
35
  n_consumed = 0
29
36
  n_keep = 10
30
37
  n_past = 0
31
- n_remain = 128
38
+ n_remain = n_predict
32
39
  repeat_last_n = 64
40
+ repeat_penalty = 1.1
41
+ frequency = 0.0
42
+ presence = 0.0
43
+ top_k = 40
44
+ top_p = 0.95
45
+ tfs_z = 1.0
46
+ typical_p = 1.0
47
+ temperature = 0.8
48
+ n_batch = 512
49
+ n_vocab = context.n_vocab
33
50
  output = []
34
51
 
35
52
  while n_remain != 0
@@ -47,10 +64,25 @@ module LLaMACpp
47
64
  embd.clear
48
65
 
49
66
  if embd_input.size <= n_consumed
50
- start = n_ctx - repeat_last_n
51
- id = context.sample_top_p_top_k(
52
- last_n_tokens[start...(start + repeat_last_n)], top_k: 40, top_p: 0.95, temp: 0.80, penalty: 1.1
67
+ logits = context.logits
68
+ base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i], p: 0.0) }
69
+ candidates = LLaMACpp::TokenDataArray.new(base_candidates)
70
+
71
+ # apply penalties
72
+ last_n_repeat = [last_n_tokens.size, repeat_last_n, n_ctx].min
73
+ context.sample_repetition_penalty(candidates, last_n_tokens[-last_n_repeat..], penalty: repeat_penalty)
74
+ context.sample_frequency_and_presence_penalties(
75
+ candidates, last_n_tokens[-last_n_repeat..], frequency: frequency, presence: presence
53
76
  )
77
+
78
+ # temperature sampling
79
+ context.sample_top_k(candidates, k: top_k)
80
+ context.sample_tail_free(candidates, z: tfs_z)
81
+ context.sample_typical(candidates, prob: typical_p)
82
+ context.sample_top_p(candidates, prob: top_p)
83
+ context.sample_temperature(candidates, temperature: temperature)
84
+ id = context.sample_token(candidates)
85
+
54
86
  last_n_tokens.shift
55
87
  last_n_tokens.push(id)
56
88
 
@@ -62,13 +94,13 @@ module LLaMACpp
62
94
  last_n_tokens.shift
63
95
  last_n_tokens.push(embd_input[n_consumed])
64
96
  n_consumed += 1
65
- break if embd.size >= 512
97
+ break if embd.size >= n_batch
66
98
  end
67
99
  end
68
100
 
69
101
  embd.each { |token| output << context.token_to_str(token) }
70
102
 
71
- break if embd[-1] == LLaMACpp.token_eos
103
+ break if !embd.empty? && embd[-1] == LLaMACpp.token_eos
72
104
  end
73
105
 
74
106
  output.join.delete_prefix(spaced_prompt).strip
data/sig/llama_cpp.rbs CHANGED
@@ -11,22 +11,46 @@ module LLaMACpp
11
11
  LLAMA_FTYPE_MOSTLY_Q4_1: Integer
12
12
  LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: Integer
13
13
  LLAMA_FTYPE_MOSTLY_Q4_2: Integer
14
- LLAMA_FTYPE_MOSTLY_Q4_3: Integer
14
+ LLAMA_FTYPE_MOSTLY_Q8_0: Integer
15
+ LLAMA_FTYPE_MOSTLY_Q5_0: Integer
16
+ LLAMA_FTYPE_MOSTLY_Q5_1: Integer
15
17
 
16
18
  def self?.model_quantize: (input_path: String, output_path: String, ftype: Integer, ?n_threads: Integer) -> void
17
- def self?.generate: (::LLaMACpp::Context, String, ?n_threads: Integer) -> String
19
+ def self?.generate: (::LLaMACpp::Context, String, ?n_predict: Integer, ?n_threads: Integer) -> String
18
20
  def self?.print_system_info: () -> void
19
21
  def self?.token_bos: () -> Integer
20
22
  def self?.token_eos: () -> Integer
23
+ def self?.token_nl: () -> Integer
21
24
  def self?.mmap_supported?: () -> bool
22
25
  def self?.mlock_supported?: () -> bool
23
26
 
27
+ class TokenData
28
+ public
29
+
30
+ def initialize: (id: Integer, logit: Float, p: Float) -> void
31
+ def id: () -> Integer
32
+ def id=: (Integer) -> Integer
33
+ def logit: () -> Float
34
+ def logit=: (Float) -> Float
35
+ def p: () -> Float
36
+ def p=: (Float) -> Float
37
+ end
38
+
39
+ class TokenDataArray
40
+ public
41
+
42
+ def initialize: (Array[::LLaMACpp::TokenData], ?sorted: bool) -> void
43
+ def size: () -> Integer
44
+ def sorted: () -> bool
45
+ end
46
+
24
47
  class Context
25
48
  public
26
49
 
27
50
  def initialize: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
28
51
  | () -> void
29
52
  def embeddings: () -> Array[Float]
53
+ def empty?: () -> bool
30
54
  def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
31
55
  def free: () -> void
32
56
  def load: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
@@ -36,10 +60,23 @@ module LLaMACpp
36
60
  def n_vocab: () -> Integer
37
61
  def print_timings: () -> void
38
62
  def reset_timings: () -> void
39
- def sample_top_p_top_k: (top_k: Integer, top_p: Float, temp: Float, penalty: Float) -> Integer
40
63
  def token_to_str: (Integer) -> String
41
64
  def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
42
65
  def apply_lora_from_file: (lora_path: String, ?base_model_path: String, ?n_threads: Integer) -> void
66
+ def kv_cache_token_count: () -> Integer
67
+ def set_rng_seed: (Integer) -> void
68
+ def sample_repetition_penalty: (::LLaMACpp::TokenDataArray, Array[Integer], penalty: Float) -> void
69
+ def sample_frequency_and_presence_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], frequency: Float, presence: Float) -> void
70
+ def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
71
+ def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
72
+ def sample_top_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
73
+ def sample_tail_free: (::LLaMACpp::TokenDataArray, z: Float, ?min_keep: Integer) -> void
74
+ def sample_typical: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
75
+ def sample_temperature: (::LLaMACpp::TokenDataArray, temperature: Float) -> void
76
+ def sample_token_mirostat: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, m: Integer, mu: Float) -> [Integer, Float]
77
+ def sample_token_mirostat_v2: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, mu: Float) -> [Integer, Float]
78
+ def sample_token_greedy: (::LLaMACpp::TokenDataArray) -> Integer
79
+ def sample_token: (::LLaMACpp::TokenDataArray) -> Integer
43
80
  end
44
81
 
45
82
  class ContextParams
@@ -59,9 +96,22 @@ module LLaMACpp
59
96
  def seed=: (Integer) -> Integer
60
97
  def use_mlock: () -> bool
61
98
  def use_mlock=: (bool) -> bool
99
+ def use_mmap: () -> bool
100
+ def use_mmap=: (bool) -> bool
62
101
  def vocab_only: () -> bool
63
102
  def vocab_only=: (bool) -> bool
64
103
  end
65
104
 
66
105
  class Params = ContextParams
106
+
107
+ class Client
108
+ def initialize(model_path: String, ?lora_adapter_path: String, ?lora_base_path: String,
109
+ ?n_ctx: Integer, ?n_parts: Integer, ?memory_f16: bool, ?use_mmap: bool, ?use_mlock: bool,
110
+ ?embedding: bool, ?n_threads: Integer, ?seed: Integer) -> void
111
+ def completions(String, ?max_tokens: Integer, ?n_keep: Integer, ?repeat_last_n: Integer, ?n_batch: Integer,
112
+ ?frequency: Float, ?presence: Float,
113
+ ?top_k: Integer, ?top_p: Float, ?tfs_z: Float, ?typical_p: Float, ?temperature: Float,
114
+ ?repeat_penalty: Float) -> String
115
+ def embeddings(String) -> Array[Float]
116
+ end
67
117
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.6
4
+ version: 0.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-04-22 00:00:00.000000000 Z
11
+ date: 2023-05-20 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email:
@@ -27,12 +27,15 @@ files:
27
27
  - ext/llama_cpp/llama_cpp.h
28
28
  - ext/llama_cpp/src/LICENSE
29
29
  - ext/llama_cpp/src/ggml-cuda.h
30
+ - ext/llama_cpp/src/ggml-opencl.c
31
+ - ext/llama_cpp/src/ggml-opencl.h
30
32
  - ext/llama_cpp/src/ggml.c
31
33
  - ext/llama_cpp/src/ggml.h
34
+ - ext/llama_cpp/src/llama-util.h
32
35
  - ext/llama_cpp/src/llama.cpp
33
36
  - ext/llama_cpp/src/llama.h
34
- - ext/llama_cpp/src/llama_util.h
35
37
  - lib/llama_cpp.rb
38
+ - lib/llama_cpp/client.rb
36
39
  - lib/llama_cpp/version.rb
37
40
  - sig/llama_cpp.rbs
38
41
  homepage: https://github.com/yoshoku/llama_cpp.rb