llama_cpp 0.0.6 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -19,9 +19,11 @@
19
19
  # define LLAMA_API
20
20
  #endif
21
21
 
22
- #define LLAMA_FILE_VERSION 1
23
- #define LLAMA_FILE_MAGIC 0x67676a74 // 'ggjt' in hex
24
- #define LLAMA_FILE_MAGIC_UNVERSIONED 0x67676d6c // pre-versioned files
22
+ #define LLAMA_FILE_VERSION 1
23
+ #define LLAMA_FILE_MAGIC 'ggjt'
24
+ #define LLAMA_FILE_MAGIC_UNVERSIONED 'ggml'
25
+ #define LLAMA_SESSION_MAGIC 'ggsn'
26
+ #define LLAMA_SESSION_VERSION 1
25
27
 
26
28
  #ifdef __cplusplus
27
29
  extern "C" {
@@ -39,18 +41,22 @@ extern "C" {
39
41
 
40
42
  typedef struct llama_token_data {
41
43
  llama_token id; // token id
42
-
44
+ float logit; // log-odds of the token
43
45
  float p; // probability of the token
44
- float plog; // log probability of the token
45
-
46
46
  } llama_token_data;
47
47
 
48
+ typedef struct llama_token_data_array {
49
+ llama_token_data * data;
50
+ size_t size;
51
+ bool sorted;
52
+ } llama_token_data_array;
53
+
48
54
  typedef void (*llama_progress_callback)(float progress, void *ctx);
49
55
 
50
56
  struct llama_context_params {
51
57
  int n_ctx; // text context
52
58
  int n_parts; // -1 for default
53
- int seed; // RNG seed, 0 for random
59
+ int seed; // RNG seed, -1 for random
54
60
 
55
61
  bool f16_kv; // use fp16 for KV cache
56
62
  bool logits_all; // the llama_eval() call computes all logits, not just the last one
@@ -73,7 +79,10 @@ extern "C" {
73
79
  LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
74
80
  LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
75
81
  LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
76
- LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // except 1d tensors
82
+ // LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed
83
+ LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
84
+ LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
85
+ LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
77
86
  };
78
87
 
79
88
  LLAMA_API struct llama_context_params llama_context_default_params();
@@ -112,22 +121,28 @@ extern "C" {
112
121
  const char * path_base_model,
113
122
  int n_threads);
114
123
 
115
- // Returns the KV cache that will contain the context for the
116
- // ongoing prediction with the model.
117
- LLAMA_API const uint8_t * llama_get_kv_cache(struct llama_context * ctx);
124
+ // Returns the number of tokens in the KV cache
125
+ LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
118
126
 
119
- // Returns the size of the KV cache
120
- LLAMA_API size_t llama_get_kv_cache_size(struct llama_context * ctx);
127
+ // Sets the current rng seed.
128
+ LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed);
121
129
 
122
- // Returns the number of tokens in the KV cache
123
- LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx);
130
+ // Returns the maximum size in bytes of the state (rng, logits, embedding
131
+ // and kv_cache) - will often be smaller after compacting tokens
132
+ LLAMA_API size_t llama_get_state_size(const struct llama_context * ctx);
124
133
 
125
- // Sets the KV cache containing the current context for the model
126
- LLAMA_API void llama_set_kv_cache(
127
- struct llama_context * ctx,
128
- const uint8_t * kv_cache,
129
- size_t n_size,
130
- int n_token_count);
134
+ // Copies the state to the specified destination address.
135
+ // Destination needs to have allocated enough memory.
136
+ // Returns the number of bytes copied
137
+ LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest);
138
+
139
+ // Set the state reading from the specified address
140
+ // Returns the number of bytes read
141
+ LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src);
142
+
143
+ // Save/load session file
144
+ LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
145
+ LLAMA_API bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count);
131
146
 
132
147
  // Run the llama inference to obtain the logits and probabilities for the next token.
133
148
  // tokens + n_tokens is the provided batch of new tokens to process
@@ -152,9 +167,9 @@ extern "C" {
152
167
  int n_max_tokens,
153
168
  bool add_bos);
154
169
 
155
- LLAMA_API int llama_n_vocab(struct llama_context * ctx);
156
- LLAMA_API int llama_n_ctx (struct llama_context * ctx);
157
- LLAMA_API int llama_n_embd (struct llama_context * ctx);
170
+ LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
171
+ LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
172
+ LLAMA_API int llama_n_embd (const struct llama_context * ctx);
158
173
 
159
174
  // Token logits obtained from the last call to llama_eval()
160
175
  // The logits for the last token are stored in the last row
@@ -168,21 +183,57 @@ extern "C" {
168
183
  LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
169
184
 
170
185
  // Token Id -> String. Uses the vocabulary in the provided context
171
- LLAMA_API const char * llama_token_to_str(struct llama_context * ctx, llama_token token);
186
+ LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
172
187
 
173
188
  // Special tokens
174
189
  LLAMA_API llama_token llama_token_bos();
175
190
  LLAMA_API llama_token llama_token_eos();
191
+ LLAMA_API llama_token llama_token_nl();
192
+
193
+ // Sampling functions
194
+
195
+ /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
196
+ LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty);
197
+
198
+ /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
199
+ LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
200
+
201
+ /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
202
+ LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
203
+
204
+ /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
205
+ LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep = 1);
206
+
207
+ /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
208
+ LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1);
209
+
210
+ /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
211
+ LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep = 1);
212
+
213
+ /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
214
+ LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1);
215
+ LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
216
+
217
+ /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
218
+ /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
219
+ /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
220
+ /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
221
+ /// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
222
+ /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
223
+ LLAMA_API llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu);
224
+
225
+ /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
226
+ /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
227
+ /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
228
+ /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
229
+ /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
230
+ LLAMA_API llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu);
231
+
232
+ /// @details Selects the token with the highest probability.
233
+ LLAMA_API llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates);
176
234
 
177
- // TODO: improve the last_n_tokens interface ?
178
- LLAMA_API llama_token llama_sample_top_p_top_k(
179
- struct llama_context * ctx,
180
- const llama_token * last_n_tokens_data,
181
- int last_n_tokens_size,
182
- int top_k,
183
- float top_p,
184
- float temp,
185
- float repeat_penalty);
235
+ /// @details Randomly selects a token from the candidates based on their probabilities.
236
+ LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
186
237
 
187
238
  // Performance information
188
239
  LLAMA_API void llama_print_timings(struct llama_context * ctx);
@@ -0,0 +1,174 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LLaMACpp
4
+ # Client provides a high-level interface to the LLM model.
5
+ class Client # rubocop:disable Metrics/ClassLength
6
+ # Creates a new client.
7
+ #
8
+ # @param model_path [String] The path to the model file.
9
+ # @param lora_adapter_path [String] The path to the LoRA adapter file.
10
+ # @param lora_base_path [String] The path to the LoRA base model file.
11
+ # @param n_ctx [Integer] The context size.
12
+ # @param n_parts [Integer] The amount of model parts (-1 = determine from model dimensions).
13
+ # @param memory_f16 [Boolean] The flag wheter to use f16 instead of f32 for memory kv.
14
+ # @param use_mmap [Boolean] The flag whether to use mmap.
15
+ # @param use_mlock [Boolean] The flag hether to use mlock.
16
+ # @param embedding [Boolean] The flag whether to calculate embedding.
17
+ # @param n_threads [Integer] The number of threads to use.
18
+ # @param seed [Integer] The seed for the random number generator.
19
+ # @return [Client]
20
+ # rubocop:disable Metrics/MethodLength, Metrics/ParameterLists
21
+ def initialize(model_path:, lora_adapter_path: nil, lora_base_path: nil,
22
+ n_ctx: 512, n_parts: -1, memory_f16: false, use_mmap: true, use_mlock: false,
23
+ embedding: false,
24
+ n_threads: 1, seed: 0)
25
+ @params = {
26
+ model_path: model_path,
27
+ lora_adapter_path: lora_adapter_path,
28
+ lora_base_path: lora_base_path,
29
+ n_ctx: n_ctx,
30
+ n_parts: n_parts,
31
+ memory_f16: memory_f16,
32
+ use_mmap: use_mmap,
33
+ use_mlock: use_mlock,
34
+ embedding: embedding,
35
+ n_threads: n_threads,
36
+ seed: seed
37
+ }
38
+ @context_params = ContextParams.new
39
+ @context_params.n_ctx = n_ctx
40
+ @context_params.n_parts = n_parts
41
+ @context_params.f16_kv = memory_f16
42
+ @context_params.use_mmap = use_mmap
43
+ @context_params.use_mlock = use_mlock
44
+ @context_params.embedding = embedding
45
+ @context_params.seed = seed
46
+ @context = Context.new(model_path: model_path, params: @context_params)
47
+ return unless lora_adapter_path.is_a?(String)
48
+
49
+ if lora_base_path.is_a?(String)
50
+ @context.apply_lora_from_file(lora_path: lora_adapter_path, base_model_path: lora_base_path, n_threads: n_threads)
51
+ else
52
+ @context.apply_lora_from_file(lora_path: lora_adapter_path, n_threads: n_threads)
53
+ end
54
+ end
55
+ # rubocop:enable Metrics/MethodLength, Metrics/ParameterLists
56
+
57
+ # Generates completions for a given prompt.
58
+ #
59
+ # @param prompt [String] The prompt to generate completions for.
60
+ # @param max_tokens [Integer] The maximum number of tokens to generate.
61
+ # @param n_keep [Integer] The number of tokens to keep from the initial prompt.
62
+ # @param repeat_last_n [Integer] The number of tokens to use for repeat penalty.
63
+ # @param n_batch [Integer] The batch size.
64
+ # @param frequency [Float] The frequency penalty value.
65
+ # @param presence [Float] The presence penalty value.
66
+ # @param top_k [Integer] The top-k value.
67
+ # @param top_p [Float] The top-p value.
68
+ # @param tfs_z [Float] The tail free sampling parameter.
69
+ # @param typical_p [Float] The typical probability value.
70
+ # @param temperature [Float] The temperature value.
71
+ # @param repeat_penalty [Float] The repeat penalty value.
72
+ # @return [String]
73
+ # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
74
+ def completions(prompt, max_tokens: 64, n_keep: 10, repeat_last_n: 64, n_batch: 512,
75
+ frequency: 0.0, presence: 0.0,
76
+ top_k: 40, top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8, repeat_penalty: 1.1)
77
+ embd_input = tokenize_prompt(prompt)
78
+
79
+ n_ctx = @context.n_ctx
80
+ raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4
81
+
82
+ last_n_tokens = [0] * n_ctx
83
+
84
+ embd = []
85
+ n_consumed = 0
86
+ n_past = 0
87
+ n_remain = max_tokens
88
+ n_vocab = @context.n_vocab
89
+ output = []
90
+
91
+ while n_remain != 0
92
+ unless embd.empty?
93
+ if n_past + embd.size > n_ctx
94
+ n_left = n_past - n_keep
95
+ n_past = n_keep
96
+ embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size])
97
+ end
98
+
99
+ @context.eval(tokens: embd, n_past: n_past, n_threads: @params[:n_threads])
100
+ end
101
+
102
+ n_past += embd.size
103
+ embd.clear
104
+
105
+ if embd_input.size <= n_consumed
106
+ logits = @context.logits
107
+ base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i], p: 0.0) }
108
+ candidates = LLaMACpp::TokenDataArray.new(base_candidates)
109
+
110
+ # apply penalties
111
+ last_n_repeat = [last_n_tokens.size, repeat_last_n, n_ctx].min
112
+ @context.sample_repetition_penalty(candidates, last_n_tokens[-last_n_repeat..], penalty: repeat_penalty)
113
+ @context.sample_frequency_and_presence_penalties(
114
+ candidates, last_n_tokens[-last_n_repeat..], frequency: frequency, presence: presence
115
+ )
116
+
117
+ # temperature sampling
118
+ @context.sample_top_k(candidates, k: top_k)
119
+ @context.sample_tail_free(candidates, z: tfs_z)
120
+ @context.sample_typical(candidates, prob: typical_p)
121
+ @context.sample_top_p(candidates, prob: top_p)
122
+ @context.sample_temperature(candidates, temperature: temperature)
123
+ id = @context.sample_token(candidates)
124
+
125
+ last_n_tokens.shift
126
+ last_n_tokens.push(id)
127
+
128
+ last_n_tokens.shift
129
+ last_n_tokens.push(id)
130
+
131
+ embd.push(id)
132
+ n_remain -= 1
133
+ else
134
+ while embd_input.size > n_consumed
135
+ embd.push(embd_input[n_consumed])
136
+ last_n_tokens.shift
137
+ last_n_tokens.push(embd_input[n_consumed])
138
+ n_consumed += 1
139
+ break if embd.size >= n_batch
140
+ end
141
+ end
142
+
143
+ embd.each { |token| output << @context.token_to_str(token) }
144
+
145
+ break if !embd.empty? && embd[-1] == LLaMACpp.token_eos
146
+ end
147
+
148
+ output.join.delete_prefix(" #{prompt}").strip
149
+ end
150
+ # rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
151
+
152
+ # def chat(prompt); end
153
+
154
+ # Obtains the embedding for a given text.
155
+ #
156
+ # @param text [String] The text to obtain the embedding for.
157
+ # @return [Array<Float>]
158
+ def embeddings(text)
159
+ raise 'The embedding option is set to false' unless @params[:embedding]
160
+
161
+ embd_input = tokenize_prompt(text)
162
+ raise 'The result of tokenizing the input text is empty' unless embd_input.size.positive?
163
+
164
+ @context.eval(tokens: embd_input, n_past: 0, n_threads: @params[:n_threads])
165
+ @context.embeddings
166
+ end
167
+
168
+ private
169
+
170
+ def tokenize_prompt(prompt)
171
+ @context.tokenize(text: " #{prompt}", add_bos: true)
172
+ end
173
+ end
174
+ end
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.0.6'
6
+ VERSION = '0.1.0'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'master-12b5900'
9
+ LLAMA_CPP_VERSION = 'master-173d0e6'
10
10
  end
data/lib/llama_cpp.rb CHANGED
@@ -2,6 +2,7 @@
2
2
 
3
3
  require_relative 'llama_cpp/version'
4
4
  require_relative 'llama_cpp/llama_cpp'
5
+ require_relative 'llama_cpp/client'
5
6
 
6
7
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
7
8
  module LLaMACpp
@@ -12,24 +13,40 @@ module LLaMACpp
12
13
 
13
14
  # Generates sentences following the given prompt for operation check.
14
15
  #
15
- # @param context [LLaMACpp::Context]
16
- # @param prompt [String]
17
- # @param n_threads [Integer]
16
+ # @param context [LLaMACpp::Context] The context to use.
17
+ # @param prompt [String] The prompt to start generation with.
18
+ # @param n_predict [Integer] The number of tokens to predict.
19
+ # @param n_threads [Integer] The number of threads.
18
20
  # @return [String]
19
- def generate(context, prompt, n_threads: 1) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/PerceivedComplexity
20
- spaced_prompt = " #{prompt}"
21
+ def generate(context, prompt, n_predict: 128, n_threads: 1) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
22
+ raise ArgumentError, 'context must be an instance of LLaMACpp::Context' unless context.is_a?(LLaMACpp::Context)
23
+ raise ArgumentError, 'context must have loaded the model' if context.empty?
24
+ raise ArgumentError, 'prompt must be a String' unless prompt.is_a?(String)
21
25
 
26
+ spaced_prompt = " #{prompt}"
22
27
  embd_input = context.tokenize(text: spaced_prompt, add_bos: true)
23
28
 
24
29
  n_ctx = context.n_ctx
30
+ raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4
31
+
25
32
  last_n_tokens = [0] * n_ctx
26
33
 
27
34
  embd = []
28
35
  n_consumed = 0
29
36
  n_keep = 10
30
37
  n_past = 0
31
- n_remain = 128
38
+ n_remain = n_predict
32
39
  repeat_last_n = 64
40
+ repeat_penalty = 1.1
41
+ frequency = 0.0
42
+ presence = 0.0
43
+ top_k = 40
44
+ top_p = 0.95
45
+ tfs_z = 1.0
46
+ typical_p = 1.0
47
+ temperature = 0.8
48
+ n_batch = 512
49
+ n_vocab = context.n_vocab
33
50
  output = []
34
51
 
35
52
  while n_remain != 0
@@ -47,10 +64,25 @@ module LLaMACpp
47
64
  embd.clear
48
65
 
49
66
  if embd_input.size <= n_consumed
50
- start = n_ctx - repeat_last_n
51
- id = context.sample_top_p_top_k(
52
- last_n_tokens[start...(start + repeat_last_n)], top_k: 40, top_p: 0.95, temp: 0.80, penalty: 1.1
67
+ logits = context.logits
68
+ base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i], p: 0.0) }
69
+ candidates = LLaMACpp::TokenDataArray.new(base_candidates)
70
+
71
+ # apply penalties
72
+ last_n_repeat = [last_n_tokens.size, repeat_last_n, n_ctx].min
73
+ context.sample_repetition_penalty(candidates, last_n_tokens[-last_n_repeat..], penalty: repeat_penalty)
74
+ context.sample_frequency_and_presence_penalties(
75
+ candidates, last_n_tokens[-last_n_repeat..], frequency: frequency, presence: presence
53
76
  )
77
+
78
+ # temperature sampling
79
+ context.sample_top_k(candidates, k: top_k)
80
+ context.sample_tail_free(candidates, z: tfs_z)
81
+ context.sample_typical(candidates, prob: typical_p)
82
+ context.sample_top_p(candidates, prob: top_p)
83
+ context.sample_temperature(candidates, temperature: temperature)
84
+ id = context.sample_token(candidates)
85
+
54
86
  last_n_tokens.shift
55
87
  last_n_tokens.push(id)
56
88
 
@@ -62,13 +94,13 @@ module LLaMACpp
62
94
  last_n_tokens.shift
63
95
  last_n_tokens.push(embd_input[n_consumed])
64
96
  n_consumed += 1
65
- break if embd.size >= 512
97
+ break if embd.size >= n_batch
66
98
  end
67
99
  end
68
100
 
69
101
  embd.each { |token| output << context.token_to_str(token) }
70
102
 
71
- break if embd[-1] == LLaMACpp.token_eos
103
+ break if !embd.empty? && embd[-1] == LLaMACpp.token_eos
72
104
  end
73
105
 
74
106
  output.join.delete_prefix(spaced_prompt).strip
data/sig/llama_cpp.rbs CHANGED
@@ -11,22 +11,46 @@ module LLaMACpp
11
11
  LLAMA_FTYPE_MOSTLY_Q4_1: Integer
12
12
  LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: Integer
13
13
  LLAMA_FTYPE_MOSTLY_Q4_2: Integer
14
- LLAMA_FTYPE_MOSTLY_Q4_3: Integer
14
+ LLAMA_FTYPE_MOSTLY_Q8_0: Integer
15
+ LLAMA_FTYPE_MOSTLY_Q5_0: Integer
16
+ LLAMA_FTYPE_MOSTLY_Q5_1: Integer
15
17
 
16
18
  def self?.model_quantize: (input_path: String, output_path: String, ftype: Integer, ?n_threads: Integer) -> void
17
- def self?.generate: (::LLaMACpp::Context, String, ?n_threads: Integer) -> String
19
+ def self?.generate: (::LLaMACpp::Context, String, ?n_predict: Integer, ?n_threads: Integer) -> String
18
20
  def self?.print_system_info: () -> void
19
21
  def self?.token_bos: () -> Integer
20
22
  def self?.token_eos: () -> Integer
23
+ def self?.token_nl: () -> Integer
21
24
  def self?.mmap_supported?: () -> bool
22
25
  def self?.mlock_supported?: () -> bool
23
26
 
27
+ class TokenData
28
+ public
29
+
30
+ def initialize: (id: Integer, logit: Float, p: Float) -> void
31
+ def id: () -> Integer
32
+ def id=: (Integer) -> Integer
33
+ def logit: () -> Float
34
+ def logit=: (Float) -> Float
35
+ def p: () -> Float
36
+ def p=: (Float) -> Float
37
+ end
38
+
39
+ class TokenDataArray
40
+ public
41
+
42
+ def initialize: (Array[::LLaMACpp::TokenData], ?sorted: bool) -> void
43
+ def size: () -> Integer
44
+ def sorted: () -> bool
45
+ end
46
+
24
47
  class Context
25
48
  public
26
49
 
27
50
  def initialize: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
28
51
  | () -> void
29
52
  def embeddings: () -> Array[Float]
53
+ def empty?: () -> bool
30
54
  def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
31
55
  def free: () -> void
32
56
  def load: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
@@ -36,10 +60,23 @@ module LLaMACpp
36
60
  def n_vocab: () -> Integer
37
61
  def print_timings: () -> void
38
62
  def reset_timings: () -> void
39
- def sample_top_p_top_k: (top_k: Integer, top_p: Float, temp: Float, penalty: Float) -> Integer
40
63
  def token_to_str: (Integer) -> String
41
64
  def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
42
65
  def apply_lora_from_file: (lora_path: String, ?base_model_path: String, ?n_threads: Integer) -> void
66
+ def kv_cache_token_count: () -> Integer
67
+ def set_rng_seed: (Integer) -> void
68
+ def sample_repetition_penalty: (::LLaMACpp::TokenDataArray, Array[Integer], penalty: Float) -> void
69
+ def sample_frequency_and_presence_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], frequency: Float, presence: Float) -> void
70
+ def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
71
+ def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
72
+ def sample_top_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
73
+ def sample_tail_free: (::LLaMACpp::TokenDataArray, z: Float, ?min_keep: Integer) -> void
74
+ def sample_typical: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
75
+ def sample_temperature: (::LLaMACpp::TokenDataArray, temperature: Float) -> void
76
+ def sample_token_mirostat: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, m: Integer, mu: Float) -> [Integer, Float]
77
+ def sample_token_mirostat_v2: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, mu: Float) -> [Integer, Float]
78
+ def sample_token_greedy: (::LLaMACpp::TokenDataArray) -> Integer
79
+ def sample_token: (::LLaMACpp::TokenDataArray) -> Integer
43
80
  end
44
81
 
45
82
  class ContextParams
@@ -59,9 +96,22 @@ module LLaMACpp
59
96
  def seed=: (Integer) -> Integer
60
97
  def use_mlock: () -> bool
61
98
  def use_mlock=: (bool) -> bool
99
+ def use_mmap: () -> bool
100
+ def use_mmap=: (bool) -> bool
62
101
  def vocab_only: () -> bool
63
102
  def vocab_only=: (bool) -> bool
64
103
  end
65
104
 
66
105
  class Params = ContextParams
106
+
107
+ class Client
108
+ def initialize(model_path: String, ?lora_adapter_path: String, ?lora_base_path: String,
109
+ ?n_ctx: Integer, ?n_parts: Integer, ?memory_f16: bool, ?use_mmap: bool, ?use_mlock: bool,
110
+ ?embedding: bool, ?n_threads: Integer, ?seed: Integer) -> void
111
+ def completions(String, ?max_tokens: Integer, ?n_keep: Integer, ?repeat_last_n: Integer, ?n_batch: Integer,
112
+ ?frequency: Float, ?presence: Float,
113
+ ?top_k: Integer, ?top_p: Float, ?tfs_z: Float, ?typical_p: Float, ?temperature: Float,
114
+ ?repeat_penalty: Float) -> String
115
+ def embeddings(String) -> Array[Float]
116
+ end
67
117
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.6
4
+ version: 0.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-04-22 00:00:00.000000000 Z
11
+ date: 2023-05-20 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email:
@@ -27,12 +27,15 @@ files:
27
27
  - ext/llama_cpp/llama_cpp.h
28
28
  - ext/llama_cpp/src/LICENSE
29
29
  - ext/llama_cpp/src/ggml-cuda.h
30
+ - ext/llama_cpp/src/ggml-opencl.c
31
+ - ext/llama_cpp/src/ggml-opencl.h
30
32
  - ext/llama_cpp/src/ggml.c
31
33
  - ext/llama_cpp/src/ggml.h
34
+ - ext/llama_cpp/src/llama-util.h
32
35
  - ext/llama_cpp/src/llama.cpp
33
36
  - ext/llama_cpp/src/llama.h
34
- - ext/llama_cpp/src/llama_util.h
35
37
  - lib/llama_cpp.rb
38
+ - lib/llama_cpp/client.rb
36
39
  - lib/llama_cpp/version.rb
37
40
  - sig/llama_cpp.rbs
38
41
  homepage: https://github.com/yoshoku/llama_cpp.rb