llama_cpp 0.2.2 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -26,6 +26,14 @@
26
26
  # define LLAMA_API
27
27
  #endif
28
28
 
29
+ #ifdef __GNUC__
30
+ # define DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
31
+ #elif defined(_MSC_VER)
32
+ # define DEPRECATED(func, hint) __declspec(deprecated(hint)) func
33
+ #else
34
+ # define DEPRECATED(func, hint) func
35
+ #endif
36
+
29
37
  #define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt'
30
38
  #define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
31
39
  #define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf'
@@ -53,6 +61,7 @@ extern "C" {
53
61
  // TODO: show sample usage
54
62
  //
55
63
 
64
+ struct llama_model;
56
65
  struct llama_context;
57
66
 
58
67
  typedef int llama_token;
@@ -131,17 +140,29 @@ extern "C" {
131
140
 
132
141
  // TODO: not great API - very likely to change
133
142
  // Initialize the llama + ggml backend
143
+ // If numa is true, use NUMA optimizations
134
144
  // Call once at the start of the program
135
- LLAMA_API void llama_init_backend();
145
+ LLAMA_API void llama_init_backend(bool numa);
136
146
 
137
147
  LLAMA_API int64_t llama_time_us();
138
148
 
149
+ LLAMA_API struct llama_model * llama_load_model_from_file(
150
+ const char * path_model,
151
+ struct llama_context_params params);
152
+
153
+ LLAMA_API void llama_free_model(struct llama_model * model);
154
+
155
+ LLAMA_API struct llama_context * llama_new_context_with_model(
156
+ struct llama_model * model,
157
+ struct llama_context_params params);
158
+
139
159
  // Various functions for loading a ggml llama model.
140
160
  // Allocate (almost) all memory needed for the model.
141
161
  // Return NULL on failure
142
- LLAMA_API struct llama_context * llama_init_from_file(
162
+ LLAMA_API DEPRECATED(struct llama_context * llama_init_from_file(
143
163
  const char * path_model,
144
- struct llama_context_params params);
164
+ struct llama_context_params params),
165
+ "please use llama_load_model_from_file combined with llama_new_context_with_model instead");
145
166
 
146
167
  // Frees all allocated memory
147
168
  LLAMA_API void llama_free(struct llama_context * ctx);
@@ -158,8 +179,15 @@ extern "C" {
158
179
  // The model needs to be reloaded before applying a new adapter, otherwise the adapter
159
180
  // will be applied on top of the previous one
160
181
  // Returns 0 on success
161
- LLAMA_API int llama_apply_lora_from_file(
182
+ LLAMA_API DEPRECATED(int llama_apply_lora_from_file(
162
183
  struct llama_context * ctx,
184
+ const char * path_lora,
185
+ const char * path_base_model,
186
+ int n_threads),
187
+ "please use llama_model_apply_lora_from_file instead");
188
+
189
+ LLAMA_API int llama_model_apply_lora_from_file(
190
+ const struct llama_model * model,
163
191
  const char * path_lora,
164
192
  const char * path_base_model,
165
193
  int n_threads);
@@ -310,7 +338,7 @@ extern "C" {
310
338
  #include <string>
311
339
  struct ggml_tensor;
312
340
 
313
- std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
341
+ const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
314
342
 
315
343
  #endif
316
344
 
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.2.2'
6
+ VERSION = '0.3.0'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'master-7487137'
9
+ LLAMA_CPP_VERSION = 'master-9d23589'
10
10
  end
data/lib/llama_cpp.rb CHANGED
@@ -2,7 +2,6 @@
2
2
 
3
3
  require_relative 'llama_cpp/version'
4
4
  require_relative 'llama_cpp/llama_cpp'
5
- require_relative 'llama_cpp/client'
6
5
 
7
6
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
8
7
  module LLaMACpp
@@ -20,7 +19,6 @@ module LLaMACpp
20
19
  # @return [String]
21
20
  def generate(context, prompt, n_predict: 128, n_threads: 1) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
22
21
  raise ArgumentError, 'context must be an instance of LLaMACpp::Context' unless context.is_a?(LLaMACpp::Context)
23
- raise ArgumentError, 'context must have loaded the model' if context.empty?
24
22
  raise ArgumentError, 'prompt must be a String' unless prompt.is_a?(String)
25
23
 
26
24
  spaced_prompt = " #{prompt}"
data/sig/llama_cpp.rbs CHANGED
@@ -25,7 +25,7 @@ module LLaMACpp
25
25
  LLAMA_FTYPE_MOSTLY_Q5_K_M: Integer
26
26
  LLAMA_FTYPE_MOSTLY_Q6_K: Integer
27
27
 
28
- def self?.init_backend: () -> void
28
+ def self?.init_backend: (?numa: bool) -> void
29
29
  def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
30
30
  def self?.generate: (::LLaMACpp::Context, String, ?n_predict: Integer, ?n_threads: Integer) -> String
31
31
  def self?.print_system_info: () -> void
@@ -55,17 +55,24 @@ module LLaMACpp
55
55
  def sorted: () -> bool
56
56
  end
57
57
 
58
- class Context
58
+ class Model
59
59
  public
60
60
 
61
61
  def initialize: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
62
62
  | () -> void
63
- def embeddings: () -> Array[Float]
64
63
  def empty?: () -> bool
65
- def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
66
- def eval_export: (String) -> bool
67
64
  def free: () -> void
68
65
  def load: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
66
+ def apply_lora_from_file: (lora_path: String, ?base_model_path: String, ?n_threads: Integer) -> void
67
+ end
68
+
69
+ class Context
70
+ public
71
+
72
+ def initialize: (model: ::LLaMACpp::Model) -> void
73
+ def embeddings: () -> Array[Float]
74
+ def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
75
+ def eval_export: (String) -> bool
69
76
  def logits: () -> Array[Float]
70
77
  def n_ctx: () -> Integer
71
78
  def n_embd: () -> Integer
@@ -75,7 +82,6 @@ module LLaMACpp
75
82
  def reset_timings: () -> void
76
83
  def token_to_str: (Integer) -> String
77
84
  def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
78
- def apply_lora_from_file: (lora_path: String, ?base_model_path: String, ?n_threads: Integer) -> void
79
85
  def kv_cache_token_count: () -> Integer
80
86
  def set_rng_seed: (Integer) -> void
81
87
  def load_session_file: (session_path: String) -> void
@@ -138,15 +144,4 @@ module LLaMACpp
138
144
  end
139
145
 
140
146
  class Params = ContextParams
141
-
142
- class Client
143
- def initialize(model_path: String, ?lora_adapter_path: String, ?lora_base_path: String,
144
- ?n_ctx: Integer, ?memory_f16: bool, ?use_mmap: bool, ?use_mlock: bool,
145
- ?embedding: bool, ?n_threads: Integer, ?seed: Integer) -> void
146
- def completions(String, ?max_tokens: Integer, ?n_keep: Integer, ?repeat_last_n: Integer, ?n_batch: Integer,
147
- ?frequency: Float, ?presence: Float,
148
- ?top_k: Integer, ?top_p: Float, ?tfs_z: Float, ?typical_p: Float, ?temperature: Float,
149
- ?repeat_penalty: Float) -> String
150
- def embeddings(String) -> Array[Float]
151
- end
152
147
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-06-23 00:00:00.000000000 Z
11
+ date: 2023-06-29 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email:
@@ -44,7 +44,6 @@ files:
44
44
  - ext/llama_cpp/src/llama.cpp
45
45
  - ext/llama_cpp/src/llama.h
46
46
  - lib/llama_cpp.rb
47
- - lib/llama_cpp/client.rb
48
47
  - lib/llama_cpp/version.rb
49
48
  - sig/llama_cpp.rbs
50
49
  homepage: https://github.com/yoshoku/llama_cpp.rb
@@ -1,172 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module LLaMACpp
4
- # Client provides a high-level interface to the LLM model.
5
- class Client # rubocop:disable Metrics/ClassLength
6
- # Creates a new client.
7
- #
8
- # @param model_path [String] The path to the model file.
9
- # @param lora_adapter_path [String] The path to the LoRA adapter file.
10
- # @param lora_base_path [String] The path to the LoRA base model file.
11
- # @param n_ctx [Integer] The context size.
12
- # @param memory_f16 [Boolean] The flag wheter to use f16 instead of f32 for memory kv.
13
- # @param use_mmap [Boolean] The flag whether to use mmap.
14
- # @param use_mlock [Boolean] The flag hether to use mlock.
15
- # @param embedding [Boolean] The flag whether to calculate embedding.
16
- # @param n_threads [Integer] The number of threads to use.
17
- # @param seed [Integer] The seed for the random number generator.
18
- # @return [Client]
19
- # rubocop:disable Metrics/MethodLength, Metrics/ParameterLists
20
- def initialize(model_path:, lora_adapter_path: nil, lora_base_path: nil,
21
- n_ctx: 512, memory_f16: false, use_mmap: true, use_mlock: false,
22
- embedding: false,
23
- n_threads: 1, seed: 0)
24
- @params = {
25
- model_path: model_path,
26
- lora_adapter_path: lora_adapter_path,
27
- lora_base_path: lora_base_path,
28
- n_ctx: n_ctx,
29
- memory_f16: memory_f16,
30
- use_mmap: use_mmap,
31
- use_mlock: use_mlock,
32
- embedding: embedding,
33
- n_threads: n_threads,
34
- seed: seed
35
- }
36
- @context_params = ContextParams.new
37
- @context_params.n_ctx = n_ctx
38
- @context_params.n_parts = n_parts
39
- @context_params.f16_kv = memory_f16
40
- @context_params.use_mmap = use_mmap
41
- @context_params.use_mlock = use_mlock
42
- @context_params.embedding = embedding
43
- @context_params.seed = seed
44
- @context = Context.new(model_path: model_path, params: @context_params)
45
- return unless lora_adapter_path.is_a?(String)
46
-
47
- if lora_base_path.is_a?(String)
48
- @context.apply_lora_from_file(lora_path: lora_adapter_path, base_model_path: lora_base_path, n_threads: n_threads)
49
- else
50
- @context.apply_lora_from_file(lora_path: lora_adapter_path, n_threads: n_threads)
51
- end
52
- end
53
- # rubocop:enable Metrics/MethodLength, Metrics/ParameterLists
54
-
55
- # Generates completions for a given prompt.
56
- #
57
- # @param prompt [String] The prompt to generate completions for.
58
- # @param max_tokens [Integer] The maximum number of tokens to generate.
59
- # @param n_keep [Integer] The number of tokens to keep from the initial prompt.
60
- # @param repeat_last_n [Integer] The number of tokens to use for repeat penalty.
61
- # @param n_batch [Integer] The batch size.
62
- # @param frequency [Float] The frequency penalty value.
63
- # @param presence [Float] The presence penalty value.
64
- # @param top_k [Integer] The top-k value.
65
- # @param top_p [Float] The top-p value.
66
- # @param tfs_z [Float] The tail free sampling parameter.
67
- # @param typical_p [Float] The typical probability value.
68
- # @param temperature [Float] The temperature value.
69
- # @param repeat_penalty [Float] The repeat penalty value.
70
- # @return [String]
71
- # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
72
- def completions(prompt, max_tokens: 64, n_keep: 10, repeat_last_n: 64, n_batch: 512,
73
- frequency: 0.0, presence: 0.0,
74
- top_k: 40, top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8, repeat_penalty: 1.1)
75
- embd_input = tokenize_prompt(prompt)
76
-
77
- n_ctx = @context.n_ctx
78
- raise ArgumentError, "prompt is too long #{embd_input.size} tokens, maximum is #{n_ctx - 4}" if embd_input.size > n_ctx - 4
79
-
80
- last_n_tokens = [0] * n_ctx
81
-
82
- embd = []
83
- n_consumed = 0
84
- n_past = 0
85
- n_remain = max_tokens
86
- n_vocab = @context.n_vocab
87
- output = []
88
-
89
- while n_remain != 0
90
- unless embd.empty?
91
- if n_past + embd.size > n_ctx
92
- n_left = n_past - n_keep
93
- n_past = n_keep
94
- embd.insert(0, last_n_tokens[(n_ctx - (n_left / 2) - embd.size)...-embd.size])
95
- end
96
-
97
- @context.eval(tokens: embd, n_past: n_past, n_threads: @params[:n_threads])
98
- end
99
-
100
- n_past += embd.size
101
- embd.clear
102
-
103
- if embd_input.size <= n_consumed
104
- logits = @context.logits
105
- base_candidates = Array.new(n_vocab) { |i| LLaMACpp::TokenData.new(id: i, logit: logits[i], p: 0.0) }
106
- candidates = LLaMACpp::TokenDataArray.new(base_candidates)
107
-
108
- # apply penalties
109
- last_n_repeat = [last_n_tokens.size, repeat_last_n, n_ctx].min
110
- @context.sample_repetition_penalty(candidates, last_n_tokens[-last_n_repeat..], penalty: repeat_penalty)
111
- @context.sample_frequency_and_presence_penalties(
112
- candidates, last_n_tokens[-last_n_repeat..], frequency: frequency, presence: presence
113
- )
114
-
115
- # temperature sampling
116
- @context.sample_top_k(candidates, k: top_k)
117
- @context.sample_tail_free(candidates, z: tfs_z)
118
- @context.sample_typical(candidates, prob: typical_p)
119
- @context.sample_top_p(candidates, prob: top_p)
120
- @context.sample_temperature(candidates, temperature: temperature)
121
- id = @context.sample_token(candidates)
122
-
123
- last_n_tokens.shift
124
- last_n_tokens.push(id)
125
-
126
- last_n_tokens.shift
127
- last_n_tokens.push(id)
128
-
129
- embd.push(id)
130
- n_remain -= 1
131
- else
132
- while embd_input.size > n_consumed
133
- embd.push(embd_input[n_consumed])
134
- last_n_tokens.shift
135
- last_n_tokens.push(embd_input[n_consumed])
136
- n_consumed += 1
137
- break if embd.size >= n_batch
138
- end
139
- end
140
-
141
- embd.each { |token| output << @context.token_to_str(token) }
142
-
143
- break if !embd.empty? && embd[-1] == LLaMACpp.token_eos
144
- end
145
-
146
- output.join.delete_prefix(" #{prompt}").strip
147
- end
148
- # rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
149
-
150
- # def chat(prompt); end
151
-
152
- # Obtains the embedding for a given text.
153
- #
154
- # @param text [String] The text to obtain the embedding for.
155
- # @return [Array<Float>]
156
- def embeddings(text)
157
- raise 'The embedding option is set to false' unless @params[:embedding]
158
-
159
- embd_input = tokenize_prompt(text)
160
- raise 'The result of tokenizing the input text is empty' unless embd_input.size.positive?
161
-
162
- @context.eval(tokens: embd_input, n_past: 0, n_threads: @params[:n_threads])
163
- @context.embeddings
164
- end
165
-
166
- private
167
-
168
- def tokenize_prompt(prompt)
169
- @context.tokenize(text: " #{prompt}", add_bos: true)
170
- end
171
- end
172
- end