llama_cpp 0.3.1 → 0.3.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -134,6 +134,20 @@ extern "C" {
134
134
  bool quantize_output_tensor; // quantize output.weight
135
135
  } llama_model_quantize_params;
136
136
 
137
+ // performance timing information
138
+ struct llama_timings {
139
+ double t_start_ms;
140
+ double t_end_ms;
141
+ double t_load_ms;
142
+ double t_sample_ms;
143
+ double t_p_eval_ms;
144
+ double t_eval_ms;
145
+
146
+ int32_t n_sample;
147
+ int32_t n_p_eval;
148
+ int32_t n_eval;
149
+ };
150
+
137
151
  LLAMA_API struct llama_context_params llama_context_default_params();
138
152
  LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
139
153
 
@@ -144,7 +158,9 @@ extern "C" {
144
158
  // Initialize the llama + ggml backend
145
159
  // If numa is true, use NUMA optimizations
146
160
  // Call once at the start of the program
147
- LLAMA_API void llama_init_backend(bool numa);
161
+ LLAMA_API void llama_backend_init(bool numa);
162
+ // Call once at the end of the program - currently only used for MPI
163
+ LLAMA_API void llama_backend_free();
148
164
 
149
165
  LLAMA_API int64_t llama_time_us();
150
166
 
@@ -293,6 +309,18 @@ extern "C" {
293
309
  /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
294
310
  LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
295
311
 
312
+ /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
313
+ /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
314
+ /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
315
+ /// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
316
+ /// @params smooth_factor Smooth factor between guidance logits and original logits. 1.0f means only use guidance logits. 0.0f means only original logits.
317
+ LLAMA_API void llama_sample_classifier_free_guidance(
318
+ struct llama_context * ctx,
319
+ llama_token_data_array * candidates,
320
+ struct llama_context * guidance_ctx,
321
+ float scale,
322
+ float smooth_factor);
323
+
296
324
  /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
297
325
  LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
298
326
 
@@ -331,6 +359,7 @@ extern "C" {
331
359
  LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
332
360
 
333
361
  // Performance information
362
+ LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
334
363
  LLAMA_API void llama_print_timings(struct llama_context * ctx);
335
364
  LLAMA_API void llama_reset_timings(struct llama_context * ctx);
336
365
 
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.3.1'
6
+ VERSION = '0.3.3'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'master-b8c8dda'
9
+ LLAMA_CPP_VERSION = 'master-32c5411'
10
10
  end
data/lib/llama_cpp.rb CHANGED
@@ -16,8 +16,22 @@ module LLaMACpp
16
16
  # @param prompt [String] The prompt to start generation with.
17
17
  # @param n_predict [Integer] The number of tokens to predict.
18
18
  # @param n_threads [Integer] The number of threads.
19
+ # @param n_keep [Integer] The number of tokens to keep in the context.
20
+ # @param n_batch [Integer] The number of tokens to process in a batch.
21
+ # @param repeat_last_n [Integer] The number of tokens to consider for repetition penalty.
22
+ # @param repeat_penalty [Float] The repetition penalty.
23
+ # @param frequency [Float] The frequency penalty.
24
+ # @param presence [Float] The presence penalty.
25
+ # @param top_k [Integer] The number of tokens to consider for top-k sampling.
26
+ # @param top_p [Float] The probability threshold for nucleus sampling.
27
+ # @param tfs_z [Float] The z parameter for tail-free sampling.
28
+ # @param typical_p [Float] The probability for typical sampling.
29
+ # @param temperature [Float] The temperature for temperature sampling.
19
30
  # @return [String]
20
- def generate(context, prompt, n_predict: 128, n_threads: 1) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
31
+ def generate(context, prompt, # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
32
+ n_predict: 128, n_threads: 1, n_keep: 10, n_batch: 512, repeat_last_n: 64,
33
+ repeat_penalty: 1.1, frequency: 0.0, presence: 0.0, top_k: 40,
34
+ top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8)
21
35
  raise ArgumentError, 'context must be an instance of LLaMACpp::Context' unless context.is_a?(LLaMACpp::Context)
22
36
  raise ArgumentError, 'prompt must be a String' unless prompt.is_a?(String)
23
37
 
@@ -31,19 +45,8 @@ module LLaMACpp
31
45
 
32
46
  embd = []
33
47
  n_consumed = 0
34
- n_keep = 10
35
48
  n_past = 0
36
49
  n_remain = n_predict
37
- repeat_last_n = 64
38
- repeat_penalty = 1.1
39
- frequency = 0.0
40
- presence = 0.0
41
- top_k = 40
42
- top_p = 0.95
43
- tfs_z = 1.0
44
- typical_p = 1.0
45
- temperature = 0.8
46
- n_batch = 512
47
50
  n_vocab = context.n_vocab
48
51
  output = []
49
52
 
@@ -105,4 +108,4 @@ module LLaMACpp
105
108
  end
106
109
  end
107
110
 
108
- LLaMACpp.init_backend
111
+ LLaMACpp.backend_init
data/sig/llama_cpp.rbs CHANGED
@@ -26,9 +26,13 @@ module LLaMACpp
26
26
  LLAMA_FTYPE_MOSTLY_Q5_K_M: Integer
27
27
  LLAMA_FTYPE_MOSTLY_Q6_K: Integer
28
28
 
29
- def self?.init_backend: (?numa: bool) -> void
29
+ def self?.backend_init: (?numa: bool) -> void
30
+ def self?.backend_free: () -> void
30
31
  def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
31
- def self?.generate: (::LLaMACpp::Context, String, ?n_predict: Integer, ?n_threads: Integer) -> String
32
+ def self?.generate: (::LLaMACpp::Context, String,
33
+ ?n_predict: Integer, ?n_threads: Integer, ?n_keep: Integer, ?n_batch: Integer,
34
+ ?repeat_last_n: Integer, ?repeat_penalty: Float, ?frequency: Float, ?presence: Float,
35
+ ?top_k: Integer, ?top_p: Float, ?tfs_z: Float, ?typical_p: Float, ?temperature: Float) -> String
32
36
  def self?.print_system_info: () -> void
33
37
  def self?.token_bos: () -> Integer
34
38
  def self?.token_eos: () -> Integer
@@ -67,6 +71,20 @@ module LLaMACpp
67
71
  def apply_lora_from_file: (lora_path: String, ?base_model_path: String, ?n_threads: Integer) -> void
68
72
  end
69
73
 
74
+ class Timings
75
+ public
76
+
77
+ def t_start_ms: () -> Float
78
+ def t_end_ms: () -> Float
79
+ def t_load_ms: () -> Float
80
+ def t_sample_ms: () -> Float
81
+ def t_p_eval_ms: () -> Float
82
+ def t_eval_ms: () -> Float
83
+ def n_sample: () -> Integer
84
+ def n_p_eval: () -> Integer
85
+ def n_eval: () -> Integer
86
+ end
87
+
70
88
  class Context
71
89
  public
72
90
 
@@ -80,6 +98,7 @@ module LLaMACpp
80
98
  def n_embd: () -> Integer
81
99
  def n_vocab: () -> Integer
82
100
  def vocab: (capacity: Integer) -> [Array[String], Array[Float]]
101
+ def timings: () -> ::LLaMACpp::Timings
83
102
  def print_timings: () -> void
84
103
  def reset_timings: () -> void
85
104
  def token_to_str: (Integer) -> String
@@ -90,6 +109,7 @@ module LLaMACpp
90
109
  def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
91
110
  def sample_repetition_penalty: (::LLaMACpp::TokenDataArray, Array[Integer], penalty: Float) -> void
92
111
  def sample_frequency_and_presence_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], frequency: Float, presence: Float) -> void
112
+ def sample_classifier_free_guidance: (::LLaMACpp::TokenDataArray, guidance: ::LLaMACpp::Context, scale: Float, smooth_factor: Float) -> void
93
113
  def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
94
114
  def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
95
115
  def sample_top_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.1
4
+ version: 0.3.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-07-02 00:00:00.000000000 Z
11
+ date: 2023-07-15 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email:
@@ -25,6 +25,7 @@ files:
25
25
  - examples/README.md
26
26
  - examples/chat.rb
27
27
  - examples/embedding.rb
28
+ - examples/prompt_jp.txt
28
29
  - ext/llama_cpp/extconf.rb
29
30
  - ext/llama_cpp/llama_cpp.cpp
30
31
  - ext/llama_cpp/llama_cpp.h
@@ -34,6 +35,8 @@ files:
34
35
  - ext/llama_cpp/src/ggml-metal.h
35
36
  - ext/llama_cpp/src/ggml-metal.m
36
37
  - ext/llama_cpp/src/ggml-metal.metal
38
+ - ext/llama_cpp/src/ggml-mpi.c
39
+ - ext/llama_cpp/src/ggml-mpi.h
37
40
  - ext/llama_cpp/src/ggml-opencl.cpp
38
41
  - ext/llama_cpp/src/ggml-opencl.h
39
42
  - ext/llama_cpp/src/ggml.c