llama_cpp 0.3.1 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -134,6 +134,20 @@ extern "C" {
134
134
  bool quantize_output_tensor; // quantize output.weight
135
135
  } llama_model_quantize_params;
136
136
 
137
+ // performance timing information
138
+ struct llama_timings {
139
+ double t_start_ms;
140
+ double t_end_ms;
141
+ double t_load_ms;
142
+ double t_sample_ms;
143
+ double t_p_eval_ms;
144
+ double t_eval_ms;
145
+
146
+ int32_t n_sample;
147
+ int32_t n_p_eval;
148
+ int32_t n_eval;
149
+ };
150
+
137
151
  LLAMA_API struct llama_context_params llama_context_default_params();
138
152
  LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
139
153
 
@@ -144,7 +158,9 @@ extern "C" {
144
158
  // Initialize the llama + ggml backend
145
159
  // If numa is true, use NUMA optimizations
146
160
  // Call once at the start of the program
147
- LLAMA_API void llama_init_backend(bool numa);
161
+ LLAMA_API void llama_backend_init(bool numa);
162
+ // Call once at the end of the program - currently only used for MPI
163
+ LLAMA_API void llama_backend_free();
148
164
 
149
165
  LLAMA_API int64_t llama_time_us();
150
166
 
@@ -293,6 +309,18 @@ extern "C" {
293
309
  /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
294
310
  LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
295
311
 
312
+ /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
313
+ /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
314
+ /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
315
+ /// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
316
+ /// @params smooth_factor Smooth factor between guidance logits and original logits. 1.0f means only use guidance logits. 0.0f means only original logits.
317
+ LLAMA_API void llama_sample_classifier_free_guidance(
318
+ struct llama_context * ctx,
319
+ llama_token_data_array * candidates,
320
+ struct llama_context * guidance_ctx,
321
+ float scale,
322
+ float smooth_factor);
323
+
296
324
  /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
297
325
  LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
298
326
 
@@ -331,6 +359,7 @@ extern "C" {
331
359
  LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
332
360
 
333
361
  // Performance information
362
+ LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
334
363
  LLAMA_API void llama_print_timings(struct llama_context * ctx);
335
364
  LLAMA_API void llama_reset_timings(struct llama_context * ctx);
336
365
 
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.3.1'
6
+ VERSION = '0.3.3'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'master-b8c8dda'
9
+ LLAMA_CPP_VERSION = 'master-32c5411'
10
10
  end
data/lib/llama_cpp.rb CHANGED
@@ -16,8 +16,22 @@ module LLaMACpp
16
16
  # @param prompt [String] The prompt to start generation with.
17
17
  # @param n_predict [Integer] The number of tokens to predict.
18
18
  # @param n_threads [Integer] The number of threads.
19
+ # @param n_keep [Integer] The number of tokens to keep in the context.
20
+ # @param n_batch [Integer] The number of tokens to process in a batch.
21
+ # @param repeat_last_n [Integer] The number of tokens to consider for repetition penalty.
22
+ # @param repeat_penalty [Float] The repetition penalty.
23
+ # @param frequency [Float] The frequency penalty.
24
+ # @param presence [Float] The presence penalty.
25
+ # @param top_k [Integer] The number of tokens to consider for top-k sampling.
26
+ # @param top_p [Float] The probability threshold for nucleus sampling.
27
+ # @param tfs_z [Float] The z parameter for tail-free sampling.
28
+ # @param typical_p [Float] The probability for typical sampling.
29
+ # @param temperature [Float] The temperature for temperature sampling.
19
30
  # @return [String]
20
- def generate(context, prompt, n_predict: 128, n_threads: 1) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
31
+ def generate(context, prompt, # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/ParameterLists, Metrics/PerceivedComplexity
32
+ n_predict: 128, n_threads: 1, n_keep: 10, n_batch: 512, repeat_last_n: 64,
33
+ repeat_penalty: 1.1, frequency: 0.0, presence: 0.0, top_k: 40,
34
+ top_p: 0.95, tfs_z: 1.0, typical_p: 1.0, temperature: 0.8)
21
35
  raise ArgumentError, 'context must be an instance of LLaMACpp::Context' unless context.is_a?(LLaMACpp::Context)
22
36
  raise ArgumentError, 'prompt must be a String' unless prompt.is_a?(String)
23
37
 
@@ -31,19 +45,8 @@ module LLaMACpp
31
45
 
32
46
  embd = []
33
47
  n_consumed = 0
34
- n_keep = 10
35
48
  n_past = 0
36
49
  n_remain = n_predict
37
- repeat_last_n = 64
38
- repeat_penalty = 1.1
39
- frequency = 0.0
40
- presence = 0.0
41
- top_k = 40
42
- top_p = 0.95
43
- tfs_z = 1.0
44
- typical_p = 1.0
45
- temperature = 0.8
46
- n_batch = 512
47
50
  n_vocab = context.n_vocab
48
51
  output = []
49
52
 
@@ -105,4 +108,4 @@ module LLaMACpp
105
108
  end
106
109
  end
107
110
 
108
- LLaMACpp.init_backend
111
+ LLaMACpp.backend_init
data/sig/llama_cpp.rbs CHANGED
@@ -26,9 +26,13 @@ module LLaMACpp
26
26
  LLAMA_FTYPE_MOSTLY_Q5_K_M: Integer
27
27
  LLAMA_FTYPE_MOSTLY_Q6_K: Integer
28
28
 
29
- def self?.init_backend: (?numa: bool) -> void
29
+ def self?.backend_init: (?numa: bool) -> void
30
+ def self?.backend_free: () -> void
30
31
  def self?.model_quantize: (input_path: String, output_path: String, params: ModelQuantizeParams) -> void
31
- def self?.generate: (::LLaMACpp::Context, String, ?n_predict: Integer, ?n_threads: Integer) -> String
32
+ def self?.generate: (::LLaMACpp::Context, String,
33
+ ?n_predict: Integer, ?n_threads: Integer, ?n_keep: Integer, ?n_batch: Integer,
34
+ ?repeat_last_n: Integer, ?repeat_penalty: Float, ?frequency: Float, ?presence: Float,
35
+ ?top_k: Integer, ?top_p: Float, ?tfs_z: Float, ?typical_p: Float, ?temperature: Float) -> String
32
36
  def self?.print_system_info: () -> void
33
37
  def self?.token_bos: () -> Integer
34
38
  def self?.token_eos: () -> Integer
@@ -67,6 +71,20 @@ module LLaMACpp
67
71
  def apply_lora_from_file: (lora_path: String, ?base_model_path: String, ?n_threads: Integer) -> void
68
72
  end
69
73
 
74
+ class Timings
75
+ public
76
+
77
+ def t_start_ms: () -> Float
78
+ def t_end_ms: () -> Float
79
+ def t_load_ms: () -> Float
80
+ def t_sample_ms: () -> Float
81
+ def t_p_eval_ms: () -> Float
82
+ def t_eval_ms: () -> Float
83
+ def n_sample: () -> Integer
84
+ def n_p_eval: () -> Integer
85
+ def n_eval: () -> Integer
86
+ end
87
+
70
88
  class Context
71
89
  public
72
90
 
@@ -80,6 +98,7 @@ module LLaMACpp
80
98
  def n_embd: () -> Integer
81
99
  def n_vocab: () -> Integer
82
100
  def vocab: (capacity: Integer) -> [Array[String], Array[Float]]
101
+ def timings: () -> ::LLaMACpp::Timings
83
102
  def print_timings: () -> void
84
103
  def reset_timings: () -> void
85
104
  def token_to_str: (Integer) -> String
@@ -90,6 +109,7 @@ module LLaMACpp
90
109
  def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
91
110
  def sample_repetition_penalty: (::LLaMACpp::TokenDataArray, Array[Integer], penalty: Float) -> void
92
111
  def sample_frequency_and_presence_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], frequency: Float, presence: Float) -> void
112
+ def sample_classifier_free_guidance: (::LLaMACpp::TokenDataArray, guidance: ::LLaMACpp::Context, scale: Float, smooth_factor: Float) -> void
93
113
  def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
94
114
  def sample_top_k: (::LLaMACpp::TokenDataArray, k: Integer, ?min_keep: Integer) -> void
95
115
  def sample_top_p: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.1
4
+ version: 0.3.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-07-02 00:00:00.000000000 Z
11
+ date: 2023-07-15 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email:
@@ -25,6 +25,7 @@ files:
25
25
  - examples/README.md
26
26
  - examples/chat.rb
27
27
  - examples/embedding.rb
28
+ - examples/prompt_jp.txt
28
29
  - ext/llama_cpp/extconf.rb
29
30
  - ext/llama_cpp/llama_cpp.cpp
30
31
  - ext/llama_cpp/llama_cpp.h
@@ -34,6 +35,8 @@ files:
34
35
  - ext/llama_cpp/src/ggml-metal.h
35
36
  - ext/llama_cpp/src/ggml-metal.m
36
37
  - ext/llama_cpp/src/ggml-metal.metal
38
+ - ext/llama_cpp/src/ggml-mpi.c
39
+ - ext/llama_cpp/src/ggml-mpi.h
37
40
  - ext/llama_cpp/src/ggml-opencl.cpp
38
41
  - ext/llama_cpp/src/ggml-opencl.h
39
42
  - ext/llama_cpp/src/ggml.c