llama_cpp 0.1.0 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -19,10 +19,16 @@
19
19
  # define LLAMA_API
20
20
  #endif
21
21
 
22
- #define LLAMA_FILE_VERSION 1
23
- #define LLAMA_FILE_MAGIC 'ggjt'
24
- #define LLAMA_FILE_MAGIC_UNVERSIONED 'ggml'
25
- #define LLAMA_SESSION_MAGIC 'ggsn'
22
+ #define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt'
23
+ #define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
24
+ #define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf'
25
+ #define LLAMA_FILE_MAGIC_GGML 0x67676d6cu // 'ggml'
26
+ #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
27
+
28
+ #define LLAMA_FILE_VERSION 3
29
+ #define LLAMA_FILE_MAGIC LLAMA_FILE_MAGIC_GGJT
30
+ #define LLAMA_FILE_MAGIC_UNVERSIONED LLAMA_FILE_MAGIC_GGML
31
+ #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
26
32
  #define LLAMA_SESSION_VERSION 1
27
33
 
28
34
  #ifdef __cplusplus
@@ -40,9 +46,9 @@ extern "C" {
40
46
  typedef int llama_token;
41
47
 
42
48
  typedef struct llama_token_data {
43
- llama_token id; // token id
44
- float logit; // log-odds of the token
45
- float p; // probability of the token
49
+ llama_token id; // token id
50
+ float logit; // log-odds of the token
51
+ float p; // probability of the token
46
52
  } llama_token_data;
47
53
 
48
54
  typedef struct llama_token_data_array {
@@ -54,9 +60,9 @@ extern "C" {
54
60
  typedef void (*llama_progress_callback)(float progress, void *ctx);
55
61
 
56
62
  struct llama_context_params {
57
- int n_ctx; // text context
58
- int n_parts; // -1 for default
59
- int seed; // RNG seed, -1 for random
63
+ int n_ctx; // text context
64
+ int n_gpu_layers; // number of layers to store in VRAM
65
+ int seed; // RNG seed, -1 for random
60
66
 
61
67
  bool f16_kv; // use fp16 for KV cache
62
68
  bool logits_all; // the llama_eval() call computes all logits, not just the last one
@@ -73,16 +79,16 @@ extern "C" {
73
79
 
74
80
  // model file types
75
81
  enum llama_ftype {
76
- LLAMA_FTYPE_ALL_F32 = 0,
77
- LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
78
- LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
79
- LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
82
+ LLAMA_FTYPE_ALL_F32 = 0,
83
+ LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
84
+ LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
85
+ LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
80
86
  LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
81
- LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
82
- // LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed
83
- LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
84
- LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
85
- LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
87
+ // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
88
+ // LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
89
+ LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
90
+ LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
91
+ LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
86
92
  };
87
93
 
88
94
  LLAMA_API struct llama_context_params llama_context_default_params();
@@ -90,6 +96,13 @@ extern "C" {
90
96
  LLAMA_API bool llama_mmap_supported();
91
97
  LLAMA_API bool llama_mlock_supported();
92
98
 
99
+ // TODO: not great API - very likely to change
100
+ // Initialize the llama + ggml backend
101
+ // Call once at the start of the program
102
+ LLAMA_API void llama_init_backend();
103
+
104
+ LLAMA_API int64_t llama_time_us();
105
+
93
106
  // Various functions for loading a ggml llama model.
94
107
  // Allocate (almost) all memory needed for the model.
95
108
  // Return NULL on failure
@@ -134,11 +147,11 @@ extern "C" {
134
147
  // Copies the state to the specified destination address.
135
148
  // Destination needs to have allocated enough memory.
136
149
  // Returns the number of bytes copied
137
- LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest);
150
+ LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst);
138
151
 
139
152
  // Set the state reading from the specified address
140
153
  // Returns the number of bytes read
141
- LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src);
154
+ LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src);
142
155
 
143
156
  // Save/load session file
144
157
  LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
@@ -202,16 +215,16 @@ extern "C" {
202
215
  LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
203
216
 
204
217
  /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
205
- LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep = 1);
218
+ LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep);
206
219
 
207
220
  /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
208
- LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1);
221
+ LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
209
222
 
210
223
  /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
211
- LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep = 1);
224
+ LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep);
212
225
 
213
226
  /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
214
- LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1);
227
+ LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
215
228
  LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
216
229
 
217
230
  /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
@@ -9,7 +9,6 @@ module LLaMACpp
9
9
  # @param lora_adapter_path [String] The path to the LoRA adapter file.
10
10
  # @param lora_base_path [String] The path to the LoRA base model file.
11
11
  # @param n_ctx [Integer] The context size.
12
- # @param n_parts [Integer] The amount of model parts (-1 = determine from model dimensions).
13
12
  # @param memory_f16 [Boolean] The flag wheter to use f16 instead of f32 for memory kv.
14
13
  # @param use_mmap [Boolean] The flag whether to use mmap.
15
14
  # @param use_mlock [Boolean] The flag hether to use mlock.
@@ -19,7 +18,7 @@ module LLaMACpp
19
18
  # @return [Client]
20
19
  # rubocop:disable Metrics/MethodLength, Metrics/ParameterLists
21
20
  def initialize(model_path:, lora_adapter_path: nil, lora_base_path: nil,
22
- n_ctx: 512, n_parts: -1, memory_f16: false, use_mmap: true, use_mlock: false,
21
+ n_ctx: 512, memory_f16: false, use_mmap: true, use_mlock: false,
23
22
  embedding: false,
24
23
  n_threads: 1, seed: 0)
25
24
  @params = {
@@ -27,7 +26,6 @@ module LLaMACpp
27
26
  lora_adapter_path: lora_adapter_path,
28
27
  lora_base_path: lora_base_path,
29
28
  n_ctx: n_ctx,
30
- n_parts: n_parts,
31
29
  memory_f16: memory_f16,
32
30
  use_mmap: use_mmap,
33
31
  use_mlock: use_mlock,
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.1.0'
6
+ VERSION = '0.1.2'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'master-173d0e6'
9
+ LLAMA_CPP_VERSION = 'master-265db98'
10
10
  end
data/lib/llama_cpp.rb CHANGED
@@ -106,3 +106,5 @@ module LLaMACpp
106
106
  output.join.delete_prefix(spaced_prompt).strip
107
107
  end
108
108
  end
109
+
110
+ LLaMACpp.init_backend
data/sig/llama_cpp.rbs CHANGED
@@ -10,11 +10,11 @@ module LLaMACpp
10
10
  LLAMA_FTYPE_MOSTLY_Q4_0: Integer
11
11
  LLAMA_FTYPE_MOSTLY_Q4_1: Integer
12
12
  LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: Integer
13
- LLAMA_FTYPE_MOSTLY_Q4_2: Integer
14
13
  LLAMA_FTYPE_MOSTLY_Q8_0: Integer
15
14
  LLAMA_FTYPE_MOSTLY_Q5_0: Integer
16
15
  LLAMA_FTYPE_MOSTLY_Q5_1: Integer
17
16
 
17
+ def self?.init_backend: () -> void
18
18
  def self?.model_quantize: (input_path: String, output_path: String, ftype: Integer, ?n_threads: Integer) -> void
19
19
  def self?.generate: (::LLaMACpp::Context, String, ?n_predict: Integer, ?n_threads: Integer) -> String
20
20
  def self?.print_system_info: () -> void
@@ -65,6 +65,8 @@ module LLaMACpp
65
65
  def apply_lora_from_file: (lora_path: String, ?base_model_path: String, ?n_threads: Integer) -> void
66
66
  def kv_cache_token_count: () -> Integer
67
67
  def set_rng_seed: (Integer) -> void
68
+ def load_session_file: (session_path: String) -> void
69
+ def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
68
70
  def sample_repetition_penalty: (::LLaMACpp::TokenDataArray, Array[Integer], penalty: Float) -> void
69
71
  def sample_frequency_and_presence_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], frequency: Float, presence: Float) -> void
70
72
  def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
@@ -90,8 +92,6 @@ module LLaMACpp
90
92
  def logits_all=: (bool) -> bool
91
93
  def n_ctx: () -> Integer
92
94
  def n_ctx=: (Integer) -> Integer
93
- def n_parts: () -> Integer
94
- def n_parts=: (Integer) -> Integer
95
95
  def seed: () -> Integer
96
96
  def seed=: (Integer) -> Integer
97
97
  def use_mlock: () -> bool
@@ -106,7 +106,7 @@ module LLaMACpp
106
106
 
107
107
  class Client
108
108
  def initialize(model_path: String, ?lora_adapter_path: String, ?lora_base_path: String,
109
- ?n_ctx: Integer, ?n_parts: Integer, ?memory_f16: bool, ?use_mmap: bool, ?use_mlock: bool,
109
+ ?n_ctx: Integer, ?memory_f16: bool, ?use_mmap: bool, ?use_mlock: bool,
110
110
  ?embedding: bool, ?n_threads: Integer, ?seed: Integer) -> void
111
111
  def completions(String, ?max_tokens: Integer, ?n_keep: Integer, ?repeat_last_n: Integer, ?n_batch: Integer,
112
112
  ?frequency: Float, ?presence: Float,
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-05-20 00:00:00.000000000 Z
11
+ date: 2023-05-22 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email: