llama_cpp 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -19,10 +19,16 @@
19
19
  # define LLAMA_API
20
20
  #endif
21
21
 
22
- #define LLAMA_FILE_VERSION 1
23
- #define LLAMA_FILE_MAGIC 'ggjt'
24
- #define LLAMA_FILE_MAGIC_UNVERSIONED 'ggml'
25
- #define LLAMA_SESSION_MAGIC 'ggsn'
22
+ #define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt'
23
+ #define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
24
+ #define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf'
25
+ #define LLAMA_FILE_MAGIC_GGML 0x67676d6cu // 'ggml'
26
+ #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
27
+
28
+ #define LLAMA_FILE_VERSION 3
29
+ #define LLAMA_FILE_MAGIC LLAMA_FILE_MAGIC_GGJT
30
+ #define LLAMA_FILE_MAGIC_UNVERSIONED LLAMA_FILE_MAGIC_GGML
31
+ #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
26
32
  #define LLAMA_SESSION_VERSION 1
27
33
 
28
34
  #ifdef __cplusplus
@@ -40,9 +46,9 @@ extern "C" {
40
46
  typedef int llama_token;
41
47
 
42
48
  typedef struct llama_token_data {
43
- llama_token id; // token id
44
- float logit; // log-odds of the token
45
- float p; // probability of the token
49
+ llama_token id; // token id
50
+ float logit; // log-odds of the token
51
+ float p; // probability of the token
46
52
  } llama_token_data;
47
53
 
48
54
  typedef struct llama_token_data_array {
@@ -54,9 +60,9 @@ extern "C" {
54
60
  typedef void (*llama_progress_callback)(float progress, void *ctx);
55
61
 
56
62
  struct llama_context_params {
57
- int n_ctx; // text context
58
- int n_parts; // -1 for default
59
- int seed; // RNG seed, -1 for random
63
+ int n_ctx; // text context
64
+ int n_gpu_layers; // number of layers to store in VRAM
65
+ int seed; // RNG seed, -1 for random
60
66
 
61
67
  bool f16_kv; // use fp16 for KV cache
62
68
  bool logits_all; // the llama_eval() call computes all logits, not just the last one
@@ -73,16 +79,16 @@ extern "C" {
73
79
 
74
80
  // model file types
75
81
  enum llama_ftype {
76
- LLAMA_FTYPE_ALL_F32 = 0,
77
- LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
78
- LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
79
- LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
82
+ LLAMA_FTYPE_ALL_F32 = 0,
83
+ LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
84
+ LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
85
+ LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
80
86
  LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
81
- LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
82
- // LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed
83
- LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
84
- LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
85
- LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
87
+ // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
88
+ // LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
89
+ LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
90
+ LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
91
+ LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
86
92
  };
87
93
 
88
94
  LLAMA_API struct llama_context_params llama_context_default_params();
@@ -90,6 +96,13 @@ extern "C" {
90
96
  LLAMA_API bool llama_mmap_supported();
91
97
  LLAMA_API bool llama_mlock_supported();
92
98
 
99
+ // TODO: not great API - very likely to change
100
+ // Initialize the llama + ggml backend
101
+ // Call once at the start of the program
102
+ LLAMA_API void llama_init_backend();
103
+
104
+ LLAMA_API int64_t llama_time_us();
105
+
93
106
  // Various functions for loading a ggml llama model.
94
107
  // Allocate (almost) all memory needed for the model.
95
108
  // Return NULL on failure
@@ -134,11 +147,11 @@ extern "C" {
134
147
  // Copies the state to the specified destination address.
135
148
  // Destination needs to have allocated enough memory.
136
149
  // Returns the number of bytes copied
137
- LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest);
150
+ LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst);
138
151
 
139
152
  // Set the state reading from the specified address
140
153
  // Returns the number of bytes read
141
- LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src);
154
+ LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src);
142
155
 
143
156
  // Save/load session file
144
157
  LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
@@ -202,16 +215,16 @@ extern "C" {
202
215
  LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
203
216
 
204
217
  /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
205
- LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep = 1);
218
+ LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep);
206
219
 
207
220
  /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
208
- LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1);
221
+ LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
209
222
 
210
223
  /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
211
- LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep = 1);
224
+ LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep);
212
225
 
213
226
  /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
214
- LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1);
227
+ LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
215
228
  LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
216
229
 
217
230
  /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
@@ -9,7 +9,6 @@ module LLaMACpp
9
9
  # @param lora_adapter_path [String] The path to the LoRA adapter file.
10
10
  # @param lora_base_path [String] The path to the LoRA base model file.
11
11
  # @param n_ctx [Integer] The context size.
12
- # @param n_parts [Integer] The amount of model parts (-1 = determine from model dimensions).
13
12
  # @param memory_f16 [Boolean] The flag wheter to use f16 instead of f32 for memory kv.
14
13
  # @param use_mmap [Boolean] The flag whether to use mmap.
15
14
  # @param use_mlock [Boolean] The flag hether to use mlock.
@@ -19,7 +18,7 @@ module LLaMACpp
19
18
  # @return [Client]
20
19
  # rubocop:disable Metrics/MethodLength, Metrics/ParameterLists
21
20
  def initialize(model_path:, lora_adapter_path: nil, lora_base_path: nil,
22
- n_ctx: 512, n_parts: -1, memory_f16: false, use_mmap: true, use_mlock: false,
21
+ n_ctx: 512, memory_f16: false, use_mmap: true, use_mlock: false,
23
22
  embedding: false,
24
23
  n_threads: 1, seed: 0)
25
24
  @params = {
@@ -27,7 +26,6 @@ module LLaMACpp
27
26
  lora_adapter_path: lora_adapter_path,
28
27
  lora_base_path: lora_base_path,
29
28
  n_ctx: n_ctx,
30
- n_parts: n_parts,
31
29
  memory_f16: memory_f16,
32
30
  use_mmap: use_mmap,
33
31
  use_mlock: use_mlock,
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.1.0'
6
+ VERSION = '0.1.2'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'master-173d0e6'
9
+ LLAMA_CPP_VERSION = 'master-265db98'
10
10
  end
data/lib/llama_cpp.rb CHANGED
@@ -106,3 +106,5 @@ module LLaMACpp
106
106
  output.join.delete_prefix(spaced_prompt).strip
107
107
  end
108
108
  end
109
+
110
+ LLaMACpp.init_backend
data/sig/llama_cpp.rbs CHANGED
@@ -10,11 +10,11 @@ module LLaMACpp
10
10
  LLAMA_FTYPE_MOSTLY_Q4_0: Integer
11
11
  LLAMA_FTYPE_MOSTLY_Q4_1: Integer
12
12
  LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: Integer
13
- LLAMA_FTYPE_MOSTLY_Q4_2: Integer
14
13
  LLAMA_FTYPE_MOSTLY_Q8_0: Integer
15
14
  LLAMA_FTYPE_MOSTLY_Q5_0: Integer
16
15
  LLAMA_FTYPE_MOSTLY_Q5_1: Integer
17
16
 
17
+ def self?.init_backend: () -> void
18
18
  def self?.model_quantize: (input_path: String, output_path: String, ftype: Integer, ?n_threads: Integer) -> void
19
19
  def self?.generate: (::LLaMACpp::Context, String, ?n_predict: Integer, ?n_threads: Integer) -> String
20
20
  def self?.print_system_info: () -> void
@@ -65,6 +65,8 @@ module LLaMACpp
65
65
  def apply_lora_from_file: (lora_path: String, ?base_model_path: String, ?n_threads: Integer) -> void
66
66
  def kv_cache_token_count: () -> Integer
67
67
  def set_rng_seed: (Integer) -> void
68
+ def load_session_file: (session_path: String) -> void
69
+ def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
68
70
  def sample_repetition_penalty: (::LLaMACpp::TokenDataArray, Array[Integer], penalty: Float) -> void
69
71
  def sample_frequency_and_presence_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], frequency: Float, presence: Float) -> void
70
72
  def sample_softmax: (::LLaMACpp::TokenDataArray) -> void
@@ -90,8 +92,6 @@ module LLaMACpp
90
92
  def logits_all=: (bool) -> bool
91
93
  def n_ctx: () -> Integer
92
94
  def n_ctx=: (Integer) -> Integer
93
- def n_parts: () -> Integer
94
- def n_parts=: (Integer) -> Integer
95
95
  def seed: () -> Integer
96
96
  def seed=: (Integer) -> Integer
97
97
  def use_mlock: () -> bool
@@ -106,7 +106,7 @@ module LLaMACpp
106
106
 
107
107
  class Client
108
108
  def initialize(model_path: String, ?lora_adapter_path: String, ?lora_base_path: String,
109
- ?n_ctx: Integer, ?n_parts: Integer, ?memory_f16: bool, ?use_mmap: bool, ?use_mlock: bool,
109
+ ?n_ctx: Integer, ?memory_f16: bool, ?use_mmap: bool, ?use_mlock: bool,
110
110
  ?embedding: bool, ?n_threads: Integer, ?seed: Integer) -> void
111
111
  def completions(String, ?max_tokens: Integer, ?n_keep: Integer, ?repeat_last_n: Integer, ?n_batch: Integer,
112
112
  ?frequency: Float, ?presence: Float,
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-05-20 00:00:00.000000000 Z
11
+ date: 2023-05-22 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email: