llama_cpp 0.0.4 → 0.0.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -72,6 +72,8 @@ extern "C" {
72
72
  LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
73
73
  LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
74
74
  LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
75
+ LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
76
+ LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // except 1d tensors
75
77
  };
76
78
 
77
79
  LLAMA_API struct llama_context_params llama_context_default_params();
@@ -91,10 +93,24 @@ extern "C" {
91
93
 
92
94
  // TODO: not great API - very likely to change
93
95
  // Returns 0 on success
96
+ // nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given
94
97
  LLAMA_API int llama_model_quantize(
95
98
  const char * fname_inp,
96
99
  const char * fname_out,
97
- enum llama_ftype ftype);
100
+ enum llama_ftype ftype,
101
+ int nthread);
102
+
103
+ // Apply a LoRA adapter to a loaded model
104
+ // path_base_model is the path to a higher quality model to use as a base for
105
+ // the layers modified by the adapter. Can be NULL to use the current loaded model.
106
+ // The model needs to be reloaded before applying a new adapter, otherwise the adapter
107
+ // will be applied on top of the previous one
108
+ // Returns 0 on success
109
+ LLAMA_API int llama_apply_lora_from_file(
110
+ struct llama_context * ctx,
111
+ const char * path_lora,
112
+ const char * path_base_model,
113
+ int n_threads);
98
114
 
99
115
  // Returns the KV cache that will contain the context for the
100
116
  // ongoing prediction with the model.
@@ -43,8 +43,12 @@
43
43
  } while (0)
44
44
 
45
45
  #ifdef __GNUC__
46
+ #ifdef __MINGW32__
47
+ __attribute__((format(gnu_printf, 1, 2)))
48
+ #else
46
49
  __attribute__((format(printf, 1, 2)))
47
50
  #endif
51
+ #endif
48
52
  static std::string format(const char * fmt, ...) {
49
53
  va_list ap, ap2;
50
54
  va_start(ap, fmt);
@@ -57,7 +61,7 @@ static std::string format(const char * fmt, ...) {
57
61
  va_end(ap2);
58
62
  va_end(ap);
59
63
  return std::string(buf.data(), size);
60
- };
64
+ }
61
65
 
62
66
  struct llama_file {
63
67
  // use FILE * so we don't have to re-open the file to mmap
@@ -164,7 +168,7 @@ struct llama_mmap {
164
168
  #ifdef _POSIX_MAPPED_FILES
165
169
  static constexpr bool SUPPORTED = true;
166
170
 
167
- llama_mmap(struct llama_file * file) {
171
+ llama_mmap(struct llama_file * file, bool prefetch = true) {
168
172
  size = file->size;
169
173
  int fd = fileno(file->fp);
170
174
  int flags = MAP_SHARED;
@@ -172,15 +176,16 @@ struct llama_mmap {
172
176
  flags |= MAP_POPULATE;
173
177
  #endif
174
178
  addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
175
- close(fd);
176
179
  if (addr == MAP_FAILED) {
177
180
  throw format("mmap failed: %s", strerror(errno));
178
181
  }
179
182
 
180
- // Advise the kernel to preload the mapped memory
181
- if (madvise(addr, file->size, MADV_WILLNEED)) {
182
- fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
183
- strerror(errno));
183
+ if (prefetch) {
184
+ // Advise the kernel to preload the mapped memory
185
+ if (madvise(addr, file->size, MADV_WILLNEED)) {
186
+ fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
187
+ strerror(errno));
188
+ }
184
189
  }
185
190
  }
186
191
 
@@ -190,14 +195,13 @@ struct llama_mmap {
190
195
  #elif defined(_WIN32)
191
196
  static constexpr bool SUPPORTED = true;
192
197
 
193
- llama_mmap(struct llama_file * file) {
198
+ llama_mmap(struct llama_file * file, bool prefetch = true) {
194
199
  size = file->size;
195
200
 
196
201
  HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
197
202
 
198
203
  HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
199
204
  DWORD error = GetLastError();
200
- CloseHandle(hFile);
201
205
 
202
206
  if (hMapping == NULL) {
203
207
  throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str());
@@ -212,13 +216,15 @@ struct llama_mmap {
212
216
  }
213
217
 
214
218
  #if _WIN32_WINNT >= _WIN32_WINNT_WIN8
215
- // Advise the kernel to preload the mapped memory
216
- WIN32_MEMORY_RANGE_ENTRY range;
217
- range.VirtualAddress = addr;
218
- range.NumberOfBytes = (SIZE_T)size;
219
- if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
220
- fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
221
- llama_format_win_err(GetLastError()).c_str());
219
+ if (prefetch) {
220
+ // Advise the kernel to preload the mapped memory
221
+ WIN32_MEMORY_RANGE_ENTRY range;
222
+ range.VirtualAddress = addr;
223
+ range.NumberOfBytes = (SIZE_T)size;
224
+ if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
225
+ fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
226
+ llama_format_win_err(GetLastError()).c_str());
227
+ }
222
228
  }
223
229
  #else
224
230
  #pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.0.4'
6
+ VERSION = '0.0.6'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'master-c85e03d'
9
+ LLAMA_CPP_VERSION = 'master-12b5900'
10
10
  end
data/lib/llama_cpp.rb CHANGED
@@ -17,9 +17,9 @@ module LLaMACpp
17
17
  # @param n_threads [Integer]
18
18
  # @return [String]
19
19
  def generate(context, prompt, n_threads: 1) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/PerceivedComplexity
20
- prompt.insert(0, ' ')
20
+ spaced_prompt = " #{prompt}"
21
21
 
22
- embd_input = context.tokenize(text: prompt, add_bos: true)
22
+ embd_input = context.tokenize(text: spaced_prompt, add_bos: true)
23
23
 
24
24
  n_ctx = context.n_ctx
25
25
  last_n_tokens = [0] * n_ctx
@@ -71,6 +71,6 @@ module LLaMACpp
71
71
  break if embd[-1] == LLaMACpp.token_eos
72
72
  end
73
73
 
74
- output.join.delete_prefix(prompt).strip
74
+ output.join.delete_prefix(spaced_prompt).strip
75
75
  end
76
76
  end
data/sig/llama_cpp.rbs CHANGED
@@ -5,10 +5,21 @@ module LLaMACpp
5
5
  LLAMA_FILE_MAGIC: String
6
6
  LLAMA_FILE_MAGIC_UNVERSIONED: String
7
7
 
8
+ LLAMA_FTYPE_ALL_F32: Integer
9
+ LLAMA_FTYPE_MOSTLY_F16: Integer
10
+ LLAMA_FTYPE_MOSTLY_Q4_0: Integer
11
+ LLAMA_FTYPE_MOSTLY_Q4_1: Integer
12
+ LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: Integer
13
+ LLAMA_FTYPE_MOSTLY_Q4_2: Integer
14
+ LLAMA_FTYPE_MOSTLY_Q4_3: Integer
15
+
16
+ def self?.model_quantize: (input_path: String, output_path: String, ftype: Integer, ?n_threads: Integer) -> void
8
17
  def self?.generate: (::LLaMACpp::Context, String, ?n_threads: Integer) -> String
9
18
  def self?.print_system_info: () -> void
10
19
  def self?.token_bos: () -> Integer
11
20
  def self?.token_eos: () -> Integer
21
+ def self?.mmap_supported?: () -> bool
22
+ def self?.mlock_supported?: () -> bool
12
23
 
13
24
  class Context
14
25
  public
@@ -16,7 +27,7 @@ module LLaMACpp
16
27
  def initialize: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
17
28
  | () -> void
18
29
  def embeddings: () -> Array[Float]
19
- def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> Qnil
30
+ def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
20
31
  def free: () -> void
21
32
  def load: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
22
33
  def logits: () -> Array[Float]
@@ -28,6 +39,7 @@ module LLaMACpp
28
39
  def sample_top_p_top_k: (top_k: Integer, top_p: Float, temp: Float, penalty: Float) -> Integer
29
40
  def token_to_str: (Integer) -> String
30
41
  def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
42
+ def apply_lora_from_file: (lora_path: String, ?base_model_path: String, ?n_threads: Integer) -> void
31
43
  end
32
44
 
33
45
  class ContextParams
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-04-15 00:00:00.000000000 Z
11
+ date: 2023-04-22 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email:
@@ -26,6 +26,7 @@ files:
26
26
  - ext/llama_cpp/llama_cpp.cpp
27
27
  - ext/llama_cpp/llama_cpp.h
28
28
  - ext/llama_cpp/src/LICENSE
29
+ - ext/llama_cpp/src/ggml-cuda.h
29
30
  - ext/llama_cpp/src/ggml.c
30
31
  - ext/llama_cpp/src/ggml.h
31
32
  - ext/llama_cpp/src/llama.cpp