llama_cpp 0.0.4 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -72,6 +72,8 @@ extern "C" {
72
72
  LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
73
73
  LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
74
74
  LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
75
+ LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
76
+ LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // except 1d tensors
75
77
  };
76
78
 
77
79
  LLAMA_API struct llama_context_params llama_context_default_params();
@@ -91,10 +93,24 @@ extern "C" {
91
93
 
92
94
  // TODO: not great API - very likely to change
93
95
  // Returns 0 on success
96
+ // nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given
94
97
  LLAMA_API int llama_model_quantize(
95
98
  const char * fname_inp,
96
99
  const char * fname_out,
97
- enum llama_ftype ftype);
100
+ enum llama_ftype ftype,
101
+ int nthread);
102
+
103
+ // Apply a LoRA adapter to a loaded model
104
+ // path_base_model is the path to a higher quality model to use as a base for
105
+ // the layers modified by the adapter. Can be NULL to use the current loaded model.
106
+ // The model needs to be reloaded before applying a new adapter, otherwise the adapter
107
+ // will be applied on top of the previous one
108
+ // Returns 0 on success
109
+ LLAMA_API int llama_apply_lora_from_file(
110
+ struct llama_context * ctx,
111
+ const char * path_lora,
112
+ const char * path_base_model,
113
+ int n_threads);
98
114
 
99
115
  // Returns the KV cache that will contain the context for the
100
116
  // ongoing prediction with the model.
@@ -43,8 +43,12 @@
43
43
  } while (0)
44
44
 
45
45
  #ifdef __GNUC__
46
+ #ifdef __MINGW32__
47
+ __attribute__((format(gnu_printf, 1, 2)))
48
+ #else
46
49
  __attribute__((format(printf, 1, 2)))
47
50
  #endif
51
+ #endif
48
52
  static std::string format(const char * fmt, ...) {
49
53
  va_list ap, ap2;
50
54
  va_start(ap, fmt);
@@ -57,7 +61,7 @@ static std::string format(const char * fmt, ...) {
57
61
  va_end(ap2);
58
62
  va_end(ap);
59
63
  return std::string(buf.data(), size);
60
- };
64
+ }
61
65
 
62
66
  struct llama_file {
63
67
  // use FILE * so we don't have to re-open the file to mmap
@@ -164,7 +168,7 @@ struct llama_mmap {
164
168
  #ifdef _POSIX_MAPPED_FILES
165
169
  static constexpr bool SUPPORTED = true;
166
170
 
167
- llama_mmap(struct llama_file * file) {
171
+ llama_mmap(struct llama_file * file, bool prefetch = true) {
168
172
  size = file->size;
169
173
  int fd = fileno(file->fp);
170
174
  int flags = MAP_SHARED;
@@ -172,15 +176,16 @@ struct llama_mmap {
172
176
  flags |= MAP_POPULATE;
173
177
  #endif
174
178
  addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
175
- close(fd);
176
179
  if (addr == MAP_FAILED) {
177
180
  throw format("mmap failed: %s", strerror(errno));
178
181
  }
179
182
 
180
- // Advise the kernel to preload the mapped memory
181
- if (madvise(addr, file->size, MADV_WILLNEED)) {
182
- fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
183
- strerror(errno));
183
+ if (prefetch) {
184
+ // Advise the kernel to preload the mapped memory
185
+ if (madvise(addr, file->size, MADV_WILLNEED)) {
186
+ fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
187
+ strerror(errno));
188
+ }
184
189
  }
185
190
  }
186
191
 
@@ -190,14 +195,13 @@ struct llama_mmap {
190
195
  #elif defined(_WIN32)
191
196
  static constexpr bool SUPPORTED = true;
192
197
 
193
- llama_mmap(struct llama_file * file) {
198
+ llama_mmap(struct llama_file * file, bool prefetch = true) {
194
199
  size = file->size;
195
200
 
196
201
  HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
197
202
 
198
203
  HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
199
204
  DWORD error = GetLastError();
200
- CloseHandle(hFile);
201
205
 
202
206
  if (hMapping == NULL) {
203
207
  throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str());
@@ -212,13 +216,15 @@ struct llama_mmap {
212
216
  }
213
217
 
214
218
  #if _WIN32_WINNT >= _WIN32_WINNT_WIN8
215
- // Advise the kernel to preload the mapped memory
216
- WIN32_MEMORY_RANGE_ENTRY range;
217
- range.VirtualAddress = addr;
218
- range.NumberOfBytes = (SIZE_T)size;
219
- if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
220
- fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
221
- llama_format_win_err(GetLastError()).c_str());
219
+ if (prefetch) {
220
+ // Advise the kernel to preload the mapped memory
221
+ WIN32_MEMORY_RANGE_ENTRY range;
222
+ range.VirtualAddress = addr;
223
+ range.NumberOfBytes = (SIZE_T)size;
224
+ if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
225
+ fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
226
+ llama_format_win_err(GetLastError()).c_str());
227
+ }
222
228
  }
223
229
  #else
224
230
  #pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.0.4'
6
+ VERSION = '0.0.6'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'master-c85e03d'
9
+ LLAMA_CPP_VERSION = 'master-12b5900'
10
10
  end
data/lib/llama_cpp.rb CHANGED
@@ -17,9 +17,9 @@ module LLaMACpp
17
17
  # @param n_threads [Integer]
18
18
  # @return [String]
19
19
  def generate(context, prompt, n_threads: 1) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/PerceivedComplexity
20
- prompt.insert(0, ' ')
20
+ spaced_prompt = " #{prompt}"
21
21
 
22
- embd_input = context.tokenize(text: prompt, add_bos: true)
22
+ embd_input = context.tokenize(text: spaced_prompt, add_bos: true)
23
23
 
24
24
  n_ctx = context.n_ctx
25
25
  last_n_tokens = [0] * n_ctx
@@ -71,6 +71,6 @@ module LLaMACpp
71
71
  break if embd[-1] == LLaMACpp.token_eos
72
72
  end
73
73
 
74
- output.join.delete_prefix(prompt).strip
74
+ output.join.delete_prefix(spaced_prompt).strip
75
75
  end
76
76
  end
data/sig/llama_cpp.rbs CHANGED
@@ -5,10 +5,21 @@ module LLaMACpp
5
5
  LLAMA_FILE_MAGIC: String
6
6
  LLAMA_FILE_MAGIC_UNVERSIONED: String
7
7
 
8
+ LLAMA_FTYPE_ALL_F32: Integer
9
+ LLAMA_FTYPE_MOSTLY_F16: Integer
10
+ LLAMA_FTYPE_MOSTLY_Q4_0: Integer
11
+ LLAMA_FTYPE_MOSTLY_Q4_1: Integer
12
+ LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: Integer
13
+ LLAMA_FTYPE_MOSTLY_Q4_2: Integer
14
+ LLAMA_FTYPE_MOSTLY_Q4_3: Integer
15
+
16
+ def self?.model_quantize: (input_path: String, output_path: String, ftype: Integer, ?n_threads: Integer) -> void
8
17
  def self?.generate: (::LLaMACpp::Context, String, ?n_threads: Integer) -> String
9
18
  def self?.print_system_info: () -> void
10
19
  def self?.token_bos: () -> Integer
11
20
  def self?.token_eos: () -> Integer
21
+ def self?.mmap_supported?: () -> bool
22
+ def self?.mlock_supported?: () -> bool
12
23
 
13
24
  class Context
14
25
  public
@@ -16,7 +27,7 @@ module LLaMACpp
16
27
  def initialize: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
17
28
  | () -> void
18
29
  def embeddings: () -> Array[Float]
19
- def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> Qnil
30
+ def eval: (tokens: Array[Integer], n_past: Integer, ?n_tokens: Integer, ?n_threads: Integer) -> void
20
31
  def free: () -> void
21
32
  def load: (model_path: String, params: ::LLaMACpp::ContextParams) -> void
22
33
  def logits: () -> Array[Float]
@@ -28,6 +39,7 @@ module LLaMACpp
28
39
  def sample_top_p_top_k: (top_k: Integer, top_p: Float, temp: Float, penalty: Float) -> Integer
29
40
  def token_to_str: (Integer) -> String
30
41
  def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool) -> Array[Integer]
42
+ def apply_lora_from_file: (lora_path: String, ?base_model_path: String, ?n_threads: Integer) -> void
31
43
  end
32
44
 
33
45
  class ContextParams
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-04-15 00:00:00.000000000 Z
11
+ date: 2023-04-22 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email:
@@ -26,6 +26,7 @@ files:
26
26
  - ext/llama_cpp/llama_cpp.cpp
27
27
  - ext/llama_cpp/llama_cpp.h
28
28
  - ext/llama_cpp/src/LICENSE
29
+ - ext/llama_cpp/src/ggml-cuda.h
29
30
  - ext/llama_cpp/src/ggml.c
30
31
  - ext/llama_cpp/src/ggml.h
31
32
  - ext/llama_cpp/src/llama.cpp