llama_cpp 0.5.0 → 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 715eab98a76ed825d66da6e4fcc84154dca8eed76f6cf6625d210a1ffb702958
4
- data.tar.gz: 3ceafc312354d245e485b664d71450cd9c27bcd89f5faec91af6cdf1221c251f
3
+ metadata.gz: 9e38c82f6ce7404a78b3ecdbc9574ae860322e6945499f0c4a905956bcbd2be7
4
+ data.tar.gz: 4a5effb6fcf3182baad091717bc510176eb127ccd660342ce0cc46bf2d392b4a
5
5
  SHA512:
6
- metadata.gz: 7ebe959d9380c9d981156606fdd8a6bcea9b88914923e693b400cfcd605b8c216bdfdcc807c0e72a21fe5fc6d7d623118fc7246524d7f59acdb8bc0064d736bc
7
- data.tar.gz: c6d428234d866c09d227b5c308a573e9721454ded3f7fdd36880706e7c47c72c67e6fed119c75d6898c6a1149cde853e5dbb59e3a390ef3d370aab4f0d6be548
6
+ metadata.gz: c471bd6c6afee142945d03da1c4908355fe900a5f0c259583b7b65f97d495d07c5397d1b551da888a5970170944596959ddef73d2df803acf001b8d079d0affb
7
+ data.tar.gz: 99cbb2d978723f9814d8ac7163f03c642a1ac6cabbd6cf09d003f563c629563a920d909ab797729f1e233f30d5776bf9f70f4c473919e5bf101d3e3f5fd6e938
data/CHANGELOG.md CHANGED
@@ -1,7 +1,19 @@
1
+ ## [[0.5.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.5.1...v0.5.2)] - 2023-09-16
2
+
3
+ - Bump bundled llama.cpp from b1198 to b1.
4
+ - Add `n_ctx_train` method to Model and Context.
5
+ - Add nvcc option to avoid link error ([#8](https://github.com/yoshoku/llama_cpp.rb/pull/8)).
6
+ - Set encoding on output of `generate` module function to avoid encoding error ([#9](https://github.com/yoshoku/llama_cpp.rb/pull/9)).
7
+ - Add `only_copy` option to ModelQuantizeParams.
8
+
9
+ ## [[0.5.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.5.0...v0.5.1)] - 2023-09-08
10
+
11
+ - Bump bundled llama.cpp from b1140 to b1198.
12
+
1
13
  ## [[0.5.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.4.0...v0.5.0)] - 2023-09-02
2
14
 
3
15
  **Breaking Changes**
4
- - Bump bundled llama.cpp from master-b1060 to master-b1140.
16
+ - Bump bundled llama.cpp from b1060 to b1140.
5
17
  - Rename `token_to_str` method on Context to `token_to_piece` method.
6
18
  - Rename `token_to_str` method on Model to `token_to_piece` method.
7
19
  - Rename `type` method on Model to `desc` method.
@@ -10,7 +22,7 @@
10
22
  ## [[0.4.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.3.8...v0.4.0)] - 2023-08-26
11
23
 
12
24
  **Breaking Changes**
13
- - Bump bundled llama.cpp from master-097e121 to master-b1060.
25
+ - Bump bundled llama.cpp from master-097e121 to b1060.
14
26
  - Support new file format GGUF.
15
27
  - You should re-convert / re-quantize your model files.
16
28
  - Remove vocab methods.
@@ -1,5 +1,5 @@
1
1
  UserがTaroという名前のアシスタントと対話するダイアログのトランスクリプト。
2
- Taroは親切で、親切で、正直で、文章を書くのが上手で、ユーザーのリクエストに即座に正確に答えることを怠りません。
2
+ Taroは親切で、正直で、文章を書くのが上手で、ユーザーのリクエストに即座に正確に答えることを怠りません。
3
3
 
4
4
  User: こんにちには、Taro。
5
5
  Taro: こんにちは、今日はどのような要件ですか?
@@ -112,7 +112,7 @@ create_makefile('llama_cpp/llama_cpp')
112
112
  if with_config('cublas')
113
113
  File.open('Makefile', 'a') do |f|
114
114
  f.puts 'ggml-cuda.o: ggml-cuda.cu ggml-cuda.h'
115
- f.puts "\tnvcc -arch=native -c -o $@ $<"
115
+ f.puts "\tnvcc -shared -Xcompiler -fPIC -arch=native -c -o $@ $<"
116
116
  end
117
117
  end
118
118
 
@@ -692,6 +692,8 @@ public:
692
692
  rb_define_method(rb_cLLaMAModelQuantizeParams, "allow_requantize", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_allow_requantize), 0);
693
693
  rb_define_method(rb_cLLaMAModelQuantizeParams, "quantize_output_tensor=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_quantize_output_tensor), 1);
694
694
  rb_define_method(rb_cLLaMAModelQuantizeParams, "quantize_output_tensor", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_quantize_output_tensor), 0);
695
+ rb_define_method(rb_cLLaMAModelQuantizeParams, "only_copy=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_only_copy), 1);
696
+ rb_define_method(rb_cLLaMAModelQuantizeParams, "only_copy", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_only_copy), 0);
695
697
  }
696
698
 
697
699
  private:
@@ -752,6 +754,18 @@ private:
752
754
  LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
753
755
  return ptr->params.quantize_output_tensor ? Qtrue : Qfalse;
754
756
  }
757
+
758
+ // only_copy
759
+ static VALUE _llama_model_quantize_params_set_only_copy(VALUE self, VALUE only_copy) {
760
+ LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
761
+ ptr->params.only_copy = RTEST(only_copy) ? true : false;
762
+ return ptr->params.only_copy ? Qtrue : Qfalse;
763
+ }
764
+
765
+ static VALUE _llama_model_quantize_params_get_only_copy(VALUE self) {
766
+ LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
767
+ return ptr->params.only_copy ? Qtrue : Qfalse;
768
+ }
755
769
  };
756
770
 
757
771
  const rb_data_type_t RbLLaMAModelQuantizeParams::llama_model_quantize_params_type = {
@@ -810,6 +824,7 @@ public:
810
824
  rb_define_method(rb_cLLaMAModel, "apply_lora_from_file", RUBY_METHOD_FUNC(_llama_model_apply_lora_from_file), -1);
811
825
  rb_define_method(rb_cLLaMAModel, "n_vocab", RUBY_METHOD_FUNC(_llama_model_get_model_n_vocab), 0);
812
826
  rb_define_method(rb_cLLaMAModel, "n_ctx", RUBY_METHOD_FUNC(_llama_model_get_model_n_ctx), 0);
827
+ rb_define_method(rb_cLLaMAModel, "n_ctx_train", RUBY_METHOD_FUNC(_llama_model_get_model_n_ctx_train), 0);
813
828
  rb_define_method(rb_cLLaMAModel, "n_embd", RUBY_METHOD_FUNC(_llama_model_get_model_n_embd), 0);
814
829
  rb_define_method(rb_cLLaMAModel, "token_to_piece", RUBY_METHOD_FUNC(_llama_model_token_to_piece_with_model), 1);
815
830
  rb_define_method(rb_cLLaMAModel, "tokenize", RUBY_METHOD_FUNC(_llama_model_tokenize_with_model), -1);
@@ -971,6 +986,11 @@ private:
971
986
  return INT2NUM(llama_model_n_ctx(ptr->model));
972
987
  }
973
988
 
989
+ static VALUE _llama_model_get_model_n_ctx_train(VALUE self) {
990
+ LLaMAModelWrapper* ptr = get_llama_model(self);
991
+ return INT2NUM(llama_model_n_ctx_train(ptr->model));
992
+ }
993
+
974
994
  static VALUE _llama_model_get_model_n_embd(VALUE self) {
975
995
  LLaMAModelWrapper* ptr = get_llama_model(self);
976
996
  return INT2NUM(llama_model_n_embd(ptr->model));
@@ -1341,6 +1361,7 @@ public:
1341
1361
  rb_define_method(rb_cLLaMAContext, "token_to_piece", RUBY_METHOD_FUNC(_llama_context_token_to_piece), 1);
1342
1362
  rb_define_method(rb_cLLaMAContext, "n_vocab", RUBY_METHOD_FUNC(_llama_context_n_vocab), 0);
1343
1363
  rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
1364
+ rb_define_method(rb_cLLaMAContext, "n_ctx_train", RUBY_METHOD_FUNC(_llama_context_n_ctx_train), 0);
1344
1365
  rb_define_method(rb_cLLaMAContext, "n_embd", RUBY_METHOD_FUNC(_llama_context_n_embd), 0);
1345
1366
  rb_define_method(rb_cLLaMAContext, "timings", RUBY_METHOD_FUNC(_llama_context_get_timings), 0);
1346
1367
  rb_define_method(rb_cLLaMAContext, "print_timings", RUBY_METHOD_FUNC(_llama_context_print_timings), 0);
@@ -1733,6 +1754,15 @@ private:
1733
1754
  return INT2NUM(llama_n_ctx(ptr->ctx));
1734
1755
  }
1735
1756
 
1757
+ static VALUE _llama_context_n_ctx_train(VALUE self) {
1758
+ LLaMAContextWrapper* ptr = get_llama_context(self);
1759
+ if (ptr->ctx == NULL) {
1760
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
1761
+ return Qnil;
1762
+ }
1763
+ return INT2NUM(llama_n_ctx_train(ptr->ctx));
1764
+ }
1765
+
1736
1766
  static VALUE _llama_context_n_embd(VALUE self) {
1737
1767
  LLaMAContextWrapper* ptr = get_llama_context(self);
1738
1768
  if (ptr->ctx == NULL) {
@@ -6,6 +6,26 @@
6
6
  #include <stdlib.h>
7
7
  #include <string.h>
8
8
 
9
+ #ifdef __has_include
10
+ #if __has_include(<unistd.h>)
11
+ #include <unistd.h>
12
+ #if defined(_POSIX_MAPPED_FILES)
13
+ #include <sys/types.h>
14
+ #include <sys/mman.h>
15
+ #endif
16
+ #endif
17
+ #endif
18
+
19
+ #if defined(_WIN32)
20
+ #define WIN32_LEAN_AND_MEAN
21
+ #ifndef NOMINMAX
22
+ #define NOMINMAX
23
+ #endif
24
+ #include <windows.h>
25
+ #include <memoryapi.h>
26
+ #endif
27
+
28
+
9
29
  #define UNUSED(x) (void)(x)
10
30
  #define MAX(a, b) ((a) > (b) ? (a) : (b))
11
31
  #define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
@@ -99,19 +119,24 @@ static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tens
99
119
  }
100
120
  #endif
101
121
 
102
-
103
- static size_t ggml_allocator_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
122
+ static size_t ggml_allocr_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
104
123
  return ggml_nbytes(tensor);
105
124
 
106
125
  UNUSED(alloc);
107
126
  }
108
127
 
128
+ // check if a tensor is allocated by this buffer
129
+ static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_tensor * tensor) {
130
+ void * ptr = tensor->data;
131
+ return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size;
132
+ }
133
+
109
134
  void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
110
135
  #ifdef GGML_ALLOCATOR_DEBUG
111
- GGML_ASSERT(ggml_is_view(tensor) == false); // views generally get data pointer from one of their sources
136
+ GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
112
137
  GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
113
138
  #endif
114
- size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
139
+ size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
115
140
  size = aligned_offset(NULL, size, alloc->alignment);
116
141
 
117
142
  AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
@@ -135,14 +160,14 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
135
160
  if (best_fit_block == -1) {
136
161
  // the last block is our last resort
137
162
  struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
163
+ max_avail = MAX(max_avail, block->size);
138
164
  if (block->size >= size) {
139
165
  best_fit_block = alloc->n_free_blocks - 1;
140
- max_avail = MAX(max_avail, block->size);
141
166
  } else {
142
167
  fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
143
168
  __func__, size, max_avail);
144
169
  GGML_ASSERT(!"not enough space in the buffer");
145
- return;
170
+ return;
146
171
  }
147
172
  }
148
173
  struct free_block * block = &alloc->free_blocks[best_fit_block];
@@ -177,17 +202,17 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
177
202
  }
178
203
 
179
204
  // this is a very naive implementation, but for our case the number of free blocks should be very small
180
- static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
205
+ static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
181
206
  void * ptr = tensor->data;
182
207
 
183
- if (ptr < alloc->data || (char*)ptr >= (char*)alloc->data + alloc->max_size) {
208
+ if (ggml_allocr_is_own(alloc, tensor) == false) {
184
209
  // the tensor was not allocated in this buffer
185
210
  // this can happen because the graph allocator will try to free weights and other tensors from different buffers
186
211
  // the easiest way to deal with this is just to ignore it
187
212
  return;
188
213
  }
189
214
 
190
- size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
215
+ size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
191
216
  size = aligned_offset(NULL, size, alloc->alignment);
192
217
  AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, alloc->n_free_blocks);
193
218
 
@@ -281,17 +306,68 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
281
306
  return alloc;
282
307
  }
283
308
 
284
- // address and size of the buffer when measuring
285
- // it needs to be large enough to fit all the tensors, but it cannot overlap with other existing buffers
286
- static void * const MEASURE_BASE_ADDR = (void *) 0x1000;
287
- static const size_t MEASURE_MAX_SIZE = 1ULL<<40; // 1 TB
309
+ // OS specific functions to allocate and free uncommitted virtual memory
310
+ static void * alloc_vmem(size_t size) {
311
+ #if defined(_WIN32)
312
+ return VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS);
313
+ #elif defined(_POSIX_MAPPED_FILES)
314
+ void * ptr = mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0);
315
+ if (ptr == MAP_FAILED) {
316
+ return NULL;
317
+ }
318
+ return ptr;
319
+ #else
320
+ // use a fixed address for other platforms
321
+ uintptr_t base_addr = (uintptr_t)-size - 0x100;
322
+ return (void *)base_addr;
323
+ #endif
324
+ }
325
+
326
+ static void free_vmem(void * base_addr, size_t size) {
327
+ #if defined(_WIN32)
328
+ VirtualFree(base_addr, 0, MEM_RELEASE);
329
+ UNUSED(size);
330
+ #elif defined(_POSIX_MAPPED_FILES)
331
+ munmap(base_addr, size);
332
+ #else
333
+ // nothing to do
334
+ UNUSED(base_addr);
335
+ UNUSED(size);
336
+ #endif
337
+ }
338
+
339
+ // allocate uncommitted virtual memory to measure the size of the graph
340
+ static void alloc_measure_vmem(void ** base_addr, size_t * size) {
341
+ // 1TB for 64-bit, 1GB for 32-bit
342
+ *size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<40;
343
+ do {
344
+ *base_addr = alloc_vmem(*size);
345
+ if (*base_addr != NULL) {
346
+ AT_PRINTF("allocated %.2f GB of virtual memory for measure buffer at %p\n", *size / 1024.0 / 1024.0 / 1024.0, *base_addr);
347
+ return;
348
+ }
349
+ // try again with half the size
350
+ *size /= 2;
351
+ } while (*size > 0);
352
+
353
+ GGML_ASSERT(!"failed to allocate virtual memory for measure buffer");
354
+ }
355
+
356
+ static void free_measure_vmem(void * base_addr, size_t size) {
357
+ free_vmem(base_addr, size);
358
+ }
288
359
 
289
360
  struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
290
361
  struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
291
362
 
363
+ void * base_addr;
364
+ size_t size;
365
+
366
+ alloc_measure_vmem(&base_addr, &size);
367
+
292
368
  *alloc = (struct ggml_allocr){
293
- /*.data = */ MEASURE_BASE_ADDR,
294
- /*.size = */ MEASURE_MAX_SIZE,
369
+ /*.data = */ base_addr,
370
+ /*.size = */ size,
295
371
  /*.alignment = */ alignment,
296
372
  /*.n_free_blocks = */ 0,
297
373
  /*.free_blocks = */ {{0}},
@@ -311,6 +387,9 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
311
387
  }
312
388
 
313
389
  void ggml_allocr_free(struct ggml_allocr * alloc) {
390
+ if (alloc->measure) {
391
+ free_measure_vmem(alloc->data, alloc->size);
392
+ }
314
393
  free(alloc);
315
394
  }
316
395
 
@@ -380,8 +459,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
380
459
  }
381
460
 
382
461
  // if the node's data is external, then we cannot re-use it
383
- if ((char *) parent->data < (char *) alloc->data ||
384
- (char *) parent->data >= ((char *) alloc->data + alloc->size)) {
462
+ if (ggml_allocr_is_own(alloc, parent) == false) {
385
463
  AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
386
464
  continue;
387
465
  }
@@ -415,7 +493,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
415
493
  }
416
494
  }
417
495
 
418
- static size_t ggml_allocator_alloc_graph_tensors_n(
496
+ static size_t ggml_allocr_alloc_graph_tensors_n(
419
497
  struct ggml_allocr * alloc,
420
498
  struct ggml_cgraph ** graphs, int n_graphs,
421
499
  struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
@@ -493,11 +571,10 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
493
571
  AT_PRINTF("\n");
494
572
  }
495
573
 
496
-
497
574
  // update parents
498
575
  // update immediately if there is no parse_seq
499
576
  // update only at barriers if there is parse_seq
500
- if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] == -1) {
577
+ if ((alloc->parse_seq_len == 0) || alloc->parse_seq[ind] == -1) {
501
578
  int update_start = alloc->parse_seq_len ? last_barrier_pos : ind;
502
579
  int update_end = alloc->parse_seq_len ? ind : ind + 1;
503
580
  for (int i = update_start; i < update_end; i++) {
@@ -521,12 +598,12 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
521
598
  view_src_hn->n_views -= 1;
522
599
  AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views);
523
600
  if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
524
- ggml_allocator_free_tensor(alloc, view_src);
601
+ ggml_allocr_free_tensor(alloc, view_src);
525
602
  }
526
603
  }
527
604
  else {
528
605
  if (parent->data != node->data) {
529
- ggml_allocator_free_tensor(alloc, parent);
606
+ ggml_allocr_free_tensor(alloc, parent);
530
607
  }
531
608
  }
532
609
  }
@@ -543,7 +620,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
543
620
  for (int i = 0; outputs[g][i] != NULL; i++) {
544
621
  struct ggml_tensor * output = outputs[g][i];
545
622
  AT_PRINTF("output: %s\n", output->name);
546
- ggml_allocator_free_tensor(alloc, output);
623
+ ggml_allocr_free_tensor(alloc, output);
547
624
  }
548
625
  }
549
626
  }
@@ -552,5 +629,5 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
552
629
  }
553
630
 
554
631
  size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
555
- return ggml_allocator_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
632
+ return ggml_allocr_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
556
633
  }