llama_cpp 0.3.0 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9e0152eb9e091932225356614b57fad416c2aa96a83316f8585c9ef2872e1504
4
- data.tar.gz: 8ea2f00f11be7dd6524bfe69e3181fc63df7c841ed1e2d91b1b2bcafd99d0b66
3
+ metadata.gz: f1fcd28849baae5e90c466665aff4fe5da1d848193ebcf74c3fe333c5674191c
4
+ data.tar.gz: fcb0c64528d24c5cfad677f17bfd6e1e817a4b8279317ca5b2113302735598b9
5
5
  SHA512:
6
- metadata.gz: a85a4bdd2d1fd575eb406b9bebdf7f388db33dc42f7a2980ba9a7a6b346b539854d9df5515c9b6968727e76f035a23f59d4bc65bc5525df962dfbdf56d8b3b01
7
- data.tar.gz: 33641d622102257dbc1358bde0871a03c595928f5d8cedee512e1df414e4aa93433eadfcd082d4db42046320c1ed7f806dfb3aafd7934a1becb33fe275f9435c
6
+ metadata.gz: c70b5f919feb7a585efbe21b3360254c2f5789504cd73fecee12fd686483c77eeb763ed91a8e7434d5852208555a78f168b358d0895f15b1ea7e774d36d6910a
7
+ data.tar.gz: f554ad58fc9d68c39b80995b7f424468386b32a5847dbdefbceb1cba53ff7182da35be8599523d82a6daa8fee23667d07e06faedc4c727d52e8fc594d0bc7d3f
data/CHANGELOG.md CHANGED
@@ -1,3 +1,37 @@
1
+ ## [[0.3.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.3.1...v0.3.2)] - 2023-07-08
2
+
3
+ - Bump bundled llama.cpp from master-b8c8dda to master-481f793.
4
+ - Add `Timings` class and `timings` method to `Context`:
5
+ ```ruby
6
+ require 'llama_cpp'
7
+
8
+ # ...
9
+
10
+ context = LLaMACpp::Context.new(model: model)
11
+ timings = context.timings
12
+
13
+ puts timings.class
14
+ # => LLaMACpp::Timings
15
+ puts timings.t_load_ms
16
+ # => 79.61
17
+ ```
18
+ - Expose sampling options as the arguemnts of `generate` module function:
19
+ ```ruby
20
+ require 'llama_cpp'
21
+
22
+ # ...
23
+
24
+ LLaMACpp.generate(context, 'Hello, world.', top_k: 30, top_p: 0.8, temperature: 0.9)
25
+ ```
26
+ - Add `ModelQuantizaParams` class, this class was not published because the author forgot to write rb_define_class.
27
+ - Minor update to example scripts, configuration files, and documentations.
28
+
29
+ ## [[0.3.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.3.0...v0.3.1)] - 2023-07-02
30
+
31
+ - Bump bundled llama.cpp from master-9d23589 to master-b8c8dda.
32
+ - Use unsigned values for random seed.
33
+ - Add `eval_embd` method to `Context` class.
34
+
1
35
  ## [[0.3.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.2.2...v0.3.0)] - 2023-06-30
2
36
 
3
37
  - Add no_k_quants and qkk_64 config options:
data/README.md CHANGED
@@ -68,6 +68,15 @@ User:
68
68
 
69
69
  ![llama_cpp_chat_example](https://github.com/yoshoku/llama_cpp.rb/assets/5562409/374ae3d8-63a6-498f-ae6e-5552b464bdda)
70
70
 
71
+ Japanse chat is also possible using the [Vicuna model on Hugging Face](https://huggingface.co/CRD716/ggml-vicuna-1.1-quantized).
72
+
73
+ ```sh
74
+ $ wget https://huggingface.co/CRD716/ggml-vicuna-1.1-quantized/resolve/main/ggml-vicuna-7b-1.1-q4_0.bin
75
+ $ ruby chat.rb --model ggml-vicuna-7b-1.1-q4_0.bin --file prompt_jp.txt
76
+ ```
77
+
78
+ ![llama_cpp rb-jpchat](https://github.com/yoshoku/llama_cpp.rb/assets/5562409/526ff18c-2bb2-4b06-8933-f72960024033)
79
+
71
80
  ## Contributing
72
81
 
73
82
  Bug reports and pull requests are welcome on GitHub at https://github.com/yoshoku/llama_cpp.rb.
data/examples/chat.rb CHANGED
@@ -33,7 +33,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
33
33
  option :n_gpu_layers, type: :numeric, desc: 'number of layers on GPU', default: 0
34
34
  def main # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
35
35
  params = LLaMACpp::ContextParams.new
36
- params.seed = options[:seed]
36
+ params.seed = options[:seed] if options[:seed] != -1
37
37
  params.n_gpu_layers = options[:n_gpu_layers]
38
38
  model = LLaMACpp::Model.new(model_path: options[:model], params: params)
39
39
  context = LLaMACpp::Context.new(model: model)
@@ -18,7 +18,7 @@ class Embedding < Thor # rubocop:disable Style/Documentation
18
18
  option :n_gpu_layers, type: :numeric, desc: 'number of layers on GPU', default: 0
19
19
  def main # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
20
20
  params = LLaMACpp::ContextParams.new
21
- params.seed = options[:seed]
21
+ params.seed = options[:seed] if options[:seed] != -1
22
22
  params.n_gpu_layers = options[:n_gpu_layers]
23
23
  params.embedding = true
24
24
  model = LLaMACpp::Model.new(model_path: options[:model], params: params)
@@ -0,0 +1,8 @@
1
+ UserがTaroという名前のアシスタントと対話するダイアログのトランスクリプト。
2
+ Taroは親切で、親切で、正直で、文章を書くのが上手で、ユーザーのリクエストに即座に正確に答えることを怠りません。
3
+
4
+ User: こんにちには、Taro。
5
+ Taro: こんにちは、今日はどのような要件ですか?
6
+ User: 日本で最大の都市について教えてください。
7
+ Taro: はい、日本で最大の都市は東京です。日本の首都でもあります。
8
+ User:
@@ -7,8 +7,8 @@ abort 'libstdc++ is not found.' unless have_library('stdc++')
7
7
 
8
8
  $srcs = %w[ggml.c llama.cpp llama_cpp.cpp]
9
9
  $srcs << 'ggml-opencl.cpp' if with_config('clblast')
10
- $CFLAGS << ' -w'
11
- $CXXFLAGS << ' -std=c++11'
10
+ $CFLAGS << ' -w -DNDEBUG'
11
+ $CXXFLAGS << ' -std=c++11 -DNDEBUG'
12
12
  $INCFLAGS << ' -I$(srcdir)/src'
13
13
  $VPATH << '$(srcdir)/src'
14
14
 
@@ -1,8 +1,8 @@
1
-
2
1
  #include "llama_cpp.h"
3
2
 
4
3
  VALUE rb_mLLaMACpp;
5
4
  VALUE rb_cLLaMAModel;
5
+ VALUE rb_cLLaMATimings;
6
6
  VALUE rb_cLLaMAContext;
7
7
  VALUE rb_cLLaMAContextParams;
8
8
  VALUE rb_cLLaMAModelQuantizeParams;
@@ -256,6 +256,111 @@ const rb_data_type_t RbLLaMATokenDataArray::llama_token_data_array_type = {
256
256
  RUBY_TYPED_FREE_IMMEDIATELY
257
257
  };
258
258
 
259
+ class LLaMATimingsWrapper {
260
+ public:
261
+ struct llama_timings timings;
262
+
263
+ LLaMATimingsWrapper(){};
264
+
265
+ ~LLaMATimingsWrapper(){};
266
+ };
267
+
268
+ class RbLLaMATimings {
269
+ public:
270
+ static VALUE llama_timings_alloc(VALUE self) {
271
+ LLaMATimingsWrapper* ptr = (LLaMATimingsWrapper*)ruby_xmalloc(sizeof(LLaMATimingsWrapper));
272
+ new (ptr) LLaMATimingsWrapper();
273
+ return TypedData_Wrap_Struct(self, &llama_timings_type, ptr);
274
+ }
275
+
276
+ static void llama_timings_free(void* ptr) {
277
+ ((LLaMATimingsWrapper*)ptr)->~LLaMATimingsWrapper();
278
+ ruby_xfree(ptr);
279
+ }
280
+
281
+ static size_t llama_timings_size(const void* ptr) {
282
+ return sizeof(*((LLaMATimingsWrapper*)ptr));
283
+ }
284
+
285
+ static LLaMATimingsWrapper* get_llama_timings(VALUE self) {
286
+ LLaMATimingsWrapper* ptr;
287
+ TypedData_Get_Struct(self, LLaMATimingsWrapper, &llama_timings_type, ptr);
288
+ return ptr;
289
+ }
290
+
291
+ static void define_class(VALUE outer) {
292
+ rb_cLLaMATimings = rb_define_class_under(outer, "Timings", rb_cObject);
293
+ rb_define_alloc_func(rb_cLLaMATimings, llama_timings_alloc);
294
+ rb_define_method(rb_cLLaMATimings, "t_start_ms", RUBY_METHOD_FUNC(_llama_timings_get_t_start_ms), 0);
295
+ rb_define_method(rb_cLLaMATimings, "t_end_ms", RUBY_METHOD_FUNC(_llama_timings_get_t_end_ms), 0);
296
+ rb_define_method(rb_cLLaMATimings, "t_load_ms", RUBY_METHOD_FUNC(_llama_timings_get_t_load_ms), 0);
297
+ rb_define_method(rb_cLLaMATimings, "t_sample_ms", RUBY_METHOD_FUNC(_llama_timings_get_t_sample_ms), 0);
298
+ rb_define_method(rb_cLLaMATimings, "t_p_eval_ms", RUBY_METHOD_FUNC(_llama_timings_get_t_p_eval_ms), 0);
299
+ rb_define_method(rb_cLLaMATimings, "t_eval_ms", RUBY_METHOD_FUNC(_llama_timings_get_t_eval_ms), 0);
300
+ rb_define_method(rb_cLLaMATimings, "n_sample", RUBY_METHOD_FUNC(_llama_timings_get_n_sample), 0);
301
+ rb_define_method(rb_cLLaMATimings, "n_p_eval", RUBY_METHOD_FUNC(_llama_timings_get_n_p_eval), 0);
302
+ rb_define_method(rb_cLLaMATimings, "n_eval", RUBY_METHOD_FUNC(_llama_timings_get_n_eval), 0);
303
+ }
304
+
305
+ private:
306
+ static const rb_data_type_t llama_timings_type;
307
+
308
+ static VALUE _llama_timings_get_t_start_ms(VALUE self) {
309
+ LLaMATimingsWrapper* ptr = get_llama_timings(self);
310
+ return DBL2NUM(ptr->timings.t_start_ms);
311
+ }
312
+
313
+ static VALUE _llama_timings_get_t_end_ms(VALUE self) {
314
+ LLaMATimingsWrapper* ptr = get_llama_timings(self);
315
+ return DBL2NUM(ptr->timings.t_end_ms);
316
+ }
317
+
318
+ static VALUE _llama_timings_get_t_load_ms(VALUE self) {
319
+ LLaMATimingsWrapper* ptr = get_llama_timings(self);
320
+ return DBL2NUM(ptr->timings.t_load_ms);
321
+ }
322
+
323
+ static VALUE _llama_timings_get_t_sample_ms(VALUE self) {
324
+ LLaMATimingsWrapper* ptr = get_llama_timings(self);
325
+ return DBL2NUM(ptr->timings.t_sample_ms);
326
+ }
327
+
328
+ static VALUE _llama_timings_get_t_p_eval_ms(VALUE self) {
329
+ LLaMATimingsWrapper* ptr = get_llama_timings(self);
330
+ return DBL2NUM(ptr->timings.t_p_eval_ms);
331
+ }
332
+
333
+ static VALUE _llama_timings_get_t_eval_ms(VALUE self) {
334
+ LLaMATimingsWrapper* ptr = get_llama_timings(self);
335
+ return DBL2NUM(ptr->timings.t_eval_ms);
336
+ }
337
+
338
+ static VALUE _llama_timings_get_n_sample(VALUE self) {
339
+ LLaMATimingsWrapper* ptr = get_llama_timings(self);
340
+ return INT2NUM(ptr->timings.n_sample);
341
+ }
342
+
343
+ static VALUE _llama_timings_get_n_p_eval(VALUE self) {
344
+ LLaMATimingsWrapper* ptr = get_llama_timings(self);
345
+ return INT2NUM(ptr->timings.n_p_eval);
346
+ }
347
+
348
+ static VALUE _llama_timings_get_n_eval(VALUE self) {
349
+ LLaMATimingsWrapper* ptr = get_llama_timings(self);
350
+ return INT2NUM(ptr->timings.n_eval);
351
+ }
352
+ };
353
+
354
+ const rb_data_type_t RbLLaMATimings::llama_timings_type = {
355
+ "RbLLaMATimings",
356
+ { NULL,
357
+ RbLLaMATimings::llama_timings_free,
358
+ RbLLaMATimings::llama_timings_size },
359
+ NULL,
360
+ NULL,
361
+ RUBY_TYPED_FREE_IMMEDIATELY
362
+ };
363
+
259
364
  class LLaMAContextParamsWrapper {
260
365
  public:
261
366
  struct llama_context_params params;
@@ -404,6 +509,10 @@ private:
404
509
  // seed
405
510
  static VALUE _llama_context_params_set_seed(VALUE self, VALUE seed) {
406
511
  LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
512
+ if (NUM2INT(seed) < 0) {
513
+ rb_raise(rb_eArgError, "seed must be positive");
514
+ return Qnil;
515
+ }
407
516
  ptr->params.seed = NUM2INT(seed);
408
517
  return INT2NUM(ptr->params.seed);
409
518
  };
@@ -685,6 +794,10 @@ private:
685
794
  LLaMAContextParamsWrapper* prms_ptr = RbLLaMAContextParams::get_llama_context_params(kw_values[1]);
686
795
  LLaMAModelWrapper* model_ptr = get_llama_model(self);
687
796
 
797
+ if (prms_ptr->params.seed == LLAMA_DEFAULT_SEED) {
798
+ prms_ptr->params.seed = time(NULL);
799
+ }
800
+
688
801
  try {
689
802
  model_ptr->model = llama_load_model_from_file(StringValueCStr(filename), prms_ptr->params);
690
803
  } catch (const std::runtime_error& e) {
@@ -848,6 +961,7 @@ public:
848
961
  rb_define_alloc_func(rb_cLLaMAContext, llama_context_alloc);
849
962
  rb_define_method(rb_cLLaMAContext, "initialize", RUBY_METHOD_FUNC(_llama_context_initialize), -1);
850
963
  rb_define_method(rb_cLLaMAContext, "eval", RUBY_METHOD_FUNC(_llama_context_eval), -1);
964
+ rb_define_method(rb_cLLaMAContext, "eval_embd", RUBY_METHOD_FUNC(_llama_context_eval_embd), -1);
851
965
  rb_define_method(rb_cLLaMAContext, "eval_export", RUBY_METHOD_FUNC(_llama_context_eval_export), 1);
852
966
  rb_define_method(rb_cLLaMAContext, "tokenize", RUBY_METHOD_FUNC(_llama_context_tokenize), -1);
853
967
  rb_define_method(rb_cLLaMAContext, "logits", RUBY_METHOD_FUNC(_llama_context_logits), 0);
@@ -857,6 +971,7 @@ public:
857
971
  rb_define_method(rb_cLLaMAContext, "n_vocab", RUBY_METHOD_FUNC(_llama_context_n_vocab), 0);
858
972
  rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
859
973
  rb_define_method(rb_cLLaMAContext, "n_embd", RUBY_METHOD_FUNC(_llama_context_n_embd), 0);
974
+ rb_define_method(rb_cLLaMAContext, "timings", RUBY_METHOD_FUNC(_llama_context_get_timings), 0);
860
975
  rb_define_method(rb_cLLaMAContext, "print_timings", RUBY_METHOD_FUNC(_llama_context_print_timings), 0);
861
976
  rb_define_method(rb_cLLaMAContext, "reset_timings", RUBY_METHOD_FUNC(_llama_context_reset_timings), 0);
862
977
  rb_define_method(rb_cLLaMAContext, "kv_cache_token_count", RUBY_METHOD_FUNC(_llama_context_kv_cache_token_count), 0);
@@ -971,6 +1086,61 @@ private:
971
1086
  return Qnil;
972
1087
  };
973
1088
 
1089
+ static VALUE _llama_context_eval_embd(int argc, VALUE* argv, VALUE self) {
1090
+ VALUE kw_args = Qnil;
1091
+ ID kw_table[4] = { rb_intern("embd"), rb_intern("n_past"), rb_intern("n_tokens"), rb_intern("n_threads") };
1092
+ VALUE kw_values[4] = { Qundef, Qundef, Qundef, Qundef };
1093
+ rb_scan_args(argc, argv, ":", &kw_args);
1094
+ rb_get_kwargs(kw_args, kw_table, 2, 2, kw_values);
1095
+
1096
+ if (!RB_TYPE_P(kw_values[0], T_ARRAY)) {
1097
+ rb_raise(rb_eArgError, "tokens must be an Array");
1098
+ return Qnil;
1099
+ }
1100
+ if (!RB_INTEGER_TYPE_P(kw_values[1])) {
1101
+ rb_raise(rb_eArgError, "n_past must be an integer");
1102
+ return Qnil;
1103
+ }
1104
+ if (kw_values[2] != Qundef && !RB_INTEGER_TYPE_P(kw_values[2])) {
1105
+ rb_raise(rb_eArgError, "n_tokens must be an integer");
1106
+ return Qnil;
1107
+ }
1108
+ if (kw_values[3] != Qundef && !RB_INTEGER_TYPE_P(kw_values[3])) {
1109
+ rb_raise(rb_eArgError, "n_threads must be an integer");
1110
+ return Qnil;
1111
+ }
1112
+
1113
+ const size_t tokens_len = RARRAY_LEN(kw_values[0]);
1114
+ std::vector<float> embd(tokens_len);
1115
+ for (size_t i = 0; i < tokens_len; i++) {
1116
+ VALUE el = rb_ary_entry(kw_values[0], i);
1117
+ if (!RB_FLOAT_TYPE_P(el)) {
1118
+ rb_raise(rb_eArgError, "embd must be an array of floats");
1119
+ return Qnil;
1120
+ }
1121
+ embd[i] = NUM2DBL(el);
1122
+ }
1123
+
1124
+ const int n_tokens = kw_values[2] == Qundef ? (int)tokens_len : NUM2INT(kw_values[2]);
1125
+ const int n_past = NUM2INT(kw_values[1]);
1126
+ const int n_threads = kw_values[3] == Qundef ? 1 : NUM2INT(kw_values[3]);
1127
+
1128
+ LLaMAContextWrapper* ptr = get_llama_context(self);
1129
+ if (ptr->ctx == NULL) {
1130
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
1131
+ return Qnil;
1132
+ }
1133
+ if (llama_eval_embd(ptr->ctx, embd.data(), n_tokens, n_past, n_threads) != 0) {
1134
+ rb_raise(rb_eRuntimeError, "Failed to evaluate");
1135
+ return Qnil;
1136
+ }
1137
+
1138
+ rb_iv_set(self, "@n_tokens", INT2NUM(n_tokens));
1139
+ rb_iv_set(self, "@has_evaluated", Qtrue);
1140
+
1141
+ return Qnil;
1142
+ }
1143
+
974
1144
  static VALUE _llama_context_eval_export(VALUE self, VALUE fname_) {
975
1145
  LLaMAContextWrapper* ptr = get_llama_context(self);
976
1146
  if (ptr->ctx == NULL) {
@@ -1163,6 +1333,18 @@ private:
1163
1333
  return INT2NUM(llama_n_embd(ptr->ctx));
1164
1334
  };
1165
1335
 
1336
+ static VALUE _llama_context_get_timings(VALUE self) {
1337
+ LLaMAContextWrapper* ptr = get_llama_context(self);
1338
+ if (ptr->ctx == NULL) {
1339
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
1340
+ return Qnil;
1341
+ }
1342
+ VALUE tm_obj = rb_funcall(rb_cLLaMATimings, rb_intern("new"), 0);
1343
+ LLaMATimingsWrapper* tm_ptr = RbLLaMATimings::get_llama_timings(tm_obj);
1344
+ tm_ptr->timings = llama_get_timings(ptr->ctx);
1345
+ return tm_obj;
1346
+ }
1347
+
1166
1348
  static VALUE _llama_context_print_timings(VALUE self) {
1167
1349
  LLaMAContextWrapper* ptr = get_llama_context(self);
1168
1350
  if (ptr->ctx == NULL) {
@@ -1198,7 +1380,11 @@ private:
1198
1380
  rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
1199
1381
  return Qnil;
1200
1382
  }
1201
- const int seed = NUM2INT(seed_);
1383
+ if (NUM2INT(seed_) < 0) {
1384
+ rb_raise(rb_eArgError, "seed must be a non-negative integer");
1385
+ return Qnil;
1386
+ }
1387
+ const uint32_t seed = NUM2INT(seed_);
1202
1388
  llama_set_rng_seed(ptr->ctx, seed);
1203
1389
  return Qnil;
1204
1390
  };
@@ -1830,8 +2016,10 @@ extern "C" void Init_llama_cpp(void) {
1830
2016
  RbLLaMATokenData::define_class(rb_mLLaMACpp);
1831
2017
  RbLLaMATokenDataArray::define_class(rb_mLLaMACpp);
1832
2018
  RbLLaMAModel::define_class(rb_mLLaMACpp);
2019
+ RbLLaMATimings::define_class(rb_mLLaMACpp);
1833
2020
  RbLLaMAContext::define_class(rb_mLLaMACpp);
1834
2021
  RbLLaMAContextParams::define_class(rb_mLLaMACpp);
2022
+ RbLLaMAModelQuantizeParams::define_class(rb_mLLaMACpp);
1835
2023
 
1836
2024
  rb_define_module_function(rb_mLLaMACpp, "init_backend", rb_llama_llama_init_backend, -1);
1837
2025
  rb_define_module_function(rb_mLLaMACpp, "model_quantize", rb_llama_model_quantize, -1);
@@ -1901,6 +2089,11 @@ extern "C" void Init_llama_cpp(void) {
1901
2089
  ss_magic << std::showbase << std::hex << LLAMA_SESSION_MAGIC;
1902
2090
  rb_define_const(rb_mLLaMACpp, "LLAMA_SESSION_MAGIC", rb_str_new2(ss_magic.str().c_str()));
1903
2091
 
2092
+ ss_magic.str("");
2093
+ ss_magic.clear(std::stringstream::goodbit);
2094
+ ss_magic << std::showbase << std::hex << LLAMA_DEFAULT_SEED;
2095
+ rb_define_const(rb_mLLaMACpp, "LLAMA_DEFAULT_SEED", rb_str_new2(ss_magic.str().c_str()));
2096
+
1904
2097
  rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_VERSION", rb_str_new2(std::to_string(LLAMA_FILE_VERSION).c_str()));
1905
2098
  rb_define_const(rb_mLLaMACpp, "LLAMA_SESSION_VERSION", rb_str_new2(std::to_string(LLAMA_SESSION_VERSION).c_str()));
1906
2099
  }