llama_cpp 0.3.1 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7a1f299e21bfe5b12d517a4254657cbc5bf9af6d0571285e2a5aff67b9175646
4
- data.tar.gz: 62dd6e0d4f0b052a912d87b52cd0cff5bb873ab12378413a3ee0af5671331ef6
3
+ metadata.gz: f1fcd28849baae5e90c466665aff4fe5da1d848193ebcf74c3fe333c5674191c
4
+ data.tar.gz: fcb0c64528d24c5cfad677f17bfd6e1e817a4b8279317ca5b2113302735598b9
5
5
  SHA512:
6
- metadata.gz: b12dc73914e5c7ecdd951fd57b70e01aae1926a2adc88030b5f5310f99c789e129cf552811363ec99525b37b9ca167a708cb756057b94f5cf4dd2a0100b06b6e
7
- data.tar.gz: d1d79696b08f89894de02a02fac91f0783c432efa641b21ee59f6987946b045681a60113392db6c85fe97bd0e1fc9860235faa358fb805bb0de21eb85926edd5
6
+ metadata.gz: c70b5f919feb7a585efbe21b3360254c2f5789504cd73fecee12fd686483c77eeb763ed91a8e7434d5852208555a78f168b358d0895f15b1ea7e774d36d6910a
7
+ data.tar.gz: f554ad58fc9d68c39b80995b7f424468386b32a5847dbdefbceb1cba53ff7182da35be8599523d82a6daa8fee23667d07e06faedc4c727d52e8fc594d0bc7d3f
data/CHANGELOG.md CHANGED
@@ -1,3 +1,31 @@
1
+ ## [[0.3.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.3.1...v0.3.2)] - 2023-07-08
2
+
3
+ - Bump bundled llama.cpp from master-b8c8dda to master-481f793.
4
+ - Add `Timings` class and `timings` method to `Context`:
5
+ ```ruby
6
+ require 'llama_cpp'
7
+
8
+ # ...
9
+
10
+ context = LLaMACpp::Context.new(model: model)
11
+ timings = context.timings
12
+
13
+ puts timings.class
14
+ # => LLaMACpp::Timings
15
+ puts timings.t_load_ms
16
+ # => 79.61
17
+ ```
18
+ - Expose sampling options as the arguemnts of `generate` module function:
19
+ ```ruby
20
+ require 'llama_cpp'
21
+
22
+ # ...
23
+
24
+ LLaMACpp.generate(context, 'Hello, world.', top_k: 30, top_p: 0.8, temperature: 0.9)
25
+ ```
26
+ - Add `ModelQuantizaParams` class, this class was not published because the author forgot to write rb_define_class.
27
+ - Minor update to example scripts, configuration files, and documentations.
28
+
1
29
  ## [[0.3.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.3.0...v0.3.1)] - 2023-07-02
2
30
 
3
31
  - Bump bundled llama.cpp from master-9d23589 to master-b8c8dda.
data/README.md CHANGED
@@ -68,6 +68,15 @@ User:
68
68
 
69
69
  ![llama_cpp_chat_example](https://github.com/yoshoku/llama_cpp.rb/assets/5562409/374ae3d8-63a6-498f-ae6e-5552b464bdda)
70
70
 
71
+ Japanse chat is also possible using the [Vicuna model on Hugging Face](https://huggingface.co/CRD716/ggml-vicuna-1.1-quantized).
72
+
73
+ ```sh
74
+ $ wget https://huggingface.co/CRD716/ggml-vicuna-1.1-quantized/resolve/main/ggml-vicuna-7b-1.1-q4_0.bin
75
+ $ ruby chat.rb --model ggml-vicuna-7b-1.1-q4_0.bin --file prompt_jp.txt
76
+ ```
77
+
78
+ ![llama_cpp rb-jpchat](https://github.com/yoshoku/llama_cpp.rb/assets/5562409/526ff18c-2bb2-4b06-8933-f72960024033)
79
+
71
80
  ## Contributing
72
81
 
73
82
  Bug reports and pull requests are welcome on GitHub at https://github.com/yoshoku/llama_cpp.rb.
data/examples/chat.rb CHANGED
@@ -33,7 +33,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
33
33
  option :n_gpu_layers, type: :numeric, desc: 'number of layers on GPU', default: 0
34
34
  def main # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
35
35
  params = LLaMACpp::ContextParams.new
36
- params.seed = options[:seed]
36
+ params.seed = options[:seed] if options[:seed] != -1
37
37
  params.n_gpu_layers = options[:n_gpu_layers]
38
38
  model = LLaMACpp::Model.new(model_path: options[:model], params: params)
39
39
  context = LLaMACpp::Context.new(model: model)
@@ -18,7 +18,7 @@ class Embedding < Thor # rubocop:disable Style/Documentation
18
18
  option :n_gpu_layers, type: :numeric, desc: 'number of layers on GPU', default: 0
19
19
  def main # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
20
20
  params = LLaMACpp::ContextParams.new
21
- params.seed = options[:seed]
21
+ params.seed = options[:seed] if options[:seed] != -1
22
22
  params.n_gpu_layers = options[:n_gpu_layers]
23
23
  params.embedding = true
24
24
  model = LLaMACpp::Model.new(model_path: options[:model], params: params)
@@ -0,0 +1,8 @@
1
+ UserがTaroという名前のアシスタントと対話するダイアログのトランスクリプト。
2
+ Taroは親切で、親切で、正直で、文章を書くのが上手で、ユーザーのリクエストに即座に正確に答えることを怠りません。
3
+
4
+ User: こんにちには、Taro。
5
+ Taro: こんにちは、今日はどのような要件ですか?
6
+ User: 日本で最大の都市について教えてください。
7
+ Taro: はい、日本で最大の都市は東京です。日本の首都でもあります。
8
+ User:
@@ -7,8 +7,8 @@ abort 'libstdc++ is not found.' unless have_library('stdc++')
7
7
 
8
8
  $srcs = %w[ggml.c llama.cpp llama_cpp.cpp]
9
9
  $srcs << 'ggml-opencl.cpp' if with_config('clblast')
10
- $CFLAGS << ' -w'
11
- $CXXFLAGS << ' -std=c++11'
10
+ $CFLAGS << ' -w -DNDEBUG'
11
+ $CXXFLAGS << ' -std=c++11 -DNDEBUG'
12
12
  $INCFLAGS << ' -I$(srcdir)/src'
13
13
  $VPATH << '$(srcdir)/src'
14
14
 
@@ -1,8 +1,8 @@
1
-
2
1
  #include "llama_cpp.h"
3
2
 
4
3
  VALUE rb_mLLaMACpp;
5
4
  VALUE rb_cLLaMAModel;
5
+ VALUE rb_cLLaMATimings;
6
6
  VALUE rb_cLLaMAContext;
7
7
  VALUE rb_cLLaMAContextParams;
8
8
  VALUE rb_cLLaMAModelQuantizeParams;
@@ -256,6 +256,111 @@ const rb_data_type_t RbLLaMATokenDataArray::llama_token_data_array_type = {
256
256
  RUBY_TYPED_FREE_IMMEDIATELY
257
257
  };
258
258
 
259
+ class LLaMATimingsWrapper {
260
+ public:
261
+ struct llama_timings timings;
262
+
263
+ LLaMATimingsWrapper(){};
264
+
265
+ ~LLaMATimingsWrapper(){};
266
+ };
267
+
268
+ class RbLLaMATimings {
269
+ public:
270
+ static VALUE llama_timings_alloc(VALUE self) {
271
+ LLaMATimingsWrapper* ptr = (LLaMATimingsWrapper*)ruby_xmalloc(sizeof(LLaMATimingsWrapper));
272
+ new (ptr) LLaMATimingsWrapper();
273
+ return TypedData_Wrap_Struct(self, &llama_timings_type, ptr);
274
+ }
275
+
276
+ static void llama_timings_free(void* ptr) {
277
+ ((LLaMATimingsWrapper*)ptr)->~LLaMATimingsWrapper();
278
+ ruby_xfree(ptr);
279
+ }
280
+
281
+ static size_t llama_timings_size(const void* ptr) {
282
+ return sizeof(*((LLaMATimingsWrapper*)ptr));
283
+ }
284
+
285
+ static LLaMATimingsWrapper* get_llama_timings(VALUE self) {
286
+ LLaMATimingsWrapper* ptr;
287
+ TypedData_Get_Struct(self, LLaMATimingsWrapper, &llama_timings_type, ptr);
288
+ return ptr;
289
+ }
290
+
291
+ static void define_class(VALUE outer) {
292
+ rb_cLLaMATimings = rb_define_class_under(outer, "Timings", rb_cObject);
293
+ rb_define_alloc_func(rb_cLLaMATimings, llama_timings_alloc);
294
+ rb_define_method(rb_cLLaMATimings, "t_start_ms", RUBY_METHOD_FUNC(_llama_timings_get_t_start_ms), 0);
295
+ rb_define_method(rb_cLLaMATimings, "t_end_ms", RUBY_METHOD_FUNC(_llama_timings_get_t_end_ms), 0);
296
+ rb_define_method(rb_cLLaMATimings, "t_load_ms", RUBY_METHOD_FUNC(_llama_timings_get_t_load_ms), 0);
297
+ rb_define_method(rb_cLLaMATimings, "t_sample_ms", RUBY_METHOD_FUNC(_llama_timings_get_t_sample_ms), 0);
298
+ rb_define_method(rb_cLLaMATimings, "t_p_eval_ms", RUBY_METHOD_FUNC(_llama_timings_get_t_p_eval_ms), 0);
299
+ rb_define_method(rb_cLLaMATimings, "t_eval_ms", RUBY_METHOD_FUNC(_llama_timings_get_t_eval_ms), 0);
300
+ rb_define_method(rb_cLLaMATimings, "n_sample", RUBY_METHOD_FUNC(_llama_timings_get_n_sample), 0);
301
+ rb_define_method(rb_cLLaMATimings, "n_p_eval", RUBY_METHOD_FUNC(_llama_timings_get_n_p_eval), 0);
302
+ rb_define_method(rb_cLLaMATimings, "n_eval", RUBY_METHOD_FUNC(_llama_timings_get_n_eval), 0);
303
+ }
304
+
305
+ private:
306
+ static const rb_data_type_t llama_timings_type;
307
+
308
+ static VALUE _llama_timings_get_t_start_ms(VALUE self) {
309
+ LLaMATimingsWrapper* ptr = get_llama_timings(self);
310
+ return DBL2NUM(ptr->timings.t_start_ms);
311
+ }
312
+
313
+ static VALUE _llama_timings_get_t_end_ms(VALUE self) {
314
+ LLaMATimingsWrapper* ptr = get_llama_timings(self);
315
+ return DBL2NUM(ptr->timings.t_end_ms);
316
+ }
317
+
318
+ static VALUE _llama_timings_get_t_load_ms(VALUE self) {
319
+ LLaMATimingsWrapper* ptr = get_llama_timings(self);
320
+ return DBL2NUM(ptr->timings.t_load_ms);
321
+ }
322
+
323
+ static VALUE _llama_timings_get_t_sample_ms(VALUE self) {
324
+ LLaMATimingsWrapper* ptr = get_llama_timings(self);
325
+ return DBL2NUM(ptr->timings.t_sample_ms);
326
+ }
327
+
328
+ static VALUE _llama_timings_get_t_p_eval_ms(VALUE self) {
329
+ LLaMATimingsWrapper* ptr = get_llama_timings(self);
330
+ return DBL2NUM(ptr->timings.t_p_eval_ms);
331
+ }
332
+
333
+ static VALUE _llama_timings_get_t_eval_ms(VALUE self) {
334
+ LLaMATimingsWrapper* ptr = get_llama_timings(self);
335
+ return DBL2NUM(ptr->timings.t_eval_ms);
336
+ }
337
+
338
+ static VALUE _llama_timings_get_n_sample(VALUE self) {
339
+ LLaMATimingsWrapper* ptr = get_llama_timings(self);
340
+ return INT2NUM(ptr->timings.n_sample);
341
+ }
342
+
343
+ static VALUE _llama_timings_get_n_p_eval(VALUE self) {
344
+ LLaMATimingsWrapper* ptr = get_llama_timings(self);
345
+ return INT2NUM(ptr->timings.n_p_eval);
346
+ }
347
+
348
+ static VALUE _llama_timings_get_n_eval(VALUE self) {
349
+ LLaMATimingsWrapper* ptr = get_llama_timings(self);
350
+ return INT2NUM(ptr->timings.n_eval);
351
+ }
352
+ };
353
+
354
+ const rb_data_type_t RbLLaMATimings::llama_timings_type = {
355
+ "RbLLaMATimings",
356
+ { NULL,
357
+ RbLLaMATimings::llama_timings_free,
358
+ RbLLaMATimings::llama_timings_size },
359
+ NULL,
360
+ NULL,
361
+ RUBY_TYPED_FREE_IMMEDIATELY
362
+ };
363
+
259
364
  class LLaMAContextParamsWrapper {
260
365
  public:
261
366
  struct llama_context_params params;
@@ -866,6 +971,7 @@ public:
866
971
  rb_define_method(rb_cLLaMAContext, "n_vocab", RUBY_METHOD_FUNC(_llama_context_n_vocab), 0);
867
972
  rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
868
973
  rb_define_method(rb_cLLaMAContext, "n_embd", RUBY_METHOD_FUNC(_llama_context_n_embd), 0);
974
+ rb_define_method(rb_cLLaMAContext, "timings", RUBY_METHOD_FUNC(_llama_context_get_timings), 0);
869
975
  rb_define_method(rb_cLLaMAContext, "print_timings", RUBY_METHOD_FUNC(_llama_context_print_timings), 0);
870
976
  rb_define_method(rb_cLLaMAContext, "reset_timings", RUBY_METHOD_FUNC(_llama_context_reset_timings), 0);
871
977
  rb_define_method(rb_cLLaMAContext, "kv_cache_token_count", RUBY_METHOD_FUNC(_llama_context_kv_cache_token_count), 0);
@@ -1227,6 +1333,18 @@ private:
1227
1333
  return INT2NUM(llama_n_embd(ptr->ctx));
1228
1334
  };
1229
1335
 
1336
+ static VALUE _llama_context_get_timings(VALUE self) {
1337
+ LLaMAContextWrapper* ptr = get_llama_context(self);
1338
+ if (ptr->ctx == NULL) {
1339
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
1340
+ return Qnil;
1341
+ }
1342
+ VALUE tm_obj = rb_funcall(rb_cLLaMATimings, rb_intern("new"), 0);
1343
+ LLaMATimingsWrapper* tm_ptr = RbLLaMATimings::get_llama_timings(tm_obj);
1344
+ tm_ptr->timings = llama_get_timings(ptr->ctx);
1345
+ return tm_obj;
1346
+ }
1347
+
1230
1348
  static VALUE _llama_context_print_timings(VALUE self) {
1231
1349
  LLaMAContextWrapper* ptr = get_llama_context(self);
1232
1350
  if (ptr->ctx == NULL) {
@@ -1898,8 +2016,10 @@ extern "C" void Init_llama_cpp(void) {
1898
2016
  RbLLaMATokenData::define_class(rb_mLLaMACpp);
1899
2017
  RbLLaMATokenDataArray::define_class(rb_mLLaMACpp);
1900
2018
  RbLLaMAModel::define_class(rb_mLLaMACpp);
2019
+ RbLLaMATimings::define_class(rb_mLLaMACpp);
1901
2020
  RbLLaMAContext::define_class(rb_mLLaMACpp);
1902
2021
  RbLLaMAContextParams::define_class(rb_mLLaMACpp);
2022
+ RbLLaMAModelQuantizeParams::define_class(rb_mLLaMACpp);
1903
2023
 
1904
2024
  rb_define_module_function(rb_mLLaMACpp, "init_backend", rb_llama_llama_init_backend, -1);
1905
2025
  rb_define_module_function(rb_mLLaMACpp, "model_quantize", rb_llama_model_quantize, -1);