llama_cpp 0.3.8 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 8a6623a24970936369944231171226dda1ce579bf29fc3711f8923c8d2d22cba
4
- data.tar.gz: dbff8f38ea54195b05fc0acbaf8fceb7fd6bfdc329100a18665ef2cba2fd5d81
3
+ metadata.gz: 715eab98a76ed825d66da6e4fcc84154dca8eed76f6cf6625d210a1ffb702958
4
+ data.tar.gz: 3ceafc312354d245e485b664d71450cd9c27bcd89f5faec91af6cdf1221c251f
5
5
  SHA512:
6
- metadata.gz: 710ab86cfea7b5f91a386bdf87872c1d19ba49057bc02aa11a4f0198aee404a2d5b931965fdeba40aa1353269f95a451090e261305931e31a182a078827ace80
7
- data.tar.gz: ec4d956b5ab5ad665a0e99489b81b364b79ed39e74146629e4140240b5e176f4ef9dbf3d1c11acdb4098398114fbf055a2ad4f8251ed98ec42471a478f6dcaa2
6
+ metadata.gz: 7ebe959d9380c9d981156606fdd8a6bcea9b88914923e693b400cfcd605b8c216bdfdcc807c0e72a21fe5fc6d7d623118fc7246524d7f59acdb8bc0064d736bc
7
+ data.tar.gz: c6d428234d866c09d227b5c308a573e9721454ded3f7fdd36880706e7c47c72c67e6fed119c75d6898c6a1149cde853e5dbb59e3a390ef3d370aab4f0d6be548
data/CHANGELOG.md CHANGED
@@ -1,3 +1,22 @@
1
+ ## [[0.5.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.4.0...v0.5.0)] - 2023-09-02
2
+
3
+ **Breaking Changes**
4
+ - Bump bundled llama.cpp from master-b1060 to master-b1140.
5
+ - Rename `token_to_str` method on Context to `token_to_piece` method.
6
+ - Rename `token_to_str` method on Model to `token_to_piece` method.
7
+ - Rename `type` method on Model to `desc` method.
8
+ - Add `size` and `n_params` methods to Model.
9
+
10
+ ## [[0.4.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.3.8...v0.4.0)] - 2023-08-26
11
+
12
+ **Breaking Changes**
13
+ - Bump bundled llama.cpp from master-097e121 to master-b1060.
14
+ - Support new file format GGUF.
15
+ - You should re-convert / re-quantize your model files.
16
+ - Remove vocab methods.
17
+ - Move token_bos, token_eos, and token_nl methods to Context.
18
+ - Add text, score, and type methods to Context.
19
+
1
20
  ## [[0.3.8](https://github.com/yoshoku/llama_cpp.rb/compare/v0.3.7...v0.3.8)] - 2023-08-19
2
21
 
3
22
  - Bump bundled llama.cpp from master-9ca4abe to master-097e121.
data/README.md CHANGED
@@ -51,7 +51,7 @@ $ git clone https://huggingface.co/openlm-research/open_llama_7b
51
51
  $ cd ../
52
52
  $ python3 convert.py models/open_llama_7b
53
53
  $ make
54
- $ ./quantize ./models/open_llama_7b/ggml-model-f16.bin ./models/open_llama_7b/ggml-model-q4_0.bin q4_0
54
+ $ ./quantize ./models/open_llama_7b/ggml-model-f16.gguf ./models/open_llama_7b/ggml-model-q4_0.bin q4_0
55
55
  ```
56
56
 
57
57
  An example of Ruby code that generates sentences with the quantization model is as follows:
data/examples/chat.rb CHANGED
@@ -49,8 +49,6 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
49
49
  n_keep = options[:keep]
50
50
  n_keep = embd_input.size if n_keep > embd_input.size
51
51
 
52
- token_newline = context.tokenize(text: "\n", add_bos: false)
53
-
54
52
  last_n_tokens = [0] * n_ctx
55
53
  interactive = true
56
54
  is_interacting = false
@@ -101,8 +99,8 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
101
99
  last_n_tokens.shift
102
100
  last_n_tokens.push(id)
103
101
 
104
- if id == LLaMACpp.token_eos
105
- id = token_newline.first
102
+ if id == context.token_eos
103
+ id = context.token_nl
106
104
  unless antiprompt.empty?
107
105
  first_antiprompt = context.tokenize(text: antiprompt, add_bos: false)
108
106
  embd_input.concat(first_antiprompt)
@@ -124,7 +122,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
124
122
 
125
123
  if input_echo
126
124
  output = []
127
- embd.each { |token| output << context.token_to_str(token) }
125
+ embd.each { |token| output << context.token_to_piece(token) }
128
126
  output_str = output.join
129
127
  output_str.chomp!(antiprompt) if first_input
130
128
  print(output_str)
@@ -133,7 +131,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
133
131
  if embd_input.size <= n_consumed
134
132
  if antiprompt.size.positive?
135
133
  last_output = []
136
- last_n_tokens.each { |token| last_output << context.token_to_str(token) }
134
+ last_n_tokens.each { |token| last_output << context.token_to_piece(token) }
137
135
  last_output_str = last_output.join
138
136
 
139
137
  search_start_pos = last_output_str.size > antiprompt.size ? last_output_str.size - antiprompt.size : 0
@@ -50,10 +50,10 @@ if with_config('accelerate')
50
50
  end
51
51
 
52
52
  if with_config('metal')
53
- $CFLAGS << ' -DGGML_USE_METAL -DGGML_METAL_NDEBUG'
53
+ $CFLAGS << ' -DGGML_USE_METAL'
54
54
  $CXXFLAGS << ' -DGGML_USE_METAL'
55
55
  $LDFLAGS << ' -framework Foundation -framework Metal -framework MetalKit'
56
- $objs = %w[ggml.o ggml-alloc.o llama.o llama_cpp.o ggml-metal.o]
56
+ $objs = %w[ggml.o ggml-alloc.o ggml-metal.o llama.o llama_cpp.o]
57
57
  $objs << 'k_quants.o' unless with_config('no_k_quants')
58
58
  end
59
59
 
@@ -61,7 +61,7 @@ if with_config('cublas')
61
61
  $CFLAGS << ' -DGGML_USE_CUBLAS -I/usr/local/cuda/include'
62
62
  $CXXFLAGS << ' -DGGML_USE_CUBLAS -I/usr/local/cuda/include'
63
63
  $LDFLAGS << ' -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64'
64
- $objs = %w[ggml-cuda.o ggml.o llama.o llama_cpp.o]
64
+ $objs = %w[ggml.o ggml-alloc.o ggml-cuda.o llama.o llama_cpp.o]
65
65
  $objs << 'k_quants.o' unless with_config('no_k_quants')
66
66
  end
67
67
 
@@ -808,13 +808,14 @@ public:
808
808
  rb_define_method(rb_cLLaMAModel, "free", RUBY_METHOD_FUNC(_llama_model_free), 0);
809
809
  rb_define_method(rb_cLLaMAModel, "load", RUBY_METHOD_FUNC(_llama_model_load), -1);
810
810
  rb_define_method(rb_cLLaMAModel, "apply_lora_from_file", RUBY_METHOD_FUNC(_llama_model_apply_lora_from_file), -1);
811
- rb_define_method(rb_cLLaMAModel, "n_vocab", RUBY_METHOD_FUNC(_llama_model_get_n_vocab_from_model), 0);
812
- rb_define_method(rb_cLLaMAModel, "n_ctx", RUBY_METHOD_FUNC(_llama_model_get_n_ctx_from_model), 0);
813
- rb_define_method(rb_cLLaMAModel, "n_embd", RUBY_METHOD_FUNC(_llama_model_get_n_embd_from_model), 0);
814
- rb_define_method(rb_cLLaMAModel, "vocab", RUBY_METHOD_FUNC(_llama_model_get_vocab_from_model), -1);
815
- rb_define_method(rb_cLLaMAModel, "token_to_str", RUBY_METHOD_FUNC(_llama_model_token_to_str_with_model), 1);
811
+ rb_define_method(rb_cLLaMAModel, "n_vocab", RUBY_METHOD_FUNC(_llama_model_get_model_n_vocab), 0);
812
+ rb_define_method(rb_cLLaMAModel, "n_ctx", RUBY_METHOD_FUNC(_llama_model_get_model_n_ctx), 0);
813
+ rb_define_method(rb_cLLaMAModel, "n_embd", RUBY_METHOD_FUNC(_llama_model_get_model_n_embd), 0);
814
+ rb_define_method(rb_cLLaMAModel, "token_to_piece", RUBY_METHOD_FUNC(_llama_model_token_to_piece_with_model), 1);
816
815
  rb_define_method(rb_cLLaMAModel, "tokenize", RUBY_METHOD_FUNC(_llama_model_tokenize_with_model), -1);
817
- rb_define_method(rb_cLLaMAModel, "type", RUBY_METHOD_FUNC(_llama_model_get_model_type), 0);
816
+ rb_define_method(rb_cLLaMAModel, "desc", RUBY_METHOD_FUNC(_llama_model_get_model_desc), 0);
817
+ rb_define_method(rb_cLLaMAModel, "size", RUBY_METHOD_FUNC(_llama_model_get_model_size), 0);
818
+ rb_define_method(rb_cLLaMAModel, "n_params", RUBY_METHOD_FUNC(_llama_model_get_model_n_params), 0);
818
819
  }
819
820
 
820
821
  private:
@@ -960,64 +961,42 @@ private:
960
961
  return Qnil;
961
962
  }
962
963
 
963
- static VALUE _llama_model_get_n_vocab_from_model(VALUE self) {
964
+ static VALUE _llama_model_get_model_n_vocab(VALUE self) {
964
965
  LLaMAModelWrapper* ptr = get_llama_model(self);
965
- return INT2NUM(llama_n_vocab_from_model(ptr->model));
966
+ return INT2NUM(llama_model_n_vocab(ptr->model));
966
967
  }
967
968
 
968
- static VALUE _llama_model_get_n_ctx_from_model(VALUE self) {
969
+ static VALUE _llama_model_get_model_n_ctx(VALUE self) {
969
970
  LLaMAModelWrapper* ptr = get_llama_model(self);
970
- return INT2NUM(llama_n_ctx_from_model(ptr->model));
971
+ return INT2NUM(llama_model_n_ctx(ptr->model));
971
972
  }
972
973
 
973
- static VALUE _llama_model_get_n_embd_from_model(VALUE self) {
974
+ static VALUE _llama_model_get_model_n_embd(VALUE self) {
974
975
  LLaMAModelWrapper* ptr = get_llama_model(self);
975
- return INT2NUM(llama_n_embd_from_model(ptr->model));
976
+ return INT2NUM(llama_model_n_embd(ptr->model));
976
977
  }
977
978
 
978
- static VALUE _llama_model_get_vocab_from_model(int argc, VALUE* argv, VALUE self) {
979
- VALUE kw_args = Qnil;
980
- ID kw_table[1] = { rb_intern("capacity") };
981
- VALUE kw_values[1] = { Qundef };
982
- rb_scan_args(argc, argv, ":", &kw_args);
983
- rb_get_kwargs(kw_args, kw_table, 1, 0, kw_values);
984
-
985
- if (!RB_INTEGER_TYPE_P(kw_values[0])) {
986
- rb_raise(rb_eArgError, "capacity must be an integer");
987
- return Qnil;
988
- }
989
-
990
- const int capacity = NUM2INT(kw_values[0]);
991
-
992
- LLaMAModelWrapper* ptr = get_llama_model(self);
993
- const int n = std::min(capacity, llama_n_vocab_from_model(ptr->model));
994
- const char** vocabs = ALLOCA_N(const char*, n);
995
- float* scores = ALLOCA_N(float, n);
996
-
997
- llama_get_vocab_from_model(ptr->model, vocabs, scores, capacity);
998
-
999
- VALUE vocabs_ary = rb_ary_new();
1000
- VALUE scores_ary = rb_ary_new();
1001
-
1002
- for (int i = 0; i < n; i++) {
1003
- rb_ary_push(vocabs_ary, rb_str_new_cstr(vocabs[i]));
1004
- rb_ary_push(scores_ary, DBL2NUM(scores[i]));
1005
- }
1006
-
1007
- VALUE ret = rb_ary_new3(2, vocabs_ary, scores_ary);
1008
-
1009
- return ret;
1010
- }
1011
-
1012
- static VALUE _llama_model_token_to_str_with_model(VALUE self, VALUE token_) {
979
+ static VALUE _llama_model_token_to_piece_with_model(VALUE self, VALUE token_) {
1013
980
  if (!RB_INTEGER_TYPE_P(token_)) {
1014
981
  rb_raise(rb_eArgError, "token must be an integer");
1015
982
  return Qnil;
1016
983
  }
1017
984
  const llama_token token = NUM2INT(token_);
1018
985
  LLaMAModelWrapper* ptr = get_llama_model(self);
1019
- const char* str = llama_token_to_str_with_model(ptr->model, token);
1020
- return rb_str_new_cstr(str);
986
+ std::vector<char> result(8, 0);
987
+ const int n_tokens = llama_token_to_piece_with_model(ptr->model, token, result.data(), result.size());
988
+ if (n_tokens < 0) {
989
+ result.resize(-n_tokens);
990
+ const int check = llama_token_to_piece_with_model(ptr->model, token, result.data(), result.size());
991
+ if (check != -n_tokens) {
992
+ rb_raise(rb_eRuntimeError, "failed to convert");
993
+ return Qnil;
994
+ }
995
+ } else {
996
+ result.resize(n_tokens);
997
+ }
998
+ std::string ret(result.data(), result.size());
999
+ return rb_str_new_cstr(ret.c_str());
1021
1000
  }
1022
1001
 
1023
1002
  static VALUE _llama_model_tokenize_with_model(int argc, VALUE* argv, VALUE self) {
@@ -1063,12 +1042,22 @@ private:
1063
1042
  return ret;
1064
1043
  }
1065
1044
 
1066
- static VALUE _llama_model_get_model_type(VALUE self) {
1045
+ static VALUE _llama_model_get_model_desc(VALUE self) {
1067
1046
  LLaMAModelWrapper* ptr = get_llama_model(self);
1068
1047
  char buf[128];
1069
- ::llama_model_type(ptr->model, buf, sizeof(buf));
1048
+ llama_model_desc(ptr->model, buf, sizeof(buf));
1070
1049
  return rb_str_new_cstr(buf);
1071
1050
  }
1051
+
1052
+ static VALUE _llama_model_get_model_size(VALUE self) {
1053
+ LLaMAModelWrapper* ptr = get_llama_model(self);
1054
+ return UINT2NUM(llama_model_size(ptr->model));
1055
+ }
1056
+
1057
+ static VALUE _llama_model_get_model_n_params(VALUE self) {
1058
+ LLaMAModelWrapper* ptr = get_llama_model(self);
1059
+ return UINT2NUM(llama_model_n_params(ptr->model));
1060
+ }
1072
1061
  };
1073
1062
 
1074
1063
  const rb_data_type_t RbLLaMAModel::llama_model_type = {
@@ -1343,8 +1332,13 @@ public:
1343
1332
  rb_define_method(rb_cLLaMAContext, "tokenize", RUBY_METHOD_FUNC(_llama_context_tokenize), -1);
1344
1333
  rb_define_method(rb_cLLaMAContext, "logits", RUBY_METHOD_FUNC(_llama_context_logits), 0);
1345
1334
  rb_define_method(rb_cLLaMAContext, "embeddings", RUBY_METHOD_FUNC(_llama_context_embeddings), 0);
1346
- rb_define_method(rb_cLLaMAContext, "vocab", RUBY_METHOD_FUNC(_llama_context_vocab), -1);
1347
- rb_define_method(rb_cLLaMAContext, "token_to_str", RUBY_METHOD_FUNC(_llama_context_token_to_str), 1);
1335
+ rb_define_method(rb_cLLaMAContext, "text", RUBY_METHOD_FUNC(_llama_context_text), 1);
1336
+ rb_define_method(rb_cLLaMAContext, "score", RUBY_METHOD_FUNC(_llama_context_score), 1);
1337
+ rb_define_method(rb_cLLaMAContext, "type", RUBY_METHOD_FUNC(_llama_context_type), 1);
1338
+ rb_define_method(rb_cLLaMAContext, "token_bos", RUBY_METHOD_FUNC(_llama_context_token_bos), 0);
1339
+ rb_define_method(rb_cLLaMAContext, "token_eos", RUBY_METHOD_FUNC(_llama_context_token_eos), 0);
1340
+ rb_define_method(rb_cLLaMAContext, "token_nl", RUBY_METHOD_FUNC(_llama_context_token_nl), 0);
1341
+ rb_define_method(rb_cLLaMAContext, "token_to_piece", RUBY_METHOD_FUNC(_llama_context_token_to_piece), 1);
1348
1342
  rb_define_method(rb_cLLaMAContext, "n_vocab", RUBY_METHOD_FUNC(_llama_context_n_vocab), 0);
1349
1343
  rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
1350
1344
  rb_define_method(rb_cLLaMAContext, "n_embd", RUBY_METHOD_FUNC(_llama_context_n_embd), 0);
@@ -1585,15 +1579,27 @@ private:
1585
1579
  return output;
1586
1580
  }
1587
1581
 
1588
- static VALUE _llama_context_token_to_str(VALUE self, VALUE token_) {
1582
+ static VALUE _llama_context_token_to_piece(VALUE self, VALUE token_) {
1589
1583
  LLaMAContextWrapper* ptr = get_llama_context(self);
1590
1584
  if (ptr->ctx == NULL) {
1591
1585
  rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
1592
1586
  return Qnil;
1593
1587
  }
1594
1588
  const llama_token token = NUM2INT(token_);
1595
- const char* str = llama_token_to_str(ptr->ctx, token);
1596
- return str != nullptr ? rb_utf8_str_new_cstr(str) : rb_utf8_str_new_cstr("");
1589
+ std::vector<char> result(8, 0);
1590
+ const int n_tokens = llama_token_to_piece(ptr->ctx, token, result.data(), result.size());
1591
+ if (n_tokens < 0) {
1592
+ result.resize(-n_tokens);
1593
+ const int check = llama_token_to_piece(ptr->ctx, token, result.data(), result.size());
1594
+ if (check != -n_tokens) {
1595
+ rb_raise(rb_eRuntimeError, "failed to convert");
1596
+ return Qnil;
1597
+ }
1598
+ } else {
1599
+ result.resize(n_tokens);
1600
+ }
1601
+ std::string ret(result.data(), result.size());
1602
+ return rb_str_new_cstr(ret.c_str());
1597
1603
  }
1598
1604
 
1599
1605
  static VALUE _llama_context_logits(VALUE self) {
@@ -1649,41 +1655,64 @@ private:
1649
1655
  return output;
1650
1656
  }
1651
1657
 
1652
- static VALUE _llama_context_vocab(int argc, VALUE* argv, VALUE self) {
1653
- VALUE kw_args = Qnil;
1654
- ID kw_table[1] = { rb_intern("capacity") };
1655
- VALUE kw_values[1] = { Qundef };
1656
- rb_scan_args(argc, argv, ":", &kw_args);
1657
- rb_get_kwargs(kw_args, kw_table, 1, 0, kw_values);
1658
-
1659
- if (!RB_INTEGER_TYPE_P(kw_values[0])) {
1660
- rb_raise(rb_eArgError, "capacity must be an integer");
1658
+ static VALUE _llama_context_text(VALUE self, VALUE token_) {
1659
+ LLaMAContextWrapper* ptr = get_llama_context(self);
1660
+ if (ptr->ctx == NULL) {
1661
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
1661
1662
  return Qnil;
1662
1663
  }
1664
+ const llama_token token = NUM2INT(token_);
1665
+ const char* text = llama_token_get_text(ptr->ctx, token);
1666
+ return rb_str_new_cstr(text);
1667
+ }
1663
1668
 
1669
+ static VALUE _llama_context_score(VALUE self, VALUE token_) {
1664
1670
  LLaMAContextWrapper* ptr = get_llama_context(self);
1665
1671
  if (ptr->ctx == NULL) {
1666
1672
  rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
1667
1673
  return Qnil;
1668
1674
  }
1675
+ const llama_token token = NUM2INT(token_);
1676
+ const float score = llama_token_get_score(ptr->ctx, token);
1677
+ return DBL2NUM(score);
1678
+ }
1669
1679
 
1670
- const int capacity = NUM2INT(kw_values[0]);
1671
- std::vector<const char*> strings;
1672
- std::vector<float> scores;
1673
- int n_vocab = llama_n_vocab(ptr->ctx);
1674
- strings.resize(n_vocab, NULL);
1675
- scores.resize(n_vocab, 0);
1680
+ static VALUE _llama_context_type(VALUE self, VALUE token_) {
1681
+ LLaMAContextWrapper* ptr = get_llama_context(self);
1682
+ if (ptr->ctx == NULL) {
1683
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
1684
+ return Qnil;
1685
+ }
1686
+ const llama_token token = NUM2INT(token_);
1687
+ const int type = llama_token_get_type(ptr->ctx, token);
1688
+ return INT2NUM(type);
1689
+ }
1676
1690
 
1677
- n_vocab = llama_get_vocab(ptr->ctx, strings.data(), scores.data(), capacity);
1691
+ static VALUE _llama_context_token_bos(VALUE self) {
1692
+ LLaMAContextWrapper* ptr = get_llama_context(self);
1693
+ if (ptr->ctx == NULL) {
1694
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
1695
+ return Qnil;
1696
+ }
1697
+ return INT2NUM(llama_token_bos(ptr->ctx));
1698
+ }
1678
1699
 
1679
- VALUE ret_strings = rb_ary_new();
1680
- VALUE ret_scores = rb_ary_new();
1681
- for (int i = 0; i < n_vocab; i++) {
1682
- rb_ary_push(ret_strings, rb_utf8_str_new_cstr(strings[i]));
1683
- rb_ary_push(ret_scores, DBL2NUM(static_cast<double>(scores[i])));
1700
+ static VALUE _llama_context_token_eos(VALUE self) {
1701
+ LLaMAContextWrapper* ptr = get_llama_context(self);
1702
+ if (ptr->ctx == NULL) {
1703
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
1704
+ return Qnil;
1684
1705
  }
1706
+ return INT2NUM(llama_token_eos(ptr->ctx));
1707
+ }
1685
1708
 
1686
- return rb_ary_new_from_args(2, ret_strings, ret_scores);
1709
+ static VALUE _llama_context_token_nl(VALUE self) {
1710
+ LLaMAContextWrapper* ptr = get_llama_context(self);
1711
+ if (ptr->ctx == NULL) {
1712
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
1713
+ return Qnil;
1714
+ }
1715
+ return INT2NUM(llama_token_nl(ptr->ctx));
1687
1716
  }
1688
1717
 
1689
1718
  static VALUE _llama_context_n_vocab(VALUE self) {
@@ -2474,23 +2503,15 @@ static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) {
2474
2503
  return Qnil;
2475
2504
  }
2476
2505
 
2477
- static VALUE rb_llama_token_bos(VALUE self) {
2478
- return INT2NUM(llama_token_bos());
2479
- }
2480
-
2481
- static VALUE rb_llama_token_eos(VALUE self) {
2482
- return INT2NUM(llama_token_eos());
2483
- }
2484
-
2485
- static VALUE rb_llama_token_nl(VALUE self) {
2486
- return INT2NUM(llama_token_nl());
2487
- }
2488
-
2489
2506
  static VALUE rb_llama_print_system_info(VALUE self) {
2490
2507
  const char* result = llama_print_system_info();
2491
2508
  return rb_utf8_str_new_cstr(result);
2492
2509
  }
2493
2510
 
2511
+ static VALUE rb_llama_time_us(VALUE self) {
2512
+ return LONG2NUM(llama_time_us());
2513
+ }
2514
+
2494
2515
  static VALUE rb_llama_mmap_supported(VALUE self) {
2495
2516
  return llama_mmap_supported() ? Qtrue : Qfalse;
2496
2517
  }
@@ -2519,16 +2540,29 @@ extern "C" void Init_llama_cpp(void) {
2519
2540
  rb_define_module_function(rb_mLLaMACpp, "backend_init", rb_llama_llama_backend_init, -1);
2520
2541
  rb_define_module_function(rb_mLLaMACpp, "backend_free", rb_llama_llama_backend_free, 0);
2521
2542
  rb_define_module_function(rb_mLLaMACpp, "model_quantize", rb_llama_model_quantize, -1);
2522
- rb_define_module_function(rb_mLLaMACpp, "token_bos", rb_llama_token_bos, 0);
2523
- rb_define_module_function(rb_mLLaMACpp, "token_eos", rb_llama_token_eos, 0);
2524
- rb_define_module_function(rb_mLLaMACpp, "token_nl", rb_llama_token_nl, 0);
2525
2543
  rb_define_module_function(rb_mLLaMACpp, "print_system_info", rb_llama_print_system_info, 0);
2544
+ rb_define_module_function(rb_mLLaMACpp, "time_us", rb_llama_time_us, 0);
2526
2545
  rb_define_module_function(rb_mLLaMACpp, "mmap_supported?", rb_llama_mmap_supported, 0);
2527
2546
  rb_define_module_function(rb_mLLaMACpp, "mlock_supported?", rb_llama_mlock_supported, 0);
2528
2547
  rb_define_module_function(rb_mLLaMACpp, "max_devices", rb_llama_max_devices, 0);
2529
2548
 
2530
2549
  rb_define_const(rb_mLLaMACpp, "LLAMA_MAX_DEVICES", INT2NUM(LLAMA_MAX_DEVICES));
2531
2550
 
2551
+ rb_define_const(rb_mLLaMACpp, "LLAMA_LOG_LEVEL_ERROR", INT2NUM(LLAMA_LOG_LEVEL_ERROR));
2552
+ rb_define_const(rb_mLLaMACpp, "LLAMA_LOG_LEVEL_WARN", INT2NUM(LLAMA_LOG_LEVEL_WARN));
2553
+ rb_define_const(rb_mLLaMACpp, "LLAMA_LOG_LEVEL_INFO", INT2NUM(LLAMA_LOG_LEVEL_INFO));
2554
+
2555
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_SPM", INT2NUM(LLAMA_VOCAB_TYPE_SPM));
2556
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_BPE", INT2NUM(LLAMA_VOCAB_TYPE_BPE));
2557
+
2558
+ rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_UNDEFINED", INT2NUM(LLAMA_TOKEN_TYPE_UNDEFINED));
2559
+ rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_NORMAL", INT2NUM(LLAMA_TOKEN_TYPE_NORMAL));
2560
+ rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_UNKNOWN", INT2NUM(LLAMA_TOKEN_TYPE_UNKNOWN));
2561
+ rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_CONTROL", INT2NUM(LLAMA_TOKEN_TYPE_CONTROL));
2562
+ rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_USER_DEFINED", INT2NUM(LLAMA_TOKEN_TYPE_USER_DEFINED));
2563
+ rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_UNUSED", INT2NUM(LLAMA_TOKEN_TYPE_UNUSED));
2564
+ rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_BYTE", INT2NUM(LLAMA_TOKEN_TYPE_BYTE));
2565
+
2532
2566
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_ALL_F32", INT2NUM(LLAMA_FTYPE_ALL_F32));
2533
2567
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_F16", INT2NUM(LLAMA_FTYPE_MOSTLY_F16));
2534
2568
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_0", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_0));
@@ -2547,6 +2581,8 @@ extern "C" void Init_llama_cpp(void) {
2547
2581
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_K_M", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_M));
2548
2582
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q6_K", INT2NUM(LLAMA_FTYPE_MOSTLY_Q6_K));
2549
2583
 
2584
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_GUESSED", INT2NUM(LLAMA_FTYPE_GUESSED));
2585
+
2550
2586
  rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_END", INT2NUM(LLAMA_GRETYPE_END));
2551
2587
  rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_ALT", INT2NUM(LLAMA_GRETYPE_ALT));
2552
2588
  rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_RULE_REF", INT2NUM(LLAMA_GRETYPE_RULE_REF));
@@ -2556,39 +2592,9 @@ extern "C" void Init_llama_cpp(void) {
2556
2592
  rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_CHAR_ALT", INT2NUM(LLAMA_GRETYPE_CHAR_ALT));
2557
2593
 
2558
2594
  std::stringstream ss_magic;
2559
- ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGJT;
2560
- rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGJT", rb_str_new2(ss_magic.str().c_str()));
2561
-
2562
- ss_magic.str("");
2563
- ss_magic.clear(std::stringstream::goodbit);
2564
- ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGLA;
2565
- rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGLA", rb_str_new2(ss_magic.str().c_str()));
2566
-
2567
- ss_magic.str("");
2568
- ss_magic.clear(std::stringstream::goodbit);
2569
- ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGMF;
2570
- rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGMF", rb_str_new2(ss_magic.str().c_str()));
2571
-
2572
- ss_magic.str("");
2573
- ss_magic.clear(std::stringstream::goodbit);
2574
- ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGML;
2575
- rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGML", rb_str_new2(ss_magic.str().c_str()));
2576
-
2577
- ss_magic.str("");
2578
- ss_magic.clear(std::stringstream::goodbit);
2579
2595
  ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGSN;
2580
2596
  rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGSN", rb_str_new2(ss_magic.str().c_str()));
2581
2597
 
2582
- ss_magic.str("");
2583
- ss_magic.clear(std::stringstream::goodbit);
2584
- ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC;
2585
- rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC", rb_str_new2(ss_magic.str().c_str()));
2586
-
2587
- ss_magic.str("");
2588
- ss_magic.clear(std::stringstream::goodbit);
2589
- ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_UNVERSIONED;
2590
- rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_UNVERSIONED", rb_str_new2(ss_magic.str().c_str()));
2591
-
2592
2598
  ss_magic.str("");
2593
2599
  ss_magic.clear(std::stringstream::goodbit);
2594
2600
  ss_magic << std::showbase << std::hex << LLAMA_SESSION_MAGIC;
@@ -2599,6 +2605,5 @@ extern "C" void Init_llama_cpp(void) {
2599
2605
  ss_magic << std::showbase << std::hex << LLAMA_DEFAULT_SEED;
2600
2606
  rb_define_const(rb_mLLaMACpp, "LLAMA_DEFAULT_SEED", rb_str_new2(ss_magic.str().c_str()));
2601
2607
 
2602
- rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_VERSION", rb_str_new2(std::to_string(LLAMA_FILE_VERSION).c_str()));
2603
2608
  rb_define_const(rb_mLLaMACpp, "LLAMA_SESSION_VERSION", rb_str_new2(std::to_string(LLAMA_SESSION_VERSION).c_str()));
2604
2609
  }