llama_cpp 0.3.8 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 8a6623a24970936369944231171226dda1ce579bf29fc3711f8923c8d2d22cba
4
- data.tar.gz: dbff8f38ea54195b05fc0acbaf8fceb7fd6bfdc329100a18665ef2cba2fd5d81
3
+ metadata.gz: af3a0e01bc9f3cfad4cee3f21144dd354640e1d4558125be36d4b499fa3b4c24
4
+ data.tar.gz: 042a3b0491d98fa6a093c684e6ab751152f37c8438a3b4a7b19cb2d8c7ab95a7
5
5
  SHA512:
6
- metadata.gz: 710ab86cfea7b5f91a386bdf87872c1d19ba49057bc02aa11a4f0198aee404a2d5b931965fdeba40aa1353269f95a451090e261305931e31a182a078827ace80
7
- data.tar.gz: ec4d956b5ab5ad665a0e99489b81b364b79ed39e74146629e4140240b5e176f4ef9dbf3d1c11acdb4098398114fbf055a2ad4f8251ed98ec42471a478f6dcaa2
6
+ metadata.gz: 7ed85bd8438ee3b3adab884795c4aecb5b0d72ad57b7e02bc281b62c3b1d669efab62a020e03b09defe3084ecd8afacc4220303e99167d04d668650768c7392b
7
+ data.tar.gz: b705a0ccd2c7c1e15aed6383acb9d5a3d79d0a0c882a74c42b9099df9a27aff88ba08a2f06aa4d195382e8f41c1b16c0014a2047d1923369f275ca481d52bb21
data/CHANGELOG.md CHANGED
@@ -1,3 +1,13 @@
1
+ ## [[0.4.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.3.8...v0.4.0)] - 2023-08-26
2
+
3
+ **Breaking Changes**
4
+ - Bump bundled llama.cpp from master-097e121 to master-b1060.
5
+ - Support new file format GGUF.
6
+ - You should re-convert / re-quantize your model files.
7
+ - Remove vocab methods.
8
+ - Move token_bos, token_eos, and token_nl methods to Context.
9
+ - Add text, score, and type methods to Context.
10
+
1
11
  ## [[0.3.8](https://github.com/yoshoku/llama_cpp.rb/compare/v0.3.7...v0.3.8)] - 2023-08-19
2
12
 
3
13
  - Bump bundled llama.cpp from master-9ca4abe to master-097e121.
data/README.md CHANGED
@@ -51,7 +51,7 @@ $ git clone https://huggingface.co/openlm-research/open_llama_7b
51
51
  $ cd ../
52
52
  $ python3 convert.py models/open_llama_7b
53
53
  $ make
54
- $ ./quantize ./models/open_llama_7b/ggml-model-f16.bin ./models/open_llama_7b/ggml-model-q4_0.bin q4_0
54
+ $ ./quantize ./models/open_llama_7b/ggml-model-f16.gguf ./models/open_llama_7b/ggml-model-q4_0.bin q4_0
55
55
  ```
56
56
 
57
57
  An example of Ruby code that generates sentences with the quantization model is as follows:
data/examples/chat.rb CHANGED
@@ -49,8 +49,6 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
49
49
  n_keep = options[:keep]
50
50
  n_keep = embd_input.size if n_keep > embd_input.size
51
51
 
52
- token_newline = context.tokenize(text: "\n", add_bos: false)
53
-
54
52
  last_n_tokens = [0] * n_ctx
55
53
  interactive = true
56
54
  is_interacting = false
@@ -101,8 +99,8 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
101
99
  last_n_tokens.shift
102
100
  last_n_tokens.push(id)
103
101
 
104
- if id == LLaMACpp.token_eos
105
- id = token_newline.first
102
+ if id == context.token_eos
103
+ id = context.token_nl
106
104
  unless antiprompt.empty?
107
105
  first_antiprompt = context.tokenize(text: antiprompt, add_bos: false)
108
106
  embd_input.concat(first_antiprompt)
@@ -53,7 +53,7 @@ if with_config('metal')
53
53
  $CFLAGS << ' -DGGML_USE_METAL -DGGML_METAL_NDEBUG'
54
54
  $CXXFLAGS << ' -DGGML_USE_METAL'
55
55
  $LDFLAGS << ' -framework Foundation -framework Metal -framework MetalKit'
56
- $objs = %w[ggml.o ggml-alloc.o llama.o llama_cpp.o ggml-metal.o]
56
+ $objs = %w[ggml.o ggml-alloc.o ggml-metal.o llama.o llama_cpp.o]
57
57
  $objs << 'k_quants.o' unless with_config('no_k_quants')
58
58
  end
59
59
 
@@ -61,7 +61,7 @@ if with_config('cublas')
61
61
  $CFLAGS << ' -DGGML_USE_CUBLAS -I/usr/local/cuda/include'
62
62
  $CXXFLAGS << ' -DGGML_USE_CUBLAS -I/usr/local/cuda/include'
63
63
  $LDFLAGS << ' -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64'
64
- $objs = %w[ggml-cuda.o ggml.o llama.o llama_cpp.o]
64
+ $objs = %w[ggml.o ggml-alloc.o ggml-cuda.o llama.o llama_cpp.o]
65
65
  $objs << 'k_quants.o' unless with_config('no_k_quants')
66
66
  end
67
67
 
@@ -808,10 +808,9 @@ public:
808
808
  rb_define_method(rb_cLLaMAModel, "free", RUBY_METHOD_FUNC(_llama_model_free), 0);
809
809
  rb_define_method(rb_cLLaMAModel, "load", RUBY_METHOD_FUNC(_llama_model_load), -1);
810
810
  rb_define_method(rb_cLLaMAModel, "apply_lora_from_file", RUBY_METHOD_FUNC(_llama_model_apply_lora_from_file), -1);
811
- rb_define_method(rb_cLLaMAModel, "n_vocab", RUBY_METHOD_FUNC(_llama_model_get_n_vocab_from_model), 0);
812
- rb_define_method(rb_cLLaMAModel, "n_ctx", RUBY_METHOD_FUNC(_llama_model_get_n_ctx_from_model), 0);
813
- rb_define_method(rb_cLLaMAModel, "n_embd", RUBY_METHOD_FUNC(_llama_model_get_n_embd_from_model), 0);
814
- rb_define_method(rb_cLLaMAModel, "vocab", RUBY_METHOD_FUNC(_llama_model_get_vocab_from_model), -1);
811
+ rb_define_method(rb_cLLaMAModel, "n_vocab", RUBY_METHOD_FUNC(_llama_model_get_model_n_vocab), 0);
812
+ rb_define_method(rb_cLLaMAModel, "n_ctx", RUBY_METHOD_FUNC(_llama_model_get_model_n_ctx), 0);
813
+ rb_define_method(rb_cLLaMAModel, "n_embd", RUBY_METHOD_FUNC(_llama_model_get_model_n_embd), 0);
815
814
  rb_define_method(rb_cLLaMAModel, "token_to_str", RUBY_METHOD_FUNC(_llama_model_token_to_str_with_model), 1);
816
815
  rb_define_method(rb_cLLaMAModel, "tokenize", RUBY_METHOD_FUNC(_llama_model_tokenize_with_model), -1);
817
816
  rb_define_method(rb_cLLaMAModel, "type", RUBY_METHOD_FUNC(_llama_model_get_model_type), 0);
@@ -960,53 +959,19 @@ private:
960
959
  return Qnil;
961
960
  }
962
961
 
963
- static VALUE _llama_model_get_n_vocab_from_model(VALUE self) {
962
+ static VALUE _llama_model_get_model_n_vocab(VALUE self) {
964
963
  LLaMAModelWrapper* ptr = get_llama_model(self);
965
- return INT2NUM(llama_n_vocab_from_model(ptr->model));
964
+ return INT2NUM(llama_model_n_vocab(ptr->model));
966
965
  }
967
966
 
968
- static VALUE _llama_model_get_n_ctx_from_model(VALUE self) {
967
+ static VALUE _llama_model_get_model_n_ctx(VALUE self) {
969
968
  LLaMAModelWrapper* ptr = get_llama_model(self);
970
- return INT2NUM(llama_n_ctx_from_model(ptr->model));
969
+ return INT2NUM(llama_model_n_ctx(ptr->model));
971
970
  }
972
971
 
973
- static VALUE _llama_model_get_n_embd_from_model(VALUE self) {
972
+ static VALUE _llama_model_get_model_n_embd(VALUE self) {
974
973
  LLaMAModelWrapper* ptr = get_llama_model(self);
975
- return INT2NUM(llama_n_embd_from_model(ptr->model));
976
- }
977
-
978
- static VALUE _llama_model_get_vocab_from_model(int argc, VALUE* argv, VALUE self) {
979
- VALUE kw_args = Qnil;
980
- ID kw_table[1] = { rb_intern("capacity") };
981
- VALUE kw_values[1] = { Qundef };
982
- rb_scan_args(argc, argv, ":", &kw_args);
983
- rb_get_kwargs(kw_args, kw_table, 1, 0, kw_values);
984
-
985
- if (!RB_INTEGER_TYPE_P(kw_values[0])) {
986
- rb_raise(rb_eArgError, "capacity must be an integer");
987
- return Qnil;
988
- }
989
-
990
- const int capacity = NUM2INT(kw_values[0]);
991
-
992
- LLaMAModelWrapper* ptr = get_llama_model(self);
993
- const int n = std::min(capacity, llama_n_vocab_from_model(ptr->model));
994
- const char** vocabs = ALLOCA_N(const char*, n);
995
- float* scores = ALLOCA_N(float, n);
996
-
997
- llama_get_vocab_from_model(ptr->model, vocabs, scores, capacity);
998
-
999
- VALUE vocabs_ary = rb_ary_new();
1000
- VALUE scores_ary = rb_ary_new();
1001
-
1002
- for (int i = 0; i < n; i++) {
1003
- rb_ary_push(vocabs_ary, rb_str_new_cstr(vocabs[i]));
1004
- rb_ary_push(scores_ary, DBL2NUM(scores[i]));
1005
- }
1006
-
1007
- VALUE ret = rb_ary_new3(2, vocabs_ary, scores_ary);
1008
-
1009
- return ret;
974
+ return INT2NUM(llama_model_n_embd(ptr->model));
1010
975
  }
1011
976
 
1012
977
  static VALUE _llama_model_token_to_str_with_model(VALUE self, VALUE token_) {
@@ -1016,8 +981,20 @@ private:
1016
981
  }
1017
982
  const llama_token token = NUM2INT(token_);
1018
983
  LLaMAModelWrapper* ptr = get_llama_model(self);
1019
- const char* str = llama_token_to_str_with_model(ptr->model, token);
1020
- return rb_str_new_cstr(str);
984
+ std::vector<char> result(8, 0);
985
+ const int n_tokens = llama_token_to_str_with_model(ptr->model, token, result.data(), result.size());
986
+ if (n_tokens < 0) {
987
+ result.resize(-n_tokens);
988
+ const int check = llama_token_to_str_with_model(ptr->model, token, result.data(), result.size());
989
+ if (check != -n_tokens) {
990
+ rb_raise(rb_eRuntimeError, "failed to convert");
991
+ return Qnil;
992
+ }
993
+ } else {
994
+ result.resize(n_tokens);
995
+ }
996
+ std::string ret(result.data(), result.size());
997
+ return rb_str_new_cstr(ret.c_str());
1021
998
  }
1022
999
 
1023
1000
  static VALUE _llama_model_tokenize_with_model(int argc, VALUE* argv, VALUE self) {
@@ -1343,7 +1320,12 @@ public:
1343
1320
  rb_define_method(rb_cLLaMAContext, "tokenize", RUBY_METHOD_FUNC(_llama_context_tokenize), -1);
1344
1321
  rb_define_method(rb_cLLaMAContext, "logits", RUBY_METHOD_FUNC(_llama_context_logits), 0);
1345
1322
  rb_define_method(rb_cLLaMAContext, "embeddings", RUBY_METHOD_FUNC(_llama_context_embeddings), 0);
1346
- rb_define_method(rb_cLLaMAContext, "vocab", RUBY_METHOD_FUNC(_llama_context_vocab), -1);
1323
+ rb_define_method(rb_cLLaMAContext, "text", RUBY_METHOD_FUNC(_llama_context_text), 1);
1324
+ rb_define_method(rb_cLLaMAContext, "score", RUBY_METHOD_FUNC(_llama_context_score), 1);
1325
+ rb_define_method(rb_cLLaMAContext, "type", RUBY_METHOD_FUNC(_llama_context_type), 1);
1326
+ rb_define_method(rb_cLLaMAContext, "token_bos", RUBY_METHOD_FUNC(_llama_context_token_bos), 0);
1327
+ rb_define_method(rb_cLLaMAContext, "token_eos", RUBY_METHOD_FUNC(_llama_context_token_eos), 0);
1328
+ rb_define_method(rb_cLLaMAContext, "token_nl", RUBY_METHOD_FUNC(_llama_context_token_nl), 0);
1347
1329
  rb_define_method(rb_cLLaMAContext, "token_to_str", RUBY_METHOD_FUNC(_llama_context_token_to_str), 1);
1348
1330
  rb_define_method(rb_cLLaMAContext, "n_vocab", RUBY_METHOD_FUNC(_llama_context_n_vocab), 0);
1349
1331
  rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
@@ -1592,8 +1574,20 @@ private:
1592
1574
  return Qnil;
1593
1575
  }
1594
1576
  const llama_token token = NUM2INT(token_);
1595
- const char* str = llama_token_to_str(ptr->ctx, token);
1596
- return str != nullptr ? rb_utf8_str_new_cstr(str) : rb_utf8_str_new_cstr("");
1577
+ std::vector<char> result(8, 0);
1578
+ const int n_tokens = llama_token_to_str(ptr->ctx, token, result.data(), result.size());
1579
+ if (n_tokens < 0) {
1580
+ result.resize(-n_tokens);
1581
+ const int check = llama_token_to_str(ptr->ctx, token, result.data(), result.size());
1582
+ if (check != -n_tokens) {
1583
+ rb_raise(rb_eRuntimeError, "failed to convert");
1584
+ return Qnil;
1585
+ }
1586
+ } else {
1587
+ result.resize(n_tokens);
1588
+ }
1589
+ std::string ret(result.data(), result.size());
1590
+ return rb_str_new_cstr(ret.c_str());
1597
1591
  }
1598
1592
 
1599
1593
  static VALUE _llama_context_logits(VALUE self) {
@@ -1649,41 +1643,64 @@ private:
1649
1643
  return output;
1650
1644
  }
1651
1645
 
1652
- static VALUE _llama_context_vocab(int argc, VALUE* argv, VALUE self) {
1653
- VALUE kw_args = Qnil;
1654
- ID kw_table[1] = { rb_intern("capacity") };
1655
- VALUE kw_values[1] = { Qundef };
1656
- rb_scan_args(argc, argv, ":", &kw_args);
1657
- rb_get_kwargs(kw_args, kw_table, 1, 0, kw_values);
1658
-
1659
- if (!RB_INTEGER_TYPE_P(kw_values[0])) {
1660
- rb_raise(rb_eArgError, "capacity must be an integer");
1646
+ static VALUE _llama_context_text(VALUE self, VALUE token_) {
1647
+ LLaMAContextWrapper* ptr = get_llama_context(self);
1648
+ if (ptr->ctx == NULL) {
1649
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
1661
1650
  return Qnil;
1662
1651
  }
1652
+ const llama_token token = NUM2INT(token_);
1653
+ const char* text = llama_token_get_text(ptr->ctx, token);
1654
+ return rb_str_new_cstr(text);
1655
+ }
1663
1656
 
1657
+ static VALUE _llama_context_score(VALUE self, VALUE token_) {
1664
1658
  LLaMAContextWrapper* ptr = get_llama_context(self);
1665
1659
  if (ptr->ctx == NULL) {
1666
1660
  rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
1667
1661
  return Qnil;
1668
1662
  }
1663
+ const llama_token token = NUM2INT(token_);
1664
+ const float score = llama_token_get_score(ptr->ctx, token);
1665
+ return DBL2NUM(score);
1666
+ }
1669
1667
 
1670
- const int capacity = NUM2INT(kw_values[0]);
1671
- std::vector<const char*> strings;
1672
- std::vector<float> scores;
1673
- int n_vocab = llama_n_vocab(ptr->ctx);
1674
- strings.resize(n_vocab, NULL);
1675
- scores.resize(n_vocab, 0);
1668
+ static VALUE _llama_context_type(VALUE self, VALUE token_) {
1669
+ LLaMAContextWrapper* ptr = get_llama_context(self);
1670
+ if (ptr->ctx == NULL) {
1671
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
1672
+ return Qnil;
1673
+ }
1674
+ const llama_token token = NUM2INT(token_);
1675
+ const int type = llama_token_get_type(ptr->ctx, token);
1676
+ return INT2NUM(type);
1677
+ }
1676
1678
 
1677
- n_vocab = llama_get_vocab(ptr->ctx, strings.data(), scores.data(), capacity);
1679
+ static VALUE _llama_context_token_bos(VALUE self) {
1680
+ LLaMAContextWrapper* ptr = get_llama_context(self);
1681
+ if (ptr->ctx == NULL) {
1682
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
1683
+ return Qnil;
1684
+ }
1685
+ return INT2NUM(llama_token_bos(ptr->ctx));
1686
+ }
1678
1687
 
1679
- VALUE ret_strings = rb_ary_new();
1680
- VALUE ret_scores = rb_ary_new();
1681
- for (int i = 0; i < n_vocab; i++) {
1682
- rb_ary_push(ret_strings, rb_utf8_str_new_cstr(strings[i]));
1683
- rb_ary_push(ret_scores, DBL2NUM(static_cast<double>(scores[i])));
1688
+ static VALUE _llama_context_token_eos(VALUE self) {
1689
+ LLaMAContextWrapper* ptr = get_llama_context(self);
1690
+ if (ptr->ctx == NULL) {
1691
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
1692
+ return Qnil;
1684
1693
  }
1694
+ return INT2NUM(llama_token_eos(ptr->ctx));
1695
+ }
1685
1696
 
1686
- return rb_ary_new_from_args(2, ret_strings, ret_scores);
1697
+ static VALUE _llama_context_token_nl(VALUE self) {
1698
+ LLaMAContextWrapper* ptr = get_llama_context(self);
1699
+ if (ptr->ctx == NULL) {
1700
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
1701
+ return Qnil;
1702
+ }
1703
+ return INT2NUM(llama_token_nl(ptr->ctx));
1687
1704
  }
1688
1705
 
1689
1706
  static VALUE _llama_context_n_vocab(VALUE self) {
@@ -2474,23 +2491,15 @@ static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) {
2474
2491
  return Qnil;
2475
2492
  }
2476
2493
 
2477
- static VALUE rb_llama_token_bos(VALUE self) {
2478
- return INT2NUM(llama_token_bos());
2479
- }
2480
-
2481
- static VALUE rb_llama_token_eos(VALUE self) {
2482
- return INT2NUM(llama_token_eos());
2483
- }
2484
-
2485
- static VALUE rb_llama_token_nl(VALUE self) {
2486
- return INT2NUM(llama_token_nl());
2487
- }
2488
-
2489
2494
  static VALUE rb_llama_print_system_info(VALUE self) {
2490
2495
  const char* result = llama_print_system_info();
2491
2496
  return rb_utf8_str_new_cstr(result);
2492
2497
  }
2493
2498
 
2499
+ static VALUE rb_llama_time_us(VALUE self) {
2500
+ return LONG2NUM(llama_time_us());
2501
+ }
2502
+
2494
2503
  static VALUE rb_llama_mmap_supported(VALUE self) {
2495
2504
  return llama_mmap_supported() ? Qtrue : Qfalse;
2496
2505
  }
@@ -2519,16 +2528,29 @@ extern "C" void Init_llama_cpp(void) {
2519
2528
  rb_define_module_function(rb_mLLaMACpp, "backend_init", rb_llama_llama_backend_init, -1);
2520
2529
  rb_define_module_function(rb_mLLaMACpp, "backend_free", rb_llama_llama_backend_free, 0);
2521
2530
  rb_define_module_function(rb_mLLaMACpp, "model_quantize", rb_llama_model_quantize, -1);
2522
- rb_define_module_function(rb_mLLaMACpp, "token_bos", rb_llama_token_bos, 0);
2523
- rb_define_module_function(rb_mLLaMACpp, "token_eos", rb_llama_token_eos, 0);
2524
- rb_define_module_function(rb_mLLaMACpp, "token_nl", rb_llama_token_nl, 0);
2525
2531
  rb_define_module_function(rb_mLLaMACpp, "print_system_info", rb_llama_print_system_info, 0);
2532
+ rb_define_module_function(rb_mLLaMACpp, "time_us", rb_llama_time_us, 0);
2526
2533
  rb_define_module_function(rb_mLLaMACpp, "mmap_supported?", rb_llama_mmap_supported, 0);
2527
2534
  rb_define_module_function(rb_mLLaMACpp, "mlock_supported?", rb_llama_mlock_supported, 0);
2528
2535
  rb_define_module_function(rb_mLLaMACpp, "max_devices", rb_llama_max_devices, 0);
2529
2536
 
2530
2537
  rb_define_const(rb_mLLaMACpp, "LLAMA_MAX_DEVICES", INT2NUM(LLAMA_MAX_DEVICES));
2531
2538
 
2539
+ rb_define_const(rb_mLLaMACpp, "LLAMA_LOG_LEVEL_ERROR", INT2NUM(LLAMA_LOG_LEVEL_ERROR));
2540
+ rb_define_const(rb_mLLaMACpp, "LLAMA_LOG_LEVEL_WARN", INT2NUM(LLAMA_LOG_LEVEL_WARN));
2541
+ rb_define_const(rb_mLLaMACpp, "LLAMA_LOG_LEVEL_INFO", INT2NUM(LLAMA_LOG_LEVEL_INFO));
2542
+
2543
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_SPM", INT2NUM(LLAMA_VOCAB_TYPE_SPM));
2544
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_BPE", INT2NUM(LLAMA_VOCAB_TYPE_BPE));
2545
+
2546
+ rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_UNDEFINED", INT2NUM(LLAMA_TOKEN_TYPE_UNDEFINED));
2547
+ rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_NORMAL", INT2NUM(LLAMA_TOKEN_TYPE_NORMAL));
2548
+ rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_UNKNOWN", INT2NUM(LLAMA_TOKEN_TYPE_UNKNOWN));
2549
+ rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_CONTROL", INT2NUM(LLAMA_TOKEN_TYPE_CONTROL));
2550
+ rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_USER_DEFINED", INT2NUM(LLAMA_TOKEN_TYPE_USER_DEFINED));
2551
+ rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_UNUSED", INT2NUM(LLAMA_TOKEN_TYPE_UNUSED));
2552
+ rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_BYTE", INT2NUM(LLAMA_TOKEN_TYPE_BYTE));
2553
+
2532
2554
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_ALL_F32", INT2NUM(LLAMA_FTYPE_ALL_F32));
2533
2555
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_F16", INT2NUM(LLAMA_FTYPE_MOSTLY_F16));
2534
2556
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_0", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_0));
@@ -2547,6 +2569,8 @@ extern "C" void Init_llama_cpp(void) {
2547
2569
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_K_M", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_M));
2548
2570
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q6_K", INT2NUM(LLAMA_FTYPE_MOSTLY_Q6_K));
2549
2571
 
2572
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_GUESSED", INT2NUM(LLAMA_FTYPE_GUESSED));
2573
+
2550
2574
  rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_END", INT2NUM(LLAMA_GRETYPE_END));
2551
2575
  rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_ALT", INT2NUM(LLAMA_GRETYPE_ALT));
2552
2576
  rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_RULE_REF", INT2NUM(LLAMA_GRETYPE_RULE_REF));
@@ -2556,39 +2580,9 @@ extern "C" void Init_llama_cpp(void) {
2556
2580
  rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_CHAR_ALT", INT2NUM(LLAMA_GRETYPE_CHAR_ALT));
2557
2581
 
2558
2582
  std::stringstream ss_magic;
2559
- ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGJT;
2560
- rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGJT", rb_str_new2(ss_magic.str().c_str()));
2561
-
2562
- ss_magic.str("");
2563
- ss_magic.clear(std::stringstream::goodbit);
2564
- ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGLA;
2565
- rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGLA", rb_str_new2(ss_magic.str().c_str()));
2566
-
2567
- ss_magic.str("");
2568
- ss_magic.clear(std::stringstream::goodbit);
2569
- ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGMF;
2570
- rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGMF", rb_str_new2(ss_magic.str().c_str()));
2571
-
2572
- ss_magic.str("");
2573
- ss_magic.clear(std::stringstream::goodbit);
2574
- ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGML;
2575
- rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGML", rb_str_new2(ss_magic.str().c_str()));
2576
-
2577
- ss_magic.str("");
2578
- ss_magic.clear(std::stringstream::goodbit);
2579
2583
  ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGSN;
2580
2584
  rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGSN", rb_str_new2(ss_magic.str().c_str()));
2581
2585
 
2582
- ss_magic.str("");
2583
- ss_magic.clear(std::stringstream::goodbit);
2584
- ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC;
2585
- rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC", rb_str_new2(ss_magic.str().c_str()));
2586
-
2587
- ss_magic.str("");
2588
- ss_magic.clear(std::stringstream::goodbit);
2589
- ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_UNVERSIONED;
2590
- rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_UNVERSIONED", rb_str_new2(ss_magic.str().c_str()));
2591
-
2592
2586
  ss_magic.str("");
2593
2587
  ss_magic.clear(std::stringstream::goodbit);
2594
2588
  ss_magic << std::showbase << std::hex << LLAMA_SESSION_MAGIC;
@@ -2599,6 +2593,5 @@ extern "C" void Init_llama_cpp(void) {
2599
2593
  ss_magic << std::showbase << std::hex << LLAMA_DEFAULT_SEED;
2600
2594
  rb_define_const(rb_mLLaMACpp, "LLAMA_DEFAULT_SEED", rb_str_new2(ss_magic.str().c_str()));
2601
2595
 
2602
- rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_VERSION", rb_str_new2(std::to_string(LLAMA_FILE_VERSION).c_str()));
2603
2596
  rb_define_const(rb_mLLaMACpp, "LLAMA_SESSION_VERSION", rb_str_new2(std::to_string(LLAMA_SESSION_VERSION).c_str()));
2604
2597
  }
@@ -8,6 +8,7 @@
8
8
 
9
9
  #define UNUSED(x) (void)(x)
10
10
  #define MAX(a, b) ((a) > (b) ? (a) : (b))
11
+ #define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
11
12
 
12
13
  //#define GGML_ALLOCATOR_DEBUG
13
14
 
@@ -67,8 +68,8 @@ struct ggml_allocr {
67
68
  struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE];
68
69
  size_t max_size;
69
70
  bool measure;
70
- int parse_seq[GGML_MAX_NODES];
71
- bool has_parse_seq;
71
+ int parse_seq[GGML_MAX_CONCUR];
72
+ int parse_seq_len;
72
73
 
73
74
  #ifdef GGML_ALLOCATOR_DEBUG
74
75
  struct ggml_tensor * allocated_tensors[1024];
@@ -76,7 +77,7 @@ struct ggml_allocr {
76
77
  };
77
78
 
78
79
  #ifdef GGML_ALLOCATOR_DEBUG
79
- static void add_allocated_tensor(struct ggml_allocator * alloc, struct ggml_tensor * tensor) {
80
+ static void add_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
80
81
  for (int i = 0; i < 1024; i++) {
81
82
  if (alloc->allocated_tensors[i] == NULL) {
82
83
  alloc->allocated_tensors[i] = tensor;
@@ -85,7 +86,7 @@ static void add_allocated_tensor(struct ggml_allocator * alloc, struct ggml_tens
85
86
  }
86
87
  GGML_ASSERT(!"out of allocated_tensors");
87
88
  }
88
- static void remove_allocated_tensor(struct ggml_allocator * alloc, struct ggml_tensor * tensor) {
89
+ static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
89
90
  for (int i = 0; i < 1024; i++) {
90
91
  if (alloc->allocated_tensors[i] == tensor ||
91
92
  (alloc->allocated_tensors[i] != NULL && alloc->allocated_tensors[i]->data == tensor->data)) {
@@ -238,15 +239,11 @@ static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_t
238
239
  alloc->n_free_blocks++;
239
240
  }
240
241
 
241
- void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n) {
242
- int pos = 0;
242
+ void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n) {
243
243
  for (int i = 0; i < n; i++) {
244
- if (list[i] != -1) {
245
- alloc->parse_seq[pos] = list[i];
246
- pos++;
247
- }
244
+ alloc->parse_seq[i] = list[i];
248
245
  }
249
- alloc->has_parse_seq = true;
246
+ alloc->parse_seq_len = n;
250
247
  }
251
248
 
252
249
  void ggml_allocr_reset(struct ggml_allocr * alloc) {
@@ -269,7 +266,7 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
269
266
  /*.max_size = */ 0,
270
267
  /*.measure = */ false,
271
268
  /*.parse_seq = */ {0},
272
- /*.has_parse_seq = */ false,
269
+ /*.parse_seq_len = */ 0,
273
270
  #ifdef GGML_ALLOCATOR_DEBUG
274
271
  /*.allocated_tensors = */ = {0},
275
272
  #endif
@@ -298,7 +295,7 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
298
295
  /*.max_size = */ 0,
299
296
  /*.measure = */ true,
300
297
  /*.parse_seq = */ {0},
301
- /*.has_parse_seq = */ false,
298
+ /*.parse_seq_len = */ 0,
302
299
  #ifdef GGML_ALLOCATOR_DEBUG
303
300
  /*.allocated_tensors = */ = {0},
304
301
  #endif
@@ -445,8 +442,8 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
445
442
  else {
446
443
  AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
447
444
  node->data = parent->data;
445
+ return;
448
446
  }
449
- return;
450
447
  }
451
448
  }
452
449
  }
@@ -497,69 +494,86 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
497
494
  allocate_node(alloc, input);
498
495
  }
499
496
  }
500
- for (int ind = 0; ind < gf->n_nodes; ind++) {
501
- int i;
502
- if (alloc->has_parse_seq) {
503
- i = alloc->parse_seq[ind];
504
- } else {
505
- i = ind;
506
- }
507
- struct ggml_tensor * node = gf->nodes[i];
508
-
509
- // allocate parents (leafs)
510
- for (int j = 0; j < GGML_MAX_SRC; j++) {
511
- struct ggml_tensor * parent = node->src[j];
512
- if (parent == NULL) {
513
- break;
497
+ // if we have parse_seq then we allocate nodes following the list, and we only free nodes at barriers
498
+ int last_barrier_pos = 0;
499
+ int n_nodes = alloc->parse_seq_len ? alloc->parse_seq_len : gf->n_nodes;
500
+
501
+ for (int ind = 0; ind < n_nodes; ind++) {
502
+ // allocate a node if there is no parse_seq or this is not a barrier
503
+ if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] != -1) {
504
+ int i = alloc->parse_seq_len ? alloc->parse_seq[ind] : ind;
505
+ struct ggml_tensor * node = gf->nodes[i];
506
+
507
+ // allocate parents (leafs)
508
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
509
+ struct ggml_tensor * parent = node->src[j];
510
+ if (parent == NULL) {
511
+ break;
512
+ }
513
+ allocate_node(alloc, parent);
514
514
  }
515
- allocate_node(alloc, parent);
516
- }
517
515
 
518
- // allocate node
519
- allocate_node(alloc, node);
516
+ // allocate node
517
+ allocate_node(alloc, node);
520
518
 
521
- AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
522
- for (int j = 0; j < GGML_MAX_SRC; j++) {
523
- struct ggml_tensor * parent = node->src[j];
524
- if (parent == NULL) {
525
- break;
526
- }
527
- AT_PRINTF("%s", parent->name);
528
- if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
529
- AT_PRINTF(", ");
519
+ AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
520
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
521
+ struct ggml_tensor * parent = node->src[j];
522
+ if (parent == NULL) {
523
+ break;
524
+ }
525
+ AT_PRINTF("%s", parent->name);
526
+ if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
527
+ AT_PRINTF(", ");
528
+ }
530
529
  }
530
+ AT_PRINTF("\n");
531
531
  }
532
- AT_PRINTF("\n");
532
+
533
533
 
534
534
  // update parents
535
- for (int j = 0; j < GGML_MAX_SRC; j++) {
536
- struct ggml_tensor * parent = node->src[j];
537
- if (parent == NULL) {
538
- break;
539
- }
540
- struct hash_node * p_hn = hash_get(ht, parent);
541
- p_hn->n_children -= 1;
542
-
543
- //AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
544
-
545
- if (p_hn->n_children == 0 && p_hn->n_views == 0) {
546
- if (ggml_is_view(parent)) {
547
- struct ggml_tensor * view_src = get_view_source(parent);
548
- struct hash_node * view_src_hn = hash_get(ht, view_src);
549
- view_src_hn->n_views -= 1;
550
- AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src->n_children, view_src->n_views);
551
- if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
552
- ggml_allocator_free_tensor(alloc, view_src);
535
+ // update immediately if there is no parse_seq
536
+ // update only at barriers if there is parse_seq
537
+ if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] == -1) {
538
+ int update_start = alloc->parse_seq_len ? last_barrier_pos : ind;
539
+ int update_end = alloc->parse_seq_len ? ind : ind + 1;
540
+ for (int i = update_start; i < update_end; i++) {
541
+ int node_i = alloc->parse_seq_len ? alloc->parse_seq[i] : i;
542
+ struct ggml_tensor * node = gf->nodes[node_i];
543
+
544
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
545
+ struct ggml_tensor * parent = node->src[j];
546
+ if (parent == NULL) {
547
+ break;
553
548
  }
554
- }
555
- else {
556
- if (parent->data != node->data) {
557
- ggml_allocator_free_tensor(alloc, parent);
549
+ struct hash_node * p_hn = hash_get(ht, parent);
550
+ p_hn->n_children -= 1;
551
+
552
+ //AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
553
+
554
+ if (p_hn->n_children == 0 && p_hn->n_views == 0) {
555
+ if (ggml_is_view(parent)) {
556
+ struct ggml_tensor * view_src = get_view_source(parent);
557
+ struct hash_node * view_src_hn = hash_get(ht, view_src);
558
+ view_src_hn->n_views -= 1;
559
+ AT_PRINTF("view_src %s\n", view_src->name);
560
+ if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
561
+ ggml_allocator_free_tensor(alloc, view_src);
562
+ }
563
+ }
564
+ else {
565
+ if (parent->data != node->data) {
566
+ ggml_allocator_free_tensor(alloc, parent);
567
+ }
568
+ }
558
569
  }
559
570
  }
560
571
  }
572
+ AT_PRINTF("\n");
573
+ if (alloc->parse_seq_len) {
574
+ last_barrier_pos = ind + 1;
575
+ }
561
576
  }
562
- AT_PRINTF("\n");
563
577
  }
564
578
  // free graph outputs here that wouldn't be freed otherwise because they have no children
565
579
  if (outputs != NULL && outputs[g] != NULL) {
@@ -12,7 +12,7 @@ GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
12
12
 
13
13
  // tell the allocator to parse nodes following the order described in the list
14
14
  // you should call this if your graph are optimized to execute out-of-order
15
- GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n);
15
+ GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n);
16
16
 
17
17
  GGML_API void ggml_allocr_free(struct ggml_allocr * alloc);
18
18
  GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc);