llama_cpp 0.3.8 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 8a6623a24970936369944231171226dda1ce579bf29fc3711f8923c8d2d22cba
4
- data.tar.gz: dbff8f38ea54195b05fc0acbaf8fceb7fd6bfdc329100a18665ef2cba2fd5d81
3
+ metadata.gz: af3a0e01bc9f3cfad4cee3f21144dd354640e1d4558125be36d4b499fa3b4c24
4
+ data.tar.gz: 042a3b0491d98fa6a093c684e6ab751152f37c8438a3b4a7b19cb2d8c7ab95a7
5
5
  SHA512:
6
- metadata.gz: 710ab86cfea7b5f91a386bdf87872c1d19ba49057bc02aa11a4f0198aee404a2d5b931965fdeba40aa1353269f95a451090e261305931e31a182a078827ace80
7
- data.tar.gz: ec4d956b5ab5ad665a0e99489b81b364b79ed39e74146629e4140240b5e176f4ef9dbf3d1c11acdb4098398114fbf055a2ad4f8251ed98ec42471a478f6dcaa2
6
+ metadata.gz: 7ed85bd8438ee3b3adab884795c4aecb5b0d72ad57b7e02bc281b62c3b1d669efab62a020e03b09defe3084ecd8afacc4220303e99167d04d668650768c7392b
7
+ data.tar.gz: b705a0ccd2c7c1e15aed6383acb9d5a3d79d0a0c882a74c42b9099df9a27aff88ba08a2f06aa4d195382e8f41c1b16c0014a2047d1923369f275ca481d52bb21
data/CHANGELOG.md CHANGED
@@ -1,3 +1,13 @@
1
+ ## [[0.4.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.3.8...v0.4.0)] - 2023-08-26
2
+
3
+ **Breaking Changes**
4
+ - Bump bundled llama.cpp from master-097e121 to master-b1060.
5
+ - Support new file format GGUF.
6
+ - You should re-convert / re-quantize your model files.
7
+ - Remove vocab methods.
8
+ - Move token_bos, token_eos, and token_nl methods to Context.
9
+ - Add text, score, and type methods to Context.
10
+
1
11
  ## [[0.3.8](https://github.com/yoshoku/llama_cpp.rb/compare/v0.3.7...v0.3.8)] - 2023-08-19
2
12
 
3
13
  - Bump bundled llama.cpp from master-9ca4abe to master-097e121.
data/README.md CHANGED
@@ -51,7 +51,7 @@ $ git clone https://huggingface.co/openlm-research/open_llama_7b
51
51
  $ cd ../
52
52
  $ python3 convert.py models/open_llama_7b
53
53
  $ make
54
- $ ./quantize ./models/open_llama_7b/ggml-model-f16.bin ./models/open_llama_7b/ggml-model-q4_0.bin q4_0
54
+ $ ./quantize ./models/open_llama_7b/ggml-model-f16.gguf ./models/open_llama_7b/ggml-model-q4_0.bin q4_0
55
55
  ```
56
56
 
57
57
  An example of Ruby code that generates sentences with the quantization model is as follows:
data/examples/chat.rb CHANGED
@@ -49,8 +49,6 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
49
49
  n_keep = options[:keep]
50
50
  n_keep = embd_input.size if n_keep > embd_input.size
51
51
 
52
- token_newline = context.tokenize(text: "\n", add_bos: false)
53
-
54
52
  last_n_tokens = [0] * n_ctx
55
53
  interactive = true
56
54
  is_interacting = false
@@ -101,8 +99,8 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
101
99
  last_n_tokens.shift
102
100
  last_n_tokens.push(id)
103
101
 
104
- if id == LLaMACpp.token_eos
105
- id = token_newline.first
102
+ if id == context.token_eos
103
+ id = context.token_nl
106
104
  unless antiprompt.empty?
107
105
  first_antiprompt = context.tokenize(text: antiprompt, add_bos: false)
108
106
  embd_input.concat(first_antiprompt)
@@ -53,7 +53,7 @@ if with_config('metal')
53
53
  $CFLAGS << ' -DGGML_USE_METAL -DGGML_METAL_NDEBUG'
54
54
  $CXXFLAGS << ' -DGGML_USE_METAL'
55
55
  $LDFLAGS << ' -framework Foundation -framework Metal -framework MetalKit'
56
- $objs = %w[ggml.o ggml-alloc.o llama.o llama_cpp.o ggml-metal.o]
56
+ $objs = %w[ggml.o ggml-alloc.o ggml-metal.o llama.o llama_cpp.o]
57
57
  $objs << 'k_quants.o' unless with_config('no_k_quants')
58
58
  end
59
59
 
@@ -61,7 +61,7 @@ if with_config('cublas')
61
61
  $CFLAGS << ' -DGGML_USE_CUBLAS -I/usr/local/cuda/include'
62
62
  $CXXFLAGS << ' -DGGML_USE_CUBLAS -I/usr/local/cuda/include'
63
63
  $LDFLAGS << ' -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64'
64
- $objs = %w[ggml-cuda.o ggml.o llama.o llama_cpp.o]
64
+ $objs = %w[ggml.o ggml-alloc.o ggml-cuda.o llama.o llama_cpp.o]
65
65
  $objs << 'k_quants.o' unless with_config('no_k_quants')
66
66
  end
67
67
 
@@ -808,10 +808,9 @@ public:
808
808
  rb_define_method(rb_cLLaMAModel, "free", RUBY_METHOD_FUNC(_llama_model_free), 0);
809
809
  rb_define_method(rb_cLLaMAModel, "load", RUBY_METHOD_FUNC(_llama_model_load), -1);
810
810
  rb_define_method(rb_cLLaMAModel, "apply_lora_from_file", RUBY_METHOD_FUNC(_llama_model_apply_lora_from_file), -1);
811
- rb_define_method(rb_cLLaMAModel, "n_vocab", RUBY_METHOD_FUNC(_llama_model_get_n_vocab_from_model), 0);
812
- rb_define_method(rb_cLLaMAModel, "n_ctx", RUBY_METHOD_FUNC(_llama_model_get_n_ctx_from_model), 0);
813
- rb_define_method(rb_cLLaMAModel, "n_embd", RUBY_METHOD_FUNC(_llama_model_get_n_embd_from_model), 0);
814
- rb_define_method(rb_cLLaMAModel, "vocab", RUBY_METHOD_FUNC(_llama_model_get_vocab_from_model), -1);
811
+ rb_define_method(rb_cLLaMAModel, "n_vocab", RUBY_METHOD_FUNC(_llama_model_get_model_n_vocab), 0);
812
+ rb_define_method(rb_cLLaMAModel, "n_ctx", RUBY_METHOD_FUNC(_llama_model_get_model_n_ctx), 0);
813
+ rb_define_method(rb_cLLaMAModel, "n_embd", RUBY_METHOD_FUNC(_llama_model_get_model_n_embd), 0);
815
814
  rb_define_method(rb_cLLaMAModel, "token_to_str", RUBY_METHOD_FUNC(_llama_model_token_to_str_with_model), 1);
816
815
  rb_define_method(rb_cLLaMAModel, "tokenize", RUBY_METHOD_FUNC(_llama_model_tokenize_with_model), -1);
817
816
  rb_define_method(rb_cLLaMAModel, "type", RUBY_METHOD_FUNC(_llama_model_get_model_type), 0);
@@ -960,53 +959,19 @@ private:
960
959
  return Qnil;
961
960
  }
962
961
 
963
- static VALUE _llama_model_get_n_vocab_from_model(VALUE self) {
962
+ static VALUE _llama_model_get_model_n_vocab(VALUE self) {
964
963
  LLaMAModelWrapper* ptr = get_llama_model(self);
965
- return INT2NUM(llama_n_vocab_from_model(ptr->model));
964
+ return INT2NUM(llama_model_n_vocab(ptr->model));
966
965
  }
967
966
 
968
- static VALUE _llama_model_get_n_ctx_from_model(VALUE self) {
967
+ static VALUE _llama_model_get_model_n_ctx(VALUE self) {
969
968
  LLaMAModelWrapper* ptr = get_llama_model(self);
970
- return INT2NUM(llama_n_ctx_from_model(ptr->model));
969
+ return INT2NUM(llama_model_n_ctx(ptr->model));
971
970
  }
972
971
 
973
- static VALUE _llama_model_get_n_embd_from_model(VALUE self) {
972
+ static VALUE _llama_model_get_model_n_embd(VALUE self) {
974
973
  LLaMAModelWrapper* ptr = get_llama_model(self);
975
- return INT2NUM(llama_n_embd_from_model(ptr->model));
976
- }
977
-
978
- static VALUE _llama_model_get_vocab_from_model(int argc, VALUE* argv, VALUE self) {
979
- VALUE kw_args = Qnil;
980
- ID kw_table[1] = { rb_intern("capacity") };
981
- VALUE kw_values[1] = { Qundef };
982
- rb_scan_args(argc, argv, ":", &kw_args);
983
- rb_get_kwargs(kw_args, kw_table, 1, 0, kw_values);
984
-
985
- if (!RB_INTEGER_TYPE_P(kw_values[0])) {
986
- rb_raise(rb_eArgError, "capacity must be an integer");
987
- return Qnil;
988
- }
989
-
990
- const int capacity = NUM2INT(kw_values[0]);
991
-
992
- LLaMAModelWrapper* ptr = get_llama_model(self);
993
- const int n = std::min(capacity, llama_n_vocab_from_model(ptr->model));
994
- const char** vocabs = ALLOCA_N(const char*, n);
995
- float* scores = ALLOCA_N(float, n);
996
-
997
- llama_get_vocab_from_model(ptr->model, vocabs, scores, capacity);
998
-
999
- VALUE vocabs_ary = rb_ary_new();
1000
- VALUE scores_ary = rb_ary_new();
1001
-
1002
- for (int i = 0; i < n; i++) {
1003
- rb_ary_push(vocabs_ary, rb_str_new_cstr(vocabs[i]));
1004
- rb_ary_push(scores_ary, DBL2NUM(scores[i]));
1005
- }
1006
-
1007
- VALUE ret = rb_ary_new3(2, vocabs_ary, scores_ary);
1008
-
1009
- return ret;
974
+ return INT2NUM(llama_model_n_embd(ptr->model));
1010
975
  }
1011
976
 
1012
977
  static VALUE _llama_model_token_to_str_with_model(VALUE self, VALUE token_) {
@@ -1016,8 +981,20 @@ private:
1016
981
  }
1017
982
  const llama_token token = NUM2INT(token_);
1018
983
  LLaMAModelWrapper* ptr = get_llama_model(self);
1019
- const char* str = llama_token_to_str_with_model(ptr->model, token);
1020
- return rb_str_new_cstr(str);
984
+ std::vector<char> result(8, 0);
985
+ const int n_tokens = llama_token_to_str_with_model(ptr->model, token, result.data(), result.size());
986
+ if (n_tokens < 0) {
987
+ result.resize(-n_tokens);
988
+ const int check = llama_token_to_str_with_model(ptr->model, token, result.data(), result.size());
989
+ if (check != -n_tokens) {
990
+ rb_raise(rb_eRuntimeError, "failed to convert");
991
+ return Qnil;
992
+ }
993
+ } else {
994
+ result.resize(n_tokens);
995
+ }
996
+ std::string ret(result.data(), result.size());
997
+ return rb_str_new_cstr(ret.c_str());
1021
998
  }
1022
999
 
1023
1000
  static VALUE _llama_model_tokenize_with_model(int argc, VALUE* argv, VALUE self) {
@@ -1343,7 +1320,12 @@ public:
1343
1320
  rb_define_method(rb_cLLaMAContext, "tokenize", RUBY_METHOD_FUNC(_llama_context_tokenize), -1);
1344
1321
  rb_define_method(rb_cLLaMAContext, "logits", RUBY_METHOD_FUNC(_llama_context_logits), 0);
1345
1322
  rb_define_method(rb_cLLaMAContext, "embeddings", RUBY_METHOD_FUNC(_llama_context_embeddings), 0);
1346
- rb_define_method(rb_cLLaMAContext, "vocab", RUBY_METHOD_FUNC(_llama_context_vocab), -1);
1323
+ rb_define_method(rb_cLLaMAContext, "text", RUBY_METHOD_FUNC(_llama_context_text), 1);
1324
+ rb_define_method(rb_cLLaMAContext, "score", RUBY_METHOD_FUNC(_llama_context_score), 1);
1325
+ rb_define_method(rb_cLLaMAContext, "type", RUBY_METHOD_FUNC(_llama_context_type), 1);
1326
+ rb_define_method(rb_cLLaMAContext, "token_bos", RUBY_METHOD_FUNC(_llama_context_token_bos), 0);
1327
+ rb_define_method(rb_cLLaMAContext, "token_eos", RUBY_METHOD_FUNC(_llama_context_token_eos), 0);
1328
+ rb_define_method(rb_cLLaMAContext, "token_nl", RUBY_METHOD_FUNC(_llama_context_token_nl), 0);
1347
1329
  rb_define_method(rb_cLLaMAContext, "token_to_str", RUBY_METHOD_FUNC(_llama_context_token_to_str), 1);
1348
1330
  rb_define_method(rb_cLLaMAContext, "n_vocab", RUBY_METHOD_FUNC(_llama_context_n_vocab), 0);
1349
1331
  rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
@@ -1592,8 +1574,20 @@ private:
1592
1574
  return Qnil;
1593
1575
  }
1594
1576
  const llama_token token = NUM2INT(token_);
1595
- const char* str = llama_token_to_str(ptr->ctx, token);
1596
- return str != nullptr ? rb_utf8_str_new_cstr(str) : rb_utf8_str_new_cstr("");
1577
+ std::vector<char> result(8, 0);
1578
+ const int n_tokens = llama_token_to_str(ptr->ctx, token, result.data(), result.size());
1579
+ if (n_tokens < 0) {
1580
+ result.resize(-n_tokens);
1581
+ const int check = llama_token_to_str(ptr->ctx, token, result.data(), result.size());
1582
+ if (check != -n_tokens) {
1583
+ rb_raise(rb_eRuntimeError, "failed to convert");
1584
+ return Qnil;
1585
+ }
1586
+ } else {
1587
+ result.resize(n_tokens);
1588
+ }
1589
+ std::string ret(result.data(), result.size());
1590
+ return rb_str_new_cstr(ret.c_str());
1597
1591
  }
1598
1592
 
1599
1593
  static VALUE _llama_context_logits(VALUE self) {
@@ -1649,41 +1643,64 @@ private:
1649
1643
  return output;
1650
1644
  }
1651
1645
 
1652
- static VALUE _llama_context_vocab(int argc, VALUE* argv, VALUE self) {
1653
- VALUE kw_args = Qnil;
1654
- ID kw_table[1] = { rb_intern("capacity") };
1655
- VALUE kw_values[1] = { Qundef };
1656
- rb_scan_args(argc, argv, ":", &kw_args);
1657
- rb_get_kwargs(kw_args, kw_table, 1, 0, kw_values);
1658
-
1659
- if (!RB_INTEGER_TYPE_P(kw_values[0])) {
1660
- rb_raise(rb_eArgError, "capacity must be an integer");
1646
+ static VALUE _llama_context_text(VALUE self, VALUE token_) {
1647
+ LLaMAContextWrapper* ptr = get_llama_context(self);
1648
+ if (ptr->ctx == NULL) {
1649
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
1661
1650
  return Qnil;
1662
1651
  }
1652
+ const llama_token token = NUM2INT(token_);
1653
+ const char* text = llama_token_get_text(ptr->ctx, token);
1654
+ return rb_str_new_cstr(text);
1655
+ }
1663
1656
 
1657
+ static VALUE _llama_context_score(VALUE self, VALUE token_) {
1664
1658
  LLaMAContextWrapper* ptr = get_llama_context(self);
1665
1659
  if (ptr->ctx == NULL) {
1666
1660
  rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
1667
1661
  return Qnil;
1668
1662
  }
1663
+ const llama_token token = NUM2INT(token_);
1664
+ const float score = llama_token_get_score(ptr->ctx, token);
1665
+ return DBL2NUM(score);
1666
+ }
1669
1667
 
1670
- const int capacity = NUM2INT(kw_values[0]);
1671
- std::vector<const char*> strings;
1672
- std::vector<float> scores;
1673
- int n_vocab = llama_n_vocab(ptr->ctx);
1674
- strings.resize(n_vocab, NULL);
1675
- scores.resize(n_vocab, 0);
1668
+ static VALUE _llama_context_type(VALUE self, VALUE token_) {
1669
+ LLaMAContextWrapper* ptr = get_llama_context(self);
1670
+ if (ptr->ctx == NULL) {
1671
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
1672
+ return Qnil;
1673
+ }
1674
+ const llama_token token = NUM2INT(token_);
1675
+ const int type = llama_token_get_type(ptr->ctx, token);
1676
+ return INT2NUM(type);
1677
+ }
1676
1678
 
1677
- n_vocab = llama_get_vocab(ptr->ctx, strings.data(), scores.data(), capacity);
1679
+ static VALUE _llama_context_token_bos(VALUE self) {
1680
+ LLaMAContextWrapper* ptr = get_llama_context(self);
1681
+ if (ptr->ctx == NULL) {
1682
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
1683
+ return Qnil;
1684
+ }
1685
+ return INT2NUM(llama_token_bos(ptr->ctx));
1686
+ }
1678
1687
 
1679
- VALUE ret_strings = rb_ary_new();
1680
- VALUE ret_scores = rb_ary_new();
1681
- for (int i = 0; i < n_vocab; i++) {
1682
- rb_ary_push(ret_strings, rb_utf8_str_new_cstr(strings[i]));
1683
- rb_ary_push(ret_scores, DBL2NUM(static_cast<double>(scores[i])));
1688
+ static VALUE _llama_context_token_eos(VALUE self) {
1689
+ LLaMAContextWrapper* ptr = get_llama_context(self);
1690
+ if (ptr->ctx == NULL) {
1691
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
1692
+ return Qnil;
1684
1693
  }
1694
+ return INT2NUM(llama_token_eos(ptr->ctx));
1695
+ }
1685
1696
 
1686
- return rb_ary_new_from_args(2, ret_strings, ret_scores);
1697
+ static VALUE _llama_context_token_nl(VALUE self) {
1698
+ LLaMAContextWrapper* ptr = get_llama_context(self);
1699
+ if (ptr->ctx == NULL) {
1700
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
1701
+ return Qnil;
1702
+ }
1703
+ return INT2NUM(llama_token_nl(ptr->ctx));
1687
1704
  }
1688
1705
 
1689
1706
  static VALUE _llama_context_n_vocab(VALUE self) {
@@ -2474,23 +2491,15 @@ static VALUE rb_llama_model_quantize(int argc, VALUE* argv, VALUE self) {
2474
2491
  return Qnil;
2475
2492
  }
2476
2493
 
2477
- static VALUE rb_llama_token_bos(VALUE self) {
2478
- return INT2NUM(llama_token_bos());
2479
- }
2480
-
2481
- static VALUE rb_llama_token_eos(VALUE self) {
2482
- return INT2NUM(llama_token_eos());
2483
- }
2484
-
2485
- static VALUE rb_llama_token_nl(VALUE self) {
2486
- return INT2NUM(llama_token_nl());
2487
- }
2488
-
2489
2494
  static VALUE rb_llama_print_system_info(VALUE self) {
2490
2495
  const char* result = llama_print_system_info();
2491
2496
  return rb_utf8_str_new_cstr(result);
2492
2497
  }
2493
2498
 
2499
+ static VALUE rb_llama_time_us(VALUE self) {
2500
+ return LONG2NUM(llama_time_us());
2501
+ }
2502
+
2494
2503
  static VALUE rb_llama_mmap_supported(VALUE self) {
2495
2504
  return llama_mmap_supported() ? Qtrue : Qfalse;
2496
2505
  }
@@ -2519,16 +2528,29 @@ extern "C" void Init_llama_cpp(void) {
2519
2528
  rb_define_module_function(rb_mLLaMACpp, "backend_init", rb_llama_llama_backend_init, -1);
2520
2529
  rb_define_module_function(rb_mLLaMACpp, "backend_free", rb_llama_llama_backend_free, 0);
2521
2530
  rb_define_module_function(rb_mLLaMACpp, "model_quantize", rb_llama_model_quantize, -1);
2522
- rb_define_module_function(rb_mLLaMACpp, "token_bos", rb_llama_token_bos, 0);
2523
- rb_define_module_function(rb_mLLaMACpp, "token_eos", rb_llama_token_eos, 0);
2524
- rb_define_module_function(rb_mLLaMACpp, "token_nl", rb_llama_token_nl, 0);
2525
2531
  rb_define_module_function(rb_mLLaMACpp, "print_system_info", rb_llama_print_system_info, 0);
2532
+ rb_define_module_function(rb_mLLaMACpp, "time_us", rb_llama_time_us, 0);
2526
2533
  rb_define_module_function(rb_mLLaMACpp, "mmap_supported?", rb_llama_mmap_supported, 0);
2527
2534
  rb_define_module_function(rb_mLLaMACpp, "mlock_supported?", rb_llama_mlock_supported, 0);
2528
2535
  rb_define_module_function(rb_mLLaMACpp, "max_devices", rb_llama_max_devices, 0);
2529
2536
 
2530
2537
  rb_define_const(rb_mLLaMACpp, "LLAMA_MAX_DEVICES", INT2NUM(LLAMA_MAX_DEVICES));
2531
2538
 
2539
+ rb_define_const(rb_mLLaMACpp, "LLAMA_LOG_LEVEL_ERROR", INT2NUM(LLAMA_LOG_LEVEL_ERROR));
2540
+ rb_define_const(rb_mLLaMACpp, "LLAMA_LOG_LEVEL_WARN", INT2NUM(LLAMA_LOG_LEVEL_WARN));
2541
+ rb_define_const(rb_mLLaMACpp, "LLAMA_LOG_LEVEL_INFO", INT2NUM(LLAMA_LOG_LEVEL_INFO));
2542
+
2543
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_SPM", INT2NUM(LLAMA_VOCAB_TYPE_SPM));
2544
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_BPE", INT2NUM(LLAMA_VOCAB_TYPE_BPE));
2545
+
2546
+ rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_UNDEFINED", INT2NUM(LLAMA_TOKEN_TYPE_UNDEFINED));
2547
+ rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_NORMAL", INT2NUM(LLAMA_TOKEN_TYPE_NORMAL));
2548
+ rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_UNKNOWN", INT2NUM(LLAMA_TOKEN_TYPE_UNKNOWN));
2549
+ rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_CONTROL", INT2NUM(LLAMA_TOKEN_TYPE_CONTROL));
2550
+ rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_USER_DEFINED", INT2NUM(LLAMA_TOKEN_TYPE_USER_DEFINED));
2551
+ rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_UNUSED", INT2NUM(LLAMA_TOKEN_TYPE_UNUSED));
2552
+ rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_BYTE", INT2NUM(LLAMA_TOKEN_TYPE_BYTE));
2553
+
2532
2554
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_ALL_F32", INT2NUM(LLAMA_FTYPE_ALL_F32));
2533
2555
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_F16", INT2NUM(LLAMA_FTYPE_MOSTLY_F16));
2534
2556
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q4_0", INT2NUM(LLAMA_FTYPE_MOSTLY_Q4_0));
@@ -2547,6 +2569,8 @@ extern "C" void Init_llama_cpp(void) {
2547
2569
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q5_K_M", INT2NUM(LLAMA_FTYPE_MOSTLY_Q5_K_M));
2548
2570
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q6_K", INT2NUM(LLAMA_FTYPE_MOSTLY_Q6_K));
2549
2571
 
2572
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_GUESSED", INT2NUM(LLAMA_FTYPE_GUESSED));
2573
+
2550
2574
  rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_END", INT2NUM(LLAMA_GRETYPE_END));
2551
2575
  rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_ALT", INT2NUM(LLAMA_GRETYPE_ALT));
2552
2576
  rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_RULE_REF", INT2NUM(LLAMA_GRETYPE_RULE_REF));
@@ -2556,39 +2580,9 @@ extern "C" void Init_llama_cpp(void) {
2556
2580
  rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_CHAR_ALT", INT2NUM(LLAMA_GRETYPE_CHAR_ALT));
2557
2581
 
2558
2582
  std::stringstream ss_magic;
2559
- ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGJT;
2560
- rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGJT", rb_str_new2(ss_magic.str().c_str()));
2561
-
2562
- ss_magic.str("");
2563
- ss_magic.clear(std::stringstream::goodbit);
2564
- ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGLA;
2565
- rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGLA", rb_str_new2(ss_magic.str().c_str()));
2566
-
2567
- ss_magic.str("");
2568
- ss_magic.clear(std::stringstream::goodbit);
2569
- ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGMF;
2570
- rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGMF", rb_str_new2(ss_magic.str().c_str()));
2571
-
2572
- ss_magic.str("");
2573
- ss_magic.clear(std::stringstream::goodbit);
2574
- ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGML;
2575
- rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGML", rb_str_new2(ss_magic.str().c_str()));
2576
-
2577
- ss_magic.str("");
2578
- ss_magic.clear(std::stringstream::goodbit);
2579
2583
  ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGSN;
2580
2584
  rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGSN", rb_str_new2(ss_magic.str().c_str()));
2581
2585
 
2582
- ss_magic.str("");
2583
- ss_magic.clear(std::stringstream::goodbit);
2584
- ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC;
2585
- rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC", rb_str_new2(ss_magic.str().c_str()));
2586
-
2587
- ss_magic.str("");
2588
- ss_magic.clear(std::stringstream::goodbit);
2589
- ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_UNVERSIONED;
2590
- rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_UNVERSIONED", rb_str_new2(ss_magic.str().c_str()));
2591
-
2592
2586
  ss_magic.str("");
2593
2587
  ss_magic.clear(std::stringstream::goodbit);
2594
2588
  ss_magic << std::showbase << std::hex << LLAMA_SESSION_MAGIC;
@@ -2599,6 +2593,5 @@ extern "C" void Init_llama_cpp(void) {
2599
2593
  ss_magic << std::showbase << std::hex << LLAMA_DEFAULT_SEED;
2600
2594
  rb_define_const(rb_mLLaMACpp, "LLAMA_DEFAULT_SEED", rb_str_new2(ss_magic.str().c_str()));
2601
2595
 
2602
- rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_VERSION", rb_str_new2(std::to_string(LLAMA_FILE_VERSION).c_str()));
2603
2596
  rb_define_const(rb_mLLaMACpp, "LLAMA_SESSION_VERSION", rb_str_new2(std::to_string(LLAMA_SESSION_VERSION).c_str()));
2604
2597
  }
@@ -8,6 +8,7 @@
8
8
 
9
9
  #define UNUSED(x) (void)(x)
10
10
  #define MAX(a, b) ((a) > (b) ? (a) : (b))
11
+ #define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
11
12
 
12
13
  //#define GGML_ALLOCATOR_DEBUG
13
14
 
@@ -67,8 +68,8 @@ struct ggml_allocr {
67
68
  struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE];
68
69
  size_t max_size;
69
70
  bool measure;
70
- int parse_seq[GGML_MAX_NODES];
71
- bool has_parse_seq;
71
+ int parse_seq[GGML_MAX_CONCUR];
72
+ int parse_seq_len;
72
73
 
73
74
  #ifdef GGML_ALLOCATOR_DEBUG
74
75
  struct ggml_tensor * allocated_tensors[1024];
@@ -76,7 +77,7 @@ struct ggml_allocr {
76
77
  };
77
78
 
78
79
  #ifdef GGML_ALLOCATOR_DEBUG
79
- static void add_allocated_tensor(struct ggml_allocator * alloc, struct ggml_tensor * tensor) {
80
+ static void add_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
80
81
  for (int i = 0; i < 1024; i++) {
81
82
  if (alloc->allocated_tensors[i] == NULL) {
82
83
  alloc->allocated_tensors[i] = tensor;
@@ -85,7 +86,7 @@ static void add_allocated_tensor(struct ggml_allocator * alloc, struct ggml_tens
85
86
  }
86
87
  GGML_ASSERT(!"out of allocated_tensors");
87
88
  }
88
- static void remove_allocated_tensor(struct ggml_allocator * alloc, struct ggml_tensor * tensor) {
89
+ static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
89
90
  for (int i = 0; i < 1024; i++) {
90
91
  if (alloc->allocated_tensors[i] == tensor ||
91
92
  (alloc->allocated_tensors[i] != NULL && alloc->allocated_tensors[i]->data == tensor->data)) {
@@ -238,15 +239,11 @@ static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_t
238
239
  alloc->n_free_blocks++;
239
240
  }
240
241
 
241
- void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n) {
242
- int pos = 0;
242
+ void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n) {
243
243
  for (int i = 0; i < n; i++) {
244
- if (list[i] != -1) {
245
- alloc->parse_seq[pos] = list[i];
246
- pos++;
247
- }
244
+ alloc->parse_seq[i] = list[i];
248
245
  }
249
- alloc->has_parse_seq = true;
246
+ alloc->parse_seq_len = n;
250
247
  }
251
248
 
252
249
  void ggml_allocr_reset(struct ggml_allocr * alloc) {
@@ -269,7 +266,7 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
269
266
  /*.max_size = */ 0,
270
267
  /*.measure = */ false,
271
268
  /*.parse_seq = */ {0},
272
- /*.has_parse_seq = */ false,
269
+ /*.parse_seq_len = */ 0,
273
270
  #ifdef GGML_ALLOCATOR_DEBUG
274
271
  /*.allocated_tensors = */ = {0},
275
272
  #endif
@@ -298,7 +295,7 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
298
295
  /*.max_size = */ 0,
299
296
  /*.measure = */ true,
300
297
  /*.parse_seq = */ {0},
301
- /*.has_parse_seq = */ false,
298
+ /*.parse_seq_len = */ 0,
302
299
  #ifdef GGML_ALLOCATOR_DEBUG
303
300
  /*.allocated_tensors = */ = {0},
304
301
  #endif
@@ -445,8 +442,8 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
445
442
  else {
446
443
  AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
447
444
  node->data = parent->data;
445
+ return;
448
446
  }
449
- return;
450
447
  }
451
448
  }
452
449
  }
@@ -497,69 +494,86 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
497
494
  allocate_node(alloc, input);
498
495
  }
499
496
  }
500
- for (int ind = 0; ind < gf->n_nodes; ind++) {
501
- int i;
502
- if (alloc->has_parse_seq) {
503
- i = alloc->parse_seq[ind];
504
- } else {
505
- i = ind;
506
- }
507
- struct ggml_tensor * node = gf->nodes[i];
508
-
509
- // allocate parents (leafs)
510
- for (int j = 0; j < GGML_MAX_SRC; j++) {
511
- struct ggml_tensor * parent = node->src[j];
512
- if (parent == NULL) {
513
- break;
497
+ // if we have parse_seq then we allocate nodes following the list, and we only free nodes at barriers
498
+ int last_barrier_pos = 0;
499
+ int n_nodes = alloc->parse_seq_len ? alloc->parse_seq_len : gf->n_nodes;
500
+
501
+ for (int ind = 0; ind < n_nodes; ind++) {
502
+ // allocate a node if there is no parse_seq or this is not a barrier
503
+ if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] != -1) {
504
+ int i = alloc->parse_seq_len ? alloc->parse_seq[ind] : ind;
505
+ struct ggml_tensor * node = gf->nodes[i];
506
+
507
+ // allocate parents (leafs)
508
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
509
+ struct ggml_tensor * parent = node->src[j];
510
+ if (parent == NULL) {
511
+ break;
512
+ }
513
+ allocate_node(alloc, parent);
514
514
  }
515
- allocate_node(alloc, parent);
516
- }
517
515
 
518
- // allocate node
519
- allocate_node(alloc, node);
516
+ // allocate node
517
+ allocate_node(alloc, node);
520
518
 
521
- AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
522
- for (int j = 0; j < GGML_MAX_SRC; j++) {
523
- struct ggml_tensor * parent = node->src[j];
524
- if (parent == NULL) {
525
- break;
526
- }
527
- AT_PRINTF("%s", parent->name);
528
- if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
529
- AT_PRINTF(", ");
519
+ AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
520
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
521
+ struct ggml_tensor * parent = node->src[j];
522
+ if (parent == NULL) {
523
+ break;
524
+ }
525
+ AT_PRINTF("%s", parent->name);
526
+ if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
527
+ AT_PRINTF(", ");
528
+ }
530
529
  }
530
+ AT_PRINTF("\n");
531
531
  }
532
- AT_PRINTF("\n");
532
+
533
533
 
534
534
  // update parents
535
- for (int j = 0; j < GGML_MAX_SRC; j++) {
536
- struct ggml_tensor * parent = node->src[j];
537
- if (parent == NULL) {
538
- break;
539
- }
540
- struct hash_node * p_hn = hash_get(ht, parent);
541
- p_hn->n_children -= 1;
542
-
543
- //AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
544
-
545
- if (p_hn->n_children == 0 && p_hn->n_views == 0) {
546
- if (ggml_is_view(parent)) {
547
- struct ggml_tensor * view_src = get_view_source(parent);
548
- struct hash_node * view_src_hn = hash_get(ht, view_src);
549
- view_src_hn->n_views -= 1;
550
- AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src->n_children, view_src->n_views);
551
- if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
552
- ggml_allocator_free_tensor(alloc, view_src);
535
+ // update immediately if there is no parse_seq
536
+ // update only at barriers if there is parse_seq
537
+ if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] == -1) {
538
+ int update_start = alloc->parse_seq_len ? last_barrier_pos : ind;
539
+ int update_end = alloc->parse_seq_len ? ind : ind + 1;
540
+ for (int i = update_start; i < update_end; i++) {
541
+ int node_i = alloc->parse_seq_len ? alloc->parse_seq[i] : i;
542
+ struct ggml_tensor * node = gf->nodes[node_i];
543
+
544
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
545
+ struct ggml_tensor * parent = node->src[j];
546
+ if (parent == NULL) {
547
+ break;
553
548
  }
554
- }
555
- else {
556
- if (parent->data != node->data) {
557
- ggml_allocator_free_tensor(alloc, parent);
549
+ struct hash_node * p_hn = hash_get(ht, parent);
550
+ p_hn->n_children -= 1;
551
+
552
+ //AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
553
+
554
+ if (p_hn->n_children == 0 && p_hn->n_views == 0) {
555
+ if (ggml_is_view(parent)) {
556
+ struct ggml_tensor * view_src = get_view_source(parent);
557
+ struct hash_node * view_src_hn = hash_get(ht, view_src);
558
+ view_src_hn->n_views -= 1;
559
+ AT_PRINTF("view_src %s\n", view_src->name);
560
+ if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
561
+ ggml_allocator_free_tensor(alloc, view_src);
562
+ }
563
+ }
564
+ else {
565
+ if (parent->data != node->data) {
566
+ ggml_allocator_free_tensor(alloc, parent);
567
+ }
568
+ }
558
569
  }
559
570
  }
560
571
  }
572
+ AT_PRINTF("\n");
573
+ if (alloc->parse_seq_len) {
574
+ last_barrier_pos = ind + 1;
575
+ }
561
576
  }
562
- AT_PRINTF("\n");
563
577
  }
564
578
  // free graph outputs here that wouldn't be freed otherwise because they have no children
565
579
  if (outputs != NULL && outputs[g] != NULL) {
@@ -12,7 +12,7 @@ GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
12
12
 
13
13
  // tell the allocator to parse nodes following the order described in the list
14
14
  // you should call this if your graph are optimized to execute out-of-order
15
- GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n);
15
+ GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n);
16
16
 
17
17
  GGML_API void ggml_allocr_free(struct ggml_allocr * alloc);
18
18
  GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc);