llama_cpp 0.14.6 → 0.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5c4bd6bcb93b98a00f94dcdf93d04f853174f73e281d96fce8f837a6ba7f250e
4
- data.tar.gz: 6d184e9ce927c06ba794bea63a09007a175a72e477366ffb1c5763ceb2c7c71e
3
+ metadata.gz: b6da808ddaadd304ab376b4726de19087422194ef32c9e5006272569f1c4a76a
4
+ data.tar.gz: faf5c6ed3421cacb24a11c0d126c852d38f1a0b3edb43768133a321269958730
5
5
  SHA512:
6
- metadata.gz: 953fe2777a759e5467694b8afb9d3f929a42603e81b2c3e38ba0fda4bb6dca78b2d147345023f99c2c9fb899cc746bf6729ad2726c2cb473d7094e93c13caf73
7
- data.tar.gz: 71eb3cd5a5c619e9cc8a3418be745a8b76dc5e8cabe5b26a766230a8533df9a11c3981601b0be4ec0adb34a49f86ad741503ffc9f3b0d7ba021a7e9ddc3246a7
6
+ metadata.gz: 9a83cb7da94d4672418440361d78b230f6560a97b90924c389c958a6f91b2ecded2f5e53dcbf596845687cd332ecc8126c1a7f79c33fad9b9ff20ac1ce4f8759
7
+ data.tar.gz: 55001246afe1615d8d8262c2f74dccbe819b4942cdb6517f5aa6e5d3e98fb2ea628db5c8e5b94a19052afff88236f003a15e7f792473b0c10660cbcf58ecab45
data/CHANGELOG.md CHANGED
@@ -1,3 +1,25 @@
1
+ ## [[0.15.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.7...v0.15.0)] - 2024-05-03
2
+
3
+ - Add new build flag for using CUDA ([#18](https://github.com/yoshoku/llama_cpp.rb/pull/18)).
4
+ - Bump llama.cpp from b2740 to b2781.
5
+ - Change `LLAMA_SESSION_VERSION` value from 5 to 6.
6
+ - Add contants for pre-tokenization types.
7
+ - Add `flash_attn` accessor to `ContextParams`.
8
+ - Add `heck_tensors` accessor to `ModelParams`.
9
+ - Add LLAMA_KV_OVERRIDE_TYPE_STR constant.
10
+
11
+ **Breaking Change**
12
+ - Change method names in `ModelKVOverride`.
13
+
14
+ ## [[0.14.7](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.6...v0.14.7)] - 2024-04-27
15
+
16
+ - Bump llama.cpp from b2698 to b2740.
17
+ - Add `keep_split` accessor to `ModelQuantizeParams`.
18
+ - Add `pooling_type` method to `Context`.
19
+ - Add `token_is_eog?` method to `Model`.
20
+
21
+ Implementation binding for llama_sample_token_with_rng has been skipped.
22
+
1
23
  ## [[0.14.6](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.5...v0.14.6)] - 2024-04-20
2
24
 
3
25
  - Bump llama.cpp from b2658 to b2698.
data/README.md CHANGED
@@ -28,8 +28,8 @@ There are several installation options:
28
28
  # use OpenBLAS
29
29
  $ gem install llama_cpp -- --with-openblas
30
30
 
31
- # use cuBLAS
32
- $ gem install llama_cpp -- --with-cublas
31
+ # use CUDA
32
+ $ gem install llama_cpp -- --with-cuda
33
33
  ```
34
34
 
35
35
  Those options are defined in [extconf.rb](https://github.com/yoshoku/llama_cpp.rb/blob/main/ext/llama_cpp/extconf.rb) by with_config method.
@@ -15,7 +15,8 @@ make_envs << ' LLAMA_QKK_64=1' if with_config('qkk-64')
15
15
  make_envs << ' LLAMA_NO_ACCELERATE=1' if with_config('no-accelerate')
16
16
  make_envs << ' LLAMA_OPENBLAS=1' if with_config('openblas')
17
17
  make_envs << ' LLAMA_BLIS=1' if with_config('blis')
18
- make_envs << ' LLAMA_CUBLAS=1' if with_config('cublas')
18
+ make_envs << ' LLAMA_CUBLAS=1' if with_config('cublas') # Deprecated, use --with-cuda instead
19
+ make_envs << ' LLAMA_CUDA=1' if with_config('cuda')
19
20
  make_envs << ' LLAMA_CLBLAST=1' if with_config('clblast')
20
21
  make_envs << ' LLAMA_HIPBLAS=1' if with_config('hipblas')
21
22
  make_envs << ' LLAMA_MPI=1' if with_config('mpi')
@@ -708,9 +708,10 @@ public:
708
708
  rb_define_alloc_func(rb_cLLaMAModelKVOverride, llama_model_kv_override_alloc);
709
709
  rb_define_method(rb_cLLaMAModelKVOverride, "key", RUBY_METHOD_FUNC(_llama_model_kv_override_get_key), 0);
710
710
  rb_define_method(rb_cLLaMAModelKVOverride, "tag", RUBY_METHOD_FUNC(_llama_model_kv_override_get_tag), 0);
711
- rb_define_method(rb_cLLaMAModelKVOverride, "int_value", RUBY_METHOD_FUNC(_llama_model_kv_override_get_int_value), 0);
712
- rb_define_method(rb_cLLaMAModelKVOverride, "float_value", RUBY_METHOD_FUNC(_llama_model_kv_override_get_float_value), 0);
713
- rb_define_method(rb_cLLaMAModelKVOverride, "bool_value", RUBY_METHOD_FUNC(_llama_model_kv_override_get_bool_value), 0);
711
+ rb_define_method(rb_cLLaMAModelKVOverride, "val_i64", RUBY_METHOD_FUNC(_llama_model_kv_override_get_val_i64), 0);
712
+ rb_define_method(rb_cLLaMAModelKVOverride, "val_f64", RUBY_METHOD_FUNC(_llama_model_kv_override_get_val_f64), 0);
713
+ rb_define_method(rb_cLLaMAModelKVOverride, "val_bool", RUBY_METHOD_FUNC(_llama_model_kv_override_get_val_bool), 0);
714
+ rb_define_method(rb_cLLaMAModelKVOverride, "val_str", RUBY_METHOD_FUNC(_llama_model_kv_override_get_val_str), 0);
714
715
  }
715
716
 
716
717
  static const rb_data_type_t llama_model_kv_override_type;
@@ -726,19 +727,24 @@ private:
726
727
  return INT2NUM(ptr->tag);
727
728
  }
728
729
 
729
- static VALUE _llama_model_kv_override_get_int_value(VALUE self) {
730
+ static VALUE _llama_model_kv_override_get_val_i64(VALUE self) {
730
731
  llama_model_kv_override* ptr = get_llama_model_kv_override(self);
731
- return INT2NUM(ptr->int_value);
732
+ return INT2NUM(ptr->val_i64);
732
733
  }
733
734
 
734
- static VALUE _llama_model_kv_override_get_float_value(VALUE self) {
735
+ static VALUE _llama_model_kv_override_get_val_f64(VALUE self) {
735
736
  llama_model_kv_override* ptr = get_llama_model_kv_override(self);
736
- return DBL2NUM(ptr->float_value);
737
+ return DBL2NUM(ptr->val_f64);
737
738
  }
738
739
 
739
- static VALUE _llama_model_kv_override_get_bool_value(VALUE self) {
740
+ static VALUE _llama_model_kv_override_get_val_bool(VALUE self) {
740
741
  llama_model_kv_override* ptr = get_llama_model_kv_override(self);
741
- return ptr->bool_value ? Qtrue : Qfalse;
742
+ return ptr->val_bool ? Qtrue : Qfalse;
743
+ }
744
+
745
+ static VALUE _llama_model_kv_override_get_val_str(VALUE self) {
746
+ llama_model_kv_override* ptr = get_llama_model_kv_override(self);
747
+ return rb_utf8_str_new_cstr(ptr->val_str);
742
748
  }
743
749
  };
744
750
 
@@ -800,6 +806,8 @@ public:
800
806
  rb_define_method(rb_cLLaMAModelParams, "use_mmap", RUBY_METHOD_FUNC(_llama_model_params_get_use_mmap), 0);
801
807
  rb_define_method(rb_cLLaMAModelParams, "use_mlock=", RUBY_METHOD_FUNC(_llama_model_params_set_use_mlock), 1);
802
808
  rb_define_method(rb_cLLaMAModelParams, "use_mlock", RUBY_METHOD_FUNC(_llama_model_params_get_use_mlock), 0);
809
+ rb_define_method(rb_cLLaMAModelParams, "check_tensors=", RUBY_METHOD_FUNC(_llama_model_params_set_check_tensors), 1);
810
+ rb_define_method(rb_cLLaMAModelParams, "check_tensors", RUBY_METHOD_FUNC(_llama_model_params_get_check_tensors), 0);
803
811
  }
804
812
 
805
813
  private:
@@ -892,6 +900,18 @@ private:
892
900
  LLaMAModelParamsWrapper* ptr = get_llama_model_params(self);
893
901
  return ptr->params.use_mlock ? Qtrue : Qfalse;
894
902
  }
903
+
904
+ // check_tensors
905
+ static VALUE _llama_model_params_set_check_tensors(VALUE self, VALUE check_tensors) {
906
+ LLaMAModelParamsWrapper* ptr = get_llama_model_params(self);
907
+ ptr->params.check_tensors = RTEST(check_tensors) ? true : false;
908
+ return ptr->params.check_tensors ? Qtrue : Qfalse;
909
+ }
910
+
911
+ static VALUE _llama_model_params_get_check_tensors(VALUE self) {
912
+ LLaMAModelParamsWrapper* ptr = get_llama_model_params(self);
913
+ return ptr->params.check_tensors ? Qtrue : Qfalse;
914
+ }
895
915
  };
896
916
 
897
917
  const rb_data_type_t RbLLaMAModelParams::llama_model_params_type = {
@@ -984,6 +1004,8 @@ public:
984
1004
  rb_define_method(rb_cLLaMAContextParams, "embeddings", RUBY_METHOD_FUNC(_llama_context_params_get_embeddings), 0);
985
1005
  rb_define_method(rb_cLLaMAContextParams, "offload_kqv=", RUBY_METHOD_FUNC(_llama_context_params_set_offload_kqv), 1);
986
1006
  rb_define_method(rb_cLLaMAContextParams, "offload_kqv", RUBY_METHOD_FUNC(_llama_context_params_get_offload_kqv), 0);
1007
+ rb_define_method(rb_cLLaMAContextParams, "flash_attn=", RUBY_METHOD_FUNC(_llama_context_params_set_flash_attn), 1);
1008
+ rb_define_method(rb_cLLaMAContextParams, "flash_attn", RUBY_METHOD_FUNC(_llama_context_params_get_flash_attn), 0);
987
1009
  }
988
1010
 
989
1011
  private:
@@ -1262,6 +1284,18 @@ private:
1262
1284
  LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1263
1285
  return ptr->params.offload_kqv ? Qtrue : Qfalse;
1264
1286
  }
1287
+
1288
+ // flash_attn
1289
+ static VALUE _llama_context_params_set_flash_attn(VALUE self, VALUE flash_attn) {
1290
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1291
+ ptr->params.flash_attn = RTEST(flash_attn) ? true : false;
1292
+ return ptr->params.flash_attn ? Qtrue : Qfalse;
1293
+ }
1294
+
1295
+ static VALUE _llama_context_params_get_flash_attn(VALUE self) {
1296
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1297
+ return ptr->params.flash_attn ? Qtrue : Qfalse;
1298
+ }
1265
1299
  };
1266
1300
 
1267
1301
  const rb_data_type_t RbLLaMAContextParams::llama_context_params_type = {
@@ -1321,6 +1355,8 @@ public:
1321
1355
  rb_define_method(rb_cLLaMAModelQuantizeParams, "only_copy", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_only_copy), 0);
1322
1356
  rb_define_method(rb_cLLaMAModelQuantizeParams, "pure=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_pure), 1);
1323
1357
  rb_define_method(rb_cLLaMAModelQuantizeParams, "pure", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_pure), 0);
1358
+ rb_define_method(rb_cLLaMAModelQuantizeParams, "keep_split=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_keep_split), 1);
1359
+ rb_define_method(rb_cLLaMAModelQuantizeParams, "keep_split", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_keep_split), 0);
1324
1360
  }
1325
1361
 
1326
1362
  private:
@@ -1405,6 +1441,18 @@ private:
1405
1441
  LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
1406
1442
  return ptr->params.pure ? Qtrue : Qfalse;
1407
1443
  }
1444
+
1445
+ // keep_split
1446
+ static VALUE _llama_model_quantize_params_set_keep_split(VALUE self, VALUE keep_split) {
1447
+ LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
1448
+ ptr->params.keep_split = RTEST(keep_split) ? true : false;
1449
+ return ptr->params.keep_split ? Qtrue : Qfalse;
1450
+ }
1451
+
1452
+ static VALUE _llama_model_quantize_params_get_keep_split(VALUE self) {
1453
+ LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
1454
+ return ptr->params.keep_split ? Qtrue : Qfalse;
1455
+ }
1408
1456
  };
1409
1457
 
1410
1458
  const rb_data_type_t RbLLaMAModelQuantizeParams::llama_model_quantize_params_type = {
@@ -1487,6 +1535,7 @@ public:
1487
1535
  rb_define_method(rb_cLLaMAModel, "token_middle", RUBY_METHOD_FUNC(_llama_model_token_middle), 0);
1488
1536
  rb_define_method(rb_cLLaMAModel, "token_suffix", RUBY_METHOD_FUNC(_llama_model_token_suffix), 0);
1489
1537
  rb_define_method(rb_cLLaMAModel, "token_eot", RUBY_METHOD_FUNC(_llama_model_token_eot), 0);
1538
+ rb_define_method(rb_cLLaMAModel, "token_is_eog?", RUBY_METHOD_FUNC(_llama_model_token_is_eog), 1);
1490
1539
  }
1491
1540
 
1492
1541
  private:
@@ -1634,10 +1683,10 @@ private:
1634
1683
  const llama_token token = NUM2INT(token_);
1635
1684
  LLaMAModelWrapper* ptr = get_llama_model(self);
1636
1685
  std::vector<char> result(8, 0);
1637
- const int n_tokens = llama_token_to_piece(ptr->model, token, result.data(), result.size());
1686
+ const int n_tokens = llama_token_to_piece(ptr->model, token, result.data(), result.size(), false);
1638
1687
  if (n_tokens < 0) {
1639
1688
  result.resize(-n_tokens);
1640
- const int check = llama_token_to_piece(ptr->model, token, result.data(), result.size());
1689
+ const int check = llama_token_to_piece(ptr->model, token, result.data(), result.size(), false);
1641
1690
  if (check != -n_tokens) {
1642
1691
  rb_raise(rb_eRuntimeError, "failed to convert");
1643
1692
  return Qnil;
@@ -1789,6 +1838,16 @@ private:
1789
1838
  LLaMAModelWrapper* ptr = get_llama_model(self);
1790
1839
  return INT2NUM(llama_token_eot(ptr->model));
1791
1840
  }
1841
+
1842
+ static VALUE _llama_model_token_is_eog(VALUE self, VALUE token_) {
1843
+ if (!RB_INTEGER_TYPE_P(token_)) {
1844
+ rb_raise(rb_eArgError, "token must be an integer");
1845
+ return Qnil;
1846
+ }
1847
+ const llama_token token = NUM2INT(token_);
1848
+ LLaMAModelWrapper* ptr = get_llama_model(self);
1849
+ return llama_token_is_eog(ptr->model, token) ? Qtrue : Qfalse;
1850
+ }
1792
1851
  };
1793
1852
 
1794
1853
  const rb_data_type_t RbLLaMAModel::llama_model_type = {
@@ -2102,6 +2161,7 @@ public:
2102
2161
  rb_define_method(rb_cLLaMAContext, "sample_grammar", RUBY_METHOD_FUNC(_llama_context_sample_grammar), -1);
2103
2162
  rb_define_method(rb_cLLaMAContext, "grammar_accept_token", RUBY_METHOD_FUNC(_llama_context_grammar_accept_token), -1);
2104
2163
  rb_define_method(rb_cLLaMAContext, "apply_control_vector", RUBY_METHOD_FUNC(_llama_context_apply_control_vector), -1);
2164
+ rb_define_method(rb_cLLaMAContext, "pooling_type", RUBY_METHOD_FUNC(_llama_context_pooling_type), 0);
2105
2165
  }
2106
2166
 
2107
2167
  private:
@@ -3225,6 +3285,15 @@ private:
3225
3285
 
3226
3286
  return Qnil;
3227
3287
  }
3288
+
3289
+ static VALUE _llama_context_pooling_type(VALUE self) {
3290
+ LLaMAContextWrapper* ptr = get_llama_context(self);
3291
+ if (ptr->ctx == NULL) {
3292
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
3293
+ return Qnil;
3294
+ }
3295
+ return INT2NUM(static_cast<int>(llama_pooling_type(ptr->ctx)));
3296
+ }
3228
3297
  };
3229
3298
 
3230
3299
  const rb_data_type_t RbLLaMAContext::llama_context_type = {
@@ -3351,6 +3420,15 @@ extern "C" void Init_llama_cpp(void) {
3351
3420
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_BPE", INT2NUM(LLAMA_VOCAB_TYPE_BPE));
3352
3421
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_WPM", INT2NUM(LLAMA_VOCAB_TYPE_WPM));
3353
3422
 
3423
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_DEFAULT", INT2NUM(LLAMA_VOCAB_PRE_TYPE_DEFAULT));
3424
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_LLAMA3", INT2NUM(LLAMA_VOCAB_PRE_TYPE_LLAMA3));
3425
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM", INT2NUM(LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM));
3426
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER", INT2NUM(LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER));
3427
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_FALCON", INT2NUM(LLAMA_VOCAB_PRE_TYPE_FALCON));
3428
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_MPT", INT2NUM(LLAMA_VOCAB_PRE_TYPE_MPT));
3429
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_STARCODER", INT2NUM(LLAMA_VOCAB_PRE_TYPE_STARCODER));
3430
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_GPT2", INT2NUM(LLAMA_VOCAB_PRE_TYPE_GPT2));
3431
+
3354
3432
  rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_UNDEFINED", INT2NUM(LLAMA_TOKEN_TYPE_UNDEFINED));
3355
3433
  rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_NORMAL", INT2NUM(LLAMA_TOKEN_TYPE_NORMAL));
3356
3434
  rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_UNKNOWN", INT2NUM(LLAMA_TOKEN_TYPE_UNKNOWN));
@@ -3393,6 +3471,7 @@ extern "C" void Init_llama_cpp(void) {
3393
3471
  rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_TYPE_INT", INT2NUM(LLAMA_KV_OVERRIDE_TYPE_INT));
3394
3472
  rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_TYPE_FLOAT", INT2NUM(LLAMA_KV_OVERRIDE_TYPE_FLOAT));
3395
3473
  rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_TYPE_BOOL", INT2NUM(LLAMA_KV_OVERRIDE_TYPE_BOOL));
3474
+ rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_TYPE_STR", INT2NUM(LLAMA_KV_OVERRIDE_TYPE_STR));
3396
3475
 
3397
3476
  rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_END", INT2NUM(LLAMA_GRETYPE_END));
3398
3477
  rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_ALT", INT2NUM(LLAMA_GRETYPE_ALT));
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.14.6'
6
+ VERSION = '0.15.0'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b2698'
9
+ LLAMA_CPP_VERSION = 'b2781'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -16,6 +16,15 @@ module LLaMACpp
16
16
  LLAMA_VOCAB_TYPE_BPE: Integer
17
17
  LLAMA_VOCAB_TYPE_WPM: Integer
18
18
 
19
+ LLAMA_VOCAB_PRE_TYPE_DEFAULT: Integer
20
+ LLAMA_VOCAB_PRE_TYPE_LLAMA3: Integer
21
+ LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM: Integer
22
+ LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER: Integer
23
+ LLAMA_VOCAB_PRE_TYPE_FALCON: Integer
24
+ LLAMA_VOCAB_PRE_TYPE_MPT: Integer
25
+ LLAMA_VOCAB_PRE_TYPE_STARCODER: Integer
26
+ LLAMA_VOCAB_PRE_TYPE_GPT2: Integer
27
+
19
28
  LLAMA_FTYPE_ALL_F32: Integer
20
29
  LLAMA_FTYPE_MOSTLY_F16: Integer
21
30
  LLAMA_FTYPE_MOSTLY_Q4_0: Integer
@@ -48,6 +57,7 @@ module LLaMACpp
48
57
  LLAMA_KV_OVERRIDE_TYPE_INT: Integer
49
58
  LLAMA_KV_OVERRIDE_TYPE_FLOAT: Integer
50
59
  LLAMA_KV_OVERRIDE_TYPE_BOOL: Integer
60
+ LLAMA_KV_OVERRIDE_TYPE_STR: Integer
51
61
 
52
62
  LLAMA_GRETYPE_END: Integer
53
63
  LLAMA_GRETYPE_ALT: Integer
@@ -141,6 +151,7 @@ module LLaMACpp
141
151
  def token_middle: () -> Integer
142
152
  def token_suffix: () -> Integer
143
153
  def token_eot: () -> Integer
154
+ def token_is_eog?: (Integer) -> bool
144
155
  end
145
156
 
146
157
  class Timings
@@ -162,9 +173,10 @@ module LLaMACpp
162
173
 
163
174
  def key: () -> String
164
175
  def tag: () -> Integer
165
- def int_value: () -> Integer
166
- def float_value: () -> Float
167
- def bool_value: () -> bool
176
+ def val_i64: () -> Integer
177
+ def val_f64: () -> Float
178
+ def val_bool: () -> bool
179
+ def val_str: () -> String
168
180
  end
169
181
 
170
182
  class ModelParams
@@ -183,6 +195,8 @@ module LLaMACpp
183
195
  def use_mmap=: (bool) -> bool
184
196
  def use_mlock: () -> bool
185
197
  def use_mlock=: (bool) -> bool
198
+ def check_tensors: () -> bool
199
+ def check_tensors=: (bool) -> bool
186
200
  end
187
201
 
188
202
  class Batch
@@ -260,6 +274,7 @@ module LLaMACpp
260
274
  def sample_grammar: (::LLaMACpp::TokenDataArray, grammar: ::LLaMACpp::Grammar) -> void
261
275
  def grammar_accept_token: (grammar: ::LLaMACpp::Grammar, token: Integer) -> void
262
276
  def apply_control_vector: (data: Array[Float], n_embd: Integer, il_start: Integer, il_end: Integer) -> void
277
+ def pooling_type: () -> Integer
263
278
  end
264
279
 
265
280
  class ContextParams
@@ -309,6 +324,8 @@ module LLaMACpp
309
324
  def embeddings=: (bool) -> bool
310
325
  def offload_kqv: () -> bool
311
326
  def offload_kqv=: (bool) -> bool
327
+ def flash_attn: () -> bool
328
+ def flash_attn=: (bool) -> bool
312
329
  end
313
330
 
314
331
  class ModelQuantizeParams
@@ -328,6 +345,8 @@ module LLaMACpp
328
345
  def only_copy=: (bool) -> bool
329
346
  def pure: () -> bool
330
347
  def pure=: (bool) -> bool
348
+ def keep_split: () -> bool
349
+ def keep_split=: (bool) -> bool
331
350
  end
332
351
 
333
352
  class Params = ContextParams
@@ -6,11 +6,23 @@ BUILD_TARGETS = \
6
6
 
7
7
  # Binaries only useful for tests
8
8
  TEST_TARGETS = \
9
- tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
10
- tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
11
- tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \
12
- tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease \
13
- tests/test-json-schema-to-grammar tests/test-grammar-integration
9
+ tests/test-autorelease \
10
+ tests/test-backend-ops \
11
+ tests/test-double-float \
12
+ tests/test-grad0 \
13
+ tests/test-grammar-integration \
14
+ tests/test-grammar-parser \
15
+ tests/test-json-schema-to-grammar \
16
+ tests/test-llama-grammar \
17
+ tests/test-model-load-cancel \
18
+ tests/test-opt \
19
+ tests/test-quantize-fns \
20
+ tests/test-quantize-perf \
21
+ tests/test-rope \
22
+ tests/test-sampling \
23
+ tests/test-tokenizer-0 \
24
+ tests/test-tokenizer-1-bpe \
25
+ tests/test-tokenizer-1-spm
14
26
 
15
27
  # Code coverage output files
16
28
  COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@@ -27,6 +39,17 @@ ifndef UNAME_M
27
39
  UNAME_M := $(shell uname -m)
28
40
  endif
29
41
 
42
+ # In GNU make default CXX is g++ instead of c++. Let's fix that so that users
43
+ # of non-gcc compilers don't have to provide g++ alias or wrapper.
44
+ DEFCC := cc
45
+ DEFCXX := c++
46
+ ifeq ($(origin CC),default)
47
+ CC := $(DEFCC)
48
+ endif
49
+ ifeq ($(origin CXX),default)
50
+ CXX := $(DEFCXX)
51
+ endif
52
+
30
53
  # Mac OS + Arm can report x86_64
31
54
  # ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
32
55
  ifeq ($(UNAME_S),Darwin)
@@ -49,11 +72,17 @@ default: $(BUILD_TARGETS)
49
72
  test: $(TEST_TARGETS)
50
73
  @failures=0; \
51
74
  for test_target in $(TEST_TARGETS); do \
52
- if [ "$$test_target" = "tests/test-tokenizer-0-llama" ]; then \
53
- ./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
54
- elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \
75
+ if [ "$$test_target" = "tests/test-tokenizer-0" ]; then \
76
+ ./$$test_target $(CURDIR)/models/ggml-vocab-llama-spm.gguf; \
77
+ ./$$test_target $(CURDIR)/models/ggml-vocab-llama-bpe.gguf; \
78
+ ./$$test_target $(CURDIR)/models/ggml-vocab-phi-3.gguf; \
55
79
  ./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
56
- elif [ "$$test_target" = "tests/test-tokenizer-1-llama" ]; then \
80
+ ./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-coder.gguf; \
81
+ ./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-llm.gguf; \
82
+ ./$$test_target $(CURDIR)/models/ggml-vocab-bert-bge.gguf; \
83
+ ./$$test_target $(CURDIR)/models/ggml-vocab-starcoder.gguf; \
84
+ ./$$test_target $(CURDIR)/models/ggml-vocab-gpt-2.gguf; \
85
+ elif [ "$$test_target" = "tests/test-tokenizer-1-spm" ]; then \
57
86
  continue; \
58
87
  elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \
59
88
  continue; \
@@ -386,10 +415,6 @@ ifdef LLAMA_OPENBLAS
386
415
  MK_LDFLAGS += $(shell pkg-config --libs openblas)
387
416
  endif # LLAMA_OPENBLAS
388
417
 
389
- # TODO: temporary disable until MoE is fixed
390
- # https://github.com/ggerganov/llama.cpp/pull/6716
391
- LLAMA_NO_LLAMAFILE := 1
392
-
393
418
  ifndef LLAMA_NO_LLAMAFILE
394
419
  MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
395
420
  OBJS += sgemm.o
@@ -701,7 +726,7 @@ OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
701
726
  llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
702
727
  $(CXX) $(CXXFLAGS) -c $< -o $@
703
728
 
704
- COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
729
+ COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
705
730
  COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
706
731
 
707
732
  common.o: common/common.cpp $(COMMON_H_DEPS)
@@ -777,7 +802,7 @@ batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.
777
802
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
778
803
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
779
804
 
780
- quantize: examples/quantize/quantize.cpp build-info.o ggml.o llama.o $(OBJS)
805
+ quantize: examples/quantize/quantize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
781
806
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
782
807
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
783
808
 
@@ -805,10 +830,19 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
805
830
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
806
831
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
807
832
 
808
- server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
833
+ server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
809
834
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
810
835
  $(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
811
836
 
837
+ # Portable equivalent of `cd examples/server/public && xxd -i $(notdir $<) ../$(notdir $<).hpp`:
838
+ examples/server/%.hpp: examples/server/public/% Makefile
839
+ @( export NAME=$(subst .,_,$(subst -,_,$(notdir $<))) && \
840
+ echo "unsigned char $${NAME}[] = {" && \
841
+ cat $< | od -v -t x1 -An | sed -E 's/([0-9a-fA-F]+)/0x\1, /g' && \
842
+ echo "};" && \
843
+ echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \
844
+ ) > $@
845
+
812
846
  gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
813
847
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
814
848
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -971,11 +1005,7 @@ tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(OBJS)
971
1005
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
972
1006
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
973
1007
 
974
- tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
975
- $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
976
- $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
977
-
978
- tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
1008
+ tests/test-tokenizer-0: tests/test-tokenizer-0.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
979
1009
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
980
1010
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
981
1011
 
@@ -983,7 +1013,7 @@ tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMM
983
1013
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
984
1014
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
985
1015
 
986
- tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
1016
+ tests/test-tokenizer-1-spm: tests/test-tokenizer-1-spm.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
987
1017
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
988
1018
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
989
1019
 
@@ -371,16 +371,16 @@ struct ggml_gallocr {
371
371
  };
372
372
 
373
373
  ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
374
- ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(sizeof(struct ggml_gallocr), 1);
374
+ ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(1, sizeof(struct ggml_gallocr));
375
375
  GGML_ASSERT(galloc != NULL);
376
376
 
377
- galloc->bufts = calloc(sizeof(ggml_backend_buffer_type_t) * n_bufs, 1);
377
+ galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t));
378
378
  GGML_ASSERT(galloc->bufts != NULL);
379
379
 
380
- galloc->buffers = calloc(sizeof(ggml_backend_buffer_t) * n_bufs, 1);
380
+ galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t) * n_bufs);
381
381
  GGML_ASSERT(galloc->buffers != NULL);
382
382
 
383
- galloc->buf_tallocs = calloc(sizeof(struct ggml_dyn_tallocr *) * n_bufs, 1);
383
+ galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
384
384
  GGML_ASSERT(galloc->buf_tallocs != NULL);
385
385
 
386
386
  for (int i = 0; i < n_bufs; i++) {
@@ -646,8 +646,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
646
646
  free(galloc->hash_set.keys);
647
647
  free(galloc->hash_values);
648
648
  galloc->hash_set.size = hash_size;
649
- galloc->hash_set.keys = calloc(sizeof(struct ggml_tensor *), hash_size);
650
- galloc->hash_values = calloc(sizeof(struct hash_node), hash_size);
649
+ galloc->hash_set.keys = calloc(hash_size, sizeof(struct ggml_tensor *));
650
+ galloc->hash_values = calloc(hash_size, sizeof(struct hash_node));
651
651
  GGML_ASSERT(galloc->hash_set.keys != NULL);
652
652
  GGML_ASSERT(galloc->hash_values != NULL);
653
653
  } else {
@@ -667,7 +667,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
667
667
  // set the node_allocs from the hash table
668
668
  if (galloc->n_nodes < graph->n_nodes) {
669
669
  free(galloc->node_allocs);
670
- galloc->node_allocs = calloc(sizeof(struct node_alloc), graph->n_nodes);
670
+ galloc->node_allocs = calloc(graph->n_nodes, sizeof(struct node_alloc));
671
671
  GGML_ASSERT(galloc->node_allocs != NULL);
672
672
  }
673
673
  galloc->n_nodes = graph->n_nodes;
@@ -697,7 +697,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
697
697
  }
698
698
  if (galloc->n_leafs < graph->n_leafs) {
699
699
  free(galloc->leaf_allocs);
700
- galloc->leaf_allocs = calloc(sizeof(galloc->leaf_allocs[0]), graph->n_leafs);
700
+ galloc->leaf_allocs = calloc(graph->n_leafs, sizeof(galloc->leaf_allocs[0]));
701
701
  GGML_ASSERT(galloc->leaf_allocs != NULL);
702
702
  }
703
703
  galloc->n_leafs = graph->n_leafs;
@@ -822,7 +822,11 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t
822
822
  GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
823
823
  switch (op->op) {
824
824
  case GGML_OP_CPY:
825
- return op->type != GGML_TYPE_IQ2_XXS && op->type != GGML_TYPE_IQ2_XS && op->type != GGML_TYPE_IQ1_S; // missing type_traits.from_float
825
+ return
826
+ op->type != GGML_TYPE_IQ2_XXS &&
827
+ op->type != GGML_TYPE_IQ2_XS &&
828
+ op->type != GGML_TYPE_IQ1_S &&
829
+ op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
826
830
  case GGML_OP_MUL_MAT:
827
831
  return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
828
832
  default:
@@ -1721,23 +1725,23 @@ ggml_backend_sched_t ggml_backend_sched_new(
1721
1725
  GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
1722
1726
  GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
1723
1727
 
1724
- struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
1728
+ struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched));
1725
1729
 
1726
1730
  // initialize hash table
1727
1731
  sched->hash_set = ggml_hash_set_new(graph_size);
1728
- sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size);
1729
- sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size);
1732
+ sched->tensor_backend_id = calloc(sched->hash_set.size, sizeof(sched->tensor_backend_id[0]));
1733
+ sched->tensor_copies = calloc(sched->hash_set.size, sizeof(sched->tensor_copies[0]));
1730
1734
 
1731
1735
  const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
1732
- sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), nodes_size);
1733
- sched->leaf_backend_ids = calloc(sizeof(sched->leaf_backend_ids[0]), nodes_size);
1736
+ sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
1737
+ sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
1734
1738
 
1735
1739
  sched->n_backends = n_backends;
1736
1740
 
1737
1741
  sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
1738
1742
 
1739
1743
  const int initial_splits_capacity = 16;
1740
- sched->splits = calloc(sizeof(sched->splits[0]), initial_splits_capacity);
1744
+ sched->splits = calloc(initial_splits_capacity, sizeof(sched->splits[0]));
1741
1745
  sched->splits_capacity = initial_splits_capacity;
1742
1746
 
1743
1747
  for (int b = 0; b < n_backends; b++) {
@@ -1780,12 +1784,14 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
1780
1784
 
1781
1785
  void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
1782
1786
  // reset state for the next run
1783
- size_t hash_size = sched->hash_set.size;
1784
- memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT
1785
- memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size);
1786
- memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);
1787
+ if (!sched->is_reset) {
1788
+ size_t hash_size = sched->hash_set.size;
1789
+ memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT
1790
+ memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size);
1791
+ memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);
1787
1792
 
1788
- sched->is_reset = true;
1793
+ sched->is_reset = true;
1794
+ }
1789
1795
  sched->is_alloc = false;
1790
1796
  }
1791
1797
 
@@ -1968,10 +1974,10 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te
1968
1974
  struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
1969
1975
  struct ggml_hash_set hash_set = {
1970
1976
  /* .size = */ graph->visited_hash_table.size,
1971
- /* .keys = */ calloc(sizeof(hash_set.keys[0]), graph->visited_hash_table.size) // NOLINT
1977
+ /* .keys = */ calloc(graph->visited_hash_table.size, sizeof(hash_set.keys[0])) // NOLINT
1972
1978
  };
1973
- struct ggml_tensor ** node_copies = calloc(sizeof(node_copies[0]), hash_set.size); // NOLINT
1974
- bool * node_init = calloc(sizeof(node_init[0]), hash_set.size);
1979
+ struct ggml_tensor ** node_copies = calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
1980
+ bool * node_init = calloc(hash_set.size, sizeof(node_init[0]));
1975
1981
 
1976
1982
  struct ggml_init_params params = {
1977
1983
  /* .mem_size = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false),