llama_cpp 0.14.6 → 0.15.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5c4bd6bcb93b98a00f94dcdf93d04f853174f73e281d96fce8f837a6ba7f250e
4
- data.tar.gz: 6d184e9ce927c06ba794bea63a09007a175a72e477366ffb1c5763ceb2c7c71e
3
+ metadata.gz: b6da808ddaadd304ab376b4726de19087422194ef32c9e5006272569f1c4a76a
4
+ data.tar.gz: faf5c6ed3421cacb24a11c0d126c852d38f1a0b3edb43768133a321269958730
5
5
  SHA512:
6
- metadata.gz: 953fe2777a759e5467694b8afb9d3f929a42603e81b2c3e38ba0fda4bb6dca78b2d147345023f99c2c9fb899cc746bf6729ad2726c2cb473d7094e93c13caf73
7
- data.tar.gz: 71eb3cd5a5c619e9cc8a3418be745a8b76dc5e8cabe5b26a766230a8533df9a11c3981601b0be4ec0adb34a49f86ad741503ffc9f3b0d7ba021a7e9ddc3246a7
6
+ metadata.gz: 9a83cb7da94d4672418440361d78b230f6560a97b90924c389c958a6f91b2ecded2f5e53dcbf596845687cd332ecc8126c1a7f79c33fad9b9ff20ac1ce4f8759
7
+ data.tar.gz: 55001246afe1615d8d8262c2f74dccbe819b4942cdb6517f5aa6e5d3e98fb2ea628db5c8e5b94a19052afff88236f003a15e7f792473b0c10660cbcf58ecab45
data/CHANGELOG.md CHANGED
@@ -1,3 +1,25 @@
1
+ ## [[0.15.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.7...v0.15.0)] - 2024-05-03
2
+
3
+ - Add new build flag for using CUDA ([#18](https://github.com/yoshoku/llama_cpp.rb/pull/18)).
4
+ - Bump llama.cpp from b2740 to b2781.
5
+ - Change `LLAMA_SESSION_VERSION` value from 5 to 6.
6
+ - Add contants for pre-tokenization types.
7
+ - Add `flash_attn` accessor to `ContextParams`.
8
+ - Add `heck_tensors` accessor to `ModelParams`.
9
+ - Add LLAMA_KV_OVERRIDE_TYPE_STR constant.
10
+
11
+ **Breaking Change**
12
+ - Change method names in `ModelKVOverride`.
13
+
14
+ ## [[0.14.7](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.6...v0.14.7)] - 2024-04-27
15
+
16
+ - Bump llama.cpp from b2698 to b2740.
17
+ - Add `keep_split` accessor to `ModelQuantizeParams`.
18
+ - Add `pooling_type` method to `Context`.
19
+ - Add `token_is_eog?` method to `Model`.
20
+
21
+ Implementation binding for llama_sample_token_with_rng has been skipped.
22
+
1
23
  ## [[0.14.6](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.5...v0.14.6)] - 2024-04-20
2
24
 
3
25
  - Bump llama.cpp from b2658 to b2698.
data/README.md CHANGED
@@ -28,8 +28,8 @@ There are several installation options:
28
28
  # use OpenBLAS
29
29
  $ gem install llama_cpp -- --with-openblas
30
30
 
31
- # use cuBLAS
32
- $ gem install llama_cpp -- --with-cublas
31
+ # use CUDA
32
+ $ gem install llama_cpp -- --with-cuda
33
33
  ```
34
34
 
35
35
  Those options are defined in [extconf.rb](https://github.com/yoshoku/llama_cpp.rb/blob/main/ext/llama_cpp/extconf.rb) by with_config method.
@@ -15,7 +15,8 @@ make_envs << ' LLAMA_QKK_64=1' if with_config('qkk-64')
15
15
  make_envs << ' LLAMA_NO_ACCELERATE=1' if with_config('no-accelerate')
16
16
  make_envs << ' LLAMA_OPENBLAS=1' if with_config('openblas')
17
17
  make_envs << ' LLAMA_BLIS=1' if with_config('blis')
18
- make_envs << ' LLAMA_CUBLAS=1' if with_config('cublas')
18
+ make_envs << ' LLAMA_CUBLAS=1' if with_config('cublas') # Deprecated, use --with-cuda instead
19
+ make_envs << ' LLAMA_CUDA=1' if with_config('cuda')
19
20
  make_envs << ' LLAMA_CLBLAST=1' if with_config('clblast')
20
21
  make_envs << ' LLAMA_HIPBLAS=1' if with_config('hipblas')
21
22
  make_envs << ' LLAMA_MPI=1' if with_config('mpi')
@@ -708,9 +708,10 @@ public:
708
708
  rb_define_alloc_func(rb_cLLaMAModelKVOverride, llama_model_kv_override_alloc);
709
709
  rb_define_method(rb_cLLaMAModelKVOverride, "key", RUBY_METHOD_FUNC(_llama_model_kv_override_get_key), 0);
710
710
  rb_define_method(rb_cLLaMAModelKVOverride, "tag", RUBY_METHOD_FUNC(_llama_model_kv_override_get_tag), 0);
711
- rb_define_method(rb_cLLaMAModelKVOverride, "int_value", RUBY_METHOD_FUNC(_llama_model_kv_override_get_int_value), 0);
712
- rb_define_method(rb_cLLaMAModelKVOverride, "float_value", RUBY_METHOD_FUNC(_llama_model_kv_override_get_float_value), 0);
713
- rb_define_method(rb_cLLaMAModelKVOverride, "bool_value", RUBY_METHOD_FUNC(_llama_model_kv_override_get_bool_value), 0);
711
+ rb_define_method(rb_cLLaMAModelKVOverride, "val_i64", RUBY_METHOD_FUNC(_llama_model_kv_override_get_val_i64), 0);
712
+ rb_define_method(rb_cLLaMAModelKVOverride, "val_f64", RUBY_METHOD_FUNC(_llama_model_kv_override_get_val_f64), 0);
713
+ rb_define_method(rb_cLLaMAModelKVOverride, "val_bool", RUBY_METHOD_FUNC(_llama_model_kv_override_get_val_bool), 0);
714
+ rb_define_method(rb_cLLaMAModelKVOverride, "val_str", RUBY_METHOD_FUNC(_llama_model_kv_override_get_val_str), 0);
714
715
  }
715
716
 
716
717
  static const rb_data_type_t llama_model_kv_override_type;
@@ -726,19 +727,24 @@ private:
726
727
  return INT2NUM(ptr->tag);
727
728
  }
728
729
 
729
- static VALUE _llama_model_kv_override_get_int_value(VALUE self) {
730
+ static VALUE _llama_model_kv_override_get_val_i64(VALUE self) {
730
731
  llama_model_kv_override* ptr = get_llama_model_kv_override(self);
731
- return INT2NUM(ptr->int_value);
732
+ return INT2NUM(ptr->val_i64);
732
733
  }
733
734
 
734
- static VALUE _llama_model_kv_override_get_float_value(VALUE self) {
735
+ static VALUE _llama_model_kv_override_get_val_f64(VALUE self) {
735
736
  llama_model_kv_override* ptr = get_llama_model_kv_override(self);
736
- return DBL2NUM(ptr->float_value);
737
+ return DBL2NUM(ptr->val_f64);
737
738
  }
738
739
 
739
- static VALUE _llama_model_kv_override_get_bool_value(VALUE self) {
740
+ static VALUE _llama_model_kv_override_get_val_bool(VALUE self) {
740
741
  llama_model_kv_override* ptr = get_llama_model_kv_override(self);
741
- return ptr->bool_value ? Qtrue : Qfalse;
742
+ return ptr->val_bool ? Qtrue : Qfalse;
743
+ }
744
+
745
+ static VALUE _llama_model_kv_override_get_val_str(VALUE self) {
746
+ llama_model_kv_override* ptr = get_llama_model_kv_override(self);
747
+ return rb_utf8_str_new_cstr(ptr->val_str);
742
748
  }
743
749
  };
744
750
 
@@ -800,6 +806,8 @@ public:
800
806
  rb_define_method(rb_cLLaMAModelParams, "use_mmap", RUBY_METHOD_FUNC(_llama_model_params_get_use_mmap), 0);
801
807
  rb_define_method(rb_cLLaMAModelParams, "use_mlock=", RUBY_METHOD_FUNC(_llama_model_params_set_use_mlock), 1);
802
808
  rb_define_method(rb_cLLaMAModelParams, "use_mlock", RUBY_METHOD_FUNC(_llama_model_params_get_use_mlock), 0);
809
+ rb_define_method(rb_cLLaMAModelParams, "check_tensors=", RUBY_METHOD_FUNC(_llama_model_params_set_check_tensors), 1);
810
+ rb_define_method(rb_cLLaMAModelParams, "check_tensors", RUBY_METHOD_FUNC(_llama_model_params_get_check_tensors), 0);
803
811
  }
804
812
 
805
813
  private:
@@ -892,6 +900,18 @@ private:
892
900
  LLaMAModelParamsWrapper* ptr = get_llama_model_params(self);
893
901
  return ptr->params.use_mlock ? Qtrue : Qfalse;
894
902
  }
903
+
904
+ // check_tensors
905
+ static VALUE _llama_model_params_set_check_tensors(VALUE self, VALUE check_tensors) {
906
+ LLaMAModelParamsWrapper* ptr = get_llama_model_params(self);
907
+ ptr->params.check_tensors = RTEST(check_tensors) ? true : false;
908
+ return ptr->params.check_tensors ? Qtrue : Qfalse;
909
+ }
910
+
911
+ static VALUE _llama_model_params_get_check_tensors(VALUE self) {
912
+ LLaMAModelParamsWrapper* ptr = get_llama_model_params(self);
913
+ return ptr->params.check_tensors ? Qtrue : Qfalse;
914
+ }
895
915
  };
896
916
 
897
917
  const rb_data_type_t RbLLaMAModelParams::llama_model_params_type = {
@@ -984,6 +1004,8 @@ public:
984
1004
  rb_define_method(rb_cLLaMAContextParams, "embeddings", RUBY_METHOD_FUNC(_llama_context_params_get_embeddings), 0);
985
1005
  rb_define_method(rb_cLLaMAContextParams, "offload_kqv=", RUBY_METHOD_FUNC(_llama_context_params_set_offload_kqv), 1);
986
1006
  rb_define_method(rb_cLLaMAContextParams, "offload_kqv", RUBY_METHOD_FUNC(_llama_context_params_get_offload_kqv), 0);
1007
+ rb_define_method(rb_cLLaMAContextParams, "flash_attn=", RUBY_METHOD_FUNC(_llama_context_params_set_flash_attn), 1);
1008
+ rb_define_method(rb_cLLaMAContextParams, "flash_attn", RUBY_METHOD_FUNC(_llama_context_params_get_flash_attn), 0);
987
1009
  }
988
1010
 
989
1011
  private:
@@ -1262,6 +1284,18 @@ private:
1262
1284
  LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1263
1285
  return ptr->params.offload_kqv ? Qtrue : Qfalse;
1264
1286
  }
1287
+
1288
+ // flash_attn
1289
+ static VALUE _llama_context_params_set_flash_attn(VALUE self, VALUE flash_attn) {
1290
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1291
+ ptr->params.flash_attn = RTEST(flash_attn) ? true : false;
1292
+ return ptr->params.flash_attn ? Qtrue : Qfalse;
1293
+ }
1294
+
1295
+ static VALUE _llama_context_params_get_flash_attn(VALUE self) {
1296
+ LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
1297
+ return ptr->params.flash_attn ? Qtrue : Qfalse;
1298
+ }
1265
1299
  };
1266
1300
 
1267
1301
  const rb_data_type_t RbLLaMAContextParams::llama_context_params_type = {
@@ -1321,6 +1355,8 @@ public:
1321
1355
  rb_define_method(rb_cLLaMAModelQuantizeParams, "only_copy", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_only_copy), 0);
1322
1356
  rb_define_method(rb_cLLaMAModelQuantizeParams, "pure=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_pure), 1);
1323
1357
  rb_define_method(rb_cLLaMAModelQuantizeParams, "pure", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_pure), 0);
1358
+ rb_define_method(rb_cLLaMAModelQuantizeParams, "keep_split=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_keep_split), 1);
1359
+ rb_define_method(rb_cLLaMAModelQuantizeParams, "keep_split", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_keep_split), 0);
1324
1360
  }
1325
1361
 
1326
1362
  private:
@@ -1405,6 +1441,18 @@ private:
1405
1441
  LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
1406
1442
  return ptr->params.pure ? Qtrue : Qfalse;
1407
1443
  }
1444
+
1445
+ // keep_split
1446
+ static VALUE _llama_model_quantize_params_set_keep_split(VALUE self, VALUE keep_split) {
1447
+ LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
1448
+ ptr->params.keep_split = RTEST(keep_split) ? true : false;
1449
+ return ptr->params.keep_split ? Qtrue : Qfalse;
1450
+ }
1451
+
1452
+ static VALUE _llama_model_quantize_params_get_keep_split(VALUE self) {
1453
+ LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
1454
+ return ptr->params.keep_split ? Qtrue : Qfalse;
1455
+ }
1408
1456
  };
1409
1457
 
1410
1458
  const rb_data_type_t RbLLaMAModelQuantizeParams::llama_model_quantize_params_type = {
@@ -1487,6 +1535,7 @@ public:
1487
1535
  rb_define_method(rb_cLLaMAModel, "token_middle", RUBY_METHOD_FUNC(_llama_model_token_middle), 0);
1488
1536
  rb_define_method(rb_cLLaMAModel, "token_suffix", RUBY_METHOD_FUNC(_llama_model_token_suffix), 0);
1489
1537
  rb_define_method(rb_cLLaMAModel, "token_eot", RUBY_METHOD_FUNC(_llama_model_token_eot), 0);
1538
+ rb_define_method(rb_cLLaMAModel, "token_is_eog?", RUBY_METHOD_FUNC(_llama_model_token_is_eog), 1);
1490
1539
  }
1491
1540
 
1492
1541
  private:
@@ -1634,10 +1683,10 @@ private:
1634
1683
  const llama_token token = NUM2INT(token_);
1635
1684
  LLaMAModelWrapper* ptr = get_llama_model(self);
1636
1685
  std::vector<char> result(8, 0);
1637
- const int n_tokens = llama_token_to_piece(ptr->model, token, result.data(), result.size());
1686
+ const int n_tokens = llama_token_to_piece(ptr->model, token, result.data(), result.size(), false);
1638
1687
  if (n_tokens < 0) {
1639
1688
  result.resize(-n_tokens);
1640
- const int check = llama_token_to_piece(ptr->model, token, result.data(), result.size());
1689
+ const int check = llama_token_to_piece(ptr->model, token, result.data(), result.size(), false);
1641
1690
  if (check != -n_tokens) {
1642
1691
  rb_raise(rb_eRuntimeError, "failed to convert");
1643
1692
  return Qnil;
@@ -1789,6 +1838,16 @@ private:
1789
1838
  LLaMAModelWrapper* ptr = get_llama_model(self);
1790
1839
  return INT2NUM(llama_token_eot(ptr->model));
1791
1840
  }
1841
+
1842
+ static VALUE _llama_model_token_is_eog(VALUE self, VALUE token_) {
1843
+ if (!RB_INTEGER_TYPE_P(token_)) {
1844
+ rb_raise(rb_eArgError, "token must be an integer");
1845
+ return Qnil;
1846
+ }
1847
+ const llama_token token = NUM2INT(token_);
1848
+ LLaMAModelWrapper* ptr = get_llama_model(self);
1849
+ return llama_token_is_eog(ptr->model, token) ? Qtrue : Qfalse;
1850
+ }
1792
1851
  };
1793
1852
 
1794
1853
  const rb_data_type_t RbLLaMAModel::llama_model_type = {
@@ -2102,6 +2161,7 @@ public:
2102
2161
  rb_define_method(rb_cLLaMAContext, "sample_grammar", RUBY_METHOD_FUNC(_llama_context_sample_grammar), -1);
2103
2162
  rb_define_method(rb_cLLaMAContext, "grammar_accept_token", RUBY_METHOD_FUNC(_llama_context_grammar_accept_token), -1);
2104
2163
  rb_define_method(rb_cLLaMAContext, "apply_control_vector", RUBY_METHOD_FUNC(_llama_context_apply_control_vector), -1);
2164
+ rb_define_method(rb_cLLaMAContext, "pooling_type", RUBY_METHOD_FUNC(_llama_context_pooling_type), 0);
2105
2165
  }
2106
2166
 
2107
2167
  private:
@@ -3225,6 +3285,15 @@ private:
3225
3285
 
3226
3286
  return Qnil;
3227
3287
  }
3288
+
3289
+ static VALUE _llama_context_pooling_type(VALUE self) {
3290
+ LLaMAContextWrapper* ptr = get_llama_context(self);
3291
+ if (ptr->ctx == NULL) {
3292
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
3293
+ return Qnil;
3294
+ }
3295
+ return INT2NUM(static_cast<int>(llama_pooling_type(ptr->ctx)));
3296
+ }
3228
3297
  };
3229
3298
 
3230
3299
  const rb_data_type_t RbLLaMAContext::llama_context_type = {
@@ -3351,6 +3420,15 @@ extern "C" void Init_llama_cpp(void) {
3351
3420
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_BPE", INT2NUM(LLAMA_VOCAB_TYPE_BPE));
3352
3421
  rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_WPM", INT2NUM(LLAMA_VOCAB_TYPE_WPM));
3353
3422
 
3423
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_DEFAULT", INT2NUM(LLAMA_VOCAB_PRE_TYPE_DEFAULT));
3424
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_LLAMA3", INT2NUM(LLAMA_VOCAB_PRE_TYPE_LLAMA3));
3425
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM", INT2NUM(LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM));
3426
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER", INT2NUM(LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER));
3427
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_FALCON", INT2NUM(LLAMA_VOCAB_PRE_TYPE_FALCON));
3428
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_MPT", INT2NUM(LLAMA_VOCAB_PRE_TYPE_MPT));
3429
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_STARCODER", INT2NUM(LLAMA_VOCAB_PRE_TYPE_STARCODER));
3430
+ rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_PRE_TYPE_GPT2", INT2NUM(LLAMA_VOCAB_PRE_TYPE_GPT2));
3431
+
3354
3432
  rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_UNDEFINED", INT2NUM(LLAMA_TOKEN_TYPE_UNDEFINED));
3355
3433
  rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_NORMAL", INT2NUM(LLAMA_TOKEN_TYPE_NORMAL));
3356
3434
  rb_define_const(rb_mLLaMACpp, "LLAMA_TOKEN_TYPE_UNKNOWN", INT2NUM(LLAMA_TOKEN_TYPE_UNKNOWN));
@@ -3393,6 +3471,7 @@ extern "C" void Init_llama_cpp(void) {
3393
3471
  rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_TYPE_INT", INT2NUM(LLAMA_KV_OVERRIDE_TYPE_INT));
3394
3472
  rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_TYPE_FLOAT", INT2NUM(LLAMA_KV_OVERRIDE_TYPE_FLOAT));
3395
3473
  rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_TYPE_BOOL", INT2NUM(LLAMA_KV_OVERRIDE_TYPE_BOOL));
3474
+ rb_define_const(rb_mLLaMACpp, "LLAMA_KV_OVERRIDE_TYPE_STR", INT2NUM(LLAMA_KV_OVERRIDE_TYPE_STR));
3396
3475
 
3397
3476
  rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_END", INT2NUM(LLAMA_GRETYPE_END));
3398
3477
  rb_define_const(rb_mLLaMACpp, "LLAMA_GRETYPE_ALT", INT2NUM(LLAMA_GRETYPE_ALT));
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.14.6'
6
+ VERSION = '0.15.0'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b2698'
9
+ LLAMA_CPP_VERSION = 'b2781'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -16,6 +16,15 @@ module LLaMACpp
16
16
  LLAMA_VOCAB_TYPE_BPE: Integer
17
17
  LLAMA_VOCAB_TYPE_WPM: Integer
18
18
 
19
+ LLAMA_VOCAB_PRE_TYPE_DEFAULT: Integer
20
+ LLAMA_VOCAB_PRE_TYPE_LLAMA3: Integer
21
+ LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM: Integer
22
+ LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER: Integer
23
+ LLAMA_VOCAB_PRE_TYPE_FALCON: Integer
24
+ LLAMA_VOCAB_PRE_TYPE_MPT: Integer
25
+ LLAMA_VOCAB_PRE_TYPE_STARCODER: Integer
26
+ LLAMA_VOCAB_PRE_TYPE_GPT2: Integer
27
+
19
28
  LLAMA_FTYPE_ALL_F32: Integer
20
29
  LLAMA_FTYPE_MOSTLY_F16: Integer
21
30
  LLAMA_FTYPE_MOSTLY_Q4_0: Integer
@@ -48,6 +57,7 @@ module LLaMACpp
48
57
  LLAMA_KV_OVERRIDE_TYPE_INT: Integer
49
58
  LLAMA_KV_OVERRIDE_TYPE_FLOAT: Integer
50
59
  LLAMA_KV_OVERRIDE_TYPE_BOOL: Integer
60
+ LLAMA_KV_OVERRIDE_TYPE_STR: Integer
51
61
 
52
62
  LLAMA_GRETYPE_END: Integer
53
63
  LLAMA_GRETYPE_ALT: Integer
@@ -141,6 +151,7 @@ module LLaMACpp
141
151
  def token_middle: () -> Integer
142
152
  def token_suffix: () -> Integer
143
153
  def token_eot: () -> Integer
154
+ def token_is_eog?: (Integer) -> bool
144
155
  end
145
156
 
146
157
  class Timings
@@ -162,9 +173,10 @@ module LLaMACpp
162
173
 
163
174
  def key: () -> String
164
175
  def tag: () -> Integer
165
- def int_value: () -> Integer
166
- def float_value: () -> Float
167
- def bool_value: () -> bool
176
+ def val_i64: () -> Integer
177
+ def val_f64: () -> Float
178
+ def val_bool: () -> bool
179
+ def val_str: () -> String
168
180
  end
169
181
 
170
182
  class ModelParams
@@ -183,6 +195,8 @@ module LLaMACpp
183
195
  def use_mmap=: (bool) -> bool
184
196
  def use_mlock: () -> bool
185
197
  def use_mlock=: (bool) -> bool
198
+ def check_tensors: () -> bool
199
+ def check_tensors=: (bool) -> bool
186
200
  end
187
201
 
188
202
  class Batch
@@ -260,6 +274,7 @@ module LLaMACpp
260
274
  def sample_grammar: (::LLaMACpp::TokenDataArray, grammar: ::LLaMACpp::Grammar) -> void
261
275
  def grammar_accept_token: (grammar: ::LLaMACpp::Grammar, token: Integer) -> void
262
276
  def apply_control_vector: (data: Array[Float], n_embd: Integer, il_start: Integer, il_end: Integer) -> void
277
+ def pooling_type: () -> Integer
263
278
  end
264
279
 
265
280
  class ContextParams
@@ -309,6 +324,8 @@ module LLaMACpp
309
324
  def embeddings=: (bool) -> bool
310
325
  def offload_kqv: () -> bool
311
326
  def offload_kqv=: (bool) -> bool
327
+ def flash_attn: () -> bool
328
+ def flash_attn=: (bool) -> bool
312
329
  end
313
330
 
314
331
  class ModelQuantizeParams
@@ -328,6 +345,8 @@ module LLaMACpp
328
345
  def only_copy=: (bool) -> bool
329
346
  def pure: () -> bool
330
347
  def pure=: (bool) -> bool
348
+ def keep_split: () -> bool
349
+ def keep_split=: (bool) -> bool
331
350
  end
332
351
 
333
352
  class Params = ContextParams
@@ -6,11 +6,23 @@ BUILD_TARGETS = \
6
6
 
7
7
  # Binaries only useful for tests
8
8
  TEST_TARGETS = \
9
- tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
10
- tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
11
- tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \
12
- tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease \
13
- tests/test-json-schema-to-grammar tests/test-grammar-integration
9
+ tests/test-autorelease \
10
+ tests/test-backend-ops \
11
+ tests/test-double-float \
12
+ tests/test-grad0 \
13
+ tests/test-grammar-integration \
14
+ tests/test-grammar-parser \
15
+ tests/test-json-schema-to-grammar \
16
+ tests/test-llama-grammar \
17
+ tests/test-model-load-cancel \
18
+ tests/test-opt \
19
+ tests/test-quantize-fns \
20
+ tests/test-quantize-perf \
21
+ tests/test-rope \
22
+ tests/test-sampling \
23
+ tests/test-tokenizer-0 \
24
+ tests/test-tokenizer-1-bpe \
25
+ tests/test-tokenizer-1-spm
14
26
 
15
27
  # Code coverage output files
16
28
  COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@@ -27,6 +39,17 @@ ifndef UNAME_M
27
39
  UNAME_M := $(shell uname -m)
28
40
  endif
29
41
 
42
+ # In GNU make default CXX is g++ instead of c++. Let's fix that so that users
43
+ # of non-gcc compilers don't have to provide g++ alias or wrapper.
44
+ DEFCC := cc
45
+ DEFCXX := c++
46
+ ifeq ($(origin CC),default)
47
+ CC := $(DEFCC)
48
+ endif
49
+ ifeq ($(origin CXX),default)
50
+ CXX := $(DEFCXX)
51
+ endif
52
+
30
53
  # Mac OS + Arm can report x86_64
31
54
  # ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
32
55
  ifeq ($(UNAME_S),Darwin)
@@ -49,11 +72,17 @@ default: $(BUILD_TARGETS)
49
72
  test: $(TEST_TARGETS)
50
73
  @failures=0; \
51
74
  for test_target in $(TEST_TARGETS); do \
52
- if [ "$$test_target" = "tests/test-tokenizer-0-llama" ]; then \
53
- ./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
54
- elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \
75
+ if [ "$$test_target" = "tests/test-tokenizer-0" ]; then \
76
+ ./$$test_target $(CURDIR)/models/ggml-vocab-llama-spm.gguf; \
77
+ ./$$test_target $(CURDIR)/models/ggml-vocab-llama-bpe.gguf; \
78
+ ./$$test_target $(CURDIR)/models/ggml-vocab-phi-3.gguf; \
55
79
  ./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
56
- elif [ "$$test_target" = "tests/test-tokenizer-1-llama" ]; then \
80
+ ./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-coder.gguf; \
81
+ ./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-llm.gguf; \
82
+ ./$$test_target $(CURDIR)/models/ggml-vocab-bert-bge.gguf; \
83
+ ./$$test_target $(CURDIR)/models/ggml-vocab-starcoder.gguf; \
84
+ ./$$test_target $(CURDIR)/models/ggml-vocab-gpt-2.gguf; \
85
+ elif [ "$$test_target" = "tests/test-tokenizer-1-spm" ]; then \
57
86
  continue; \
58
87
  elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \
59
88
  continue; \
@@ -386,10 +415,6 @@ ifdef LLAMA_OPENBLAS
386
415
  MK_LDFLAGS += $(shell pkg-config --libs openblas)
387
416
  endif # LLAMA_OPENBLAS
388
417
 
389
- # TODO: temporary disable until MoE is fixed
390
- # https://github.com/ggerganov/llama.cpp/pull/6716
391
- LLAMA_NO_LLAMAFILE := 1
392
-
393
418
  ifndef LLAMA_NO_LLAMAFILE
394
419
  MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
395
420
  OBJS += sgemm.o
@@ -701,7 +726,7 @@ OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
701
726
  llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
702
727
  $(CXX) $(CXXFLAGS) -c $< -o $@
703
728
 
704
- COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
729
+ COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
705
730
  COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
706
731
 
707
732
  common.o: common/common.cpp $(COMMON_H_DEPS)
@@ -777,7 +802,7 @@ batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.
777
802
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
778
803
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
779
804
 
780
- quantize: examples/quantize/quantize.cpp build-info.o ggml.o llama.o $(OBJS)
805
+ quantize: examples/quantize/quantize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
781
806
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
782
807
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
783
808
 
@@ -805,10 +830,19 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
805
830
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
806
831
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
807
832
 
808
- server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
833
+ server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
809
834
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
810
835
  $(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
811
836
 
837
+ # Portable equivalent of `cd examples/server/public && xxd -i $(notdir $<) ../$(notdir $<).hpp`:
838
+ examples/server/%.hpp: examples/server/public/% Makefile
839
+ @( export NAME=$(subst .,_,$(subst -,_,$(notdir $<))) && \
840
+ echo "unsigned char $${NAME}[] = {" && \
841
+ cat $< | od -v -t x1 -An | sed -E 's/([0-9a-fA-F]+)/0x\1, /g' && \
842
+ echo "};" && \
843
+ echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \
844
+ ) > $@
845
+
812
846
  gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
813
847
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
814
848
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -971,11 +1005,7 @@ tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(OBJS)
971
1005
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
972
1006
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
973
1007
 
974
- tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
975
- $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
976
- $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
977
-
978
- tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
1008
+ tests/test-tokenizer-0: tests/test-tokenizer-0.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
979
1009
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
980
1010
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
981
1011
 
@@ -983,7 +1013,7 @@ tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMM
983
1013
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
984
1014
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
985
1015
 
986
- tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
1016
+ tests/test-tokenizer-1-spm: tests/test-tokenizer-1-spm.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
987
1017
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
988
1018
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
989
1019
 
@@ -371,16 +371,16 @@ struct ggml_gallocr {
371
371
  };
372
372
 
373
373
  ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
374
- ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(sizeof(struct ggml_gallocr), 1);
374
+ ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(1, sizeof(struct ggml_gallocr));
375
375
  GGML_ASSERT(galloc != NULL);
376
376
 
377
- galloc->bufts = calloc(sizeof(ggml_backend_buffer_type_t) * n_bufs, 1);
377
+ galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t));
378
378
  GGML_ASSERT(galloc->bufts != NULL);
379
379
 
380
- galloc->buffers = calloc(sizeof(ggml_backend_buffer_t) * n_bufs, 1);
380
+ galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t) * n_bufs);
381
381
  GGML_ASSERT(galloc->buffers != NULL);
382
382
 
383
- galloc->buf_tallocs = calloc(sizeof(struct ggml_dyn_tallocr *) * n_bufs, 1);
383
+ galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
384
384
  GGML_ASSERT(galloc->buf_tallocs != NULL);
385
385
 
386
386
  for (int i = 0; i < n_bufs; i++) {
@@ -646,8 +646,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
646
646
  free(galloc->hash_set.keys);
647
647
  free(galloc->hash_values);
648
648
  galloc->hash_set.size = hash_size;
649
- galloc->hash_set.keys = calloc(sizeof(struct ggml_tensor *), hash_size);
650
- galloc->hash_values = calloc(sizeof(struct hash_node), hash_size);
649
+ galloc->hash_set.keys = calloc(hash_size, sizeof(struct ggml_tensor *));
650
+ galloc->hash_values = calloc(hash_size, sizeof(struct hash_node));
651
651
  GGML_ASSERT(galloc->hash_set.keys != NULL);
652
652
  GGML_ASSERT(galloc->hash_values != NULL);
653
653
  } else {
@@ -667,7 +667,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
667
667
  // set the node_allocs from the hash table
668
668
  if (galloc->n_nodes < graph->n_nodes) {
669
669
  free(galloc->node_allocs);
670
- galloc->node_allocs = calloc(sizeof(struct node_alloc), graph->n_nodes);
670
+ galloc->node_allocs = calloc(graph->n_nodes, sizeof(struct node_alloc));
671
671
  GGML_ASSERT(galloc->node_allocs != NULL);
672
672
  }
673
673
  galloc->n_nodes = graph->n_nodes;
@@ -697,7 +697,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
697
697
  }
698
698
  if (galloc->n_leafs < graph->n_leafs) {
699
699
  free(galloc->leaf_allocs);
700
- galloc->leaf_allocs = calloc(sizeof(galloc->leaf_allocs[0]), graph->n_leafs);
700
+ galloc->leaf_allocs = calloc(graph->n_leafs, sizeof(galloc->leaf_allocs[0]));
701
701
  GGML_ASSERT(galloc->leaf_allocs != NULL);
702
702
  }
703
703
  galloc->n_leafs = graph->n_leafs;
@@ -822,7 +822,11 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t
822
822
  GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
823
823
  switch (op->op) {
824
824
  case GGML_OP_CPY:
825
- return op->type != GGML_TYPE_IQ2_XXS && op->type != GGML_TYPE_IQ2_XS && op->type != GGML_TYPE_IQ1_S; // missing type_traits.from_float
825
+ return
826
+ op->type != GGML_TYPE_IQ2_XXS &&
827
+ op->type != GGML_TYPE_IQ2_XS &&
828
+ op->type != GGML_TYPE_IQ1_S &&
829
+ op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
826
830
  case GGML_OP_MUL_MAT:
827
831
  return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
828
832
  default:
@@ -1721,23 +1725,23 @@ ggml_backend_sched_t ggml_backend_sched_new(
1721
1725
  GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
1722
1726
  GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
1723
1727
 
1724
- struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
1728
+ struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched));
1725
1729
 
1726
1730
  // initialize hash table
1727
1731
  sched->hash_set = ggml_hash_set_new(graph_size);
1728
- sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size);
1729
- sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size);
1732
+ sched->tensor_backend_id = calloc(sched->hash_set.size, sizeof(sched->tensor_backend_id[0]));
1733
+ sched->tensor_copies = calloc(sched->hash_set.size, sizeof(sched->tensor_copies[0]));
1730
1734
 
1731
1735
  const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
1732
- sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), nodes_size);
1733
- sched->leaf_backend_ids = calloc(sizeof(sched->leaf_backend_ids[0]), nodes_size);
1736
+ sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
1737
+ sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
1734
1738
 
1735
1739
  sched->n_backends = n_backends;
1736
1740
 
1737
1741
  sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
1738
1742
 
1739
1743
  const int initial_splits_capacity = 16;
1740
- sched->splits = calloc(sizeof(sched->splits[0]), initial_splits_capacity);
1744
+ sched->splits = calloc(initial_splits_capacity, sizeof(sched->splits[0]));
1741
1745
  sched->splits_capacity = initial_splits_capacity;
1742
1746
 
1743
1747
  for (int b = 0; b < n_backends; b++) {
@@ -1780,12 +1784,14 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
1780
1784
 
1781
1785
  void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
1782
1786
  // reset state for the next run
1783
- size_t hash_size = sched->hash_set.size;
1784
- memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT
1785
- memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size);
1786
- memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);
1787
+ if (!sched->is_reset) {
1788
+ size_t hash_size = sched->hash_set.size;
1789
+ memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT
1790
+ memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size);
1791
+ memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);
1787
1792
 
1788
- sched->is_reset = true;
1793
+ sched->is_reset = true;
1794
+ }
1789
1795
  sched->is_alloc = false;
1790
1796
  }
1791
1797
 
@@ -1968,10 +1974,10 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te
1968
1974
  struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
1969
1975
  struct ggml_hash_set hash_set = {
1970
1976
  /* .size = */ graph->visited_hash_table.size,
1971
- /* .keys = */ calloc(sizeof(hash_set.keys[0]), graph->visited_hash_table.size) // NOLINT
1977
+ /* .keys = */ calloc(graph->visited_hash_table.size, sizeof(hash_set.keys[0])) // NOLINT
1972
1978
  };
1973
- struct ggml_tensor ** node_copies = calloc(sizeof(node_copies[0]), hash_set.size); // NOLINT
1974
- bool * node_init = calloc(sizeof(node_init[0]), hash_set.size);
1979
+ struct ggml_tensor ** node_copies = calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
1980
+ bool * node_init = calloc(hash_set.size, sizeof(node_init[0]));
1975
1981
 
1976
1982
  struct ggml_init_params params = {
1977
1983
  /* .mem_size = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false),