llama_cpp 0.14.4 → 0.14.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 92ebd411f54255b05074ef79ed3e220c9ff4332164cfc831122d766226322515
4
- data.tar.gz: 571f0ec65776945d40a54e31bba26cc4194b19965cc4841ce40b9ad3d94605df
3
+ metadata.gz: 7d80abb57b135ff04718e34099accaaabf3358553b0f061d79b195a99386739d
4
+ data.tar.gz: 5b24a9b7846b962f4063a0e50f15c6d9a9c874d1931ed32c200f3383869a2fd9
5
5
  SHA512:
6
- metadata.gz: 34ca9567b7eb96add562e977f22f8b2be087c026c85bf92cd5e31f9a96ea5f02a841bdf05f745c4079740a4bb01476fb9bab313317d66dbf8870fa829a269c86
7
- data.tar.gz: 64b19ef010bb52800cd3710c1ec70bcb7b747e53b6ea7d8f13d84d336b1ee67868153f5bbdfee0b4131dfddaf1c656c49bd774084ee2d14f191d22d215a47737
6
+ metadata.gz: dfb20e108a57b65ff624db1e2ee37034ffca406d906268d89ff441099a02c00fd67743a786a0353df2368614003604a4bf5982089024f14aee2e0f95e210e297
7
+ data.tar.gz: 0a0bbd93dfe57e033f25e5c3e3d61fb568362aa2d317851dbb69fe620e5e30bc8b08c27272579e7841c50b87984abf70ade4a9e7e34fb2615e106a5c2474b79e
data/CHANGELOG.md CHANGED
@@ -1,10 +1,18 @@
1
+ ## [[0.14.5](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.4...v0.14.5)] - 2024-04-13
2
+
3
+ - Bump llama.cpp from b2608 to b2658.
4
+ - Add magic number constants.
5
+ - Add `token_cls` and `token_sep` methods to `Model`.
6
+
7
+ Implementation bindings for llama_state_get_size, llama_state_get_data, llama_state_set_data, llama_state_load_file, llama_state_save_file, llama_state_seq_get_size, llama_state_seq_get_data, llama_state_seq_set_data, llama_state_seq_save_file, and llama_state_seq_load_file has been skipped.
8
+
1
9
  ## [[0.14.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.3...v0.14.4)] - 2024-04-06
2
10
 
3
11
  - Bump llama.cpp from b2496 to b2573.
4
12
  - Add file type constants.
5
13
  - Bump llama.cpp from b2573 to b2608.
6
14
 
7
- Implementation of llama_split_path, llama_split_prefix binding, llama_grammar_accept, and decode_utf8 has been skipped.
15
+ Implementation bindings for llama_split_path, llama_split_prefix binding, llama_grammar_accept, and decode_utf8 has been skipped.
8
16
 
9
17
  ## [[0.14.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.2...v0.14.3)] - 2024-03-23
10
18
 
data/examples/chat.rb CHANGED
@@ -127,8 +127,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
127
127
  end
128
128
 
129
129
  if input_echo
130
- output = []
131
- embd.each { |token| output << context.model.token_to_piece(token) }
130
+ output = embd.map { |token| context.model.token_to_piece(token) }
132
131
  output_str = output.join
133
132
  output_str.chomp!(antiprompt) if first_input
134
133
  print(output_str)
@@ -136,8 +135,7 @@ class Chat < Thor # rubocop:disable Metrics/ClassLength, Style/Documentation
136
135
 
137
136
  if embd_input.size <= n_consumed
138
137
  if antiprompt.size.positive?
139
- last_output = []
140
- last_n_tokens.each { |token| last_output << context.model.token_to_piece(token) }
138
+ last_output = last_n_tokens.map { |token| context.model.token_to_piece(token) }
141
139
  last_output_str = last_output.join
142
140
 
143
141
  search_start_pos = last_output_str.size > antiprompt.size ? last_output_str.size - antiprompt.size : 0
@@ -37,6 +37,7 @@ if RUBY_PLATFORM.match?(/darwin/)
37
37
  abort('Failed to set installation path for libllama.dylib.') unless mkstatus.success?
38
38
  end
39
39
  FileUtils.cp("#{LLAMA_CPP_DIR}/ggml-metal-embed.metal", VENDOR_LIB_DIR)
40
+ FileUtils.cp("#{LLAMA_CPP_DIR}/ggml-metal.metal", VENDOR_LIB_DIR)
40
41
  end
41
42
 
42
43
  abort('libstdc++ is not found.') unless have_library('stdc++')
@@ -1478,6 +1478,8 @@ public:
1478
1478
  rb_define_method(rb_cLLaMAModel, "type", RUBY_METHOD_FUNC(_llama_model_get_type), 1);
1479
1479
  rb_define_method(rb_cLLaMAModel, "token_bos", RUBY_METHOD_FUNC(_llama_model_token_bos), 0);
1480
1480
  rb_define_method(rb_cLLaMAModel, "token_eos", RUBY_METHOD_FUNC(_llama_model_token_eos), 0);
1481
+ rb_define_method(rb_cLLaMAModel, "token_cls", RUBY_METHOD_FUNC(_llama_model_token_cls), 0);
1482
+ rb_define_method(rb_cLLaMAModel, "token_sep", RUBY_METHOD_FUNC(_llama_model_token_sep), 0);
1481
1483
  rb_define_method(rb_cLLaMAModel, "token_nl", RUBY_METHOD_FUNC(_llama_model_token_nl), 0);
1482
1484
  rb_define_method(rb_cLLaMAModel, "add_bos_token?", RUBY_METHOD_FUNC(_llama_model_add_bos_token), 0);
1483
1485
  rb_define_method(rb_cLLaMAModel, "add_eos_token?", RUBY_METHOD_FUNC(_llama_model_add_eos_token), 0);
@@ -1743,6 +1745,16 @@ private:
1743
1745
  return INT2NUM(llama_token_eos(ptr->model));
1744
1746
  }
1745
1747
 
1748
+ static VALUE _llama_model_token_cls(VALUE self) {
1749
+ LLaMAModelWrapper* ptr = get_llama_model(self);
1750
+ return INT2NUM(llama_token_cls(ptr->model));
1751
+ }
1752
+
1753
+ static VALUE _llama_model_token_sep(VALUE self) {
1754
+ LLaMAModelWrapper* ptr = get_llama_model(self);
1755
+ return INT2NUM(llama_token_sep(ptr->model));
1756
+ }
1757
+
1746
1758
  static VALUE _llama_model_token_nl(VALUE self) {
1747
1759
  LLaMAModelWrapper* ptr = get_llama_model(self);
1748
1760
  return INT2NUM(llama_token_nl(ptr->model));
@@ -3414,15 +3426,26 @@ extern "C" void Init_llama_cpp(void) {
3414
3426
  ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGSN;
3415
3427
  rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGSN", rb_str_new2(ss_magic.str().c_str()));
3416
3428
 
3429
+ ss_magic.str("");
3430
+ ss_magic.clear(std::stringstream::goodbit);
3431
+ ss_magic << std::showbase << std::hex << LLAMA_FILE_MAGIC_GGSQ;
3432
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FILE_MAGIC_GGSQ", rb_str_new2(ss_magic.str().c_str()));
3433
+
3417
3434
  ss_magic.str("");
3418
3435
  ss_magic.clear(std::stringstream::goodbit);
3419
3436
  ss_magic << std::showbase << std::hex << LLAMA_SESSION_MAGIC;
3420
3437
  rb_define_const(rb_mLLaMACpp, "LLAMA_SESSION_MAGIC", rb_str_new2(ss_magic.str().c_str()));
3421
3438
 
3439
+ ss_magic.str("");
3440
+ ss_magic.clear(std::stringstream::goodbit);
3441
+ ss_magic << std::showbase << std::hex << LLAMA_STATE_SEQ_MAGIC;
3442
+ rb_define_const(rb_mLLaMACpp, "LLAMA_STATE_SEQ_MAGIC", rb_str_new2(ss_magic.str().c_str()));
3443
+
3422
3444
  ss_magic.str("");
3423
3445
  ss_magic.clear(std::stringstream::goodbit);
3424
3446
  ss_magic << std::showbase << std::hex << LLAMA_DEFAULT_SEED;
3425
3447
  rb_define_const(rb_mLLaMACpp, "LLAMA_DEFAULT_SEED", rb_str_new2(ss_magic.str().c_str()));
3426
3448
 
3427
3449
  rb_define_const(rb_mLLaMACpp, "LLAMA_SESSION_VERSION", rb_str_new2(std::to_string(LLAMA_SESSION_VERSION).c_str()));
3450
+ rb_define_const(rb_mLLaMACpp, "LLAMA_STATE_SEQ_VERSION", rb_str_new2(std::to_string(LLAMA_STATE_SEQ_VERSION).c_str()));
3428
3451
  }
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.14.4'
6
+ VERSION = '0.14.5'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b2608'
9
+ LLAMA_CPP_VERSION = 'b2658'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -3,6 +3,14 @@ module LLaMACpp
3
3
  LLAMA_CPP_VERSION: String
4
4
  LLAMA_DEFALUT_SEED: String
5
5
 
6
+ LLAMA_FILE_MAGIC_GGLA: String
7
+ LLAMA_FILE_MAGIC_GGSN: String
8
+ LLAMA_FILE_MAGIC_GGSQ: String
9
+ LLAMA_SESSION_MAGIC: String
10
+ LLAMA_SESSION_VERSION: String
11
+ LLAMA_STATE_SEQ_MAGIC: String
12
+ LLAMA_STATE_SEQ_VERSION: String
13
+
6
14
  LLAMA_VOCAB_TYPE_NONE: Integer
7
15
  LLAMA_VOCAB_TYPE_SPM: Integer
8
16
  LLAMA_VOCAB_TYPE_BPE: Integer
@@ -124,6 +132,8 @@ module LLaMACpp
124
132
  def type: (Integer) -> Integer
125
133
  def token_bos: () -> Integer
126
134
  def token_eos: () -> Integer
135
+ def token_cls: () -> Integer
136
+ def token_sep: () -> Integer
127
137
  def token_nl: () -> Integer
128
138
  def add_bos_token?: () -> bool
129
139
  def add_eos_token?: () -> bool
@@ -1,6 +1,6 @@
1
1
  MIT License
2
2
 
3
- Copyright (c) 2023 Georgi Gerganov
3
+ Copyright (c) 2023-2024 The ggml authors
4
4
 
5
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
6
6
  of this software and associated documentation files (the "Software"), to deal
@@ -1,7 +1,7 @@
1
1
  # Define the default target now so that it is always the first target
2
2
  BUILD_TARGETS = \
3
3
  main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
4
- simple batched batched-bench save-load-state server gguf gguf-split llama-bench libllava.a llava-cli baby-llama beam-search \
4
+ simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama beam-search \
5
5
  retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
6
6
 
7
7
  # Binaries only useful for tests
@@ -10,7 +10,7 @@ TEST_TARGETS = \
10
10
  tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
11
11
  tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \
12
12
  tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease \
13
- tests/test-json-schema-to-grammar
13
+ tests/test-json-schema-to-grammar tests/test-grammar-integration
14
14
 
15
15
  # Code coverage output files
16
16
  COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@@ -648,7 +648,7 @@ CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])'
648
648
  ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
649
649
  ifndef CUDA_DOCKER_ARCH
650
650
  ifndef CUDA_POWER_ARCH
651
- $(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via CUDA_DOCKER_ARCH)
651
+ $(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via environment variable CUDA_DOCKER_ARCH, e.g. by running "export CUDA_DOCKER_ARCH=compute_XX" on Unix-like systems, where XX is the minimum compute capability that the code needs to run on. A list with compute capabilities can be found here: https://developer.nvidia.com/cuda-gpus )
652
652
  endif # CUDA_POWER_ARCH
653
653
  endif # CUDA_DOCKER_ARCH
654
654
  endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
@@ -805,6 +805,10 @@ gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(O
805
805
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
806
806
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
807
807
 
808
+ eval-callback: examples/eval-callback/eval-callback.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
809
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
810
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
811
+
808
812
  train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
809
813
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
810
814
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -923,6 +927,10 @@ tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o grammar-
923
927
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
924
928
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
925
929
 
930
+ tests/test-grammar-integration: tests/test-grammar-integration.cpp ggml.o llama.o grammar-parser.o $(OBJS)
931
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
932
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
933
+
926
934
  tests/test-double-float: tests/test-double-float.cpp ggml.o $(OBJS)
927
935
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
928
936
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -137,7 +137,7 @@ extern "C" {
137
137
  /*
138
138
  Example usage:
139
139
 
140
- // operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be asigned
140
+ // operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be assigned
141
141
  // preferrably to run on the same backend as the buffer
142
142
  ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
143
143
 
@@ -1225,7 +1225,7 @@ static void ggml_cuda_op_mul_mat_cublas(
1225
1225
 
1226
1226
  // the main device has a larger memory buffer to hold the results from all GPUs
1227
1227
  // ldc == nrows of the matrix that cuBLAS writes into
1228
- int ldc = id == ctx.device ? ne0 : row_diff;
1228
+ int64_t ldc = id == ctx.device ? ne0 : row_diff;
1229
1229
 
1230
1230
  const int compute_capability = ggml_cuda_info().devices[id].cc;
1231
1231
 
@@ -1377,8 +1377,8 @@ static void ggml_cuda_op_mul_mat(
1377
1377
  const int64_t ne0 = dst->ne[0];
1378
1378
  const int64_t ne1 = dst->ne[1];
1379
1379
 
1380
- const int nb2 = dst->nb[2];
1381
- const int nb3 = dst->nb[3];
1380
+ const int64_t nb2 = dst->nb[2];
1381
+ const int64_t nb3 = dst->nb[3];
1382
1382
 
1383
1383
  GGML_ASSERT(ggml_backend_buffer_is_cuda(dst->buffer));
1384
1384
  GGML_ASSERT(ggml_backend_buffer_is_cuda(src1->buffer));
@@ -2617,6 +2617,7 @@ GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size
2617
2617
  return false;
2618
2618
  }
2619
2619
 
2620
+ #if CUDART_VERSION >= 11100
2620
2621
  cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly);
2621
2622
  if (err != cudaSuccess) {
2622
2623
  // clear the error
@@ -2627,6 +2628,9 @@ GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size
2627
2628
  return false;
2628
2629
  }
2629
2630
  return true;
2631
+ #else
2632
+ return false;
2633
+ #endif
2630
2634
  }
2631
2635
 
2632
2636
  GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer) {