llama_cpp 0.14.2 → 0.14.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d720138462a39f3fd9853befa19e55543a794eb4d1c379f7d9db516a4569db68
4
- data.tar.gz: 9779852b62cf57ab208275b746ca2492921cf1f92d4ebf6be26a668f1a7bbb66
3
+ metadata.gz: 3b503998061ee4c8a87bc3148d41feda0b45b04cbe0cafdb3897d1d457b26e0a
4
+ data.tar.gz: b761a18fd964ca0a4e871d01cc0a6058527c951413de7b110a8b07862ed64d8c
5
5
  SHA512:
6
- metadata.gz: 7fa80468abc917099b58009a7a821c704989b8086026e92f8e71a1310ea7ec0449276aeb4653bdb4ddf499183c785b0513ab75b1fa6a94659fe15be2cf05190c
7
- data.tar.gz: '091155784ead62d3ef4ec68f3b4c9f6b1fc97ef87db45327266712912501746b08df983e0d0b81b518d229d4d31f1a0d77ad36f2d7156c26141b8116049d3206'
6
+ metadata.gz: 2951b2a59b0579f3afa983283a73853300f822891f0d1dfef292727d6f313392ddc68902144caaca33b173e43e95076dda02ffa97228cf7f65babc4ac82354c9
7
+ data.tar.gz: cb655d32282b28ebaee30b87b882600c79a6666c306de2692a059da3de1bc21d3c988116fd1dd26d97c00ea0f22fdccc8b3f8d94b20cb01c819d9a578c71bd67
data/CHANGELOG.md CHANGED
@@ -1,3 +1,9 @@
1
+ ## [[0.14.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.2...v0.14.3)] - 2024-03-23
2
+
3
+ - Bump llama.cpp from b2435 to b2496.
4
+ - Add `n_layer` method to `Model`.
5
+ - Add `apply_control_vector` method to `Context`.
6
+
1
7
  ## [[0.14.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.1...v0.14.2)] - 2024-03-16
2
8
 
3
9
  - Fix to use metal embed library on macOS.
@@ -1466,6 +1466,7 @@ public:
1466
1466
  rb_define_method(rb_cLLaMAModel, "n_vocab", RUBY_METHOD_FUNC(_llama_model_get_model_n_vocab), 0);
1467
1467
  rb_define_method(rb_cLLaMAModel, "n_ctx_train", RUBY_METHOD_FUNC(_llama_model_get_model_n_ctx_train), 0);
1468
1468
  rb_define_method(rb_cLLaMAModel, "n_embd", RUBY_METHOD_FUNC(_llama_model_get_model_n_embd), 0);
1469
+ rb_define_method(rb_cLLaMAModel, "n_layer", RUBY_METHOD_FUNC(_llama_model_get_model_n_layer), 0);
1469
1470
  rb_define_method(rb_cLLaMAModel, "rope_freq_scale_train", RUBY_METHOD_FUNC(_llama_model_rope_freq_scale_train), 0);
1470
1471
  rb_define_method(rb_cLLaMAModel, "token_to_piece", RUBY_METHOD_FUNC(_llama_model_token_to_piece), 1);
1471
1472
  rb_define_method(rb_cLLaMAModel, "tokenize", RUBY_METHOD_FUNC(_llama_model_tokenize), -1);
@@ -1613,6 +1614,11 @@ private:
1613
1614
  return INT2NUM(llama_n_embd(ptr->model));
1614
1615
  }
1615
1616
 
1617
+ static VALUE _llama_model_get_model_n_layer(VALUE self) {
1618
+ LLaMAModelWrapper* ptr = get_llama_model(self);
1619
+ return INT2NUM(llama_n_layer(ptr->model));
1620
+ }
1621
+
1616
1622
  static VALUE _llama_model_rope_freq_scale_train(VALUE self) {
1617
1623
  LLaMAModelWrapper* ptr = get_llama_model(self);
1618
1624
  return DBL2NUM(llama_rope_freq_scale_train(ptr->model));
@@ -2083,6 +2089,7 @@ public:
2083
2089
  rb_define_method(rb_cLLaMAContext, "sample_token", RUBY_METHOD_FUNC(_llama_context_sample_token), 1);
2084
2090
  rb_define_method(rb_cLLaMAContext, "sample_grammar", RUBY_METHOD_FUNC(_llama_context_sample_grammar), -1);
2085
2091
  rb_define_method(rb_cLLaMAContext, "grammar_accept_token", RUBY_METHOD_FUNC(_llama_context_grammar_accept_token), -1);
2092
+ rb_define_method(rb_cLLaMAContext, "apply_control_vector", RUBY_METHOD_FUNC(_llama_context_apply_control_vector), -1);
2086
2093
  }
2087
2094
 
2088
2095
  private:
@@ -3153,6 +3160,59 @@ private:
3153
3160
 
3154
3161
  return Qnil;
3155
3162
  }
3163
+
3164
+ static VALUE _llama_context_apply_control_vector(int argc, VALUE* argv, VALUE self) {
3165
+ VALUE kw_args = Qnil;
3166
+ ID kw_table[4] = { rb_intern("data"), rb_intern("n_embd"), rb_intern("il_start"), rb_intern("il_end") };
3167
+ VALUE kw_values[4] = { Qundef, Qundef, Qundef, Qundef };
3168
+ rb_scan_args(argc, argv, ":", &kw_args);
3169
+ rb_get_kwargs(kw_args, kw_table, 4, 0, kw_values);
3170
+
3171
+ if (!RB_TYPE_P(kw_values[0], T_ARRAY) && !NIL_P(kw_values[0])) {
3172
+ rb_raise(rb_eArgError, "data must be an Array or nil");
3173
+ return Qnil;
3174
+ }
3175
+ if (!RB_INTEGER_TYPE_P(kw_values[1])) {
3176
+ rb_raise(rb_eArgError, "n_embd must be an Integer");
3177
+ return Qnil;
3178
+ }
3179
+ if (!RB_INTEGER_TYPE_P(kw_values[2])) {
3180
+ rb_raise(rb_eArgError, "il_start must be an Integer");
3181
+ return Qnil;
3182
+ }
3183
+ if (!RB_INTEGER_TYPE_P(kw_values[3])) {
3184
+ rb_raise(rb_eArgError, "il_end must be an Integer");
3185
+ return Qnil;
3186
+ }
3187
+
3188
+ LLaMAContextWrapper* ctx_ptr = get_llama_context(self);
3189
+ if (ctx_ptr->ctx == NULL) {
3190
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
3191
+ return Qnil;
3192
+ }
3193
+
3194
+ std::vector<float> data(RARRAY_LEN(kw_values[0]));
3195
+ for (size_t i = 0; i < data.size(); i++) {
3196
+ data[i] = NUM2DBL(rb_ary_entry(kw_values[0], i));
3197
+ }
3198
+ const int32_t n_embd = NUM2INT(kw_values[1]);
3199
+ const int32_t il_start = NUM2INT(kw_values[2]);
3200
+ const int32_t il_end = NUM2INT(kw_values[3]);
3201
+
3202
+ int32_t err = 0;
3203
+ if (NIL_P(kw_values[0])) {
3204
+ err = llama_control_vector_apply(ctx_ptr->ctx, NULL, 0, n_embd, il_start, il_end);
3205
+ } else {
3206
+ err = llama_control_vector_apply(ctx_ptr->ctx, data.data(), data.size(), n_embd, il_start, il_end);
3207
+ }
3208
+
3209
+ if (err) {
3210
+ rb_raise(rb_eRuntimeError, "Failed to apply control vector");
3211
+ return Qnil;
3212
+ }
3213
+
3214
+ return Qnil;
3215
+ }
3156
3216
  };
3157
3217
 
3158
3218
  const rb_data_type_t RbLLaMAContext::llama_context_type = {
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.14.2'
6
+ VERSION = '0.14.3'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b2435'
9
+ LLAMA_CPP_VERSION = 'b2496'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -108,6 +108,7 @@ module LLaMACpp
108
108
  def n_vocab: () -> Integer
109
109
  def n_ctx_train: () -> Integer
110
110
  def n_embd: () -> Integer
111
+ def n_layer: () -> Integer
111
112
  def rope_freq_scale_train: () -> Float
112
113
  def token_to_piece: (Integer) -> String
113
114
  def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool, ?special: bool) -> Array[Integer]
@@ -244,6 +245,7 @@ module LLaMACpp
244
245
  def sample_token: (::LLaMACpp::TokenDataArray) -> Integer
245
246
  def sample_grammar: (::LLaMACpp::TokenDataArray, grammar: ::LLaMACpp::Grammar) -> void
246
247
  def grammar_accept_token: (grammar: ::LLaMACpp::Grammar, token: Integer) -> void
248
+ def apply_control_vector: (data: Array[Float], n_embd: Integer, il_start: Integer, il_end: Integer) -> void
247
249
  end
248
250
 
249
251
  class ContextParams
@@ -9,7 +9,8 @@ TEST_TARGETS = \
9
9
  tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
10
10
  tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
11
11
  tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \
12
- tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease
12
+ tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease \
13
+ tests/test-json-schema-to-grammar
13
14
 
14
15
  # Code coverage output files
15
16
  COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@@ -555,7 +556,7 @@ endif
555
556
  endif # LLAMA_METAL
556
557
 
557
558
  ifdef LLAMA_METAL
558
- ggml-metal.o: ggml-metal.m ggml-metal.h
559
+ ggml-metal.o: ggml-metal.m ggml-metal.h ggml.h
559
560
  $(CC) $(CFLAGS) -c $< -o $@
560
561
 
561
562
  ifdef LLAMA_METAL_EMBED_LIBRARY
@@ -597,6 +598,11 @@ include scripts/get-flags.mk
597
598
  CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
598
599
  endif
599
600
 
601
+ ifdef LLAMA_CURL
602
+ override CXXFLAGS := $(CXXFLAGS) -DLLAMA_USE_CURL
603
+ override LDFLAGS := $(LDFLAGS) -lcurl
604
+ endif
605
+
600
606
  #
601
607
  # Print build information
602
608
  #
@@ -663,6 +669,9 @@ console.o: common/console.cpp common/console.h
663
669
  grammar-parser.o: common/grammar-parser.cpp common/grammar-parser.h
664
670
  $(CXX) $(CXXFLAGS) -c $< -o $@
665
671
 
672
+ json-schema-to-grammar.o: common/json-schema-to-grammar.cpp common/json-schema-to-grammar.h
673
+ $(CXX) $(CXXFLAGS) -c $< -o $@
674
+
666
675
  train.o: common/train.cpp common/train.h
667
676
  $(CXX) $(CXXFLAGS) -c $< -o $@
668
677
 
@@ -745,7 +754,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
745
754
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
746
755
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
747
756
 
748
- server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
757
+ server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp json-schema-to-grammar.o common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
749
758
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
750
759
  $(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
751
760
 
@@ -753,6 +762,10 @@ gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
753
762
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
754
763
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
755
764
 
765
+ gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
766
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
767
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
768
+
756
769
  train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
757
770
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
758
771
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -861,6 +874,10 @@ tests/test-double-float: tests/test-double-float.cpp ggml.o $(OBJS)
861
874
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
862
875
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
863
876
 
877
+ tests/test-json-schema-to-grammar: tests/test-json-schema-to-grammar.cpp json-schema-to-grammar.o ggml.o llama.o grammar-parser.o $(OBJS)
878
+ $(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<)
879
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
880
+
864
881
  tests/test-grad0: tests/test-grad0.cpp ggml.o $(OBJS)
865
882
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
866
883
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -548,7 +548,11 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
548
548
  for (int i = 0; i < graph->n_nodes; i++) {
549
549
  struct ggml_tensor * node = graph->nodes[i];
550
550
 
551
- if (ggml_is_view(node)) {
551
+ // TODO: better way to add external dependencies
552
+ // GGML_OP_NONE does not appear normally in the graph nodes, but is used by ggml-backend to add dependencies to
553
+ // control when some tensors are allocated and freed. in this case, the dependencies are in `src`, but the node
554
+ // itself is never used and should not be considered a dependency
555
+ if (ggml_is_view(node) && node->op != GGML_OP_NONE) {
552
556
  struct ggml_tensor * view_src = node->view_src;
553
557
  ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
554
558
  }
@@ -565,8 +569,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
565
569
 
566
570
  ggml_gallocr_hash_get(galloc, src)->n_children += 1;
567
571
 
568
- // allocate explicit inputs and leafs
569
- if (src->flags & GGML_TENSOR_FLAG_INPUT || src->op == GGML_OP_NONE) {
572
+ // allocate explicit inputs
573
+ if (src->flags & GGML_TENSOR_FLAG_INPUT) {
570
574
  ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i));
571
575
  }
572
576
  }
@@ -103,6 +103,11 @@ extern "C" {
103
103
  // check if the backend supports an operation
104
104
  bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
105
105
 
106
+ // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
107
+ // these should be expensive operations with large batch sizes that may benefit from running on this backend
108
+ // even if the weight has to be copied from the CPU temporarily
109
+ bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
110
+
106
111
  // (optional) event synchronization
107
112
  ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
108
113
  void (*GGML_CALL event_free) (ggml_backend_event_t event);