llama_cpp 0.14.2 → 0.14.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d720138462a39f3fd9853befa19e55543a794eb4d1c379f7d9db516a4569db68
4
- data.tar.gz: 9779852b62cf57ab208275b746ca2492921cf1f92d4ebf6be26a668f1a7bbb66
3
+ metadata.gz: 3b503998061ee4c8a87bc3148d41feda0b45b04cbe0cafdb3897d1d457b26e0a
4
+ data.tar.gz: b761a18fd964ca0a4e871d01cc0a6058527c951413de7b110a8b07862ed64d8c
5
5
  SHA512:
6
- metadata.gz: 7fa80468abc917099b58009a7a821c704989b8086026e92f8e71a1310ea7ec0449276aeb4653bdb4ddf499183c785b0513ab75b1fa6a94659fe15be2cf05190c
7
- data.tar.gz: '091155784ead62d3ef4ec68f3b4c9f6b1fc97ef87db45327266712912501746b08df983e0d0b81b518d229d4d31f1a0d77ad36f2d7156c26141b8116049d3206'
6
+ metadata.gz: 2951b2a59b0579f3afa983283a73853300f822891f0d1dfef292727d6f313392ddc68902144caaca33b173e43e95076dda02ffa97228cf7f65babc4ac82354c9
7
+ data.tar.gz: cb655d32282b28ebaee30b87b882600c79a6666c306de2692a059da3de1bc21d3c988116fd1dd26d97c00ea0f22fdccc8b3f8d94b20cb01c819d9a578c71bd67
data/CHANGELOG.md CHANGED
@@ -1,3 +1,9 @@
1
+ ## [[0.14.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.2...v0.14.3)] - 2024-03-23
2
+
3
+ - Bump llama.cpp from b2435 to b2496.
4
+ - Add `n_layer` method to `Model`.
5
+ - Add `apply_control_vector` method to `Context`.
6
+
1
7
  ## [[0.14.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.1...v0.14.2)] - 2024-03-16
2
8
 
3
9
  - Fix to use metal embed library on macOS.
@@ -1466,6 +1466,7 @@ public:
1466
1466
  rb_define_method(rb_cLLaMAModel, "n_vocab", RUBY_METHOD_FUNC(_llama_model_get_model_n_vocab), 0);
1467
1467
  rb_define_method(rb_cLLaMAModel, "n_ctx_train", RUBY_METHOD_FUNC(_llama_model_get_model_n_ctx_train), 0);
1468
1468
  rb_define_method(rb_cLLaMAModel, "n_embd", RUBY_METHOD_FUNC(_llama_model_get_model_n_embd), 0);
1469
+ rb_define_method(rb_cLLaMAModel, "n_layer", RUBY_METHOD_FUNC(_llama_model_get_model_n_layer), 0);
1469
1470
  rb_define_method(rb_cLLaMAModel, "rope_freq_scale_train", RUBY_METHOD_FUNC(_llama_model_rope_freq_scale_train), 0);
1470
1471
  rb_define_method(rb_cLLaMAModel, "token_to_piece", RUBY_METHOD_FUNC(_llama_model_token_to_piece), 1);
1471
1472
  rb_define_method(rb_cLLaMAModel, "tokenize", RUBY_METHOD_FUNC(_llama_model_tokenize), -1);
@@ -1613,6 +1614,11 @@ private:
1613
1614
  return INT2NUM(llama_n_embd(ptr->model));
1614
1615
  }
1615
1616
 
1617
+ static VALUE _llama_model_get_model_n_layer(VALUE self) {
1618
+ LLaMAModelWrapper* ptr = get_llama_model(self);
1619
+ return INT2NUM(llama_n_layer(ptr->model));
1620
+ }
1621
+
1616
1622
  static VALUE _llama_model_rope_freq_scale_train(VALUE self) {
1617
1623
  LLaMAModelWrapper* ptr = get_llama_model(self);
1618
1624
  return DBL2NUM(llama_rope_freq_scale_train(ptr->model));
@@ -2083,6 +2089,7 @@ public:
2083
2089
  rb_define_method(rb_cLLaMAContext, "sample_token", RUBY_METHOD_FUNC(_llama_context_sample_token), 1);
2084
2090
  rb_define_method(rb_cLLaMAContext, "sample_grammar", RUBY_METHOD_FUNC(_llama_context_sample_grammar), -1);
2085
2091
  rb_define_method(rb_cLLaMAContext, "grammar_accept_token", RUBY_METHOD_FUNC(_llama_context_grammar_accept_token), -1);
2092
+ rb_define_method(rb_cLLaMAContext, "apply_control_vector", RUBY_METHOD_FUNC(_llama_context_apply_control_vector), -1);
2086
2093
  }
2087
2094
 
2088
2095
  private:
@@ -3153,6 +3160,59 @@ private:
3153
3160
 
3154
3161
  return Qnil;
3155
3162
  }
3163
+
3164
+ static VALUE _llama_context_apply_control_vector(int argc, VALUE* argv, VALUE self) {
3165
+ VALUE kw_args = Qnil;
3166
+ ID kw_table[4] = { rb_intern("data"), rb_intern("n_embd"), rb_intern("il_start"), rb_intern("il_end") };
3167
+ VALUE kw_values[4] = { Qundef, Qundef, Qundef, Qundef };
3168
+ rb_scan_args(argc, argv, ":", &kw_args);
3169
+ rb_get_kwargs(kw_args, kw_table, 4, 0, kw_values);
3170
+
3171
+ if (!RB_TYPE_P(kw_values[0], T_ARRAY) && !NIL_P(kw_values[0])) {
3172
+ rb_raise(rb_eArgError, "data must be an Array or nil");
3173
+ return Qnil;
3174
+ }
3175
+ if (!RB_INTEGER_TYPE_P(kw_values[1])) {
3176
+ rb_raise(rb_eArgError, "n_embd must be an Integer");
3177
+ return Qnil;
3178
+ }
3179
+ if (!RB_INTEGER_TYPE_P(kw_values[2])) {
3180
+ rb_raise(rb_eArgError, "il_start must be an Integer");
3181
+ return Qnil;
3182
+ }
3183
+ if (!RB_INTEGER_TYPE_P(kw_values[3])) {
3184
+ rb_raise(rb_eArgError, "il_end must be an Integer");
3185
+ return Qnil;
3186
+ }
3187
+
3188
+ LLaMAContextWrapper* ctx_ptr = get_llama_context(self);
3189
+ if (ctx_ptr->ctx == NULL) {
3190
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
3191
+ return Qnil;
3192
+ }
3193
+
3194
+ std::vector<float> data(RARRAY_LEN(kw_values[0]));
3195
+ for (size_t i = 0; i < data.size(); i++) {
3196
+ data[i] = NUM2DBL(rb_ary_entry(kw_values[0], i));
3197
+ }
3198
+ const int32_t n_embd = NUM2INT(kw_values[1]);
3199
+ const int32_t il_start = NUM2INT(kw_values[2]);
3200
+ const int32_t il_end = NUM2INT(kw_values[3]);
3201
+
3202
+ int32_t err = 0;
3203
+ if (NIL_P(kw_values[0])) {
3204
+ err = llama_control_vector_apply(ctx_ptr->ctx, NULL, 0, n_embd, il_start, il_end);
3205
+ } else {
3206
+ err = llama_control_vector_apply(ctx_ptr->ctx, data.data(), data.size(), n_embd, il_start, il_end);
3207
+ }
3208
+
3209
+ if (err) {
3210
+ rb_raise(rb_eRuntimeError, "Failed to apply control vector");
3211
+ return Qnil;
3212
+ }
3213
+
3214
+ return Qnil;
3215
+ }
3156
3216
  };
3157
3217
 
3158
3218
  const rb_data_type_t RbLLaMAContext::llama_context_type = {
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.14.2'
6
+ VERSION = '0.14.3'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b2435'
9
+ LLAMA_CPP_VERSION = 'b2496'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -108,6 +108,7 @@ module LLaMACpp
108
108
  def n_vocab: () -> Integer
109
109
  def n_ctx_train: () -> Integer
110
110
  def n_embd: () -> Integer
111
+ def n_layer: () -> Integer
111
112
  def rope_freq_scale_train: () -> Float
112
113
  def token_to_piece: (Integer) -> String
113
114
  def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool, ?special: bool) -> Array[Integer]
@@ -244,6 +245,7 @@ module LLaMACpp
244
245
  def sample_token: (::LLaMACpp::TokenDataArray) -> Integer
245
246
  def sample_grammar: (::LLaMACpp::TokenDataArray, grammar: ::LLaMACpp::Grammar) -> void
246
247
  def grammar_accept_token: (grammar: ::LLaMACpp::Grammar, token: Integer) -> void
248
+ def apply_control_vector: (data: Array[Float], n_embd: Integer, il_start: Integer, il_end: Integer) -> void
247
249
  end
248
250
 
249
251
  class ContextParams
@@ -9,7 +9,8 @@ TEST_TARGETS = \
9
9
  tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
10
10
  tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
11
11
  tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \
12
- tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease
12
+ tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease \
13
+ tests/test-json-schema-to-grammar
13
14
 
14
15
  # Code coverage output files
15
16
  COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@@ -555,7 +556,7 @@ endif
555
556
  endif # LLAMA_METAL
556
557
 
557
558
  ifdef LLAMA_METAL
558
- ggml-metal.o: ggml-metal.m ggml-metal.h
559
+ ggml-metal.o: ggml-metal.m ggml-metal.h ggml.h
559
560
  $(CC) $(CFLAGS) -c $< -o $@
560
561
 
561
562
  ifdef LLAMA_METAL_EMBED_LIBRARY
@@ -597,6 +598,11 @@ include scripts/get-flags.mk
597
598
  CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
598
599
  endif
599
600
 
601
+ ifdef LLAMA_CURL
602
+ override CXXFLAGS := $(CXXFLAGS) -DLLAMA_USE_CURL
603
+ override LDFLAGS := $(LDFLAGS) -lcurl
604
+ endif
605
+
600
606
  #
601
607
  # Print build information
602
608
  #
@@ -663,6 +669,9 @@ console.o: common/console.cpp common/console.h
663
669
  grammar-parser.o: common/grammar-parser.cpp common/grammar-parser.h
664
670
  $(CXX) $(CXXFLAGS) -c $< -o $@
665
671
 
672
+ json-schema-to-grammar.o: common/json-schema-to-grammar.cpp common/json-schema-to-grammar.h
673
+ $(CXX) $(CXXFLAGS) -c $< -o $@
674
+
666
675
  train.o: common/train.cpp common/train.h
667
676
  $(CXX) $(CXXFLAGS) -c $< -o $@
668
677
 
@@ -745,7 +754,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
745
754
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
746
755
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
747
756
 
748
- server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
757
+ server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp json-schema-to-grammar.o common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
749
758
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
750
759
  $(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
751
760
 
@@ -753,6 +762,10 @@ gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
753
762
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
754
763
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
755
764
 
765
+ gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
766
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
767
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
768
+
756
769
  train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
757
770
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
758
771
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -861,6 +874,10 @@ tests/test-double-float: tests/test-double-float.cpp ggml.o $(OBJS)
861
874
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
862
875
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
863
876
 
877
+ tests/test-json-schema-to-grammar: tests/test-json-schema-to-grammar.cpp json-schema-to-grammar.o ggml.o llama.o grammar-parser.o $(OBJS)
878
+ $(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<)
879
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
880
+
864
881
  tests/test-grad0: tests/test-grad0.cpp ggml.o $(OBJS)
865
882
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
866
883
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -548,7 +548,11 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
548
548
  for (int i = 0; i < graph->n_nodes; i++) {
549
549
  struct ggml_tensor * node = graph->nodes[i];
550
550
 
551
- if (ggml_is_view(node)) {
551
+ // TODO: better way to add external dependencies
552
+ // GGML_OP_NONE does not appear normally in the graph nodes, but is used by ggml-backend to add dependencies to
553
+ // control when some tensors are allocated and freed. in this case, the dependencies are in `src`, but the node
554
+ // itself is never used and should not be considered a dependency
555
+ if (ggml_is_view(node) && node->op != GGML_OP_NONE) {
552
556
  struct ggml_tensor * view_src = node->view_src;
553
557
  ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
554
558
  }
@@ -565,8 +569,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
565
569
 
566
570
  ggml_gallocr_hash_get(galloc, src)->n_children += 1;
567
571
 
568
- // allocate explicit inputs and leafs
569
- if (src->flags & GGML_TENSOR_FLAG_INPUT || src->op == GGML_OP_NONE) {
572
+ // allocate explicit inputs
573
+ if (src->flags & GGML_TENSOR_FLAG_INPUT) {
570
574
  ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i));
571
575
  }
572
576
  }
@@ -103,6 +103,11 @@ extern "C" {
103
103
  // check if the backend supports an operation
104
104
  bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
105
105
 
106
+ // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
107
+ // these should be expensive operations with large batch sizes that may benefit from running on this backend
108
+ // even if the weight has to be copied from the CPU temporarily
109
+ bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
110
+
106
111
  // (optional) event synchronization
107
112
  ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
108
113
  void (*GGML_CALL event_free) (ggml_backend_event_t event);