llama_cpp 0.14.2 → 0.14.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d720138462a39f3fd9853befa19e55543a794eb4d1c379f7d9db516a4569db68
4
- data.tar.gz: 9779852b62cf57ab208275b746ca2492921cf1f92d4ebf6be26a668f1a7bbb66
3
+ metadata.gz: 92ebd411f54255b05074ef79ed3e220c9ff4332164cfc831122d766226322515
4
+ data.tar.gz: 571f0ec65776945d40a54e31bba26cc4194b19965cc4841ce40b9ad3d94605df
5
5
  SHA512:
6
- metadata.gz: 7fa80468abc917099b58009a7a821c704989b8086026e92f8e71a1310ea7ec0449276aeb4653bdb4ddf499183c785b0513ab75b1fa6a94659fe15be2cf05190c
7
- data.tar.gz: '091155784ead62d3ef4ec68f3b4c9f6b1fc97ef87db45327266712912501746b08df983e0d0b81b518d229d4d31f1a0d77ad36f2d7156c26141b8116049d3206'
6
+ metadata.gz: 34ca9567b7eb96add562e977f22f8b2be087c026c85bf92cd5e31f9a96ea5f02a841bdf05f745c4079740a4bb01476fb9bab313317d66dbf8870fa829a269c86
7
+ data.tar.gz: 64b19ef010bb52800cd3710c1ec70bcb7b747e53b6ea7d8f13d84d336b1ee67868153f5bbdfee0b4131dfddaf1c656c49bd774084ee2d14f191d22d215a47737
data/CHANGELOG.md CHANGED
@@ -1,3 +1,17 @@
1
+ ## [[0.14.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.3...v0.14.4)] - 2024-04-06
2
+
3
+ - Bump llama.cpp from b2496 to b2573.
4
+ - Add file type constants.
5
+ - Bump llama.cpp from b2573 to b2608.
6
+
7
+ Implementation of llama_split_path, llama_split_prefix binding, llama_grammar_accept, and decode_utf8 has been skipped.
8
+
9
+ ## [[0.14.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.2...v0.14.3)] - 2024-03-23
10
+
11
+ - Bump llama.cpp from b2435 to b2496.
12
+ - Add `n_layer` method to `Model`.
13
+ - Add `apply_control_vector` method to `Context`.
14
+
1
15
  ## [[0.14.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.1...v0.14.2)] - 2024-03-16
2
16
 
3
17
  - Fix to use metal embed library on macOS.
@@ -1466,6 +1466,7 @@ public:
1466
1466
  rb_define_method(rb_cLLaMAModel, "n_vocab", RUBY_METHOD_FUNC(_llama_model_get_model_n_vocab), 0);
1467
1467
  rb_define_method(rb_cLLaMAModel, "n_ctx_train", RUBY_METHOD_FUNC(_llama_model_get_model_n_ctx_train), 0);
1468
1468
  rb_define_method(rb_cLLaMAModel, "n_embd", RUBY_METHOD_FUNC(_llama_model_get_model_n_embd), 0);
1469
+ rb_define_method(rb_cLLaMAModel, "n_layer", RUBY_METHOD_FUNC(_llama_model_get_model_n_layer), 0);
1469
1470
  rb_define_method(rb_cLLaMAModel, "rope_freq_scale_train", RUBY_METHOD_FUNC(_llama_model_rope_freq_scale_train), 0);
1470
1471
  rb_define_method(rb_cLLaMAModel, "token_to_piece", RUBY_METHOD_FUNC(_llama_model_token_to_piece), 1);
1471
1472
  rb_define_method(rb_cLLaMAModel, "tokenize", RUBY_METHOD_FUNC(_llama_model_tokenize), -1);
@@ -1613,6 +1614,11 @@ private:
1613
1614
  return INT2NUM(llama_n_embd(ptr->model));
1614
1615
  }
1615
1616
 
1617
+ static VALUE _llama_model_get_model_n_layer(VALUE self) {
1618
+ LLaMAModelWrapper* ptr = get_llama_model(self);
1619
+ return INT2NUM(llama_n_layer(ptr->model));
1620
+ }
1621
+
1616
1622
  static VALUE _llama_model_rope_freq_scale_train(VALUE self) {
1617
1623
  LLaMAModelWrapper* ptr = get_llama_model(self);
1618
1624
  return DBL2NUM(llama_rope_freq_scale_train(ptr->model));
@@ -2083,6 +2089,7 @@ public:
2083
2089
  rb_define_method(rb_cLLaMAContext, "sample_token", RUBY_METHOD_FUNC(_llama_context_sample_token), 1);
2084
2090
  rb_define_method(rb_cLLaMAContext, "sample_grammar", RUBY_METHOD_FUNC(_llama_context_sample_grammar), -1);
2085
2091
  rb_define_method(rb_cLLaMAContext, "grammar_accept_token", RUBY_METHOD_FUNC(_llama_context_grammar_accept_token), -1);
2092
+ rb_define_method(rb_cLLaMAContext, "apply_control_vector", RUBY_METHOD_FUNC(_llama_context_apply_control_vector), -1);
2086
2093
  }
2087
2094
 
2088
2095
  private:
@@ -3153,6 +3160,59 @@ private:
3153
3160
 
3154
3161
  return Qnil;
3155
3162
  }
3163
+
3164
+ static VALUE _llama_context_apply_control_vector(int argc, VALUE* argv, VALUE self) {
3165
+ VALUE kw_args = Qnil;
3166
+ ID kw_table[4] = { rb_intern("data"), rb_intern("n_embd"), rb_intern("il_start"), rb_intern("il_end") };
3167
+ VALUE kw_values[4] = { Qundef, Qundef, Qundef, Qundef };
3168
+ rb_scan_args(argc, argv, ":", &kw_args);
3169
+ rb_get_kwargs(kw_args, kw_table, 4, 0, kw_values);
3170
+
3171
+ if (!RB_TYPE_P(kw_values[0], T_ARRAY) && !NIL_P(kw_values[0])) {
3172
+ rb_raise(rb_eArgError, "data must be an Array or nil");
3173
+ return Qnil;
3174
+ }
3175
+ if (!RB_INTEGER_TYPE_P(kw_values[1])) {
3176
+ rb_raise(rb_eArgError, "n_embd must be an Integer");
3177
+ return Qnil;
3178
+ }
3179
+ if (!RB_INTEGER_TYPE_P(kw_values[2])) {
3180
+ rb_raise(rb_eArgError, "il_start must be an Integer");
3181
+ return Qnil;
3182
+ }
3183
+ if (!RB_INTEGER_TYPE_P(kw_values[3])) {
3184
+ rb_raise(rb_eArgError, "il_end must be an Integer");
3185
+ return Qnil;
3186
+ }
3187
+
3188
+ LLaMAContextWrapper* ctx_ptr = get_llama_context(self);
3189
+ if (ctx_ptr->ctx == NULL) {
3190
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
3191
+ return Qnil;
3192
+ }
3193
+
3194
+ std::vector<float> data(RARRAY_LEN(kw_values[0]));
3195
+ for (size_t i = 0; i < data.size(); i++) {
3196
+ data[i] = NUM2DBL(rb_ary_entry(kw_values[0], i));
3197
+ }
3198
+ const int32_t n_embd = NUM2INT(kw_values[1]);
3199
+ const int32_t il_start = NUM2INT(kw_values[2]);
3200
+ const int32_t il_end = NUM2INT(kw_values[3]);
3201
+
3202
+ int32_t err = 0;
3203
+ if (NIL_P(kw_values[0])) {
3204
+ err = llama_control_vector_apply(ctx_ptr->ctx, NULL, 0, n_embd, il_start, il_end);
3205
+ } else {
3206
+ err = llama_control_vector_apply(ctx_ptr->ctx, data.data(), data.size(), n_embd, il_start, il_end);
3207
+ }
3208
+
3209
+ if (err) {
3210
+ rb_raise(rb_eRuntimeError, "Failed to apply control vector");
3211
+ return Qnil;
3212
+ }
3213
+
3214
+ return Qnil;
3215
+ }
3156
3216
  };
3157
3217
 
3158
3218
  const rb_data_type_t RbLLaMAContext::llama_context_type = {
@@ -3311,6 +3371,10 @@ extern "C" void Init_llama_cpp(void) {
3311
3371
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_XXS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_XXS));
3312
3372
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ1_S", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ1_S));
3313
3373
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ4_NL", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ4_NL));
3374
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_S", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_S));
3375
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_M", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_M));
3376
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ4_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ4_XS));
3377
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ1_M", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ1_M));
3314
3378
 
3315
3379
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_GUESSED", INT2NUM(LLAMA_FTYPE_GUESSED));
3316
3380
 
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.14.2'
6
+ VERSION = '0.14.4'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b2435'
9
+ LLAMA_CPP_VERSION = 'b2608'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -32,6 +32,10 @@ module LLaMACpp
32
32
  LLAMA_FTYPE_MOSTLY_IQ3_XXS: Integer
33
33
  LLAMA_FTYPE_MOSTLY_IQ1_S: Integer
34
34
  LLAMA_FTYPE_MOSTLY_IQ4_NL: Integer
35
+ LLAMA_FTYPE_MOSTLY_IQ3_S: Integer
36
+ LLAMA_FTYPE_MOSTLY_IQ3_M: Integer
37
+ LLAMA_FTYPE_MOSTLY_IQ4_XS: Integer
38
+ LLAMA_FTYPE_MOSTLY_IQ1_M: Integer
35
39
 
36
40
  LLAMA_KV_OVERRIDE_TYPE_INT: Integer
37
41
  LLAMA_KV_OVERRIDE_TYPE_FLOAT: Integer
@@ -108,6 +112,7 @@ module LLaMACpp
108
112
  def n_vocab: () -> Integer
109
113
  def n_ctx_train: () -> Integer
110
114
  def n_embd: () -> Integer
115
+ def n_layer: () -> Integer
111
116
  def rope_freq_scale_train: () -> Float
112
117
  def token_to_piece: (Integer) -> String
113
118
  def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool, ?special: bool) -> Array[Integer]
@@ -244,6 +249,7 @@ module LLaMACpp
244
249
  def sample_token: (::LLaMACpp::TokenDataArray) -> Integer
245
250
  def sample_grammar: (::LLaMACpp::TokenDataArray, grammar: ::LLaMACpp::Grammar) -> void
246
251
  def grammar_accept_token: (grammar: ::LLaMACpp::Grammar, token: Integer) -> void
252
+ def apply_control_vector: (data: Array[Float], n_embd: Integer, il_start: Integer, il_end: Integer) -> void
247
253
  end
248
254
 
249
255
  class ContextParams
@@ -1,15 +1,16 @@
1
1
  # Define the default target now so that it is always the first target
2
2
  BUILD_TARGETS = \
3
3
  main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
4
- simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \
5
- speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
4
+ simple batched batched-bench save-load-state server gguf gguf-split llama-bench libllava.a llava-cli baby-llama beam-search \
5
+ retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
6
6
 
7
7
  # Binaries only useful for tests
8
8
  TEST_TARGETS = \
9
9
  tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
10
10
  tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
11
11
  tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \
12
- tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease
12
+ tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease \
13
+ tests/test-json-schema-to-grammar
13
14
 
14
15
  # Code coverage output files
15
16
  COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@@ -391,14 +392,20 @@ ifdef LLAMA_BLIS
391
392
  endif # LLAMA_BLIS
392
393
 
393
394
  ifdef LLAMA_CUBLAS
395
+ # LLAMA_CUBLAS is deprecated and will be removed in the future
396
+ LLAMA_CUDA := 1
397
+ endif
398
+
399
+ ifdef LLAMA_CUDA
394
400
  ifneq ('', '$(wildcard /opt/cuda)')
395
401
  CUDA_PATH ?= /opt/cuda
396
402
  else
397
403
  CUDA_PATH ?= /usr/local/cuda
398
404
  endif
399
- MK_CPPFLAGS += -DGGML_USE_CUBLAS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
405
+ MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
400
406
  MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
401
407
  OBJS += ggml-cuda.o
408
+ OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
402
409
  MK_NVCCFLAGS += -use_fast_math
403
410
  ifdef LLAMA_FATAL_WARNINGS
404
411
  MK_NVCCFLAGS += -Werror all-warnings
@@ -453,19 +460,30 @@ ifdef LLAMA_CUDA_PEER_MAX_BATCH_SIZE
453
460
  else
454
461
  MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
455
462
  endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE
456
- #ifdef LLAMA_CUDA_CUBLAS
457
- # MK_NVCCFLAGS += -DGGML_CUDA_CUBLAS
458
- #endif # LLAMA_CUDA_CUBLAS
463
+ ifdef LLAMA_CUDA_NO_PEER_COPY
464
+ MK_NVCCFLAGS += -DGGML_CUDA_NO_PEER_COPY
465
+ endif # LLAMA_CUDA_NO_PEER_COPY
459
466
  ifdef LLAMA_CUDA_CCBIN
460
467
  MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
461
468
  endif
462
- ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml-common.h
469
+
463
470
  ifdef JETSON_EOL_MODULE_DETECT
464
- $(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
471
+ define NVCC_COMPILE
472
+ $(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
473
+ endef # NVCC_COMPILE
465
474
  else
475
+ define NVCC_COMPILE
466
476
  $(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
477
+ endef # NVCC_COMPILE
467
478
  endif # JETSON_EOL_MODULE_DETECT
468
- endif # LLAMA_CUBLAS
479
+
480
+ ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
481
+ $(NVCC_COMPILE)
482
+
483
+ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
484
+ $(NVCC_COMPILE)
485
+
486
+ endif # LLAMA_CUDA
469
487
 
470
488
  ifdef LLAMA_CLBLAST
471
489
 
@@ -511,7 +529,6 @@ ggml-vulkan.o: ggml-vulkan.cpp ggml-vulkan.h
511
529
  endif # LLAMA_VULKAN
512
530
 
513
531
  ifdef LLAMA_HIPBLAS
514
-
515
532
  ifeq ($(wildcard /opt/rocm),)
516
533
  ROCM_PATH ?= /usr
517
534
  GPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
@@ -523,7 +540,7 @@ ifdef LLAMA_HIPBLAS
523
540
  LLAMA_CUDA_DMMV_X ?= 32
524
541
  LLAMA_CUDA_MMV_Y ?= 1
525
542
  LLAMA_CUDA_KQUANTS_ITER ?= 2
526
- MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
543
+ MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUDA
527
544
  ifdef LLAMA_HIP_UMA
528
545
  MK_CPPFLAGS += -DGGML_HIP_UMA
529
546
  endif # LLAMA_HIP_UMA
@@ -536,9 +553,18 @@ endif # LLAMA_HIP_UMA
536
553
  ifdef LLAMA_CUDA_FORCE_DMMV
537
554
  HIPFLAGS += -DGGML_CUDA_FORCE_DMMV
538
555
  endif # LLAMA_CUDA_FORCE_DMMV
556
+ ifdef LLAMA_CUDA_NO_PEER_COPY
557
+ HIPFLAGS += -DGGML_CUDA_NO_PEER_COPY
558
+ endif # LLAMA_CUDA_NO_PEER_COPY
539
559
  OBJS += ggml-cuda.o
540
- ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
560
+ OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
561
+
562
+ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
563
+ $(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
564
+
565
+ ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
541
566
  $(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
567
+
542
568
  endif # LLAMA_HIPBLAS
543
569
 
544
570
  ifdef LLAMA_METAL
@@ -555,7 +581,7 @@ endif
555
581
  endif # LLAMA_METAL
556
582
 
557
583
  ifdef LLAMA_METAL
558
- ggml-metal.o: ggml-metal.m ggml-metal.h
584
+ ggml-metal.o: ggml-metal.m ggml-metal.h ggml.h
559
585
  $(CC) $(CFLAGS) -c $< -o $@
560
586
 
561
587
  ifdef LLAMA_METAL_EMBED_LIBRARY
@@ -591,12 +617,17 @@ override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS)
591
617
  override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
592
618
 
593
619
  # identify CUDA host compiler
594
- ifdef LLAMA_CUBLAS
620
+ ifdef LLAMA_CUDA
595
621
  GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler
596
622
  include scripts/get-flags.mk
597
623
  CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
598
624
  endif
599
625
 
626
+ ifdef LLAMA_CURL
627
+ override CXXFLAGS := $(CXXFLAGS) -DLLAMA_USE_CURL
628
+ override LDFLAGS := $(LDFLAGS) -lcurl
629
+ endif
630
+
600
631
  #
601
632
  # Print build information
602
633
  #
@@ -611,7 +642,7 @@ $(info I NVCCFLAGS: $(NVCCFLAGS))
611
642
  $(info I LDFLAGS: $(LDFLAGS))
612
643
  $(info I CC: $(shell $(CC) --version | head -n 1))
613
644
  $(info I CXX: $(shell $(CXX) --version | head -n 1))
614
- ifdef LLAMA_CUBLAS
645
+ ifdef LLAMA_CUDA
615
646
  $(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
616
647
  CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
617
648
  ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
@@ -621,8 +652,15 @@ $(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be exp
621
652
  endif # CUDA_POWER_ARCH
622
653
  endif # CUDA_DOCKER_ARCH
623
654
  endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
624
- endif # LLAMA_CUBLAS
655
+ endif # LLAMA_CUDA
656
+ $(info )
657
+
658
+ ifdef LLAMA_CUBLAS
659
+ $(info !!!!)
660
+ $(info LLAMA_CUBLAS is deprecated and will be removed in the future. Use LLAMA_CUDA instead.)
661
+ $(info !!!!)
625
662
  $(info )
663
+ endif
626
664
 
627
665
  #
628
666
  # Build library
@@ -643,7 +681,10 @@ ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-common.h
643
681
  unicode.o: unicode.cpp unicode.h
644
682
  $(CXX) $(CXXFLAGS) -c $< -o $@
645
683
 
646
- OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o
684
+ unicode-data.o: unicode-data.cpp unicode-data.h
685
+ $(CXX) $(CXXFLAGS) -c $< -o $@
686
+
687
+ OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
647
688
 
648
689
  llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
649
690
  $(CXX) $(CXXFLAGS) -c $< -o $@
@@ -663,9 +704,15 @@ console.o: common/console.cpp common/console.h
663
704
  grammar-parser.o: common/grammar-parser.cpp common/grammar-parser.h
664
705
  $(CXX) $(CXXFLAGS) -c $< -o $@
665
706
 
707
+ json-schema-to-grammar.o: common/json-schema-to-grammar.cpp common/json-schema-to-grammar.h
708
+ $(CXX) $(CXXFLAGS) -c $< -o $@
709
+
666
710
  train.o: common/train.cpp common/train.h
667
711
  $(CXX) $(CXXFLAGS) -c $< -o $@
668
712
 
713
+ ngram-cache.o: common/ngram-cache.cpp common/ngram-cache.h
714
+ $(CXX) $(CXXFLAGS) -c $< -o $@
715
+
669
716
  libllama.so: llama.o ggml.o $(OBJS)
670
717
  $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
671
718
 
@@ -677,7 +724,8 @@ lib: llama.o ggml.o $(OBJS)
677
724
  ar rcs libllama.a $^
678
725
 
679
726
  clean:
680
- rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
727
+ rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
728
+ rm -vrf ggml-cuda/*.o
681
729
 
682
730
  #
683
731
  # Examples
@@ -745,7 +793,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
745
793
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
746
794
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
747
795
 
748
- server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
796
+ server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp json-schema-to-grammar.o common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
749
797
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
750
798
  $(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
751
799
 
@@ -753,6 +801,10 @@ gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
753
801
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
754
802
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
755
803
 
804
+ gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
805
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
806
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
807
+
756
808
  train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
757
809
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
758
810
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -790,6 +842,10 @@ export-lora: examples/export-lora/export-lora.cpp ggml.o common/common.h $(OBJS)
790
842
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
791
843
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
792
844
 
845
+ retrieval: examples/retrieval/retrieval.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
846
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
847
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
848
+
793
849
  speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
794
850
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
795
851
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -802,14 +858,24 @@ lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS
802
858
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
803
859
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
804
860
 
805
- lookup: examples/lookup/lookup.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
861
+ lookup: examples/lookup/lookup.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS)
806
862
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
807
863
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
864
+ $(CXX) $(CXXFLAGS) -c examples/lookup/lookup-create.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp)
865
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp) -o lookup-create $(LDFLAGS)
866
+ $(CXX) $(CXXFLAGS) -c examples/lookup/lookup-merge.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp)
867
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp) -o lookup-merge $(LDFLAGS)
868
+ $(CXX) $(CXXFLAGS) -c examples/lookup/lookup-stats.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp)
869
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp) -o lookup-stats $(LDFLAGS)
808
870
 
809
871
  passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
810
872
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
811
873
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
812
874
 
875
+ gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
876
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
877
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
878
+
813
879
  ifeq ($(UNAME_S),Darwin)
814
880
  swift: examples/batched.swift
815
881
  (cd examples/batched.swift; make build)
@@ -861,6 +927,10 @@ tests/test-double-float: tests/test-double-float.cpp ggml.o $(OBJS)
861
927
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
862
928
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
863
929
 
930
+ tests/test-json-schema-to-grammar: tests/test-json-schema-to-grammar.cpp json-schema-to-grammar.o ggml.o llama.o grammar-parser.o $(OBJS)
931
+ $(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<)
932
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
933
+
864
934
  tests/test-grad0: tests/test-grad0.cpp ggml.o $(OBJS)
865
935
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
866
936
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -548,7 +548,11 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
548
548
  for (int i = 0; i < graph->n_nodes; i++) {
549
549
  struct ggml_tensor * node = graph->nodes[i];
550
550
 
551
- if (ggml_is_view(node)) {
551
+ // TODO: better way to add external dependencies
552
+ // GGML_OP_NONE does not appear normally in the graph nodes, but is used by ggml-backend to add dependencies to
553
+ // control when some tensors are allocated and freed. in this case, the dependencies are in `src`, but the node
554
+ // itself is never used and should not be considered a dependency
555
+ if (ggml_is_view(node) && node->op != GGML_OP_NONE) {
552
556
  struct ggml_tensor * view_src = node->view_src;
553
557
  ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
554
558
  }
@@ -565,8 +569,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
565
569
 
566
570
  ggml_gallocr_hash_get(galloc, src)->n_children += 1;
567
571
 
568
- // allocate explicit inputs and leafs
569
- if (src->flags & GGML_TENSOR_FLAG_INPUT || src->op == GGML_OP_NONE) {
572
+ // allocate explicit inputs
573
+ if (src->flags & GGML_TENSOR_FLAG_INPUT) {
570
574
  ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i));
571
575
  }
572
576
  }
@@ -701,8 +705,13 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
701
705
  struct ggml_tensor * leaf = graph->leafs[i];
702
706
  struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
703
707
  galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
704
- galloc->leaf_allocs[i].leaf.offset = hn->offset;
705
- galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
708
+ if (leaf->view_src || leaf->data) {
709
+ galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
710
+ galloc->leaf_allocs[i].leaf.size_max = 0;
711
+ } else {
712
+ galloc->leaf_allocs[i].leaf.offset = hn->offset;
713
+ galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
714
+ }
706
715
  }
707
716
 
708
717
  // reallocate buffers if needed
@@ -103,6 +103,11 @@ extern "C" {
103
103
  // check if the backend supports an operation
104
104
  bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
105
105
 
106
+ // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
107
+ // these should be expensive operations with large batch sizes that may benefit from running on this backend
108
+ // even if the weight has to be copied from the CPU temporarily
109
+ bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
110
+
106
111
  // (optional) event synchronization
107
112
  ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
108
113
  void (*GGML_CALL event_free) (ggml_backend_event_t event);