llama_cpp 0.14.2 → 0.14.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d720138462a39f3fd9853befa19e55543a794eb4d1c379f7d9db516a4569db68
4
- data.tar.gz: 9779852b62cf57ab208275b746ca2492921cf1f92d4ebf6be26a668f1a7bbb66
3
+ metadata.gz: 92ebd411f54255b05074ef79ed3e220c9ff4332164cfc831122d766226322515
4
+ data.tar.gz: 571f0ec65776945d40a54e31bba26cc4194b19965cc4841ce40b9ad3d94605df
5
5
  SHA512:
6
- metadata.gz: 7fa80468abc917099b58009a7a821c704989b8086026e92f8e71a1310ea7ec0449276aeb4653bdb4ddf499183c785b0513ab75b1fa6a94659fe15be2cf05190c
7
- data.tar.gz: '091155784ead62d3ef4ec68f3b4c9f6b1fc97ef87db45327266712912501746b08df983e0d0b81b518d229d4d31f1a0d77ad36f2d7156c26141b8116049d3206'
6
+ metadata.gz: 34ca9567b7eb96add562e977f22f8b2be087c026c85bf92cd5e31f9a96ea5f02a841bdf05f745c4079740a4bb01476fb9bab313317d66dbf8870fa829a269c86
7
+ data.tar.gz: 64b19ef010bb52800cd3710c1ec70bcb7b747e53b6ea7d8f13d84d336b1ee67868153f5bbdfee0b4131dfddaf1c656c49bd774084ee2d14f191d22d215a47737
data/CHANGELOG.md CHANGED
@@ -1,3 +1,17 @@
1
+ ## [[0.14.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.3...v0.14.4)] - 2024-04-06
2
+
3
+ - Bump llama.cpp from b2496 to b2573.
4
+ - Add file type constants.
5
+ - Bump llama.cpp from b2573 to b2608.
6
+
7
+ Implementation of llama_split_path, llama_split_prefix binding, llama_grammar_accept, and decode_utf8 has been skipped.
8
+
9
+ ## [[0.14.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.2...v0.14.3)] - 2024-03-23
10
+
11
+ - Bump llama.cpp from b2435 to b2496.
12
+ - Add `n_layer` method to `Model`.
13
+ - Add `apply_control_vector` method to `Context`.
14
+
1
15
  ## [[0.14.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.1...v0.14.2)] - 2024-03-16
2
16
 
3
17
  - Fix to use metal embed library on macOS.
@@ -1466,6 +1466,7 @@ public:
1466
1466
  rb_define_method(rb_cLLaMAModel, "n_vocab", RUBY_METHOD_FUNC(_llama_model_get_model_n_vocab), 0);
1467
1467
  rb_define_method(rb_cLLaMAModel, "n_ctx_train", RUBY_METHOD_FUNC(_llama_model_get_model_n_ctx_train), 0);
1468
1468
  rb_define_method(rb_cLLaMAModel, "n_embd", RUBY_METHOD_FUNC(_llama_model_get_model_n_embd), 0);
1469
+ rb_define_method(rb_cLLaMAModel, "n_layer", RUBY_METHOD_FUNC(_llama_model_get_model_n_layer), 0);
1469
1470
  rb_define_method(rb_cLLaMAModel, "rope_freq_scale_train", RUBY_METHOD_FUNC(_llama_model_rope_freq_scale_train), 0);
1470
1471
  rb_define_method(rb_cLLaMAModel, "token_to_piece", RUBY_METHOD_FUNC(_llama_model_token_to_piece), 1);
1471
1472
  rb_define_method(rb_cLLaMAModel, "tokenize", RUBY_METHOD_FUNC(_llama_model_tokenize), -1);
@@ -1613,6 +1614,11 @@ private:
1613
1614
  return INT2NUM(llama_n_embd(ptr->model));
1614
1615
  }
1615
1616
 
1617
+ static VALUE _llama_model_get_model_n_layer(VALUE self) {
1618
+ LLaMAModelWrapper* ptr = get_llama_model(self);
1619
+ return INT2NUM(llama_n_layer(ptr->model));
1620
+ }
1621
+
1616
1622
  static VALUE _llama_model_rope_freq_scale_train(VALUE self) {
1617
1623
  LLaMAModelWrapper* ptr = get_llama_model(self);
1618
1624
  return DBL2NUM(llama_rope_freq_scale_train(ptr->model));
@@ -2083,6 +2089,7 @@ public:
2083
2089
  rb_define_method(rb_cLLaMAContext, "sample_token", RUBY_METHOD_FUNC(_llama_context_sample_token), 1);
2084
2090
  rb_define_method(rb_cLLaMAContext, "sample_grammar", RUBY_METHOD_FUNC(_llama_context_sample_grammar), -1);
2085
2091
  rb_define_method(rb_cLLaMAContext, "grammar_accept_token", RUBY_METHOD_FUNC(_llama_context_grammar_accept_token), -1);
2092
+ rb_define_method(rb_cLLaMAContext, "apply_control_vector", RUBY_METHOD_FUNC(_llama_context_apply_control_vector), -1);
2086
2093
  }
2087
2094
 
2088
2095
  private:
@@ -3153,6 +3160,59 @@ private:
3153
3160
 
3154
3161
  return Qnil;
3155
3162
  }
3163
+
3164
+ static VALUE _llama_context_apply_control_vector(int argc, VALUE* argv, VALUE self) {
3165
+ VALUE kw_args = Qnil;
3166
+ ID kw_table[4] = { rb_intern("data"), rb_intern("n_embd"), rb_intern("il_start"), rb_intern("il_end") };
3167
+ VALUE kw_values[4] = { Qundef, Qundef, Qundef, Qundef };
3168
+ rb_scan_args(argc, argv, ":", &kw_args);
3169
+ rb_get_kwargs(kw_args, kw_table, 4, 0, kw_values);
3170
+
3171
+ if (!RB_TYPE_P(kw_values[0], T_ARRAY) && !NIL_P(kw_values[0])) {
3172
+ rb_raise(rb_eArgError, "data must be an Array or nil");
3173
+ return Qnil;
3174
+ }
3175
+ if (!RB_INTEGER_TYPE_P(kw_values[1])) {
3176
+ rb_raise(rb_eArgError, "n_embd must be an Integer");
3177
+ return Qnil;
3178
+ }
3179
+ if (!RB_INTEGER_TYPE_P(kw_values[2])) {
3180
+ rb_raise(rb_eArgError, "il_start must be an Integer");
3181
+ return Qnil;
3182
+ }
3183
+ if (!RB_INTEGER_TYPE_P(kw_values[3])) {
3184
+ rb_raise(rb_eArgError, "il_end must be an Integer");
3185
+ return Qnil;
3186
+ }
3187
+
3188
+ LLaMAContextWrapper* ctx_ptr = get_llama_context(self);
3189
+ if (ctx_ptr->ctx == NULL) {
3190
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
3191
+ return Qnil;
3192
+ }
3193
+
3194
+ std::vector<float> data(RARRAY_LEN(kw_values[0]));
3195
+ for (size_t i = 0; i < data.size(); i++) {
3196
+ data[i] = NUM2DBL(rb_ary_entry(kw_values[0], i));
3197
+ }
3198
+ const int32_t n_embd = NUM2INT(kw_values[1]);
3199
+ const int32_t il_start = NUM2INT(kw_values[2]);
3200
+ const int32_t il_end = NUM2INT(kw_values[3]);
3201
+
3202
+ int32_t err = 0;
3203
+ if (NIL_P(kw_values[0])) {
3204
+ err = llama_control_vector_apply(ctx_ptr->ctx, NULL, 0, n_embd, il_start, il_end);
3205
+ } else {
3206
+ err = llama_control_vector_apply(ctx_ptr->ctx, data.data(), data.size(), n_embd, il_start, il_end);
3207
+ }
3208
+
3209
+ if (err) {
3210
+ rb_raise(rb_eRuntimeError, "Failed to apply control vector");
3211
+ return Qnil;
3212
+ }
3213
+
3214
+ return Qnil;
3215
+ }
3156
3216
  };
3157
3217
 
3158
3218
  const rb_data_type_t RbLLaMAContext::llama_context_type = {
@@ -3311,6 +3371,10 @@ extern "C" void Init_llama_cpp(void) {
3311
3371
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_XXS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_XXS));
3312
3372
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ1_S", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ1_S));
3313
3373
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ4_NL", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ4_NL));
3374
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_S", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_S));
3375
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_M", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_M));
3376
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ4_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ4_XS));
3377
+ rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ1_M", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ1_M));
3314
3378
 
3315
3379
  rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_GUESSED", INT2NUM(LLAMA_FTYPE_GUESSED));
3316
3380
 
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.14.2'
6
+ VERSION = '0.14.4'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b2435'
9
+ LLAMA_CPP_VERSION = 'b2608'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -32,6 +32,10 @@ module LLaMACpp
32
32
  LLAMA_FTYPE_MOSTLY_IQ3_XXS: Integer
33
33
  LLAMA_FTYPE_MOSTLY_IQ1_S: Integer
34
34
  LLAMA_FTYPE_MOSTLY_IQ4_NL: Integer
35
+ LLAMA_FTYPE_MOSTLY_IQ3_S: Integer
36
+ LLAMA_FTYPE_MOSTLY_IQ3_M: Integer
37
+ LLAMA_FTYPE_MOSTLY_IQ4_XS: Integer
38
+ LLAMA_FTYPE_MOSTLY_IQ1_M: Integer
35
39
 
36
40
  LLAMA_KV_OVERRIDE_TYPE_INT: Integer
37
41
  LLAMA_KV_OVERRIDE_TYPE_FLOAT: Integer
@@ -108,6 +112,7 @@ module LLaMACpp
108
112
  def n_vocab: () -> Integer
109
113
  def n_ctx_train: () -> Integer
110
114
  def n_embd: () -> Integer
115
+ def n_layer: () -> Integer
111
116
  def rope_freq_scale_train: () -> Float
112
117
  def token_to_piece: (Integer) -> String
113
118
  def tokenize: (text: String, ?n_max_tokens: Integer, ?add_bos: bool, ?special: bool) -> Array[Integer]
@@ -244,6 +249,7 @@ module LLaMACpp
244
249
  def sample_token: (::LLaMACpp::TokenDataArray) -> Integer
245
250
  def sample_grammar: (::LLaMACpp::TokenDataArray, grammar: ::LLaMACpp::Grammar) -> void
246
251
  def grammar_accept_token: (grammar: ::LLaMACpp::Grammar, token: Integer) -> void
252
+ def apply_control_vector: (data: Array[Float], n_embd: Integer, il_start: Integer, il_end: Integer) -> void
247
253
  end
248
254
 
249
255
  class ContextParams
@@ -1,15 +1,16 @@
1
1
  # Define the default target now so that it is always the first target
2
2
  BUILD_TARGETS = \
3
3
  main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
4
- simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \
5
- speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
4
+ simple batched batched-bench save-load-state server gguf gguf-split llama-bench libllava.a llava-cli baby-llama beam-search \
5
+ retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
6
6
 
7
7
  # Binaries only useful for tests
8
8
  TEST_TARGETS = \
9
9
  tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
10
10
  tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
11
11
  tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \
12
- tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease
12
+ tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease \
13
+ tests/test-json-schema-to-grammar
13
14
 
14
15
  # Code coverage output files
15
16
  COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@@ -391,14 +392,20 @@ ifdef LLAMA_BLIS
391
392
  endif # LLAMA_BLIS
392
393
 
393
394
  ifdef LLAMA_CUBLAS
395
+ # LLAMA_CUBLAS is deprecated and will be removed in the future
396
+ LLAMA_CUDA := 1
397
+ endif
398
+
399
+ ifdef LLAMA_CUDA
394
400
  ifneq ('', '$(wildcard /opt/cuda)')
395
401
  CUDA_PATH ?= /opt/cuda
396
402
  else
397
403
  CUDA_PATH ?= /usr/local/cuda
398
404
  endif
399
- MK_CPPFLAGS += -DGGML_USE_CUBLAS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
405
+ MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
400
406
  MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
401
407
  OBJS += ggml-cuda.o
408
+ OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
402
409
  MK_NVCCFLAGS += -use_fast_math
403
410
  ifdef LLAMA_FATAL_WARNINGS
404
411
  MK_NVCCFLAGS += -Werror all-warnings
@@ -453,19 +460,30 @@ ifdef LLAMA_CUDA_PEER_MAX_BATCH_SIZE
453
460
  else
454
461
  MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
455
462
  endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE
456
- #ifdef LLAMA_CUDA_CUBLAS
457
- # MK_NVCCFLAGS += -DGGML_CUDA_CUBLAS
458
- #endif # LLAMA_CUDA_CUBLAS
463
+ ifdef LLAMA_CUDA_NO_PEER_COPY
464
+ MK_NVCCFLAGS += -DGGML_CUDA_NO_PEER_COPY
465
+ endif # LLAMA_CUDA_NO_PEER_COPY
459
466
  ifdef LLAMA_CUDA_CCBIN
460
467
  MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
461
468
  endif
462
- ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml-common.h
469
+
463
470
  ifdef JETSON_EOL_MODULE_DETECT
464
- $(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
471
+ define NVCC_COMPILE
472
+ $(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
473
+ endef # NVCC_COMPILE
465
474
  else
475
+ define NVCC_COMPILE
466
476
  $(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
477
+ endef # NVCC_COMPILE
467
478
  endif # JETSON_EOL_MODULE_DETECT
468
- endif # LLAMA_CUBLAS
479
+
480
+ ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
481
+ $(NVCC_COMPILE)
482
+
483
+ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
484
+ $(NVCC_COMPILE)
485
+
486
+ endif # LLAMA_CUDA
469
487
 
470
488
  ifdef LLAMA_CLBLAST
471
489
 
@@ -511,7 +529,6 @@ ggml-vulkan.o: ggml-vulkan.cpp ggml-vulkan.h
511
529
  endif # LLAMA_VULKAN
512
530
 
513
531
  ifdef LLAMA_HIPBLAS
514
-
515
532
  ifeq ($(wildcard /opt/rocm),)
516
533
  ROCM_PATH ?= /usr
517
534
  GPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
@@ -523,7 +540,7 @@ ifdef LLAMA_HIPBLAS
523
540
  LLAMA_CUDA_DMMV_X ?= 32
524
541
  LLAMA_CUDA_MMV_Y ?= 1
525
542
  LLAMA_CUDA_KQUANTS_ITER ?= 2
526
- MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
543
+ MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUDA
527
544
  ifdef LLAMA_HIP_UMA
528
545
  MK_CPPFLAGS += -DGGML_HIP_UMA
529
546
  endif # LLAMA_HIP_UMA
@@ -536,9 +553,18 @@ endif # LLAMA_HIP_UMA
536
553
  ifdef LLAMA_CUDA_FORCE_DMMV
537
554
  HIPFLAGS += -DGGML_CUDA_FORCE_DMMV
538
555
  endif # LLAMA_CUDA_FORCE_DMMV
556
+ ifdef LLAMA_CUDA_NO_PEER_COPY
557
+ HIPFLAGS += -DGGML_CUDA_NO_PEER_COPY
558
+ endif # LLAMA_CUDA_NO_PEER_COPY
539
559
  OBJS += ggml-cuda.o
540
- ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
560
+ OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
561
+
562
+ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
563
+ $(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
564
+
565
+ ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
541
566
  $(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
567
+
542
568
  endif # LLAMA_HIPBLAS
543
569
 
544
570
  ifdef LLAMA_METAL
@@ -555,7 +581,7 @@ endif
555
581
  endif # LLAMA_METAL
556
582
 
557
583
  ifdef LLAMA_METAL
558
- ggml-metal.o: ggml-metal.m ggml-metal.h
584
+ ggml-metal.o: ggml-metal.m ggml-metal.h ggml.h
559
585
  $(CC) $(CFLAGS) -c $< -o $@
560
586
 
561
587
  ifdef LLAMA_METAL_EMBED_LIBRARY
@@ -591,12 +617,17 @@ override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS)
591
617
  override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
592
618
 
593
619
  # identify CUDA host compiler
594
- ifdef LLAMA_CUBLAS
620
+ ifdef LLAMA_CUDA
595
621
  GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler
596
622
  include scripts/get-flags.mk
597
623
  CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
598
624
  endif
599
625
 
626
+ ifdef LLAMA_CURL
627
+ override CXXFLAGS := $(CXXFLAGS) -DLLAMA_USE_CURL
628
+ override LDFLAGS := $(LDFLAGS) -lcurl
629
+ endif
630
+
600
631
  #
601
632
  # Print build information
602
633
  #
@@ -611,7 +642,7 @@ $(info I NVCCFLAGS: $(NVCCFLAGS))
611
642
  $(info I LDFLAGS: $(LDFLAGS))
612
643
  $(info I CC: $(shell $(CC) --version | head -n 1))
613
644
  $(info I CXX: $(shell $(CXX) --version | head -n 1))
614
- ifdef LLAMA_CUBLAS
645
+ ifdef LLAMA_CUDA
615
646
  $(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
616
647
  CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
617
648
  ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
@@ -621,8 +652,15 @@ $(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be exp
621
652
  endif # CUDA_POWER_ARCH
622
653
  endif # CUDA_DOCKER_ARCH
623
654
  endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
624
- endif # LLAMA_CUBLAS
655
+ endif # LLAMA_CUDA
656
+ $(info )
657
+
658
+ ifdef LLAMA_CUBLAS
659
+ $(info !!!!)
660
+ $(info LLAMA_CUBLAS is deprecated and will be removed in the future. Use LLAMA_CUDA instead.)
661
+ $(info !!!!)
625
662
  $(info )
663
+ endif
626
664
 
627
665
  #
628
666
  # Build library
@@ -643,7 +681,10 @@ ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-common.h
643
681
  unicode.o: unicode.cpp unicode.h
644
682
  $(CXX) $(CXXFLAGS) -c $< -o $@
645
683
 
646
- OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o
684
+ unicode-data.o: unicode-data.cpp unicode-data.h
685
+ $(CXX) $(CXXFLAGS) -c $< -o $@
686
+
687
+ OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
647
688
 
648
689
  llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
649
690
  $(CXX) $(CXXFLAGS) -c $< -o $@
@@ -663,9 +704,15 @@ console.o: common/console.cpp common/console.h
663
704
  grammar-parser.o: common/grammar-parser.cpp common/grammar-parser.h
664
705
  $(CXX) $(CXXFLAGS) -c $< -o $@
665
706
 
707
+ json-schema-to-grammar.o: common/json-schema-to-grammar.cpp common/json-schema-to-grammar.h
708
+ $(CXX) $(CXXFLAGS) -c $< -o $@
709
+
666
710
  train.o: common/train.cpp common/train.h
667
711
  $(CXX) $(CXXFLAGS) -c $< -o $@
668
712
 
713
+ ngram-cache.o: common/ngram-cache.cpp common/ngram-cache.h
714
+ $(CXX) $(CXXFLAGS) -c $< -o $@
715
+
669
716
  libllama.so: llama.o ggml.o $(OBJS)
670
717
  $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
671
718
 
@@ -677,7 +724,8 @@ lib: llama.o ggml.o $(OBJS)
677
724
  ar rcs libllama.a $^
678
725
 
679
726
  clean:
680
- rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
727
+ rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
728
+ rm -vrf ggml-cuda/*.o
681
729
 
682
730
  #
683
731
  # Examples
@@ -745,7 +793,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
745
793
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
746
794
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
747
795
 
748
- server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
796
+ server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp json-schema-to-grammar.o common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
749
797
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
750
798
  $(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
751
799
 
@@ -753,6 +801,10 @@ gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
753
801
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
754
802
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
755
803
 
804
+ gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
805
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
806
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
807
+
756
808
  train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
757
809
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
758
810
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -790,6 +842,10 @@ export-lora: examples/export-lora/export-lora.cpp ggml.o common/common.h $(OBJS)
790
842
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
791
843
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
792
844
 
845
+ retrieval: examples/retrieval/retrieval.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
846
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
847
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
848
+
793
849
  speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
794
850
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
795
851
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -802,14 +858,24 @@ lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS
802
858
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
803
859
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
804
860
 
805
- lookup: examples/lookup/lookup.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
861
+ lookup: examples/lookup/lookup.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS)
806
862
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
807
863
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
864
+ $(CXX) $(CXXFLAGS) -c examples/lookup/lookup-create.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp)
865
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp) -o lookup-create $(LDFLAGS)
866
+ $(CXX) $(CXXFLAGS) -c examples/lookup/lookup-merge.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp)
867
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp) -o lookup-merge $(LDFLAGS)
868
+ $(CXX) $(CXXFLAGS) -c examples/lookup/lookup-stats.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp)
869
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp) -o lookup-stats $(LDFLAGS)
808
870
 
809
871
  passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
810
872
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
811
873
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
812
874
 
875
+ gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
876
+ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
877
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
878
+
813
879
  ifeq ($(UNAME_S),Darwin)
814
880
  swift: examples/batched.swift
815
881
  (cd examples/batched.swift; make build)
@@ -861,6 +927,10 @@ tests/test-double-float: tests/test-double-float.cpp ggml.o $(OBJS)
861
927
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
862
928
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
863
929
 
930
+ tests/test-json-schema-to-grammar: tests/test-json-schema-to-grammar.cpp json-schema-to-grammar.o ggml.o llama.o grammar-parser.o $(OBJS)
931
+ $(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<)
932
+ $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
933
+
864
934
  tests/test-grad0: tests/test-grad0.cpp ggml.o $(OBJS)
865
935
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
866
936
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -548,7 +548,11 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
548
548
  for (int i = 0; i < graph->n_nodes; i++) {
549
549
  struct ggml_tensor * node = graph->nodes[i];
550
550
 
551
- if (ggml_is_view(node)) {
551
+ // TODO: better way to add external dependencies
552
+ // GGML_OP_NONE does not appear normally in the graph nodes, but is used by ggml-backend to add dependencies to
553
+ // control when some tensors are allocated and freed. in this case, the dependencies are in `src`, but the node
554
+ // itself is never used and should not be considered a dependency
555
+ if (ggml_is_view(node) && node->op != GGML_OP_NONE) {
552
556
  struct ggml_tensor * view_src = node->view_src;
553
557
  ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
554
558
  }
@@ -565,8 +569,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
565
569
 
566
570
  ggml_gallocr_hash_get(galloc, src)->n_children += 1;
567
571
 
568
- // allocate explicit inputs and leafs
569
- if (src->flags & GGML_TENSOR_FLAG_INPUT || src->op == GGML_OP_NONE) {
572
+ // allocate explicit inputs
573
+ if (src->flags & GGML_TENSOR_FLAG_INPUT) {
570
574
  ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i));
571
575
  }
572
576
  }
@@ -701,8 +705,13 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
701
705
  struct ggml_tensor * leaf = graph->leafs[i];
702
706
  struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
703
707
  galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
704
- galloc->leaf_allocs[i].leaf.offset = hn->offset;
705
- galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
708
+ if (leaf->view_src || leaf->data) {
709
+ galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
710
+ galloc->leaf_allocs[i].leaf.size_max = 0;
711
+ } else {
712
+ galloc->leaf_allocs[i].leaf.offset = hn->offset;
713
+ galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
714
+ }
706
715
  }
707
716
 
708
717
  // reallocate buffers if needed
@@ -103,6 +103,11 @@ extern "C" {
103
103
  // check if the backend supports an operation
104
104
  bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
105
105
 
106
+ // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
107
+ // these should be expensive operations with large batch sizes that may benefit from running on this backend
108
+ // even if the weight has to be copied from the CPU temporarily
109
+ bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
110
+
106
111
  // (optional) event synchronization
107
112
  ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
108
113
  void (*GGML_CALL event_free) (ggml_backend_event_t event);