llama_cpp 0.14.0 → 0.14.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/ext/llama_cpp/llama_cpp.cpp +71 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +9 -0
- data/vendor/tmp/llama.cpp/Makefile +28 -12
- data/vendor/tmp/llama.cpp/ggml-alloc.c +45 -64
- data/vendor/tmp/llama.cpp/ggml-alloc.h +13 -5
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +14 -3
- data/vendor/tmp/llama.cpp/ggml-backend.c +358 -135
- data/vendor/tmp/llama.cpp/ggml-backend.h +41 -17
- data/vendor/tmp/llama.cpp/ggml-common.h +1830 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +187 -1033
- data/vendor/tmp/llama.cpp/ggml-impl.h +6 -2
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +5 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +42 -20
- data/vendor/tmp/llama.cpp/ggml-metal.metal +44 -910
- data/vendor/tmp/llama.cpp/ggml-quants.c +457 -1074
- data/vendor/tmp/llama.cpp/ggml-quants.h +27 -259
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +388 -565
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +6 -39
- data/vendor/tmp/llama.cpp/ggml.c +509 -343
- data/vendor/tmp/llama.cpp/ggml.h +61 -47
- data/vendor/tmp/llama.cpp/llama.cpp +1446 -687
- data/vendor/tmp/llama.cpp/llama.h +25 -11
- data/vendor/tmp/llama.cpp/unicode.cpp +1672 -0
- data/vendor/tmp/llama.cpp/unicode.h +16 -774
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c2a192fa17c1d313a93306e415ec27dfb8fb6ce993b9fc78797ed6e1d38ca63f
|
4
|
+
data.tar.gz: f800e54961a8bea5de95373d15f0cda30f7e95edd655cc0504247dfefcff473a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 48cefba1491319f82d52a46e8be34b5f0115dbe80bd6a9fdbf4fe0e190581a6b1ff8c3e2b2dfdaefeaa0b7cb11c8b9f5a84bcb60354f64248abbee3d488378ee
|
7
|
+
data.tar.gz: 9c6d75d3818b61192bd5c93a8b091003e2342f28102de1fbc9a1a02955a7c89e2a144b82bbe83e805b3f741261e967469c3ad2f6d347b1b870fb51880b850d89
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,10 @@
|
|
1
|
+
## [[0.14.1](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.0...v0.14.1)] - 2024-03-16
|
2
|
+
|
3
|
+
- Bump llama.cpp from b2361 to b2435.
|
4
|
+
- Add constants for vocaburary type: `LLAMA_VOCAB_TYPE_NONE`.
|
5
|
+
- Add `n_ubatch` and `n_seq_max` accessors to `ContextParams`.
|
6
|
+
- Add `n_ubatch`, `n_seq_max`, `set_causal_attn`, and `synchronize` methods to `Context`.
|
7
|
+
|
1
8
|
## [[0.14.0](https://github.com/yoshoku/llama_cpp.rb/compare/v0.13.0...v0.14.0)] - 2024-03-09
|
2
9
|
|
3
10
|
**Breaking Changes**
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -946,6 +946,10 @@ public:
|
|
946
946
|
rb_define_method(rb_cLLaMAContextParams, "n_ctx", RUBY_METHOD_FUNC(_llama_context_params_get_n_ctx), 0);
|
947
947
|
rb_define_method(rb_cLLaMAContextParams, "n_batch=", RUBY_METHOD_FUNC(_llama_context_params_set_n_batch), 1);
|
948
948
|
rb_define_method(rb_cLLaMAContextParams, "n_batch", RUBY_METHOD_FUNC(_llama_context_params_get_n_batch), 0);
|
949
|
+
rb_define_method(rb_cLLaMAContextParams, "n_ubatch=", RUBY_METHOD_FUNC(_llama_context_params_set_n_ubatch), 1);
|
950
|
+
rb_define_method(rb_cLLaMAContextParams, "n_ubatch", RUBY_METHOD_FUNC(_llama_context_params_get_n_ubatch), 0);
|
951
|
+
rb_define_method(rb_cLLaMAContextParams, "n_seq_max=", RUBY_METHOD_FUNC(_llama_context_params_set_n_seq_max), 1);
|
952
|
+
rb_define_method(rb_cLLaMAContextParams, "n_seq_max", RUBY_METHOD_FUNC(_llama_context_params_get_n_seq_max), 0);
|
949
953
|
rb_define_method(rb_cLLaMAContextParams, "n_threads=", RUBY_METHOD_FUNC(_llama_context_params_set_n_threads), 1);
|
950
954
|
rb_define_method(rb_cLLaMAContextParams, "n_threads", RUBY_METHOD_FUNC(_llama_context_params_get_n_threads), 0);
|
951
955
|
rb_define_method(rb_cLLaMAContextParams, "n_threads_batch=", RUBY_METHOD_FUNC(_llama_context_params_set_n_threads_batch), 1);
|
@@ -1031,6 +1035,30 @@ private:
|
|
1031
1035
|
return INT2NUM(ptr->params.n_batch);
|
1032
1036
|
}
|
1033
1037
|
|
1038
|
+
// n_ubatch
|
1039
|
+
static VALUE _llama_context_params_set_n_ubatch(VALUE self, VALUE n_ubatch) {
|
1040
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1041
|
+
ptr->params.n_ubatch = NUM2INT(n_ubatch);
|
1042
|
+
return INT2NUM(ptr->params.n_ubatch);
|
1043
|
+
}
|
1044
|
+
|
1045
|
+
static VALUE _llama_context_params_get_n_ubatch(VALUE self) {
|
1046
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1047
|
+
return INT2NUM(ptr->params.n_ubatch);
|
1048
|
+
}
|
1049
|
+
|
1050
|
+
// n_seq_max
|
1051
|
+
static VALUE _llama_context_params_set_n_seq_max(VALUE self, VALUE n_seq_max) {
|
1052
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1053
|
+
ptr->params.n_seq_max = NUM2INT(n_seq_max);
|
1054
|
+
return INT2NUM(ptr->params.n_seq_max);
|
1055
|
+
}
|
1056
|
+
|
1057
|
+
static VALUE _llama_context_params_get_n_seq_max(VALUE self) {
|
1058
|
+
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
1059
|
+
return INT2NUM(ptr->params.n_seq_max);
|
1060
|
+
}
|
1061
|
+
|
1034
1062
|
// n_threads
|
1035
1063
|
static VALUE _llama_context_params_set_n_threads(VALUE self, VALUE n_threads) {
|
1036
1064
|
LLaMAContextParamsWrapper* ptr = get_llama_context_params(self);
|
@@ -2019,6 +2047,8 @@ public:
|
|
2019
2047
|
rb_define_method(rb_cLLaMAContext, "embeddings_seq", RUBY_METHOD_FUNC(_llama_context_embeddings_seq), 1);
|
2020
2048
|
rb_define_method(rb_cLLaMAContext, "n_ctx", RUBY_METHOD_FUNC(_llama_context_n_ctx), 0);
|
2021
2049
|
rb_define_method(rb_cLLaMAContext, "n_batch", RUBY_METHOD_FUNC(_llama_context_n_batch), 0);
|
2050
|
+
rb_define_method(rb_cLLaMAContext, "n_ubatch", RUBY_METHOD_FUNC(_llama_context_n_ubatch), 0);
|
2051
|
+
rb_define_method(rb_cLLaMAContext, "n_seq_max", RUBY_METHOD_FUNC(_llama_context_n_seq_max), 0);
|
2022
2052
|
rb_define_method(rb_cLLaMAContext, "timings", RUBY_METHOD_FUNC(_llama_context_get_timings), 0);
|
2023
2053
|
rb_define_method(rb_cLLaMAContext, "print_timings", RUBY_METHOD_FUNC(_llama_context_print_timings), 0);
|
2024
2054
|
rb_define_method(rb_cLLaMAContext, "reset_timings", RUBY_METHOD_FUNC(_llama_context_reset_timings), 0);
|
@@ -2033,6 +2063,8 @@ public:
|
|
2033
2063
|
rb_define_method(rb_cLLaMAContext, "kv_cache_kv_cache_defrag", RUBY_METHOD_FUNC(_llama_context_kv_cache_defrag), 0);
|
2034
2064
|
rb_define_method(rb_cLLaMAContext, "kv_cache_kv_cache_update", RUBY_METHOD_FUNC(_llama_context_kv_cache_update), 0);
|
2035
2065
|
rb_define_method(rb_cLLaMAContext, "set_rng_seed", RUBY_METHOD_FUNC(_llama_context_set_rng_seed), 1);
|
2066
|
+
rb_define_method(rb_cLLaMAContext, "set_causal_attn", RUBY_METHOD_FUNC(_llama_context_set_causal_attn), 1);
|
2067
|
+
rb_define_method(rb_cLLaMAContext, "synchronize", RUBY_METHOD_FUNC(_llama_context_synchronize), 0);
|
2036
2068
|
rb_define_method(rb_cLLaMAContext, "load_session_file", RUBY_METHOD_FUNC(_llama_context_load_session_file), -1);
|
2037
2069
|
rb_define_method(rb_cLLaMAContext, "save_session_file", RUBY_METHOD_FUNC(_llama_context_save_session_file), -1);
|
2038
2070
|
rb_define_method(rb_cLLaMAContext, "sample_repetition_penalties", RUBY_METHOD_FUNC(_llama_context_sample_repetition_penalties), -1);
|
@@ -2250,6 +2282,24 @@ private:
|
|
2250
2282
|
return UINT2NUM(llama_n_batch(ptr->ctx));
|
2251
2283
|
}
|
2252
2284
|
|
2285
|
+
static VALUE _llama_context_n_ubatch(VALUE self) {
|
2286
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2287
|
+
if (ptr->ctx == NULL) {
|
2288
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
2289
|
+
return Qnil;
|
2290
|
+
}
|
2291
|
+
return UINT2NUM(llama_n_ubatch(ptr->ctx));
|
2292
|
+
}
|
2293
|
+
|
2294
|
+
static VALUE _llama_context_n_seq_max(VALUE self) {
|
2295
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2296
|
+
if (ptr->ctx == NULL) {
|
2297
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
2298
|
+
return Qnil;
|
2299
|
+
}
|
2300
|
+
return UINT2NUM(llama_n_seq_max(ptr->ctx));
|
2301
|
+
}
|
2302
|
+
|
2253
2303
|
static VALUE _llama_context_get_timings(VALUE self) {
|
2254
2304
|
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2255
2305
|
if (ptr->ctx == NULL) {
|
@@ -2395,6 +2445,26 @@ private:
|
|
2395
2445
|
return Qnil;
|
2396
2446
|
}
|
2397
2447
|
|
2448
|
+
static VALUE _llama_context_set_causal_attn(VALUE self, VALUE causal_attn) {
|
2449
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2450
|
+
if (ptr->ctx == NULL) {
|
2451
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
2452
|
+
return Qnil;
|
2453
|
+
}
|
2454
|
+
llama_set_causal_attn(ptr->ctx, RTEST(causal_attn) ? true : false);
|
2455
|
+
return Qnil;
|
2456
|
+
}
|
2457
|
+
|
2458
|
+
static VALUE _llama_context_synchronize(VALUE self) {
|
2459
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
2460
|
+
if (ptr->ctx == NULL) {
|
2461
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
2462
|
+
return Qnil;
|
2463
|
+
}
|
2464
|
+
llama_synchronize(ptr->ctx);
|
2465
|
+
return Qnil;
|
2466
|
+
}
|
2467
|
+
|
2398
2468
|
static VALUE _llama_context_load_session_file(int argc, VALUE* argv, VALUE self) {
|
2399
2469
|
VALUE kw_args = Qnil;
|
2400
2470
|
ID kw_table[1] = { rb_intern("session_path") };
|
@@ -3204,6 +3274,7 @@ extern "C" void Init_llama_cpp(void) {
|
|
3204
3274
|
rb_define_module_function(rb_mLLaMACpp, "supports_mlock?", rb_llama_supports_mlock, 0);
|
3205
3275
|
rb_define_module_function(rb_mLLaMACpp, "supports_gpu_offload?", rb_llama_supports_gpu_offload, 0);
|
3206
3276
|
|
3277
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_NONE", INT2NUM(LLAMA_VOCAB_TYPE_NONE));
|
3207
3278
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_SPM", INT2NUM(LLAMA_VOCAB_TYPE_SPM));
|
3208
3279
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_BPE", INT2NUM(LLAMA_VOCAB_TYPE_BPE));
|
3209
3280
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_WPM", INT2NUM(LLAMA_VOCAB_TYPE_WPM));
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.14.
|
6
|
+
VERSION = '0.14.1'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b2435'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -3,6 +3,7 @@ module LLaMACpp
|
|
3
3
|
LLAMA_CPP_VERSION: String
|
4
4
|
LLAMA_DEFALUT_SEED: String
|
5
5
|
|
6
|
+
LLAMA_VOCAB_TYPE_NONE: Integer
|
6
7
|
LLAMA_VOCAB_TYPE_SPM: Integer
|
7
8
|
LLAMA_VOCAB_TYPE_BPE: Integer
|
8
9
|
LLAMA_VOCAB_TYPE_WPM: Integer
|
@@ -207,6 +208,8 @@ module LLaMACpp
|
|
207
208
|
def logits: () -> Array[Float]
|
208
209
|
def n_ctx: () -> Integer
|
209
210
|
def n_batch: () -> Integer
|
211
|
+
def n_ubatch: () -> Integer
|
212
|
+
def n_seq_max: () -> Integer
|
210
213
|
def timings: () -> ::LLaMACpp::Timings
|
211
214
|
def print_timings: () -> void
|
212
215
|
def reset_timings: () -> void
|
@@ -221,6 +224,8 @@ module LLaMACpp
|
|
221
224
|
def kv_cache_defrag: () -> void
|
222
225
|
def kv_cache_update: () -> void
|
223
226
|
def set_rng_seed: (Integer) -> void
|
227
|
+
def set_causal_attn: (bool) -> void
|
228
|
+
def synchronize: () -> void
|
224
229
|
def load_session_file: (session_path: String) -> void
|
225
230
|
def save_session_file: (session_path: String, session_tokens: Array[Integer]) -> void
|
226
231
|
def sample_repetition_penalties: (::LLaMACpp::TokenDataArray, Array[Integer], penalty_repeat: Float, penalty_freq: Float, penalty_present: Float) -> void
|
@@ -250,6 +255,10 @@ module LLaMACpp
|
|
250
255
|
def n_ctx=: (Integer) -> Integer
|
251
256
|
def n_batch: () -> Integer
|
252
257
|
def n_batch=: (Integer) -> Integer
|
258
|
+
def n_ubatch: () -> Integer
|
259
|
+
def n_ubatch=: (Integer) -> Integer
|
260
|
+
def n_seq_max: () -> Integer
|
261
|
+
def n_seq_max=: (Integer) -> Integer
|
253
262
|
def n_threads: () -> Integer
|
254
263
|
def n_threads=: (Integer) -> Integer
|
255
264
|
def n_threads_batch: () -> Integer
|
@@ -2,7 +2,7 @@
|
|
2
2
|
BUILD_TARGETS = \
|
3
3
|
main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
4
4
|
simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \
|
5
|
-
speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey tests/test-c.o
|
5
|
+
speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
|
6
6
|
|
7
7
|
# Binaries only useful for tests
|
8
8
|
TEST_TARGETS = \
|
@@ -167,6 +167,10 @@ ifeq ($(UNAME_S),OpenBSD)
|
|
167
167
|
MK_CPPFLAGS += -D_BSD_SOURCE
|
168
168
|
endif
|
169
169
|
|
170
|
+
ifdef LLAMA_SCHED_MAX_COPIES
|
171
|
+
MK_CPPFLAGS += -DGGML_SCHED_MAX_COPIES=$(LLAMA_SCHED_MAX_COPIES)
|
172
|
+
endif
|
173
|
+
|
170
174
|
ifdef LLAMA_DEBUG
|
171
175
|
MK_CFLAGS += -O0 -g
|
172
176
|
MK_CXXFLAGS += -O0 -g
|
@@ -201,6 +205,10 @@ ifdef LLAMA_SERVER_VERBOSE
|
|
201
205
|
MK_CPPFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
|
202
206
|
endif
|
203
207
|
|
208
|
+
ifdef LLAMA_SERVER_SSL
|
209
|
+
MK_CPPFLAGS += -DCPPHTTPLIB_OPENSSL_SUPPORT
|
210
|
+
MK_LDFLAGS += -lssl -lcrypto
|
211
|
+
endif
|
204
212
|
|
205
213
|
ifdef LLAMA_CODE_COVERAGE
|
206
214
|
MK_CXXFLAGS += -fprofile-arcs -ftest-coverage -dumpbase ''
|
@@ -451,7 +459,7 @@ endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE
|
|
451
459
|
ifdef LLAMA_CUDA_CCBIN
|
452
460
|
MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
|
453
461
|
endif
|
454
|
-
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
|
462
|
+
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml-common.h
|
455
463
|
ifdef JETSON_EOL_MODULE_DETECT
|
456
464
|
$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
|
457
465
|
else
|
@@ -551,15 +559,16 @@ ggml-metal.o: ggml-metal.m ggml-metal.h
|
|
551
559
|
$(CC) $(CFLAGS) -c $< -o $@
|
552
560
|
|
553
561
|
ifdef LLAMA_METAL_EMBED_LIBRARY
|
554
|
-
ggml-metal-embed.o: ggml-metal.metal
|
562
|
+
ggml-metal-embed.o: ggml-metal.metal ggml-common.h
|
555
563
|
@echo "Embedding Metal library"
|
564
|
+
@sed -e '/#include "ggml-common.h"/r ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml-metal.metal > ggml-metal-embed.metal
|
556
565
|
$(eval TEMP_ASSEMBLY=$(shell mktemp))
|
557
|
-
@echo ".section __DATA, __ggml_metallib"
|
558
|
-
@echo ".globl _ggml_metallib_start"
|
559
|
-
@echo "_ggml_metallib_start:"
|
560
|
-
@echo ".incbin \"
|
561
|
-
@echo ".globl _ggml_metallib_end"
|
562
|
-
@echo "_ggml_metallib_end:"
|
566
|
+
@echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)
|
567
|
+
@echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)
|
568
|
+
@echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)
|
569
|
+
@echo ".incbin \"ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)
|
570
|
+
@echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)
|
571
|
+
@echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)
|
563
572
|
@$(AS) $(TEMP_ASSEMBLY) -o $@
|
564
573
|
@rm -f ${TEMP_ASSEMBLY}
|
565
574
|
endif
|
@@ -628,12 +637,15 @@ ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
|
|
628
637
|
ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
|
629
638
|
$(CC) $(CFLAGS) -c $< -o $@
|
630
639
|
|
631
|
-
ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h
|
640
|
+
ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-common.h
|
632
641
|
$(CC) $(CFLAGS) -c $< -o $@
|
633
642
|
|
634
|
-
|
643
|
+
unicode.o: unicode.cpp unicode.h
|
644
|
+
$(CXX) $(CXXFLAGS) -c $< -o $@
|
635
645
|
|
636
|
-
|
646
|
+
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o
|
647
|
+
|
648
|
+
llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
|
637
649
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
638
650
|
|
639
651
|
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
|
@@ -725,6 +737,10 @@ embedding: examples/embedding/embedding.cpp ggml.o llama.o $(C
|
|
725
737
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
726
738
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
727
739
|
|
740
|
+
gritlm: examples/gritlm/gritlm.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
741
|
+
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
742
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
743
|
+
|
728
744
|
save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
729
745
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
730
746
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
@@ -61,7 +61,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
|
|
61
61
|
}
|
62
62
|
}
|
63
63
|
|
64
|
-
// TODO: GGML_PAD ?
|
65
64
|
static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
|
66
65
|
assert(alignment && !(alignment & (alignment - 1))); // power of 2
|
67
66
|
size_t align = (alignment - (((uintptr_t)buffer + offset) % alignment)) % alignment;
|
@@ -69,25 +68,14 @@ static size_t aligned_offset(const void * buffer, size_t offset, size_t alignmen
|
|
69
68
|
}
|
70
69
|
|
71
70
|
// tallocr
|
72
|
-
struct ggml_tallocr {
|
73
|
-
ggml_backend_buffer_t buffer;
|
74
|
-
void * base;
|
75
|
-
size_t alignment;
|
76
|
-
size_t offset;
|
77
|
-
};
|
78
|
-
|
79
|
-
ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer) {
|
80
|
-
ggml_tallocr_t talloc = malloc(sizeof(struct ggml_tallocr));
|
81
|
-
if (talloc == NULL) {
|
82
|
-
return NULL;
|
83
|
-
}
|
84
71
|
|
72
|
+
struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer) {
|
85
73
|
void * base = ggml_backend_buffer_get_base(buffer);
|
86
74
|
size_t align = ggml_backend_buffer_get_alignment(buffer);
|
87
75
|
|
88
76
|
assert(align && !(align & (align - 1))); // power of 2
|
89
77
|
|
90
|
-
|
78
|
+
struct ggml_tallocr talloc = (struct ggml_tallocr) {
|
91
79
|
/*.buffer = */ buffer,
|
92
80
|
/*.base = */ base,
|
93
81
|
/*.alignment = */ align,
|
@@ -96,11 +84,7 @@ ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer) {
|
|
96
84
|
return talloc;
|
97
85
|
}
|
98
86
|
|
99
|
-
void
|
100
|
-
free(talloc);
|
101
|
-
}
|
102
|
-
|
103
|
-
void ggml_tallocr_alloc(ggml_tallocr_t talloc, struct ggml_tensor * tensor) {
|
87
|
+
void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor) {
|
104
88
|
size_t size = ggml_backend_buffer_get_alloc_size(talloc->buffer, tensor);
|
105
89
|
size = GGML_PAD(size, talloc->alignment);
|
106
90
|
|
@@ -354,12 +338,16 @@ struct hash_node {
|
|
354
338
|
bool allocated;
|
355
339
|
};
|
356
340
|
|
357
|
-
//
|
358
341
|
struct tensor_alloc {
|
359
342
|
size_t offset;
|
360
343
|
size_t size_max; // 0 = pre-allocated, unused, or view
|
361
344
|
};
|
362
345
|
|
346
|
+
struct leaf_alloc {
|
347
|
+
int buffer_id;
|
348
|
+
struct tensor_alloc leaf;
|
349
|
+
};
|
350
|
+
|
363
351
|
struct node_alloc {
|
364
352
|
int buffer_id;
|
365
353
|
struct tensor_alloc dst;
|
@@ -378,7 +366,7 @@ struct ggml_gallocr {
|
|
378
366
|
struct node_alloc * node_allocs; // [n_nodes]
|
379
367
|
int n_nodes;
|
380
368
|
|
381
|
-
struct
|
369
|
+
struct leaf_alloc * leaf_allocs; // [n_leafs]
|
382
370
|
int n_leafs;
|
383
371
|
};
|
384
372
|
|
@@ -543,13 +531,20 @@ static int get_node_buffer_id(const int * node_buffer_ids, int i) {
|
|
543
531
|
return node_buffer_ids ? node_buffer_ids[i] : 0;
|
544
532
|
}
|
545
533
|
|
546
|
-
static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids) {
|
534
|
+
static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
|
547
535
|
// clear hash tables
|
548
536
|
memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *));
|
549
537
|
memset(galloc->hash_values, 0, galloc->hash_set.size * sizeof(struct hash_node));
|
550
538
|
|
539
|
+
// allocate leafs
|
540
|
+
// these may be tensors that the application is not using in the graph, but may still want to allocate for other purposes
|
541
|
+
for (int i = 0; i < graph->n_leafs; i++) {
|
542
|
+
struct ggml_tensor * leaf = graph->leafs[i];
|
543
|
+
ggml_gallocr_allocate_node(galloc, leaf, get_node_buffer_id(leaf_buffer_ids, i));
|
544
|
+
}
|
545
|
+
|
551
546
|
// count number of children and views
|
552
|
-
// allocate
|
547
|
+
// allocate other graph inputs and leafs first to avoid overwriting them
|
553
548
|
for (int i = 0; i < graph->n_nodes; i++) {
|
554
549
|
struct ggml_tensor * node = graph->nodes[i];
|
555
550
|
|
@@ -577,19 +572,6 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
|
577
572
|
}
|
578
573
|
}
|
579
574
|
|
580
|
-
// allocate the remaining leafs that are unused on the graph
|
581
|
-
// these are effectively static tensors that the application is not using in the graph, but may still want to allocate for other purposes
|
582
|
-
for (int i = 0; i < graph->n_leafs; i++) {
|
583
|
-
struct ggml_tensor * leaf = graph->leafs[i];
|
584
|
-
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
|
585
|
-
|
586
|
-
if (hn->n_children == 0) {
|
587
|
-
assert(!hn->allocated);
|
588
|
-
// since buffer ids are only given for nodes, these leafs are always allocated in the first buffer
|
589
|
-
ggml_gallocr_allocate_node(galloc, leaf, 0);
|
590
|
-
}
|
591
|
-
}
|
592
|
-
|
593
575
|
// allocate tensors
|
594
576
|
for (int i = 0; i < graph->n_nodes; i++) {
|
595
577
|
struct ggml_tensor * node = graph->nodes[i];
|
@@ -652,7 +634,7 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
|
652
634
|
}
|
653
635
|
}
|
654
636
|
|
655
|
-
bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids) {
|
637
|
+
bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
|
656
638
|
size_t hash_size = graph->visited_hash_table.size;
|
657
639
|
|
658
640
|
// initialize hash table
|
@@ -676,7 +658,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
676
658
|
}
|
677
659
|
|
678
660
|
// allocate in hash table
|
679
|
-
ggml_gallocr_alloc_graph_impl(galloc, graph, node_buffer_ids);
|
661
|
+
ggml_gallocr_alloc_graph_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids);
|
680
662
|
|
681
663
|
// set the node_allocs from the hash table
|
682
664
|
if (galloc->n_nodes < graph->n_nodes) {
|
@@ -711,15 +693,16 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
711
693
|
}
|
712
694
|
if (galloc->n_leafs < graph->n_leafs) {
|
713
695
|
free(galloc->leaf_allocs);
|
714
|
-
galloc->leaf_allocs = calloc(sizeof(
|
696
|
+
galloc->leaf_allocs = calloc(sizeof(galloc->leaf_allocs[0]), graph->n_leafs);
|
715
697
|
GGML_ASSERT(galloc->leaf_allocs != NULL);
|
716
698
|
}
|
717
699
|
galloc->n_leafs = graph->n_leafs;
|
718
700
|
for (int i = 0; i < graph->n_leafs; i++) {
|
719
701
|
struct ggml_tensor * leaf = graph->leafs[i];
|
720
702
|
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
|
721
|
-
galloc->leaf_allocs[i].
|
722
|
-
galloc->leaf_allocs[i].
|
703
|
+
galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
|
704
|
+
galloc->leaf_allocs[i].leaf.offset = hn->offset;
|
705
|
+
galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
|
723
706
|
}
|
724
707
|
|
725
708
|
// reallocate buffers if needed
|
@@ -727,7 +710,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
727
710
|
size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
|
728
711
|
size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
|
729
712
|
|
730
|
-
if
|
713
|
+
// even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
|
714
|
+
if (new_size > cur_size || galloc->buffers[i] == NULL) {
|
731
715
|
#ifndef NDEBUG
|
732
716
|
fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
733
717
|
#endif
|
@@ -744,30 +728,30 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
744
728
|
}
|
745
729
|
|
746
730
|
bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
|
747
|
-
return ggml_gallocr_reserve_n(galloc, graph, NULL);
|
731
|
+
return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
|
748
732
|
}
|
749
733
|
|
750
|
-
static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
|
751
|
-
assert(
|
734
|
+
static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, int buffer_id, struct tensor_alloc * tensor_alloc) {
|
735
|
+
assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
|
752
736
|
|
753
|
-
if (
|
754
|
-
if (
|
737
|
+
if (tensor->view_src != NULL) {
|
738
|
+
if (tensor->buffer == NULL) {
|
755
739
|
assert(tensor_alloc->offset == SIZE_MAX);
|
756
|
-
if (
|
740
|
+
if (tensor->view_src->buffer == NULL) {
|
757
741
|
// this tensor was allocated without ggml-backend
|
758
742
|
return;
|
759
743
|
}
|
760
|
-
ggml_backend_view_init(galloc->buffers[buffer_id],
|
744
|
+
ggml_backend_view_init(galloc->buffers[buffer_id], tensor);
|
761
745
|
}
|
762
746
|
} else {
|
763
|
-
if (
|
747
|
+
if (tensor->data == NULL) {
|
764
748
|
assert(tensor_alloc->offset != SIZE_MAX);
|
765
|
-
assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id],
|
749
|
+
assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
|
766
750
|
void * base = ggml_backend_buffer_get_base(galloc->buffers[buffer_id]);
|
767
751
|
void * addr = (char *)base + tensor_alloc->offset;
|
768
|
-
ggml_backend_tensor_alloc(galloc->buffers[buffer_id],
|
752
|
+
ggml_backend_tensor_alloc(galloc->buffers[buffer_id], tensor, addr);
|
769
753
|
} else {
|
770
|
-
if (
|
754
|
+
if (tensor->buffer == NULL) {
|
771
755
|
// this tensor was allocated without ggml-backend
|
772
756
|
return;
|
773
757
|
}
|
@@ -843,13 +827,18 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
|
|
843
827
|
|
844
828
|
// reset buffers
|
845
829
|
for (int i = 0; i < galloc->n_buffers; i++) {
|
846
|
-
// zero size buffers are not allocated
|
847
830
|
if (galloc->buffers[i] != NULL) {
|
848
831
|
ggml_backend_buffer_reset(galloc->buffers[i]);
|
849
832
|
}
|
850
833
|
}
|
851
834
|
|
852
835
|
// allocate the graph tensors from the previous assignments
|
836
|
+
// leafs
|
837
|
+
for (int i = 0; i < graph->n_leafs; i++) {
|
838
|
+
struct ggml_tensor * leaf = graph->leafs[i];
|
839
|
+
struct leaf_alloc * leaf_alloc = &galloc->leaf_allocs[i];
|
840
|
+
ggml_gallocr_init_tensor(galloc, leaf, leaf_alloc->buffer_id, &leaf_alloc->leaf);
|
841
|
+
}
|
853
842
|
// nodes
|
854
843
|
for (int i = 0; i < graph->n_nodes; i++) {
|
855
844
|
struct ggml_tensor * node = graph->nodes[i];
|
@@ -863,12 +852,6 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
|
|
863
852
|
}
|
864
853
|
ggml_gallocr_init_tensor(galloc, node, node_alloc->buffer_id, &node_alloc->dst);
|
865
854
|
}
|
866
|
-
// leafs
|
867
|
-
for (int i = 0; i < graph->n_leafs; i++) {
|
868
|
-
struct ggml_tensor * leaf = graph->leafs[i];
|
869
|
-
struct tensor_alloc * leaf_alloc = &galloc->leaf_allocs[i];
|
870
|
-
ggml_gallocr_init_tensor(galloc, leaf, 0, leaf_alloc);
|
871
|
-
}
|
872
855
|
|
873
856
|
return true;
|
874
857
|
}
|
@@ -900,12 +883,12 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
|
|
900
883
|
return false;
|
901
884
|
}
|
902
885
|
|
903
|
-
struct ggml_tallocr
|
886
|
+
struct ggml_tallocr tallocr = ggml_tallocr_new(buffer);
|
904
887
|
|
905
888
|
for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
|
906
889
|
if (t->data == NULL) {
|
907
890
|
if (t->view_src == NULL) {
|
908
|
-
ggml_tallocr_alloc(tallocr, t);
|
891
|
+
ggml_tallocr_alloc(&tallocr, t);
|
909
892
|
} else if (t->buffer == NULL) {
|
910
893
|
ggml_backend_view_init(buffer, t);
|
911
894
|
}
|
@@ -917,8 +900,6 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
|
|
917
900
|
}
|
918
901
|
}
|
919
902
|
|
920
|
-
ggml_tallocr_free(tallocr);
|
921
|
-
|
922
903
|
*buffers = realloc(*buffers, sizeof(ggml_backend_buffer_t) * (*n_buffers + 1));
|
923
904
|
(*buffers)[(*n_buffers)++] = buffer;
|
924
905
|
|
@@ -11,11 +11,15 @@ typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
|
|
11
11
|
typedef struct ggml_backend * ggml_backend_t;
|
12
12
|
|
13
13
|
// Tensor allocator
|
14
|
-
|
14
|
+
struct ggml_tallocr {
|
15
|
+
ggml_backend_buffer_t buffer;
|
16
|
+
void * base;
|
17
|
+
size_t alignment;
|
18
|
+
size_t offset;
|
19
|
+
};
|
15
20
|
|
16
|
-
GGML_API
|
17
|
-
GGML_API void
|
18
|
-
GGML_API void ggml_tallocr_alloc(ggml_tallocr_t talloc, struct ggml_tensor * tensor);
|
21
|
+
GGML_API struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer);
|
22
|
+
GGML_API void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor);
|
19
23
|
|
20
24
|
// Graph allocator
|
21
25
|
/*
|
@@ -50,7 +54,11 @@ GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc);
|
|
50
54
|
// not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
|
51
55
|
// returns false if the buffer allocation failed
|
52
56
|
GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
|
53
|
-
GGML_API bool ggml_gallocr_reserve_n(
|
57
|
+
GGML_API bool ggml_gallocr_reserve_n(
|
58
|
+
ggml_gallocr_t galloc,
|
59
|
+
struct ggml_cgraph * graph,
|
60
|
+
const int * node_buffer_ids,
|
61
|
+
const int * leaf_buffer_ids);
|
54
62
|
|
55
63
|
// automatic reallocation if the topology changes when using a single buffer
|
56
64
|
// returns false if using multiple buffers and a re-allocation is needed (call ggml_gallocr_reserve_n first to set the node buffers)
|
@@ -86,12 +86,12 @@ extern "C" {
|
|
86
86
|
// (optional) asynchronous tensor data access
|
87
87
|
void (*GGML_CALL set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
88
88
|
void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
89
|
-
bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t
|
89
|
+
bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
|
90
90
|
|
91
91
|
// (optional) complete all pending operations
|
92
92
|
void (*GGML_CALL synchronize)(ggml_backend_t backend);
|
93
93
|
|
94
|
-
//
|
94
|
+
// compute graph with a plan (not used currently)
|
95
95
|
ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
|
96
96
|
void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
97
97
|
|
@@ -102,16 +102,27 @@ extern "C" {
|
|
102
102
|
|
103
103
|
// check if the backend supports an operation
|
104
104
|
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
105
|
+
|
106
|
+
// (optional) event synchronization
|
107
|
+
ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
|
108
|
+
void (*GGML_CALL event_free) (ggml_backend_event_t event);
|
109
|
+
void (*GGML_CALL event_record) (ggml_backend_event_t event);
|
110
|
+
void (*GGML_CALL event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
|
111
|
+
void (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
|
105
112
|
};
|
106
113
|
|
107
114
|
struct ggml_backend {
|
108
115
|
ggml_guid_t guid;
|
109
116
|
|
110
117
|
struct ggml_backend_i iface;
|
111
|
-
|
112
118
|
ggml_backend_context_t context;
|
113
119
|
};
|
114
120
|
|
121
|
+
struct ggml_backend_event {
|
122
|
+
ggml_backend_t backend;
|
123
|
+
void * context;
|
124
|
+
};
|
125
|
+
|
115
126
|
//
|
116
127
|
// Backend registry
|
117
128
|
//
|