llama_cpp 0.12.2 → 0.12.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +15 -0
- data/README.md +2 -2
- data/ext/llama_cpp/extconf.rb +1 -0
- data/ext/llama_cpp/llama_cpp.cpp +68 -6
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +6 -2
- data/vendor/tmp/llama.cpp/Makefile +25 -3
- data/vendor/tmp/llama.cpp/ggml-alloc.c +87 -27
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +6 -0
- data/vendor/tmp/llama.cpp/ggml-backend.c +176 -18
- data/vendor/tmp/llama.cpp/ggml-backend.h +14 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +1990 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.h +46 -0
- data/vendor/tmp/llama.cpp/ggml-metal.h +3 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +144 -113
- data/vendor/tmp/llama.cpp/ggml-metal.metal +303 -4
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +95 -3
- data/vendor/tmp/llama.cpp/ggml-opencl.h +1 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +736 -59
- data/vendor/tmp/llama.cpp/ggml-quants.h +20 -1
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +15255 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.h +29 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +60854 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +5270 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +34 -0
- data/vendor/tmp/llama.cpp/ggml.c +664 -117
- data/vendor/tmp/llama.cpp/ggml.h +46 -11
- data/vendor/tmp/llama.cpp/llama.cpp +1426 -341
- data/vendor/tmp/llama.cpp/llama.h +24 -15
- data/vendor/tmp/llama.cpp/unicode.h +2 -1
- metadata +10 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e77376858bfb07c67b29963a898f3cf9f2494a5cadabbc4cf777e87af801b33c
|
4
|
+
data.tar.gz: 1196c932182a2c76416c326dac934e97cb9111e6bed269c4776e05587391b916
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 594f4af7e1e88f156926b7605683e29b47a7caf3afb2c18434fa0035415902fb51a9dafe845a4a108bce0dfdd9ad63b5301790826ee6995fa1799cf2bff0c1ee
|
7
|
+
data.tar.gz: 4199b0e417efc0e469172c147aa766a81b3f073158eefc13315ab50e4240a4e2f41611e3c87939f4d3012357edf339b1450e49f2bc324f37f92040396342d476
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,18 @@
|
|
1
|
+
## [[0.12.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.3...v0.12.4)] - 2024-02-03
|
2
|
+
|
3
|
+
- Bump bundled llama.cpp from b1971 to b2047.
|
4
|
+
- Add constant for file type: `LLAMA_FTYPE_MOSTLY_IQ3_XXS`.
|
5
|
+
- Add `supports_mmap?`, `supports_mlock?`, and `supports_gpu_offload?` module functions to `LLaMACpp`.
|
6
|
+
- Add `--with-vulkan` configuration option.
|
7
|
+
- Deprecate `mmap_supported?` and `mlock_supported?` module functions in `LLaMACpp`.
|
8
|
+
- Remove `LLAMA_MAX_DEVICES` constant.
|
9
|
+
|
10
|
+
## [[0.12.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.2...v0.12.3)] - 2024-01-27
|
11
|
+
|
12
|
+
- Bump bundled llama.cpp from b1892 to b1971.
|
13
|
+
- Add constant for file type: `LLAMA_FTYPE_MOSTLY_Q3_K_XS`.
|
14
|
+
- Add `sample_entropy` method to Context.
|
15
|
+
|
1
16
|
## [[0.12.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.1...v0.12.2)] - 2024-01-20
|
2
17
|
|
3
18
|
- Bump bundled llama.cpp from b1833 to b1892.
|
data/README.md
CHANGED
@@ -28,8 +28,8 @@ There are several installation options:
|
|
28
28
|
# use OpenBLAS
|
29
29
|
$ gem install llama_cpp -- --with-openblas
|
30
30
|
|
31
|
-
# use
|
32
|
-
$ gem install llama_cpp -- --with-
|
31
|
+
# use cuBLAS
|
32
|
+
$ gem install llama_cpp -- --with-cublas
|
33
33
|
```
|
34
34
|
|
35
35
|
Those options are defined in [extconf.rb](https://github.com/yoshoku/llama_cpp.rb/blob/main/ext/llama_cpp/extconf.rb) by with_config method.
|
data/ext/llama_cpp/extconf.rb
CHANGED
@@ -19,6 +19,7 @@ make_envs << ' LLAMA_CUBLAS=1' if with_config('cublas')
|
|
19
19
|
make_envs << ' LLAMA_CLBLAST=1' if with_config('clblast')
|
20
20
|
make_envs << ' LLAMA_HIPBLAS=1' if with_config('hipblas')
|
21
21
|
make_envs << ' LLAMA_MPI=1' if with_config('mpi')
|
22
|
+
make_envs << ' LLAMA_VULKAN=1' if with_config('vulkan')
|
22
23
|
|
23
24
|
Dir.chdir(LLAMA_CPP_DIR) do
|
24
25
|
_mkstdout, _mkstderr, mkstatus = Open3.capture3("make lib #{make_envs}".strip)
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -843,15 +843,15 @@ private:
|
|
843
843
|
|
844
844
|
// tensor_split
|
845
845
|
static VALUE _llama_model_params_get_tensor_split(VALUE self) {
|
846
|
-
if (
|
846
|
+
if (llama_max_devices() < 1) {
|
847
847
|
return rb_ary_new();
|
848
848
|
}
|
849
|
-
VALUE ret = rb_ary_new2(
|
849
|
+
VALUE ret = rb_ary_new2(llama_max_devices());
|
850
850
|
LLaMAModelParamsWrapper* ptr = get_llama_model_params(self);
|
851
851
|
if (ptr->params.tensor_split == nullptr) {
|
852
852
|
return rb_ary_new();
|
853
853
|
}
|
854
|
-
for (size_t i = 0; i <
|
854
|
+
for (size_t i = 0; i < llama_max_devices(); i++) {
|
855
855
|
rb_ary_store(ret, i, DBL2NUM(ptr->params.tensor_split[i]));
|
856
856
|
}
|
857
857
|
return ret;
|
@@ -2054,6 +2054,7 @@ public:
|
|
2054
2054
|
rb_define_method(rb_cLLaMAContext, "sample_tail_free", RUBY_METHOD_FUNC(_llama_context_sample_tail_free), -1);
|
2055
2055
|
rb_define_method(rb_cLLaMAContext, "sample_typical", RUBY_METHOD_FUNC(_llama_context_sample_typical), -1);
|
2056
2056
|
rb_define_method(rb_cLLaMAContext, "sample_temp", RUBY_METHOD_FUNC(_llama_context_sample_temp), -1);
|
2057
|
+
rb_define_method(rb_cLLaMAContext, "sample_entropy", RUBY_METHOD_FUNC(_llama_context_sample_entropy), -1);
|
2057
2058
|
rb_define_method(rb_cLLaMAContext, "sample_temperature", RUBY_METHOD_FUNC(_llama_context_sample_temperature), -1);
|
2058
2059
|
rb_define_method(rb_cLLaMAContext, "sample_token_mirostat", RUBY_METHOD_FUNC(_llama_context_sample_token_mirostat), -1);
|
2059
2060
|
rb_define_method(rb_cLLaMAContext, "sample_token_mirostat_v2", RUBY_METHOD_FUNC(_llama_context_sample_token_mirostat_v2), -1);
|
@@ -2904,6 +2905,50 @@ private:
|
|
2904
2905
|
return Qnil;
|
2905
2906
|
}
|
2906
2907
|
|
2908
|
+
static VALUE _llama_context_sample_entropy(int argc, VALUE* argv, VALUE self) {
|
2909
|
+
VALUE kw_args = Qnil;
|
2910
|
+
ID kw_table[3] = { rb_intern("min_temp"), rb_intern("max_temp"), rb_intern("exponent_val") };
|
2911
|
+
VALUE kw_values[3] = { Qundef, Qundef, Qundef };
|
2912
|
+
VALUE candidates = Qnil;
|
2913
|
+
rb_scan_args(argc, argv, "1:", &candidates, &kw_args);
|
2914
|
+
rb_get_kwargs(kw_args, kw_table, 3, 0, kw_values);
|
2915
|
+
|
2916
|
+
if (!rb_obj_is_kind_of(candidates, rb_cLLaMATokenDataArray)) {
|
2917
|
+
rb_raise(rb_eArgError, "1st argument must be a TokenDataArray");
|
2918
|
+
return Qnil;
|
2919
|
+
}
|
2920
|
+
if (!RB_FLOAT_TYPE_P(kw_values[0])) {
|
2921
|
+
rb_raise(rb_eArgError, "min_temp must be a float");
|
2922
|
+
return Qnil;
|
2923
|
+
}
|
2924
|
+
if (!RB_FLOAT_TYPE_P(kw_values[1])) {
|
2925
|
+
rb_raise(rb_eArgError, "max_temp must be a float");
|
2926
|
+
return Qnil;
|
2927
|
+
}
|
2928
|
+
if (!RB_FLOAT_TYPE_P(kw_values[2])) {
|
2929
|
+
rb_raise(rb_eArgError, "exponent_val must be a float");
|
2930
|
+
return Qnil;
|
2931
|
+
}
|
2932
|
+
|
2933
|
+
LLaMAContextWrapper* ctx_ptr = get_llama_context(self);
|
2934
|
+
if (ctx_ptr->ctx == NULL) {
|
2935
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
2936
|
+
return Qnil;
|
2937
|
+
}
|
2938
|
+
LLaMATokenDataArrayWrapper* cnd_ptr = RbLLaMATokenDataArray::get_llama_token_data_array(candidates);
|
2939
|
+
if (cnd_ptr->array.data == nullptr) {
|
2940
|
+
rb_raise(rb_eRuntimeError, "TokenDataArray is empty");
|
2941
|
+
return Qnil;
|
2942
|
+
}
|
2943
|
+
const float min_temp = NUM2DBL(kw_values[0]);
|
2944
|
+
const float max_temp = NUM2DBL(kw_values[1]);
|
2945
|
+
const float exponent_val = NUM2DBL(kw_values[2]);
|
2946
|
+
|
2947
|
+
llama_sample_entropy(ctx_ptr->ctx, &(cnd_ptr->array), min_temp, max_temp, exponent_val);
|
2948
|
+
|
2949
|
+
return Qnil;
|
2950
|
+
}
|
2951
|
+
|
2907
2952
|
static VALUE _llama_context_sample_temperature(int argc, VALUE* argv, VALUE self) {
|
2908
2953
|
VALUE kw_args = Qnil;
|
2909
2954
|
ID kw_table[1] = { rb_intern("temperature") };
|
@@ -3214,15 +3259,29 @@ static VALUE rb_llama_time_us(VALUE self) {
|
|
3214
3259
|
}
|
3215
3260
|
|
3216
3261
|
static VALUE rb_llama_mmap_supported(VALUE self) {
|
3262
|
+
rb_warn("mmap_supported? is deprecated. Use supports_mmap? instead.");
|
3217
3263
|
return llama_mmap_supported() ? Qtrue : Qfalse;
|
3218
3264
|
}
|
3219
3265
|
|
3220
3266
|
static VALUE rb_llama_mlock_supported(VALUE self) {
|
3267
|
+
rb_warn("mlock_supported? is deprecated. Use supports_mlock? instead.");
|
3221
3268
|
return llama_mlock_supported() ? Qtrue : Qfalse;
|
3222
3269
|
}
|
3223
3270
|
|
3224
3271
|
static VALUE rb_llama_max_devices(VALUE self) {
|
3225
|
-
return
|
3272
|
+
return SIZET2NUM(llama_max_devices());
|
3273
|
+
}
|
3274
|
+
|
3275
|
+
static VALUE rb_llama_supports_mmap(VALUE self) {
|
3276
|
+
return llama_supports_mmap() ? Qtrue : Qfalse;
|
3277
|
+
}
|
3278
|
+
|
3279
|
+
static VALUE rb_llama_supports_mlock(VALUE self) {
|
3280
|
+
return llama_supports_mlock() ? Qtrue : Qfalse;
|
3281
|
+
}
|
3282
|
+
|
3283
|
+
static VALUE rb_llama_supports_gpu_offload(VALUE self) {
|
3284
|
+
return llama_supports_gpu_offload() ? Qtrue : Qfalse;
|
3226
3285
|
}
|
3227
3286
|
|
3228
3287
|
extern "C" void Init_llama_cpp(void) {
|
@@ -3249,8 +3308,9 @@ extern "C" void Init_llama_cpp(void) {
|
|
3249
3308
|
rb_define_module_function(rb_mLLaMACpp, "mmap_supported?", rb_llama_mmap_supported, 0);
|
3250
3309
|
rb_define_module_function(rb_mLLaMACpp, "mlock_supported?", rb_llama_mlock_supported, 0);
|
3251
3310
|
rb_define_module_function(rb_mLLaMACpp, "max_devices", rb_llama_max_devices, 0);
|
3252
|
-
|
3253
|
-
|
3311
|
+
rb_define_module_function(rb_mLLaMACpp, "supports_mmap?", rb_llama_supports_mmap, 0);
|
3312
|
+
rb_define_module_function(rb_mLLaMACpp, "supports_mlock?", rb_llama_supports_mlock, 0);
|
3313
|
+
rb_define_module_function(rb_mLLaMACpp, "supports_gpu_offload?", rb_llama_supports_gpu_offload, 0);
|
3254
3314
|
|
3255
3315
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_SPM", INT2NUM(LLAMA_VOCAB_TYPE_SPM));
|
3256
3316
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_BPE", INT2NUM(LLAMA_VOCAB_TYPE_BPE));
|
@@ -3283,6 +3343,8 @@ extern "C" void Init_llama_cpp(void) {
|
|
3283
3343
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ2_XXS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ2_XXS));
|
3284
3344
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ2_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ2_XS));
|
3285
3345
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q2_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q2_K_S));
|
3346
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q3_K_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_XS));
|
3347
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_XXS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_XXS));
|
3286
3348
|
|
3287
3349
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_GUESSED", INT2NUM(LLAMA_FTYPE_GUESSED));
|
3288
3350
|
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.12.
|
6
|
+
VERSION = '0.12.4'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b2047'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -3,8 +3,6 @@ module LLaMACpp
|
|
3
3
|
LLAMA_CPP_VERSION: String
|
4
4
|
LLAMA_DEFALUT_SEED: String
|
5
5
|
|
6
|
-
LLAMA_MAX_DEVICES: Integer
|
7
|
-
|
8
6
|
LLAMA_FTYPE_ALL_F32: Integer
|
9
7
|
LLAMA_FTYPE_MOSTLY_F16: Integer
|
10
8
|
LLAMA_FTYPE_MOSTLY_Q4_0: Integer
|
@@ -25,6 +23,8 @@ module LLaMACpp
|
|
25
23
|
LLAMA_FTYPE_MOSTLY_IQ2_XXS: Integer
|
26
24
|
LLAMA_FTYPE_MOSTLY_IQ2_XS: Integer
|
27
25
|
LLAMA_FTYPE_MOSTLY_Q2_K_S: Integer
|
26
|
+
LLAMA_FTYPE_MOSTLY_Q3_K_XS: Integer
|
27
|
+
LLAMA_FTYPE_MOSTLY_IQ3_XXS: Integer
|
28
28
|
|
29
29
|
LLAMA_KV_OVERRIDE_INT: Integer
|
30
30
|
LLAMA_KV_OVERRIDE_FLOAT: Integer
|
@@ -60,6 +60,9 @@ module LLaMACpp
|
|
60
60
|
def self?.mmap_supported?: () -> bool
|
61
61
|
def self?.mlock_supported?: () -> bool
|
62
62
|
def self?.max_devices: () -> Integer
|
63
|
+
def self?.supports_mmap?: () -> bool
|
64
|
+
def self?.supports_mlock?: () -> bool
|
65
|
+
def self?.supports_gpu_offload?: () -> bool
|
63
66
|
|
64
67
|
class TokenData
|
65
68
|
public
|
@@ -216,6 +219,7 @@ module LLaMACpp
|
|
216
219
|
def sample_tail_free: (::LLaMACpp::TokenDataArray, z: Float, ?min_keep: Integer) -> void
|
217
220
|
def sample_typical: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
|
218
221
|
def sample_temp: (::LLaMACpp::TokenDataArray, temp: Float) -> void
|
222
|
+
def sample_entropy: (::LLaMACpp::TokenDataArray, min_temp: Float, max_temp: Float, exponent_val: Float) -> void
|
219
223
|
def sample_temperature: (::LLaMACpp::TokenDataArray, temperature: Float) -> void
|
220
224
|
def sample_token_mirostat: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, m: Integer, mu: Float) -> [Integer, Float]
|
221
225
|
def sample_token_mirostat_v2: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, mu: Float) -> [Integer, Float]
|
@@ -9,7 +9,7 @@ TEST_TARGETS = \
|
|
9
9
|
tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
|
10
10
|
tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
|
11
11
|
tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \
|
12
|
-
tests/test-backend-ops
|
12
|
+
tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease
|
13
13
|
|
14
14
|
# Code coverage output files
|
15
15
|
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
|
@@ -450,6 +450,19 @@ ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
|
|
450
450
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
451
451
|
endif # LLAMA_CLBLAST
|
452
452
|
|
453
|
+
ifdef LLAMA_VULKAN
|
454
|
+
MK_CPPFLAGS += -DGGML_USE_VULKAN
|
455
|
+
MK_LDFLAGS += -lvulkan
|
456
|
+
OBJS += ggml-vulkan.o
|
457
|
+
|
458
|
+
ifdef LLAMA_VULKAN_CHECK_RESULTS
|
459
|
+
MK_CPPFLAGS += -DGGML_VULKAN_CHECK_RESULTS
|
460
|
+
endif
|
461
|
+
|
462
|
+
ggml-vulkan.o: ggml-vulkan.cpp ggml-vulkan.h
|
463
|
+
$(CXX) $(CXXFLAGS) -c $< -o $@
|
464
|
+
endif # LLAMA_VULKAN
|
465
|
+
|
453
466
|
ifdef LLAMA_HIPBLAS
|
454
467
|
|
455
468
|
ifeq ($(wildcard /opt/rocm),)
|
@@ -575,12 +588,15 @@ train.o: common/train.cpp common/train.h
|
|
575
588
|
libllama.so: llama.o ggml.o $(OBJS)
|
576
589
|
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
577
590
|
|
591
|
+
libllama.a: llama.o ggml.o $(OBJS) $(COMMON_DEPS)
|
592
|
+
ar rcs libllama.a llama.o ggml.o $(OBJS) $(COMMON_DEPS)
|
593
|
+
|
578
594
|
lib: llama.o ggml.o $(OBJS)
|
579
595
|
$(CXX) $(CXXFLAGS) -shared -fPIC -o libllama$(DSO_EXT) $^ $(LDFLAGS)
|
580
596
|
ar rcs libllama.a $^
|
581
597
|
|
582
598
|
clean:
|
583
|
-
rm -vrf *.o tests/*.o *.so *.dll *.dylib
|
599
|
+
rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
584
600
|
|
585
601
|
#
|
586
602
|
# Examples
|
@@ -625,7 +641,7 @@ embedding: examples/embedding/embedding.cpp ggml.o llama.o $(C
|
|
625
641
|
save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
626
642
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
627
643
|
|
628
|
-
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
644
|
+
server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
629
645
|
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) -Wno-cast-qual
|
630
646
|
|
631
647
|
gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
|
@@ -753,3 +769,9 @@ tests/test-c.o: tests/test-c.c llama.h
|
|
753
769
|
|
754
770
|
tests/test-backend-ops: tests/test-backend-ops.cpp ggml.o $(OBJS)
|
755
771
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
772
|
+
|
773
|
+
tests/test-model-load-cancel: tests/test-model-load-cancel.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS)
|
774
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
775
|
+
|
776
|
+
tests/test-autorelease: tests/test-autorelease.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS)
|
777
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
@@ -109,8 +109,8 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
|
|
109
109
|
if (block->size >= size) {
|
110
110
|
best_fit_block = alloc->n_free_blocks - 1;
|
111
111
|
} else {
|
112
|
-
fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
|
113
|
-
__func__, size, max_avail);
|
112
|
+
fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, largest block available %zu)\n",
|
113
|
+
__func__, tensor->name, size, max_avail);
|
114
114
|
GGML_ASSERT(!"not enough space in the buffer");
|
115
115
|
return;
|
116
116
|
}
|
@@ -335,7 +335,9 @@ bool ggml_tallocr_is_measure(ggml_tallocr_t alloc) {
|
|
335
335
|
}
|
336
336
|
|
337
337
|
size_t ggml_tallocr_max_size(ggml_tallocr_t alloc) {
|
338
|
-
|
338
|
+
// FIXME: changes in the tensor sizes compared to the measure graph may cause allocations to fail
|
339
|
+
// to avoid this, we add a 10% margin to the buffer size
|
340
|
+
return alloc->max_size + alloc->max_size/10;
|
339
341
|
}
|
340
342
|
|
341
343
|
// graph allocator
|
@@ -776,38 +778,26 @@ size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph)
|
|
776
778
|
}
|
777
779
|
|
778
780
|
// utils
|
779
|
-
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
|
780
|
-
GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
|
781
|
-
|
782
|
-
size_t alignment = ggml_backend_buft_get_alignment(buft);
|
783
|
-
|
784
|
-
size_t nbytes = 0;
|
785
|
-
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
786
|
-
if (t->data == NULL && t->view_src == NULL) {
|
787
|
-
nbytes += GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
|
788
|
-
}
|
789
|
-
}
|
790
|
-
|
791
|
-
if (nbytes == 0) {
|
792
|
-
// all the tensors in the context are already allocated
|
793
|
-
#ifndef NDEBUG
|
794
|
-
fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
|
795
|
-
#endif
|
796
|
-
return NULL;
|
797
|
-
}
|
798
781
|
|
799
|
-
|
782
|
+
static bool alloc_tensor_range(struct ggml_context * ctx,
|
783
|
+
struct ggml_tensor * first, struct ggml_tensor * last,
|
784
|
+
ggml_backend_buffer_type_t buft, size_t size,
|
785
|
+
ggml_backend_buffer_t ** buffers, size_t * n_buffers) {
|
786
|
+
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
|
800
787
|
if (buffer == NULL) {
|
801
|
-
// failed to allocate buffer
|
802
788
|
#ifndef NDEBUG
|
803
|
-
fprintf(stderr, "%s: failed to allocate buffer\n", __func__);
|
789
|
+
fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
|
804
790
|
#endif
|
805
|
-
|
791
|
+
for (size_t i = 0; i < *n_buffers; i++) {
|
792
|
+
ggml_backend_buffer_free(*buffers[i]);
|
793
|
+
}
|
794
|
+
free(*buffers);
|
795
|
+
return false;
|
806
796
|
}
|
807
797
|
|
808
798
|
ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer);
|
809
799
|
|
810
|
-
for (struct ggml_tensor * t =
|
800
|
+
for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
|
811
801
|
if (t->data == NULL) {
|
812
802
|
if (t->view_src == NULL) {
|
813
803
|
ggml_tallocr_alloc(tallocr, t);
|
@@ -824,6 +814,76 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
|
824
814
|
|
825
815
|
ggml_tallocr_free(tallocr);
|
826
816
|
|
817
|
+
*buffers = realloc(*buffers, sizeof(ggml_backend_buffer_t) * (*n_buffers + 1));
|
818
|
+
(*buffers)[(*n_buffers)++] = buffer;
|
819
|
+
|
820
|
+
return true;
|
821
|
+
}
|
822
|
+
|
823
|
+
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
|
824
|
+
GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
|
825
|
+
|
826
|
+
size_t alignment = ggml_backend_buft_get_alignment(buft);
|
827
|
+
size_t max_size = ggml_backend_buft_get_max_size(buft);
|
828
|
+
|
829
|
+
ggml_backend_buffer_t * buffers = NULL;
|
830
|
+
size_t n_buffers = 0;
|
831
|
+
|
832
|
+
size_t cur_buf_size = 0;
|
833
|
+
struct ggml_tensor * first = ggml_get_first_tensor(ctx);
|
834
|
+
for (struct ggml_tensor * t = first; t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
835
|
+
size_t this_size = 0;
|
836
|
+
if (t->data == NULL && t->view_src == NULL) {
|
837
|
+
this_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
|
838
|
+
}
|
839
|
+
|
840
|
+
if (this_size > max_size) {
|
841
|
+
// tensor is too large to fit in a single buffer
|
842
|
+
fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
|
843
|
+
__func__, t->name,
|
844
|
+
ggml_backend_buft_name(buft),
|
845
|
+
this_size, max_size);
|
846
|
+
for (size_t i = 0; i < n_buffers; i++) {
|
847
|
+
ggml_backend_buffer_free(buffers[i]);
|
848
|
+
}
|
849
|
+
free(buffers);
|
850
|
+
return NULL;
|
851
|
+
}
|
852
|
+
|
853
|
+
if ((cur_buf_size + this_size) > max_size) {
|
854
|
+
// allocate tensors in the current buffer
|
855
|
+
if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
|
856
|
+
return NULL;
|
857
|
+
}
|
858
|
+
first = t;
|
859
|
+
cur_buf_size = this_size;
|
860
|
+
} else {
|
861
|
+
cur_buf_size += this_size;
|
862
|
+
}
|
863
|
+
}
|
864
|
+
|
865
|
+
// allocate remaining tensors
|
866
|
+
if (cur_buf_size > 0) {
|
867
|
+
if (!alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
|
868
|
+
return NULL;
|
869
|
+
}
|
870
|
+
}
|
871
|
+
|
872
|
+
if (n_buffers == 0) {
|
873
|
+
// all the tensors in the context are already allocated
|
874
|
+
#ifndef NDEBUG
|
875
|
+
fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
|
876
|
+
#endif
|
877
|
+
return NULL;
|
878
|
+
}
|
879
|
+
|
880
|
+
ggml_backend_buffer_t buffer;
|
881
|
+
if (n_buffers == 1) {
|
882
|
+
buffer = buffers[0];
|
883
|
+
} else {
|
884
|
+
buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers);
|
885
|
+
}
|
886
|
+
free(buffers);
|
827
887
|
return buffer;
|
828
888
|
}
|
829
889
|
|
@@ -19,6 +19,7 @@ extern "C" {
|
|
19
19
|
const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft);
|
20
20
|
ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
|
21
21
|
size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
|
22
|
+
size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft); // allocation max size
|
22
23
|
size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
|
23
24
|
bool (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
|
24
25
|
// check if tensor data is in host memory
|
@@ -63,6 +64,11 @@ extern "C" {
|
|
63
64
|
// do not use directly, use ggml_backend_tensor_copy instead
|
64
65
|
bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
|
65
66
|
|
67
|
+
// buffer that contains a collection of buffers
|
68
|
+
GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
|
69
|
+
GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
|
70
|
+
GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
|
71
|
+
|
66
72
|
//
|
67
73
|
// Backend
|
68
74
|
//
|