llama_cpp 0.12.2 → 0.12.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +15 -0
- data/README.md +2 -2
- data/ext/llama_cpp/extconf.rb +1 -0
- data/ext/llama_cpp/llama_cpp.cpp +68 -6
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +6 -2
- data/vendor/tmp/llama.cpp/Makefile +25 -3
- data/vendor/tmp/llama.cpp/ggml-alloc.c +87 -27
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +6 -0
- data/vendor/tmp/llama.cpp/ggml-backend.c +176 -18
- data/vendor/tmp/llama.cpp/ggml-backend.h +14 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +1990 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.h +46 -0
- data/vendor/tmp/llama.cpp/ggml-metal.h +3 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +144 -113
- data/vendor/tmp/llama.cpp/ggml-metal.metal +303 -4
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +95 -3
- data/vendor/tmp/llama.cpp/ggml-opencl.h +1 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +736 -59
- data/vendor/tmp/llama.cpp/ggml-quants.h +20 -1
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +15255 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.h +29 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +60854 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +5270 -0
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +34 -0
- data/vendor/tmp/llama.cpp/ggml.c +664 -117
- data/vendor/tmp/llama.cpp/ggml.h +46 -11
- data/vendor/tmp/llama.cpp/llama.cpp +1426 -341
- data/vendor/tmp/llama.cpp/llama.h +24 -15
- data/vendor/tmp/llama.cpp/unicode.h +2 -1
- metadata +10 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e77376858bfb07c67b29963a898f3cf9f2494a5cadabbc4cf777e87af801b33c
|
4
|
+
data.tar.gz: 1196c932182a2c76416c326dac934e97cb9111e6bed269c4776e05587391b916
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 594f4af7e1e88f156926b7605683e29b47a7caf3afb2c18434fa0035415902fb51a9dafe845a4a108bce0dfdd9ad63b5301790826ee6995fa1799cf2bff0c1ee
|
7
|
+
data.tar.gz: 4199b0e417efc0e469172c147aa766a81b3f073158eefc13315ab50e4240a4e2f41611e3c87939f4d3012357edf339b1450e49f2bc324f37f92040396342d476
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,18 @@
|
|
1
|
+
## [[0.12.4](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.3...v0.12.4)] - 2024-02-03
|
2
|
+
|
3
|
+
- Bump bundled llama.cpp from b1971 to b2047.
|
4
|
+
- Add constant for file type: `LLAMA_FTYPE_MOSTLY_IQ3_XXS`.
|
5
|
+
- Add `supports_mmap?`, `supports_mlock?`, and `supports_gpu_offload?` module functions to `LLaMACpp`.
|
6
|
+
- Add `--with-vulkan` configuration option.
|
7
|
+
- Deprecate `mmap_supported?` and `mlock_supported?` module functions in `LLaMACpp`.
|
8
|
+
- Remove `LLAMA_MAX_DEVICES` constant.
|
9
|
+
|
10
|
+
## [[0.12.3](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.2...v0.12.3)] - 2024-01-27
|
11
|
+
|
12
|
+
- Bump bundled llama.cpp from b1892 to b1971.
|
13
|
+
- Add constant for file type: `LLAMA_FTYPE_MOSTLY_Q3_K_XS`.
|
14
|
+
- Add `sample_entropy` method to Context.
|
15
|
+
|
1
16
|
## [[0.12.2](https://github.com/yoshoku/llama_cpp.rb/compare/v0.12.1...v0.12.2)] - 2024-01-20
|
2
17
|
|
3
18
|
- Bump bundled llama.cpp from b1833 to b1892.
|
data/README.md
CHANGED
@@ -28,8 +28,8 @@ There are several installation options:
|
|
28
28
|
# use OpenBLAS
|
29
29
|
$ gem install llama_cpp -- --with-openblas
|
30
30
|
|
31
|
-
# use
|
32
|
-
$ gem install llama_cpp -- --with-
|
31
|
+
# use cuBLAS
|
32
|
+
$ gem install llama_cpp -- --with-cublas
|
33
33
|
```
|
34
34
|
|
35
35
|
Those options are defined in [extconf.rb](https://github.com/yoshoku/llama_cpp.rb/blob/main/ext/llama_cpp/extconf.rb) by with_config method.
|
data/ext/llama_cpp/extconf.rb
CHANGED
@@ -19,6 +19,7 @@ make_envs << ' LLAMA_CUBLAS=1' if with_config('cublas')
|
|
19
19
|
make_envs << ' LLAMA_CLBLAST=1' if with_config('clblast')
|
20
20
|
make_envs << ' LLAMA_HIPBLAS=1' if with_config('hipblas')
|
21
21
|
make_envs << ' LLAMA_MPI=1' if with_config('mpi')
|
22
|
+
make_envs << ' LLAMA_VULKAN=1' if with_config('vulkan')
|
22
23
|
|
23
24
|
Dir.chdir(LLAMA_CPP_DIR) do
|
24
25
|
_mkstdout, _mkstderr, mkstatus = Open3.capture3("make lib #{make_envs}".strip)
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -843,15 +843,15 @@ private:
|
|
843
843
|
|
844
844
|
// tensor_split
|
845
845
|
static VALUE _llama_model_params_get_tensor_split(VALUE self) {
|
846
|
-
if (
|
846
|
+
if (llama_max_devices() < 1) {
|
847
847
|
return rb_ary_new();
|
848
848
|
}
|
849
|
-
VALUE ret = rb_ary_new2(
|
849
|
+
VALUE ret = rb_ary_new2(llama_max_devices());
|
850
850
|
LLaMAModelParamsWrapper* ptr = get_llama_model_params(self);
|
851
851
|
if (ptr->params.tensor_split == nullptr) {
|
852
852
|
return rb_ary_new();
|
853
853
|
}
|
854
|
-
for (size_t i = 0; i <
|
854
|
+
for (size_t i = 0; i < llama_max_devices(); i++) {
|
855
855
|
rb_ary_store(ret, i, DBL2NUM(ptr->params.tensor_split[i]));
|
856
856
|
}
|
857
857
|
return ret;
|
@@ -2054,6 +2054,7 @@ public:
|
|
2054
2054
|
rb_define_method(rb_cLLaMAContext, "sample_tail_free", RUBY_METHOD_FUNC(_llama_context_sample_tail_free), -1);
|
2055
2055
|
rb_define_method(rb_cLLaMAContext, "sample_typical", RUBY_METHOD_FUNC(_llama_context_sample_typical), -1);
|
2056
2056
|
rb_define_method(rb_cLLaMAContext, "sample_temp", RUBY_METHOD_FUNC(_llama_context_sample_temp), -1);
|
2057
|
+
rb_define_method(rb_cLLaMAContext, "sample_entropy", RUBY_METHOD_FUNC(_llama_context_sample_entropy), -1);
|
2057
2058
|
rb_define_method(rb_cLLaMAContext, "sample_temperature", RUBY_METHOD_FUNC(_llama_context_sample_temperature), -1);
|
2058
2059
|
rb_define_method(rb_cLLaMAContext, "sample_token_mirostat", RUBY_METHOD_FUNC(_llama_context_sample_token_mirostat), -1);
|
2059
2060
|
rb_define_method(rb_cLLaMAContext, "sample_token_mirostat_v2", RUBY_METHOD_FUNC(_llama_context_sample_token_mirostat_v2), -1);
|
@@ -2904,6 +2905,50 @@ private:
|
|
2904
2905
|
return Qnil;
|
2905
2906
|
}
|
2906
2907
|
|
2908
|
+
static VALUE _llama_context_sample_entropy(int argc, VALUE* argv, VALUE self) {
|
2909
|
+
VALUE kw_args = Qnil;
|
2910
|
+
ID kw_table[3] = { rb_intern("min_temp"), rb_intern("max_temp"), rb_intern("exponent_val") };
|
2911
|
+
VALUE kw_values[3] = { Qundef, Qundef, Qundef };
|
2912
|
+
VALUE candidates = Qnil;
|
2913
|
+
rb_scan_args(argc, argv, "1:", &candidates, &kw_args);
|
2914
|
+
rb_get_kwargs(kw_args, kw_table, 3, 0, kw_values);
|
2915
|
+
|
2916
|
+
if (!rb_obj_is_kind_of(candidates, rb_cLLaMATokenDataArray)) {
|
2917
|
+
rb_raise(rb_eArgError, "1st argument must be a TokenDataArray");
|
2918
|
+
return Qnil;
|
2919
|
+
}
|
2920
|
+
if (!RB_FLOAT_TYPE_P(kw_values[0])) {
|
2921
|
+
rb_raise(rb_eArgError, "min_temp must be a float");
|
2922
|
+
return Qnil;
|
2923
|
+
}
|
2924
|
+
if (!RB_FLOAT_TYPE_P(kw_values[1])) {
|
2925
|
+
rb_raise(rb_eArgError, "max_temp must be a float");
|
2926
|
+
return Qnil;
|
2927
|
+
}
|
2928
|
+
if (!RB_FLOAT_TYPE_P(kw_values[2])) {
|
2929
|
+
rb_raise(rb_eArgError, "exponent_val must be a float");
|
2930
|
+
return Qnil;
|
2931
|
+
}
|
2932
|
+
|
2933
|
+
LLaMAContextWrapper* ctx_ptr = get_llama_context(self);
|
2934
|
+
if (ctx_ptr->ctx == NULL) {
|
2935
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
2936
|
+
return Qnil;
|
2937
|
+
}
|
2938
|
+
LLaMATokenDataArrayWrapper* cnd_ptr = RbLLaMATokenDataArray::get_llama_token_data_array(candidates);
|
2939
|
+
if (cnd_ptr->array.data == nullptr) {
|
2940
|
+
rb_raise(rb_eRuntimeError, "TokenDataArray is empty");
|
2941
|
+
return Qnil;
|
2942
|
+
}
|
2943
|
+
const float min_temp = NUM2DBL(kw_values[0]);
|
2944
|
+
const float max_temp = NUM2DBL(kw_values[1]);
|
2945
|
+
const float exponent_val = NUM2DBL(kw_values[2]);
|
2946
|
+
|
2947
|
+
llama_sample_entropy(ctx_ptr->ctx, &(cnd_ptr->array), min_temp, max_temp, exponent_val);
|
2948
|
+
|
2949
|
+
return Qnil;
|
2950
|
+
}
|
2951
|
+
|
2907
2952
|
static VALUE _llama_context_sample_temperature(int argc, VALUE* argv, VALUE self) {
|
2908
2953
|
VALUE kw_args = Qnil;
|
2909
2954
|
ID kw_table[1] = { rb_intern("temperature") };
|
@@ -3214,15 +3259,29 @@ static VALUE rb_llama_time_us(VALUE self) {
|
|
3214
3259
|
}
|
3215
3260
|
|
3216
3261
|
static VALUE rb_llama_mmap_supported(VALUE self) {
|
3262
|
+
rb_warn("mmap_supported? is deprecated. Use supports_mmap? instead.");
|
3217
3263
|
return llama_mmap_supported() ? Qtrue : Qfalse;
|
3218
3264
|
}
|
3219
3265
|
|
3220
3266
|
static VALUE rb_llama_mlock_supported(VALUE self) {
|
3267
|
+
rb_warn("mlock_supported? is deprecated. Use supports_mlock? instead.");
|
3221
3268
|
return llama_mlock_supported() ? Qtrue : Qfalse;
|
3222
3269
|
}
|
3223
3270
|
|
3224
3271
|
static VALUE rb_llama_max_devices(VALUE self) {
|
3225
|
-
return
|
3272
|
+
return SIZET2NUM(llama_max_devices());
|
3273
|
+
}
|
3274
|
+
|
3275
|
+
static VALUE rb_llama_supports_mmap(VALUE self) {
|
3276
|
+
return llama_supports_mmap() ? Qtrue : Qfalse;
|
3277
|
+
}
|
3278
|
+
|
3279
|
+
static VALUE rb_llama_supports_mlock(VALUE self) {
|
3280
|
+
return llama_supports_mlock() ? Qtrue : Qfalse;
|
3281
|
+
}
|
3282
|
+
|
3283
|
+
static VALUE rb_llama_supports_gpu_offload(VALUE self) {
|
3284
|
+
return llama_supports_gpu_offload() ? Qtrue : Qfalse;
|
3226
3285
|
}
|
3227
3286
|
|
3228
3287
|
extern "C" void Init_llama_cpp(void) {
|
@@ -3249,8 +3308,9 @@ extern "C" void Init_llama_cpp(void) {
|
|
3249
3308
|
rb_define_module_function(rb_mLLaMACpp, "mmap_supported?", rb_llama_mmap_supported, 0);
|
3250
3309
|
rb_define_module_function(rb_mLLaMACpp, "mlock_supported?", rb_llama_mlock_supported, 0);
|
3251
3310
|
rb_define_module_function(rb_mLLaMACpp, "max_devices", rb_llama_max_devices, 0);
|
3252
|
-
|
3253
|
-
|
3311
|
+
rb_define_module_function(rb_mLLaMACpp, "supports_mmap?", rb_llama_supports_mmap, 0);
|
3312
|
+
rb_define_module_function(rb_mLLaMACpp, "supports_mlock?", rb_llama_supports_mlock, 0);
|
3313
|
+
rb_define_module_function(rb_mLLaMACpp, "supports_gpu_offload?", rb_llama_supports_gpu_offload, 0);
|
3254
3314
|
|
3255
3315
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_SPM", INT2NUM(LLAMA_VOCAB_TYPE_SPM));
|
3256
3316
|
rb_define_const(rb_mLLaMACpp, "LLAMA_VOCAB_TYPE_BPE", INT2NUM(LLAMA_VOCAB_TYPE_BPE));
|
@@ -3283,6 +3343,8 @@ extern "C" void Init_llama_cpp(void) {
|
|
3283
3343
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ2_XXS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ2_XXS));
|
3284
3344
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ2_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ2_XS));
|
3285
3345
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q2_K_S", INT2NUM(LLAMA_FTYPE_MOSTLY_Q2_K_S));
|
3346
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_Q3_K_XS", INT2NUM(LLAMA_FTYPE_MOSTLY_Q3_K_XS));
|
3347
|
+
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_MOSTLY_IQ3_XXS", INT2NUM(LLAMA_FTYPE_MOSTLY_IQ3_XXS));
|
3286
3348
|
|
3287
3349
|
rb_define_const(rb_mLLaMACpp, "LLAMA_FTYPE_GUESSED", INT2NUM(LLAMA_FTYPE_GUESSED));
|
3288
3350
|
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.12.
|
6
|
+
VERSION = '0.12.4'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b2047'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -3,8 +3,6 @@ module LLaMACpp
|
|
3
3
|
LLAMA_CPP_VERSION: String
|
4
4
|
LLAMA_DEFALUT_SEED: String
|
5
5
|
|
6
|
-
LLAMA_MAX_DEVICES: Integer
|
7
|
-
|
8
6
|
LLAMA_FTYPE_ALL_F32: Integer
|
9
7
|
LLAMA_FTYPE_MOSTLY_F16: Integer
|
10
8
|
LLAMA_FTYPE_MOSTLY_Q4_0: Integer
|
@@ -25,6 +23,8 @@ module LLaMACpp
|
|
25
23
|
LLAMA_FTYPE_MOSTLY_IQ2_XXS: Integer
|
26
24
|
LLAMA_FTYPE_MOSTLY_IQ2_XS: Integer
|
27
25
|
LLAMA_FTYPE_MOSTLY_Q2_K_S: Integer
|
26
|
+
LLAMA_FTYPE_MOSTLY_Q3_K_XS: Integer
|
27
|
+
LLAMA_FTYPE_MOSTLY_IQ3_XXS: Integer
|
28
28
|
|
29
29
|
LLAMA_KV_OVERRIDE_INT: Integer
|
30
30
|
LLAMA_KV_OVERRIDE_FLOAT: Integer
|
@@ -60,6 +60,9 @@ module LLaMACpp
|
|
60
60
|
def self?.mmap_supported?: () -> bool
|
61
61
|
def self?.mlock_supported?: () -> bool
|
62
62
|
def self?.max_devices: () -> Integer
|
63
|
+
def self?.supports_mmap?: () -> bool
|
64
|
+
def self?.supports_mlock?: () -> bool
|
65
|
+
def self?.supports_gpu_offload?: () -> bool
|
63
66
|
|
64
67
|
class TokenData
|
65
68
|
public
|
@@ -216,6 +219,7 @@ module LLaMACpp
|
|
216
219
|
def sample_tail_free: (::LLaMACpp::TokenDataArray, z: Float, ?min_keep: Integer) -> void
|
217
220
|
def sample_typical: (::LLaMACpp::TokenDataArray, prob: Float, ?min_keep: Integer) -> void
|
218
221
|
def sample_temp: (::LLaMACpp::TokenDataArray, temp: Float) -> void
|
222
|
+
def sample_entropy: (::LLaMACpp::TokenDataArray, min_temp: Float, max_temp: Float, exponent_val: Float) -> void
|
219
223
|
def sample_temperature: (::LLaMACpp::TokenDataArray, temperature: Float) -> void
|
220
224
|
def sample_token_mirostat: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, m: Integer, mu: Float) -> [Integer, Float]
|
221
225
|
def sample_token_mirostat_v2: (::LLaMACpp::TokenDataArray, tau: Float, eta: Float, mu: Float) -> [Integer, Float]
|
@@ -9,7 +9,7 @@ TEST_TARGETS = \
|
|
9
9
|
tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
|
10
10
|
tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
|
11
11
|
tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \
|
12
|
-
tests/test-backend-ops
|
12
|
+
tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease
|
13
13
|
|
14
14
|
# Code coverage output files
|
15
15
|
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
|
@@ -450,6 +450,19 @@ ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
|
|
450
450
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
451
451
|
endif # LLAMA_CLBLAST
|
452
452
|
|
453
|
+
ifdef LLAMA_VULKAN
|
454
|
+
MK_CPPFLAGS += -DGGML_USE_VULKAN
|
455
|
+
MK_LDFLAGS += -lvulkan
|
456
|
+
OBJS += ggml-vulkan.o
|
457
|
+
|
458
|
+
ifdef LLAMA_VULKAN_CHECK_RESULTS
|
459
|
+
MK_CPPFLAGS += -DGGML_VULKAN_CHECK_RESULTS
|
460
|
+
endif
|
461
|
+
|
462
|
+
ggml-vulkan.o: ggml-vulkan.cpp ggml-vulkan.h
|
463
|
+
$(CXX) $(CXXFLAGS) -c $< -o $@
|
464
|
+
endif # LLAMA_VULKAN
|
465
|
+
|
453
466
|
ifdef LLAMA_HIPBLAS
|
454
467
|
|
455
468
|
ifeq ($(wildcard /opt/rocm),)
|
@@ -575,12 +588,15 @@ train.o: common/train.cpp common/train.h
|
|
575
588
|
libllama.so: llama.o ggml.o $(OBJS)
|
576
589
|
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
577
590
|
|
591
|
+
libllama.a: llama.o ggml.o $(OBJS) $(COMMON_DEPS)
|
592
|
+
ar rcs libllama.a llama.o ggml.o $(OBJS) $(COMMON_DEPS)
|
593
|
+
|
578
594
|
lib: llama.o ggml.o $(OBJS)
|
579
595
|
$(CXX) $(CXXFLAGS) -shared -fPIC -o libllama$(DSO_EXT) $^ $(LDFLAGS)
|
580
596
|
ar rcs libllama.a $^
|
581
597
|
|
582
598
|
clean:
|
583
|
-
rm -vrf *.o tests/*.o *.so *.dll *.dylib
|
599
|
+
rm -vrf *.o tests/*.o *.so *.a *.dll *.dylib benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
584
600
|
|
585
601
|
#
|
586
602
|
# Examples
|
@@ -625,7 +641,7 @@ embedding: examples/embedding/embedding.cpp ggml.o llama.o $(C
|
|
625
641
|
save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
626
642
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
627
643
|
|
628
|
-
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
644
|
+
server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
629
645
|
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) -Wno-cast-qual
|
630
646
|
|
631
647
|
gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
|
@@ -753,3 +769,9 @@ tests/test-c.o: tests/test-c.c llama.h
|
|
753
769
|
|
754
770
|
tests/test-backend-ops: tests/test-backend-ops.cpp ggml.o $(OBJS)
|
755
771
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
772
|
+
|
773
|
+
tests/test-model-load-cancel: tests/test-model-load-cancel.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS)
|
774
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
775
|
+
|
776
|
+
tests/test-autorelease: tests/test-autorelease.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS)
|
777
|
+
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
@@ -109,8 +109,8 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
|
|
109
109
|
if (block->size >= size) {
|
110
110
|
best_fit_block = alloc->n_free_blocks - 1;
|
111
111
|
} else {
|
112
|
-
fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
|
113
|
-
__func__, size, max_avail);
|
112
|
+
fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, largest block available %zu)\n",
|
113
|
+
__func__, tensor->name, size, max_avail);
|
114
114
|
GGML_ASSERT(!"not enough space in the buffer");
|
115
115
|
return;
|
116
116
|
}
|
@@ -335,7 +335,9 @@ bool ggml_tallocr_is_measure(ggml_tallocr_t alloc) {
|
|
335
335
|
}
|
336
336
|
|
337
337
|
size_t ggml_tallocr_max_size(ggml_tallocr_t alloc) {
|
338
|
-
|
338
|
+
// FIXME: changes in the tensor sizes compared to the measure graph may cause allocations to fail
|
339
|
+
// to avoid this, we add a 10% margin to the buffer size
|
340
|
+
return alloc->max_size + alloc->max_size/10;
|
339
341
|
}
|
340
342
|
|
341
343
|
// graph allocator
|
@@ -776,38 +778,26 @@ size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph)
|
|
776
778
|
}
|
777
779
|
|
778
780
|
// utils
|
779
|
-
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
|
780
|
-
GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
|
781
|
-
|
782
|
-
size_t alignment = ggml_backend_buft_get_alignment(buft);
|
783
|
-
|
784
|
-
size_t nbytes = 0;
|
785
|
-
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
786
|
-
if (t->data == NULL && t->view_src == NULL) {
|
787
|
-
nbytes += GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
|
788
|
-
}
|
789
|
-
}
|
790
|
-
|
791
|
-
if (nbytes == 0) {
|
792
|
-
// all the tensors in the context are already allocated
|
793
|
-
#ifndef NDEBUG
|
794
|
-
fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
|
795
|
-
#endif
|
796
|
-
return NULL;
|
797
|
-
}
|
798
781
|
|
799
|
-
|
782
|
+
static bool alloc_tensor_range(struct ggml_context * ctx,
|
783
|
+
struct ggml_tensor * first, struct ggml_tensor * last,
|
784
|
+
ggml_backend_buffer_type_t buft, size_t size,
|
785
|
+
ggml_backend_buffer_t ** buffers, size_t * n_buffers) {
|
786
|
+
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
|
800
787
|
if (buffer == NULL) {
|
801
|
-
// failed to allocate buffer
|
802
788
|
#ifndef NDEBUG
|
803
|
-
fprintf(stderr, "%s: failed to allocate buffer\n", __func__);
|
789
|
+
fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
|
804
790
|
#endif
|
805
|
-
|
791
|
+
for (size_t i = 0; i < *n_buffers; i++) {
|
792
|
+
ggml_backend_buffer_free(*buffers[i]);
|
793
|
+
}
|
794
|
+
free(*buffers);
|
795
|
+
return false;
|
806
796
|
}
|
807
797
|
|
808
798
|
ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer);
|
809
799
|
|
810
|
-
for (struct ggml_tensor * t =
|
800
|
+
for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
|
811
801
|
if (t->data == NULL) {
|
812
802
|
if (t->view_src == NULL) {
|
813
803
|
ggml_tallocr_alloc(tallocr, t);
|
@@ -824,6 +814,76 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
|
824
814
|
|
825
815
|
ggml_tallocr_free(tallocr);
|
826
816
|
|
817
|
+
*buffers = realloc(*buffers, sizeof(ggml_backend_buffer_t) * (*n_buffers + 1));
|
818
|
+
(*buffers)[(*n_buffers)++] = buffer;
|
819
|
+
|
820
|
+
return true;
|
821
|
+
}
|
822
|
+
|
823
|
+
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
|
824
|
+
GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
|
825
|
+
|
826
|
+
size_t alignment = ggml_backend_buft_get_alignment(buft);
|
827
|
+
size_t max_size = ggml_backend_buft_get_max_size(buft);
|
828
|
+
|
829
|
+
ggml_backend_buffer_t * buffers = NULL;
|
830
|
+
size_t n_buffers = 0;
|
831
|
+
|
832
|
+
size_t cur_buf_size = 0;
|
833
|
+
struct ggml_tensor * first = ggml_get_first_tensor(ctx);
|
834
|
+
for (struct ggml_tensor * t = first; t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
835
|
+
size_t this_size = 0;
|
836
|
+
if (t->data == NULL && t->view_src == NULL) {
|
837
|
+
this_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
|
838
|
+
}
|
839
|
+
|
840
|
+
if (this_size > max_size) {
|
841
|
+
// tensor is too large to fit in a single buffer
|
842
|
+
fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
|
843
|
+
__func__, t->name,
|
844
|
+
ggml_backend_buft_name(buft),
|
845
|
+
this_size, max_size);
|
846
|
+
for (size_t i = 0; i < n_buffers; i++) {
|
847
|
+
ggml_backend_buffer_free(buffers[i]);
|
848
|
+
}
|
849
|
+
free(buffers);
|
850
|
+
return NULL;
|
851
|
+
}
|
852
|
+
|
853
|
+
if ((cur_buf_size + this_size) > max_size) {
|
854
|
+
// allocate tensors in the current buffer
|
855
|
+
if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
|
856
|
+
return NULL;
|
857
|
+
}
|
858
|
+
first = t;
|
859
|
+
cur_buf_size = this_size;
|
860
|
+
} else {
|
861
|
+
cur_buf_size += this_size;
|
862
|
+
}
|
863
|
+
}
|
864
|
+
|
865
|
+
// allocate remaining tensors
|
866
|
+
if (cur_buf_size > 0) {
|
867
|
+
if (!alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
|
868
|
+
return NULL;
|
869
|
+
}
|
870
|
+
}
|
871
|
+
|
872
|
+
if (n_buffers == 0) {
|
873
|
+
// all the tensors in the context are already allocated
|
874
|
+
#ifndef NDEBUG
|
875
|
+
fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
|
876
|
+
#endif
|
877
|
+
return NULL;
|
878
|
+
}
|
879
|
+
|
880
|
+
ggml_backend_buffer_t buffer;
|
881
|
+
if (n_buffers == 1) {
|
882
|
+
buffer = buffers[0];
|
883
|
+
} else {
|
884
|
+
buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers);
|
885
|
+
}
|
886
|
+
free(buffers);
|
827
887
|
return buffer;
|
828
888
|
}
|
829
889
|
|
@@ -19,6 +19,7 @@ extern "C" {
|
|
19
19
|
const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft);
|
20
20
|
ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
|
21
21
|
size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
|
22
|
+
size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft); // allocation max size
|
22
23
|
size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
|
23
24
|
bool (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
|
24
25
|
// check if tensor data is in host memory
|
@@ -63,6 +64,11 @@ extern "C" {
|
|
63
64
|
// do not use directly, use ggml_backend_tensor_copy instead
|
64
65
|
bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
|
65
66
|
|
67
|
+
// buffer that contains a collection of buffers
|
68
|
+
GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
|
69
|
+
GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
|
70
|
+
GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
|
71
|
+
|
66
72
|
//
|
67
73
|
// Backend
|
68
74
|
//
|