llama_cpp 0.14.5 → 0.14.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/ext/llama_cpp/llama_cpp.cpp +37 -2
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +24 -7
- data/vendor/tmp/llama.cpp/ggml-alloc.c +8 -8
- data/vendor/tmp/llama.cpp/ggml-backend.c +14 -10
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +135 -46
- data/vendor/tmp/llama.cpp/ggml-impl.h +263 -5
- data/vendor/tmp/llama.cpp/ggml-metal.m +130 -83
- data/vendor/tmp/llama.cpp/ggml-metal.metal +505 -1467
- data/vendor/tmp/llama.cpp/ggml-quants.c +1 -294
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +65 -52
- data/vendor/tmp/llama.cpp/ggml.c +151 -99
- data/vendor/tmp/llama.cpp/ggml.h +5 -4
- data/vendor/tmp/llama.cpp/llama.cpp +1308 -254
- data/vendor/tmp/llama.cpp/llama.h +19 -6
- data/vendor/tmp/llama.cpp/sgemm.cpp +999 -0
- data/vendor/tmp/llama.cpp/sgemm.h +12 -0
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 243241c78383cb68d4fb5027ffc54ea7f6789bd74bfe85fae8e62d45e7c3145d
|
4
|
+
data.tar.gz: b7c792c6fb2287b71a72ff823a31706dc0830aa704d86e6f8a92d1d0630649d9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 59565cd5e6bd79d98d31dcf1ce505c8388a97296f607c2a114cf92a614a2cd39291a8a18a3f58993606ea3f0970d1eadbfe670280c5261c5826a54d77a2eb85d
|
7
|
+
data.tar.gz: 228bc19181b0163ef922e847f67e7b6a52dc1311c4e8173586dfca82eb402c5a08c104b5bac5ba0eee4772f615f8fd17f2d06cbc6db5323d133a46d3de85eeb4
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,16 @@
|
|
1
|
+
## [[0.14.7](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.6...v0.14.7)] - 2024-04-27
|
2
|
+
|
3
|
+
- Bump llama.cpp from b2698 to b2740.
|
4
|
+
- Add `keep_split` accessor to `ModelQuantizeParams`.
|
5
|
+
- Add `pooling_type` method to `Context`.
|
6
|
+
- Add `token_is_eog?` method to `Model`.
|
7
|
+
|
8
|
+
Implementation binding for llama_sample_token_with_rng has been skipped.
|
9
|
+
|
10
|
+
## [[0.14.6](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.5...v0.14.6)] - 2024-04-20
|
11
|
+
|
12
|
+
- Bump llama.cpp from b2658 to b2698.
|
13
|
+
|
1
14
|
## [[0.14.5](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.4...v0.14.5)] - 2024-04-13
|
2
15
|
|
3
16
|
- Bump llama.cpp from b2608 to b2658.
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -1321,6 +1321,8 @@ public:
|
|
1321
1321
|
rb_define_method(rb_cLLaMAModelQuantizeParams, "only_copy", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_only_copy), 0);
|
1322
1322
|
rb_define_method(rb_cLLaMAModelQuantizeParams, "pure=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_pure), 1);
|
1323
1323
|
rb_define_method(rb_cLLaMAModelQuantizeParams, "pure", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_pure), 0);
|
1324
|
+
rb_define_method(rb_cLLaMAModelQuantizeParams, "keep_split=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_keep_split), 1);
|
1325
|
+
rb_define_method(rb_cLLaMAModelQuantizeParams, "keep_split", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_keep_split), 0);
|
1324
1326
|
}
|
1325
1327
|
|
1326
1328
|
private:
|
@@ -1405,6 +1407,18 @@ private:
|
|
1405
1407
|
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
1406
1408
|
return ptr->params.pure ? Qtrue : Qfalse;
|
1407
1409
|
}
|
1410
|
+
|
1411
|
+
// keep_split
|
1412
|
+
static VALUE _llama_model_quantize_params_set_keep_split(VALUE self, VALUE keep_split) {
|
1413
|
+
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
1414
|
+
ptr->params.keep_split = RTEST(keep_split) ? true : false;
|
1415
|
+
return ptr->params.keep_split ? Qtrue : Qfalse;
|
1416
|
+
}
|
1417
|
+
|
1418
|
+
static VALUE _llama_model_quantize_params_get_keep_split(VALUE self) {
|
1419
|
+
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
1420
|
+
return ptr->params.keep_split ? Qtrue : Qfalse;
|
1421
|
+
}
|
1408
1422
|
};
|
1409
1423
|
|
1410
1424
|
const rb_data_type_t RbLLaMAModelQuantizeParams::llama_model_quantize_params_type = {
|
@@ -1487,6 +1501,7 @@ public:
|
|
1487
1501
|
rb_define_method(rb_cLLaMAModel, "token_middle", RUBY_METHOD_FUNC(_llama_model_token_middle), 0);
|
1488
1502
|
rb_define_method(rb_cLLaMAModel, "token_suffix", RUBY_METHOD_FUNC(_llama_model_token_suffix), 0);
|
1489
1503
|
rb_define_method(rb_cLLaMAModel, "token_eot", RUBY_METHOD_FUNC(_llama_model_token_eot), 0);
|
1504
|
+
rb_define_method(rb_cLLaMAModel, "token_is_eog?", RUBY_METHOD_FUNC(_llama_model_token_is_eog), 1);
|
1490
1505
|
}
|
1491
1506
|
|
1492
1507
|
private:
|
@@ -1634,10 +1649,10 @@ private:
|
|
1634
1649
|
const llama_token token = NUM2INT(token_);
|
1635
1650
|
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1636
1651
|
std::vector<char> result(8, 0);
|
1637
|
-
const int n_tokens = llama_token_to_piece(ptr->model, token, result.data(), result.size());
|
1652
|
+
const int n_tokens = llama_token_to_piece(ptr->model, token, result.data(), result.size(), false);
|
1638
1653
|
if (n_tokens < 0) {
|
1639
1654
|
result.resize(-n_tokens);
|
1640
|
-
const int check = llama_token_to_piece(ptr->model, token, result.data(), result.size());
|
1655
|
+
const int check = llama_token_to_piece(ptr->model, token, result.data(), result.size(), false);
|
1641
1656
|
if (check != -n_tokens) {
|
1642
1657
|
rb_raise(rb_eRuntimeError, "failed to convert");
|
1643
1658
|
return Qnil;
|
@@ -1789,6 +1804,16 @@ private:
|
|
1789
1804
|
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1790
1805
|
return INT2NUM(llama_token_eot(ptr->model));
|
1791
1806
|
}
|
1807
|
+
|
1808
|
+
static VALUE _llama_model_token_is_eog(VALUE self, VALUE token_) {
|
1809
|
+
if (!RB_INTEGER_TYPE_P(token_)) {
|
1810
|
+
rb_raise(rb_eArgError, "token must be an integer");
|
1811
|
+
return Qnil;
|
1812
|
+
}
|
1813
|
+
const llama_token token = NUM2INT(token_);
|
1814
|
+
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1815
|
+
return llama_token_is_eog(ptr->model, token) ? Qtrue : Qfalse;
|
1816
|
+
}
|
1792
1817
|
};
|
1793
1818
|
|
1794
1819
|
const rb_data_type_t RbLLaMAModel::llama_model_type = {
|
@@ -2102,6 +2127,7 @@ public:
|
|
2102
2127
|
rb_define_method(rb_cLLaMAContext, "sample_grammar", RUBY_METHOD_FUNC(_llama_context_sample_grammar), -1);
|
2103
2128
|
rb_define_method(rb_cLLaMAContext, "grammar_accept_token", RUBY_METHOD_FUNC(_llama_context_grammar_accept_token), -1);
|
2104
2129
|
rb_define_method(rb_cLLaMAContext, "apply_control_vector", RUBY_METHOD_FUNC(_llama_context_apply_control_vector), -1);
|
2130
|
+
rb_define_method(rb_cLLaMAContext, "pooling_type", RUBY_METHOD_FUNC(_llama_context_pooling_type), 0);
|
2105
2131
|
}
|
2106
2132
|
|
2107
2133
|
private:
|
@@ -3225,6 +3251,15 @@ private:
|
|
3225
3251
|
|
3226
3252
|
return Qnil;
|
3227
3253
|
}
|
3254
|
+
|
3255
|
+
static VALUE _llama_context_pooling_type(VALUE self) {
|
3256
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
3257
|
+
if (ptr->ctx == NULL) {
|
3258
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
3259
|
+
return Qnil;
|
3260
|
+
}
|
3261
|
+
return INT2NUM(static_cast<int>(llama_pooling_type(ptr->ctx)));
|
3262
|
+
}
|
3228
3263
|
};
|
3229
3264
|
|
3230
3265
|
const rb_data_type_t RbLLaMAContext::llama_context_type = {
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.14.
|
6
|
+
VERSION = '0.14.7'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b2740'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -141,6 +141,7 @@ module LLaMACpp
|
|
141
141
|
def token_middle: () -> Integer
|
142
142
|
def token_suffix: () -> Integer
|
143
143
|
def token_eot: () -> Integer
|
144
|
+
def token_is_eog?: (Integer) -> bool
|
144
145
|
end
|
145
146
|
|
146
147
|
class Timings
|
@@ -260,6 +261,7 @@ module LLaMACpp
|
|
260
261
|
def sample_grammar: (::LLaMACpp::TokenDataArray, grammar: ::LLaMACpp::Grammar) -> void
|
261
262
|
def grammar_accept_token: (grammar: ::LLaMACpp::Grammar, token: Integer) -> void
|
262
263
|
def apply_control_vector: (data: Array[Float], n_embd: Integer, il_start: Integer, il_end: Integer) -> void
|
264
|
+
def pooling_type: () -> Integer
|
263
265
|
end
|
264
266
|
|
265
267
|
class ContextParams
|
@@ -328,6 +330,8 @@ module LLaMACpp
|
|
328
330
|
def only_copy=: (bool) -> bool
|
329
331
|
def pure: () -> bool
|
330
332
|
def pure=: (bool) -> bool
|
333
|
+
def keep_split: () -> bool
|
334
|
+
def keep_split=: (bool) -> bool
|
331
335
|
end
|
332
336
|
|
333
337
|
class Params = ContextParams
|
@@ -386,6 +386,11 @@ ifdef LLAMA_OPENBLAS
|
|
386
386
|
MK_LDFLAGS += $(shell pkg-config --libs openblas)
|
387
387
|
endif # LLAMA_OPENBLAS
|
388
388
|
|
389
|
+
ifndef LLAMA_NO_LLAMAFILE
|
390
|
+
MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
|
391
|
+
OBJS += sgemm.o
|
392
|
+
endif
|
393
|
+
|
389
394
|
ifdef LLAMA_BLIS
|
390
395
|
MK_CPPFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
|
391
396
|
MK_LDFLAGS += -lblis -L/usr/local/lib
|
@@ -482,11 +487,9 @@ ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/com
|
|
482
487
|
|
483
488
|
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
|
484
489
|
$(NVCC_COMPILE)
|
485
|
-
|
486
490
|
endif # LLAMA_CUDA
|
487
491
|
|
488
492
|
ifdef LLAMA_CLBLAST
|
489
|
-
|
490
493
|
MK_CPPFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags-only-I clblast OpenCL)
|
491
494
|
MK_CFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
|
492
495
|
MK_CXXFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
|
@@ -605,6 +608,11 @@ ggml-mpi.o: ggml-mpi.c ggml-mpi.h
|
|
605
608
|
$(CC) $(CFLAGS) -c $< -o $@
|
606
609
|
endif # LLAMA_MPI
|
607
610
|
|
611
|
+
ifndef LLAMA_NO_LLAMAFILE
|
612
|
+
sgemm.o: sgemm.cpp sgemm.h ggml.h
|
613
|
+
$(CXX) $(CXXFLAGS) -c $< -o $@
|
614
|
+
endif
|
615
|
+
|
608
616
|
GF_CC := $(CC)
|
609
617
|
include scripts/get-flags.mk
|
610
618
|
|
@@ -689,8 +697,8 @@ OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
|
|
689
697
|
llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
|
690
698
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
691
699
|
|
692
|
-
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
|
693
|
-
COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o
|
700
|
+
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
|
701
|
+
COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
|
694
702
|
|
695
703
|
common.o: common/common.cpp $(COMMON_H_DEPS)
|
696
704
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
@@ -724,7 +732,7 @@ lib: llama.o ggml.o $(OBJS)
|
|
724
732
|
ar rcs libllama.a $^
|
725
733
|
|
726
734
|
clean:
|
727
|
-
rm -vrf *.o tests/*.o *.so *.a *.dll
|
735
|
+
rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
728
736
|
rm -vrf ggml-cuda/*.o
|
729
737
|
|
730
738
|
#
|
@@ -761,7 +769,7 @@ batched: examples/batched/batched.cpp ggml.o llama.o $(C
|
|
761
769
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
762
770
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
763
771
|
|
764
|
-
batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o
|
772
|
+
batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
765
773
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
766
774
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
767
775
|
|
@@ -793,10 +801,19 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
|
|
793
801
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
794
802
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
795
803
|
|
796
|
-
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp json-schema-to-grammar.
|
804
|
+
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
797
805
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
798
806
|
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
799
807
|
|
808
|
+
# Portable equivalent of `cd examples/server/public && xxd -i $(notdir $<) ../$(notdir $<).hpp`:
|
809
|
+
examples/server/%.hpp: examples/server/public/% Makefile
|
810
|
+
@( export NAME=$(subst .,_,$(subst -,_,$(notdir $<))) && \
|
811
|
+
echo "unsigned char $${NAME}[] = {" && \
|
812
|
+
cat $< | od -v -t x1 -An | sed -E 's/([0-9a-fA-F]+)/0x\1, /g' && \
|
813
|
+
echo "};" && \
|
814
|
+
echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \
|
815
|
+
) > $@
|
816
|
+
|
800
817
|
gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
|
801
818
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
802
819
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
@@ -371,16 +371,16 @@ struct ggml_gallocr {
|
|
371
371
|
};
|
372
372
|
|
373
373
|
ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
|
374
|
-
ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(sizeof(struct ggml_gallocr)
|
374
|
+
ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(1, sizeof(struct ggml_gallocr));
|
375
375
|
GGML_ASSERT(galloc != NULL);
|
376
376
|
|
377
|
-
galloc->bufts = calloc(sizeof(ggml_backend_buffer_type_t)
|
377
|
+
galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t));
|
378
378
|
GGML_ASSERT(galloc->bufts != NULL);
|
379
379
|
|
380
|
-
galloc->buffers = calloc(sizeof(ggml_backend_buffer_t) * n_bufs
|
380
|
+
galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t) * n_bufs);
|
381
381
|
GGML_ASSERT(galloc->buffers != NULL);
|
382
382
|
|
383
|
-
galloc->buf_tallocs = calloc(sizeof(struct ggml_dyn_tallocr *)
|
383
|
+
galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
|
384
384
|
GGML_ASSERT(galloc->buf_tallocs != NULL);
|
385
385
|
|
386
386
|
for (int i = 0; i < n_bufs; i++) {
|
@@ -646,8 +646,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
646
646
|
free(galloc->hash_set.keys);
|
647
647
|
free(galloc->hash_values);
|
648
648
|
galloc->hash_set.size = hash_size;
|
649
|
-
galloc->hash_set.keys = calloc(sizeof(struct ggml_tensor *)
|
650
|
-
galloc->hash_values = calloc(sizeof(struct hash_node)
|
649
|
+
galloc->hash_set.keys = calloc(hash_size, sizeof(struct ggml_tensor *));
|
650
|
+
galloc->hash_values = calloc(hash_size, sizeof(struct hash_node));
|
651
651
|
GGML_ASSERT(galloc->hash_set.keys != NULL);
|
652
652
|
GGML_ASSERT(galloc->hash_values != NULL);
|
653
653
|
} else {
|
@@ -667,7 +667,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
667
667
|
// set the node_allocs from the hash table
|
668
668
|
if (galloc->n_nodes < graph->n_nodes) {
|
669
669
|
free(galloc->node_allocs);
|
670
|
-
galloc->node_allocs = calloc(sizeof(struct node_alloc)
|
670
|
+
galloc->node_allocs = calloc(graph->n_nodes, sizeof(struct node_alloc));
|
671
671
|
GGML_ASSERT(galloc->node_allocs != NULL);
|
672
672
|
}
|
673
673
|
galloc->n_nodes = graph->n_nodes;
|
@@ -697,7 +697,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
697
697
|
}
|
698
698
|
if (galloc->n_leafs < graph->n_leafs) {
|
699
699
|
free(galloc->leaf_allocs);
|
700
|
-
galloc->leaf_allocs = calloc(sizeof(galloc->leaf_allocs[0])
|
700
|
+
galloc->leaf_allocs = calloc(graph->n_leafs, sizeof(galloc->leaf_allocs[0]));
|
701
701
|
GGML_ASSERT(galloc->leaf_allocs != NULL);
|
702
702
|
}
|
703
703
|
galloc->n_leafs = graph->n_leafs;
|
@@ -822,7 +822,11 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t
|
|
822
822
|
GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
823
823
|
switch (op->op) {
|
824
824
|
case GGML_OP_CPY:
|
825
|
-
return
|
825
|
+
return
|
826
|
+
op->type != GGML_TYPE_IQ2_XXS &&
|
827
|
+
op->type != GGML_TYPE_IQ2_XS &&
|
828
|
+
op->type != GGML_TYPE_IQ1_S &&
|
829
|
+
op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
|
826
830
|
case GGML_OP_MUL_MAT:
|
827
831
|
return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
|
828
832
|
default:
|
@@ -1721,23 +1725,23 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
|
1721
1725
|
GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
|
1722
1726
|
GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
|
1723
1727
|
|
1724
|
-
struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched)
|
1728
|
+
struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched));
|
1725
1729
|
|
1726
1730
|
// initialize hash table
|
1727
1731
|
sched->hash_set = ggml_hash_set_new(graph_size);
|
1728
|
-
sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0])
|
1729
|
-
sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0])
|
1732
|
+
sched->tensor_backend_id = calloc(sched->hash_set.size, sizeof(sched->tensor_backend_id[0]));
|
1733
|
+
sched->tensor_copies = calloc(sched->hash_set.size, sizeof(sched->tensor_copies[0]));
|
1730
1734
|
|
1731
1735
|
const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
|
1732
|
-
sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0])
|
1733
|
-
sched->leaf_backend_ids = calloc(sizeof(sched->leaf_backend_ids[0])
|
1736
|
+
sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
|
1737
|
+
sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
|
1734
1738
|
|
1735
1739
|
sched->n_backends = n_backends;
|
1736
1740
|
|
1737
1741
|
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
|
1738
1742
|
|
1739
1743
|
const int initial_splits_capacity = 16;
|
1740
|
-
sched->splits = calloc(sizeof(sched->splits[0])
|
1744
|
+
sched->splits = calloc(initial_splits_capacity, sizeof(sched->splits[0]));
|
1741
1745
|
sched->splits_capacity = initial_splits_capacity;
|
1742
1746
|
|
1743
1747
|
for (int b = 0; b < n_backends; b++) {
|
@@ -1968,10 +1972,10 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te
|
|
1968
1972
|
struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
|
1969
1973
|
struct ggml_hash_set hash_set = {
|
1970
1974
|
/* .size = */ graph->visited_hash_table.size,
|
1971
|
-
/* .keys = */ calloc(sizeof(hash_set.keys[0])
|
1975
|
+
/* .keys = */ calloc(graph->visited_hash_table.size, sizeof(hash_set.keys[0])) // NOLINT
|
1972
1976
|
};
|
1973
|
-
struct ggml_tensor ** node_copies = calloc(sizeof(node_copies[0])
|
1974
|
-
bool * node_init = calloc(sizeof(node_init[0])
|
1977
|
+
struct ggml_tensor ** node_copies = calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
|
1978
|
+
bool * node_init = calloc(hash_set.size, sizeof(node_init[0]));
|
1975
1979
|
|
1976
1980
|
struct ggml_init_params params = {
|
1977
1981
|
/* .mem_size = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false),
|
@@ -1231,7 +1231,7 @@ static void ggml_cuda_op_mul_mat_cublas(
|
|
1231
1231
|
|
1232
1232
|
if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
|
1233
1233
|
// convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
|
1234
|
-
ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool());
|
1234
|
+
ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool(id));
|
1235
1235
|
if (src0->type != GGML_TYPE_F16) {
|
1236
1236
|
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type);
|
1237
1237
|
GGML_ASSERT(to_fp16_cuda != nullptr);
|
@@ -1241,7 +1241,7 @@ static void ggml_cuda_op_mul_mat_cublas(
|
|
1241
1241
|
}
|
1242
1242
|
const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16.get();
|
1243
1243
|
|
1244
|
-
ggml_cuda_pool_alloc<half> src1_as_f16(ctx.pool());
|
1244
|
+
ggml_cuda_pool_alloc<half> src1_as_f16(ctx.pool(id));
|
1245
1245
|
if (src1->type != GGML_TYPE_F16) {
|
1246
1246
|
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
|
1247
1247
|
GGML_ASSERT(to_fp16_cuda != nullptr);
|
@@ -1250,7 +1250,7 @@ static void ggml_cuda_op_mul_mat_cublas(
|
|
1250
1250
|
to_fp16_cuda(src1_ddf_i, src1_as_f16.get(), ne, stream);
|
1251
1251
|
}
|
1252
1252
|
const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16.get();
|
1253
|
-
ggml_cuda_pool_alloc<half> dst_f16(ctx.pool(), row_diff*src1_ncols);
|
1253
|
+
ggml_cuda_pool_alloc<half> dst_f16(ctx.pool(id), row_diff*src1_ncols);
|
1254
1254
|
|
1255
1255
|
const half alpha_f16 = 1.0f;
|
1256
1256
|
const half beta_f16 = 0.0f;
|
@@ -1946,7 +1946,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
|
1946
1946
|
} else if (!split && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
1947
1947
|
// KQV single-batch
|
1948
1948
|
ggml_cuda_mul_mat_vec_nc(ctx, src0, src1, dst);
|
1949
|
-
} else if (!split &&
|
1949
|
+
} else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || fp16_performance_good) && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
1950
1950
|
// KQ + KQV multi-batch
|
1951
1951
|
ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
|
1952
1952
|
} else if (use_dequantize_mul_mat_vec) {
|
@@ -1960,20 +1960,73 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
|
1960
1960
|
}
|
1961
1961
|
}
|
1962
1962
|
|
1963
|
+
struct mmid_row_mapping {
|
1964
|
+
int32_t i1;
|
1965
|
+
int32_t i2;
|
1966
|
+
};
|
1967
|
+
|
1968
|
+
static __global__ void k_copy_src1_to_contiguous(const char * __restrict__ src1_original, char * __restrict__ src1_contiguous,
|
1969
|
+
int * __restrict__ cur_src1_row, mmid_row_mapping * __restrict__ row_mapping,
|
1970
|
+
const char * __restrict ids, int64_t i02, size_t ids_nb1, size_t ids_nb0,
|
1971
|
+
int64_t ne11, int64_t ne10,
|
1972
|
+
size_t nb11, size_t nb12) {
|
1973
|
+
int32_t iid1 = blockIdx.x;
|
1974
|
+
int32_t id = blockIdx.y;
|
1975
|
+
|
1976
|
+
const int32_t row_id_i = *(const int32_t *) (ids + iid1*ids_nb1 + id*ids_nb0);
|
1977
|
+
|
1978
|
+
if (row_id_i != i02) {
|
1979
|
+
return;
|
1980
|
+
}
|
1981
|
+
|
1982
|
+
const int64_t i11 = id % ne11;
|
1983
|
+
const int64_t i12 = iid1;
|
1984
|
+
|
1985
|
+
__shared__ int src1_row;
|
1986
|
+
if (threadIdx.x == 0) {
|
1987
|
+
src1_row = atomicAdd(cur_src1_row, 1);
|
1988
|
+
row_mapping[src1_row] = {id, iid1};
|
1989
|
+
}
|
1990
|
+
__syncthreads();
|
1991
|
+
|
1992
|
+
const float * src1_row_original = (const float *)(src1_original + i11*nb11 + i12*nb12);
|
1993
|
+
float * src1_row_contiguous = (float *)(src1_contiguous + src1_row*nb11);
|
1994
|
+
|
1995
|
+
for (int i = threadIdx.x; i < ne10; i += blockDim.x) {
|
1996
|
+
src1_row_contiguous[i] = src1_row_original[i];
|
1997
|
+
}
|
1998
|
+
}
|
1999
|
+
|
2000
|
+
static __global__ void k_copy_dst_from_contiguous(char * __restrict__ dst_original, const char * __restrict__ dst_contiguous,
|
2001
|
+
const mmid_row_mapping * __restrict__ row_mapping,
|
2002
|
+
int64_t ne0,
|
2003
|
+
size_t nb1, size_t nb2) {
|
2004
|
+
int32_t i = blockIdx.x;
|
2005
|
+
|
2006
|
+
const int32_t i1 = row_mapping[i].i1;
|
2007
|
+
const int32_t i2 = row_mapping[i].i2;
|
2008
|
+
|
2009
|
+
const float * dst_row_contiguous = (const float *)(dst_contiguous + i*nb1);
|
2010
|
+
float * dst_row_original = (float *)(dst_original + i1*nb1 + i2*nb2);
|
2011
|
+
|
2012
|
+
for (int j = threadIdx.x; j < ne0; j += blockDim.x) {
|
2013
|
+
dst_row_original[j] = dst_row_contiguous[j];
|
2014
|
+
}
|
2015
|
+
}
|
2016
|
+
|
1963
2017
|
static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
1964
2018
|
const ggml_tensor * src0 = dst->src[0];
|
1965
2019
|
const ggml_tensor * src1 = dst->src[1];
|
1966
2020
|
const ggml_tensor * ids = dst->src[2];
|
1967
2021
|
|
2022
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
2023
|
+
|
1968
2024
|
GGML_ASSERT(!ggml_backend_buffer_is_cuda_split(src0->buffer) && "mul_mat_id does not support split buffers");
|
1969
2025
|
|
1970
2026
|
cudaStream_t stream = ctx.stream();
|
1971
2027
|
|
1972
|
-
const
|
1973
|
-
const
|
1974
|
-
|
1975
|
-
const int32_t id = ((int32_t *) dst->op_params)[0];
|
1976
|
-
const int32_t n_as = src0->ne[2];
|
2028
|
+
const int64_t n_as = ne02;
|
2029
|
+
const int64_t n_ids = ids->ne[0];
|
1977
2030
|
|
1978
2031
|
std::vector<char> ids_host(ggml_nbytes(ids));
|
1979
2032
|
const char * ids_dev = (const char *) ids->data;
|
@@ -1982,7 +2035,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
|
|
1982
2035
|
|
1983
2036
|
ggml_tensor src0_row = *src0;
|
1984
2037
|
ggml_tensor src1_row = *src1;
|
1985
|
-
ggml_tensor dst_row
|
2038
|
+
ggml_tensor dst_row = *dst;
|
1986
2039
|
|
1987
2040
|
char * src0_original = (char *) src0->data;
|
1988
2041
|
char * src1_original = (char *) src1->data;
|
@@ -1990,19 +2043,39 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
|
|
1990
2043
|
|
1991
2044
|
src0_row.ne[2] = 1;
|
1992
2045
|
src0_row.ne[3] = 1;
|
1993
|
-
src0_row.nb[3] =
|
2046
|
+
src0_row.nb[3] = nb02;
|
1994
2047
|
|
1995
|
-
|
1996
|
-
|
1997
|
-
|
2048
|
+
src1_row.ne[1] = 1;
|
2049
|
+
src1_row.ne[2] = 1;
|
2050
|
+
src1_row.ne[3] = 1;
|
2051
|
+
src1_row.nb[2] = nb11;
|
2052
|
+
src1_row.nb[3] = nb11;
|
1998
2053
|
|
1999
|
-
|
2054
|
+
dst_row.ne[1] = 1;
|
2055
|
+
dst_row.ne[2] = 1;
|
2056
|
+
dst_row.ne[3] = 1;
|
2057
|
+
dst_row.nb[2] = nb1;
|
2058
|
+
dst_row.nb[3] = nb1;
|
2000
2059
|
|
2001
|
-
|
2002
|
-
|
2003
|
-
|
2060
|
+
if (ne12 == 1) {
|
2061
|
+
for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
|
2062
|
+
for (int64_t id = 0; id < n_ids; id++) {
|
2063
|
+
const int32_t i02 = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
|
2004
2064
|
|
2005
|
-
|
2065
|
+
GGML_ASSERT(i02 >= 0 && i02 < n_as);
|
2066
|
+
|
2067
|
+
const int64_t i11 = id % ne11;
|
2068
|
+
const int64_t i12 = iid1;
|
2069
|
+
|
2070
|
+
const int64_t i1 = id;
|
2071
|
+
const int64_t i2 = i12;
|
2072
|
+
|
2073
|
+
src0_row.data = src0_original + i02*nb02;
|
2074
|
+
src1_row.data = src1_original + i11*nb11 + i12*nb12;
|
2075
|
+
dst_row.data = dst_original + i1*nb1 + i2*nb2;
|
2076
|
+
|
2077
|
+
ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
|
2078
|
+
}
|
2006
2079
|
}
|
2007
2080
|
} else {
|
2008
2081
|
ggml_cuda_pool_alloc<char> src1_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(src1));
|
@@ -2011,54 +2084,69 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
|
|
2011
2084
|
src1_row.data = src1_contiguous.get();
|
2012
2085
|
dst_row.data = dst_contiguous.get();
|
2013
2086
|
|
2014
|
-
for (
|
2087
|
+
for (int64_t i02 = 0; i02 < n_as; i02++) {
|
2015
2088
|
int64_t num_src1_rows = 0;
|
2016
|
-
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
2017
|
-
const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
|
2018
2089
|
|
2019
|
-
|
2020
|
-
|
2021
|
-
|
2090
|
+
for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
|
2091
|
+
for (int64_t id = 0; id < n_ids; id++) {
|
2092
|
+
const int32_t row_id_i = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
|
2022
2093
|
|
2023
|
-
|
2094
|
+
GGML_ASSERT(row_id_i >= 0 && row_id_i < n_as);
|
2024
2095
|
|
2025
|
-
|
2026
|
-
|
2027
|
-
|
2096
|
+
if (row_id_i != i02) {
|
2097
|
+
continue;
|
2098
|
+
}
|
2099
|
+
|
2100
|
+
num_src1_rows++;
|
2101
|
+
}
|
2028
2102
|
}
|
2029
2103
|
|
2030
2104
|
if (num_src1_rows == 0) {
|
2031
2105
|
continue;
|
2032
2106
|
}
|
2033
2107
|
|
2034
|
-
|
2108
|
+
ggml_cuda_pool_alloc<int> dev_cur_src1_row(ctx.pool(), 1);
|
2109
|
+
ggml_cuda_pool_alloc<mmid_row_mapping> dev_row_mapping(ctx.pool(), num_src1_rows);
|
2110
|
+
CUDA_CHECK(cudaMemsetAsync(dev_cur_src1_row.get(), 0, sizeof(int), stream));
|
2035
2111
|
|
2036
|
-
|
2037
|
-
|
2112
|
+
{
|
2113
|
+
dim3 block_dims(std::min((unsigned int)ne10, 768u));
|
2114
|
+
dim3 grid_dims(ids->ne[1], n_ids);
|
2115
|
+
k_copy_src1_to_contiguous<<<grid_dims, block_dims, 0, stream>>>(
|
2116
|
+
src1_original, src1_contiguous.get(),
|
2117
|
+
dev_cur_src1_row.get(), dev_row_mapping.get(),
|
2118
|
+
ids_dev, i02, ids->nb[1], ids->nb[0],
|
2119
|
+
ne11, ne10,
|
2120
|
+
nb11, nb12);
|
2121
|
+
CUDA_CHECK(cudaGetLastError());
|
2122
|
+
}
|
2123
|
+
|
2124
|
+
src0_row.data = src0_original + i02*nb02;
|
2038
2125
|
|
2126
|
+
GGML_ASSERT(nb11 == sizeof(float)*ne10);
|
2127
|
+
GGML_ASSERT(nb1 == sizeof(float)*ne0);
|
2128
|
+
|
2129
|
+
src1_row.ne[1] = num_src1_rows;
|
2039
2130
|
src1_row.nb[1] = nb11;
|
2040
2131
|
src1_row.nb[2] = num_src1_rows*nb11;
|
2041
2132
|
src1_row.nb[3] = num_src1_rows*nb11;
|
2042
2133
|
|
2134
|
+
dst_row.ne[1] = num_src1_rows;
|
2043
2135
|
dst_row.nb[1] = nb1;
|
2044
2136
|
dst_row.nb[2] = num_src1_rows*nb1;
|
2045
2137
|
dst_row.nb[3] = num_src1_rows*nb1;
|
2046
2138
|
|
2047
2139
|
ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
|
2048
2140
|
|
2049
|
-
|
2050
|
-
|
2051
|
-
|
2052
|
-
|
2053
|
-
|
2054
|
-
|
2055
|
-
|
2056
|
-
|
2057
|
-
|
2058
|
-
|
2059
|
-
CUDA_CHECK(cudaMemcpyAsync(dst_original + i01*nb1, dst_contiguous.get() + num_src1_rows*nb1,
|
2060
|
-
nb1, cudaMemcpyDeviceToDevice, stream));
|
2061
|
-
num_src1_rows++;
|
2141
|
+
{
|
2142
|
+
dim3 block_dims(std::min((unsigned int)ne0, 768u));
|
2143
|
+
dim3 grid_dims(num_src1_rows);
|
2144
|
+
k_copy_dst_from_contiguous<<<grid_dims, block_dims, 0, stream>>>(
|
2145
|
+
dst_original, dst_contiguous.get(),
|
2146
|
+
dev_row_mapping.get(),
|
2147
|
+
ne0,
|
2148
|
+
nb1, nb2);
|
2149
|
+
CUDA_CHECK(cudaGetLastError());
|
2062
2150
|
}
|
2063
2151
|
}
|
2064
2152
|
}
|
@@ -2487,7 +2575,8 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|
2487
2575
|
GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
|
2488
2576
|
const int min_batch_size = 32;
|
2489
2577
|
|
2490
|
-
return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS
|
2578
|
+
return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
|
2579
|
+
(op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
|
2491
2580
|
|
2492
2581
|
GGML_UNUSED(backend);
|
2493
2582
|
}
|