llama_cpp 0.14.5 → 0.14.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/ext/llama_cpp/llama_cpp.cpp +37 -2
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +24 -7
- data/vendor/tmp/llama.cpp/ggml-alloc.c +8 -8
- data/vendor/tmp/llama.cpp/ggml-backend.c +14 -10
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +135 -46
- data/vendor/tmp/llama.cpp/ggml-impl.h +263 -5
- data/vendor/tmp/llama.cpp/ggml-metal.m +130 -83
- data/vendor/tmp/llama.cpp/ggml-metal.metal +505 -1467
- data/vendor/tmp/llama.cpp/ggml-quants.c +1 -294
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +65 -52
- data/vendor/tmp/llama.cpp/ggml.c +151 -99
- data/vendor/tmp/llama.cpp/ggml.h +5 -4
- data/vendor/tmp/llama.cpp/llama.cpp +1308 -254
- data/vendor/tmp/llama.cpp/llama.h +19 -6
- data/vendor/tmp/llama.cpp/sgemm.cpp +999 -0
- data/vendor/tmp/llama.cpp/sgemm.h +12 -0
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 243241c78383cb68d4fb5027ffc54ea7f6789bd74bfe85fae8e62d45e7c3145d
|
4
|
+
data.tar.gz: b7c792c6fb2287b71a72ff823a31706dc0830aa704d86e6f8a92d1d0630649d9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 59565cd5e6bd79d98d31dcf1ce505c8388a97296f607c2a114cf92a614a2cd39291a8a18a3f58993606ea3f0970d1eadbfe670280c5261c5826a54d77a2eb85d
|
7
|
+
data.tar.gz: 228bc19181b0163ef922e847f67e7b6a52dc1311c4e8173586dfca82eb402c5a08c104b5bac5ba0eee4772f615f8fd17f2d06cbc6db5323d133a46d3de85eeb4
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,16 @@
|
|
1
|
+
## [[0.14.7](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.6...v0.14.7)] - 2024-04-27
|
2
|
+
|
3
|
+
- Bump llama.cpp from b2698 to b2740.
|
4
|
+
- Add `keep_split` accessor to `ModelQuantizeParams`.
|
5
|
+
- Add `pooling_type` method to `Context`.
|
6
|
+
- Add `token_is_eog?` method to `Model`.
|
7
|
+
|
8
|
+
Implementation binding for llama_sample_token_with_rng has been skipped.
|
9
|
+
|
10
|
+
## [[0.14.6](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.5...v0.14.6)] - 2024-04-20
|
11
|
+
|
12
|
+
- Bump llama.cpp from b2658 to b2698.
|
13
|
+
|
1
14
|
## [[0.14.5](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.4...v0.14.5)] - 2024-04-13
|
2
15
|
|
3
16
|
- Bump llama.cpp from b2608 to b2658.
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -1321,6 +1321,8 @@ public:
|
|
1321
1321
|
rb_define_method(rb_cLLaMAModelQuantizeParams, "only_copy", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_only_copy), 0);
|
1322
1322
|
rb_define_method(rb_cLLaMAModelQuantizeParams, "pure=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_pure), 1);
|
1323
1323
|
rb_define_method(rb_cLLaMAModelQuantizeParams, "pure", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_pure), 0);
|
1324
|
+
rb_define_method(rb_cLLaMAModelQuantizeParams, "keep_split=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_keep_split), 1);
|
1325
|
+
rb_define_method(rb_cLLaMAModelQuantizeParams, "keep_split", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_keep_split), 0);
|
1324
1326
|
}
|
1325
1327
|
|
1326
1328
|
private:
|
@@ -1405,6 +1407,18 @@ private:
|
|
1405
1407
|
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
1406
1408
|
return ptr->params.pure ? Qtrue : Qfalse;
|
1407
1409
|
}
|
1410
|
+
|
1411
|
+
// keep_split
|
1412
|
+
static VALUE _llama_model_quantize_params_set_keep_split(VALUE self, VALUE keep_split) {
|
1413
|
+
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
1414
|
+
ptr->params.keep_split = RTEST(keep_split) ? true : false;
|
1415
|
+
return ptr->params.keep_split ? Qtrue : Qfalse;
|
1416
|
+
}
|
1417
|
+
|
1418
|
+
static VALUE _llama_model_quantize_params_get_keep_split(VALUE self) {
|
1419
|
+
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
1420
|
+
return ptr->params.keep_split ? Qtrue : Qfalse;
|
1421
|
+
}
|
1408
1422
|
};
|
1409
1423
|
|
1410
1424
|
const rb_data_type_t RbLLaMAModelQuantizeParams::llama_model_quantize_params_type = {
|
@@ -1487,6 +1501,7 @@ public:
|
|
1487
1501
|
rb_define_method(rb_cLLaMAModel, "token_middle", RUBY_METHOD_FUNC(_llama_model_token_middle), 0);
|
1488
1502
|
rb_define_method(rb_cLLaMAModel, "token_suffix", RUBY_METHOD_FUNC(_llama_model_token_suffix), 0);
|
1489
1503
|
rb_define_method(rb_cLLaMAModel, "token_eot", RUBY_METHOD_FUNC(_llama_model_token_eot), 0);
|
1504
|
+
rb_define_method(rb_cLLaMAModel, "token_is_eog?", RUBY_METHOD_FUNC(_llama_model_token_is_eog), 1);
|
1490
1505
|
}
|
1491
1506
|
|
1492
1507
|
private:
|
@@ -1634,10 +1649,10 @@ private:
|
|
1634
1649
|
const llama_token token = NUM2INT(token_);
|
1635
1650
|
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1636
1651
|
std::vector<char> result(8, 0);
|
1637
|
-
const int n_tokens = llama_token_to_piece(ptr->model, token, result.data(), result.size());
|
1652
|
+
const int n_tokens = llama_token_to_piece(ptr->model, token, result.data(), result.size(), false);
|
1638
1653
|
if (n_tokens < 0) {
|
1639
1654
|
result.resize(-n_tokens);
|
1640
|
-
const int check = llama_token_to_piece(ptr->model, token, result.data(), result.size());
|
1655
|
+
const int check = llama_token_to_piece(ptr->model, token, result.data(), result.size(), false);
|
1641
1656
|
if (check != -n_tokens) {
|
1642
1657
|
rb_raise(rb_eRuntimeError, "failed to convert");
|
1643
1658
|
return Qnil;
|
@@ -1789,6 +1804,16 @@ private:
|
|
1789
1804
|
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1790
1805
|
return INT2NUM(llama_token_eot(ptr->model));
|
1791
1806
|
}
|
1807
|
+
|
1808
|
+
static VALUE _llama_model_token_is_eog(VALUE self, VALUE token_) {
|
1809
|
+
if (!RB_INTEGER_TYPE_P(token_)) {
|
1810
|
+
rb_raise(rb_eArgError, "token must be an integer");
|
1811
|
+
return Qnil;
|
1812
|
+
}
|
1813
|
+
const llama_token token = NUM2INT(token_);
|
1814
|
+
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1815
|
+
return llama_token_is_eog(ptr->model, token) ? Qtrue : Qfalse;
|
1816
|
+
}
|
1792
1817
|
};
|
1793
1818
|
|
1794
1819
|
const rb_data_type_t RbLLaMAModel::llama_model_type = {
|
@@ -2102,6 +2127,7 @@ public:
|
|
2102
2127
|
rb_define_method(rb_cLLaMAContext, "sample_grammar", RUBY_METHOD_FUNC(_llama_context_sample_grammar), -1);
|
2103
2128
|
rb_define_method(rb_cLLaMAContext, "grammar_accept_token", RUBY_METHOD_FUNC(_llama_context_grammar_accept_token), -1);
|
2104
2129
|
rb_define_method(rb_cLLaMAContext, "apply_control_vector", RUBY_METHOD_FUNC(_llama_context_apply_control_vector), -1);
|
2130
|
+
rb_define_method(rb_cLLaMAContext, "pooling_type", RUBY_METHOD_FUNC(_llama_context_pooling_type), 0);
|
2105
2131
|
}
|
2106
2132
|
|
2107
2133
|
private:
|
@@ -3225,6 +3251,15 @@ private:
|
|
3225
3251
|
|
3226
3252
|
return Qnil;
|
3227
3253
|
}
|
3254
|
+
|
3255
|
+
static VALUE _llama_context_pooling_type(VALUE self) {
|
3256
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
3257
|
+
if (ptr->ctx == NULL) {
|
3258
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
3259
|
+
return Qnil;
|
3260
|
+
}
|
3261
|
+
return INT2NUM(static_cast<int>(llama_pooling_type(ptr->ctx)));
|
3262
|
+
}
|
3228
3263
|
};
|
3229
3264
|
|
3230
3265
|
const rb_data_type_t RbLLaMAContext::llama_context_type = {
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.14.
|
6
|
+
VERSION = '0.14.7'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b2740'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -141,6 +141,7 @@ module LLaMACpp
|
|
141
141
|
def token_middle: () -> Integer
|
142
142
|
def token_suffix: () -> Integer
|
143
143
|
def token_eot: () -> Integer
|
144
|
+
def token_is_eog?: (Integer) -> bool
|
144
145
|
end
|
145
146
|
|
146
147
|
class Timings
|
@@ -260,6 +261,7 @@ module LLaMACpp
|
|
260
261
|
def sample_grammar: (::LLaMACpp::TokenDataArray, grammar: ::LLaMACpp::Grammar) -> void
|
261
262
|
def grammar_accept_token: (grammar: ::LLaMACpp::Grammar, token: Integer) -> void
|
262
263
|
def apply_control_vector: (data: Array[Float], n_embd: Integer, il_start: Integer, il_end: Integer) -> void
|
264
|
+
def pooling_type: () -> Integer
|
263
265
|
end
|
264
266
|
|
265
267
|
class ContextParams
|
@@ -328,6 +330,8 @@ module LLaMACpp
|
|
328
330
|
def only_copy=: (bool) -> bool
|
329
331
|
def pure: () -> bool
|
330
332
|
def pure=: (bool) -> bool
|
333
|
+
def keep_split: () -> bool
|
334
|
+
def keep_split=: (bool) -> bool
|
331
335
|
end
|
332
336
|
|
333
337
|
class Params = ContextParams
|
@@ -386,6 +386,11 @@ ifdef LLAMA_OPENBLAS
|
|
386
386
|
MK_LDFLAGS += $(shell pkg-config --libs openblas)
|
387
387
|
endif # LLAMA_OPENBLAS
|
388
388
|
|
389
|
+
ifndef LLAMA_NO_LLAMAFILE
|
390
|
+
MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
|
391
|
+
OBJS += sgemm.o
|
392
|
+
endif
|
393
|
+
|
389
394
|
ifdef LLAMA_BLIS
|
390
395
|
MK_CPPFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
|
391
396
|
MK_LDFLAGS += -lblis -L/usr/local/lib
|
@@ -482,11 +487,9 @@ ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/com
|
|
482
487
|
|
483
488
|
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
|
484
489
|
$(NVCC_COMPILE)
|
485
|
-
|
486
490
|
endif # LLAMA_CUDA
|
487
491
|
|
488
492
|
ifdef LLAMA_CLBLAST
|
489
|
-
|
490
493
|
MK_CPPFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags-only-I clblast OpenCL)
|
491
494
|
MK_CFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
|
492
495
|
MK_CXXFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
|
@@ -605,6 +608,11 @@ ggml-mpi.o: ggml-mpi.c ggml-mpi.h
|
|
605
608
|
$(CC) $(CFLAGS) -c $< -o $@
|
606
609
|
endif # LLAMA_MPI
|
607
610
|
|
611
|
+
ifndef LLAMA_NO_LLAMAFILE
|
612
|
+
sgemm.o: sgemm.cpp sgemm.h ggml.h
|
613
|
+
$(CXX) $(CXXFLAGS) -c $< -o $@
|
614
|
+
endif
|
615
|
+
|
608
616
|
GF_CC := $(CC)
|
609
617
|
include scripts/get-flags.mk
|
610
618
|
|
@@ -689,8 +697,8 @@ OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
|
|
689
697
|
llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
|
690
698
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
691
699
|
|
692
|
-
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
|
693
|
-
COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o
|
700
|
+
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
|
701
|
+
COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
|
694
702
|
|
695
703
|
common.o: common/common.cpp $(COMMON_H_DEPS)
|
696
704
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
@@ -724,7 +732,7 @@ lib: llama.o ggml.o $(OBJS)
|
|
724
732
|
ar rcs libllama.a $^
|
725
733
|
|
726
734
|
clean:
|
727
|
-
rm -vrf *.o tests/*.o *.so *.a *.dll
|
735
|
+
rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
728
736
|
rm -vrf ggml-cuda/*.o
|
729
737
|
|
730
738
|
#
|
@@ -761,7 +769,7 @@ batched: examples/batched/batched.cpp ggml.o llama.o $(C
|
|
761
769
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
762
770
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
763
771
|
|
764
|
-
batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o
|
772
|
+
batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
765
773
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
766
774
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
767
775
|
|
@@ -793,10 +801,19 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
|
|
793
801
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
794
802
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
795
803
|
|
796
|
-
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp json-schema-to-grammar.
|
804
|
+
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
797
805
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
798
806
|
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
799
807
|
|
808
|
+
# Portable equivalent of `cd examples/server/public && xxd -i $(notdir $<) ../$(notdir $<).hpp`:
|
809
|
+
examples/server/%.hpp: examples/server/public/% Makefile
|
810
|
+
@( export NAME=$(subst .,_,$(subst -,_,$(notdir $<))) && \
|
811
|
+
echo "unsigned char $${NAME}[] = {" && \
|
812
|
+
cat $< | od -v -t x1 -An | sed -E 's/([0-9a-fA-F]+)/0x\1, /g' && \
|
813
|
+
echo "};" && \
|
814
|
+
echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \
|
815
|
+
) > $@
|
816
|
+
|
800
817
|
gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
|
801
818
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
802
819
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
@@ -371,16 +371,16 @@ struct ggml_gallocr {
|
|
371
371
|
};
|
372
372
|
|
373
373
|
ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
|
374
|
-
ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(sizeof(struct ggml_gallocr)
|
374
|
+
ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(1, sizeof(struct ggml_gallocr));
|
375
375
|
GGML_ASSERT(galloc != NULL);
|
376
376
|
|
377
|
-
galloc->bufts = calloc(sizeof(ggml_backend_buffer_type_t)
|
377
|
+
galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t));
|
378
378
|
GGML_ASSERT(galloc->bufts != NULL);
|
379
379
|
|
380
|
-
galloc->buffers = calloc(sizeof(ggml_backend_buffer_t) * n_bufs
|
380
|
+
galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t) * n_bufs);
|
381
381
|
GGML_ASSERT(galloc->buffers != NULL);
|
382
382
|
|
383
|
-
galloc->buf_tallocs = calloc(sizeof(struct ggml_dyn_tallocr *)
|
383
|
+
galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
|
384
384
|
GGML_ASSERT(galloc->buf_tallocs != NULL);
|
385
385
|
|
386
386
|
for (int i = 0; i < n_bufs; i++) {
|
@@ -646,8 +646,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
646
646
|
free(galloc->hash_set.keys);
|
647
647
|
free(galloc->hash_values);
|
648
648
|
galloc->hash_set.size = hash_size;
|
649
|
-
galloc->hash_set.keys = calloc(sizeof(struct ggml_tensor *)
|
650
|
-
galloc->hash_values = calloc(sizeof(struct hash_node)
|
649
|
+
galloc->hash_set.keys = calloc(hash_size, sizeof(struct ggml_tensor *));
|
650
|
+
galloc->hash_values = calloc(hash_size, sizeof(struct hash_node));
|
651
651
|
GGML_ASSERT(galloc->hash_set.keys != NULL);
|
652
652
|
GGML_ASSERT(galloc->hash_values != NULL);
|
653
653
|
} else {
|
@@ -667,7 +667,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
667
667
|
// set the node_allocs from the hash table
|
668
668
|
if (galloc->n_nodes < graph->n_nodes) {
|
669
669
|
free(galloc->node_allocs);
|
670
|
-
galloc->node_allocs = calloc(sizeof(struct node_alloc)
|
670
|
+
galloc->node_allocs = calloc(graph->n_nodes, sizeof(struct node_alloc));
|
671
671
|
GGML_ASSERT(galloc->node_allocs != NULL);
|
672
672
|
}
|
673
673
|
galloc->n_nodes = graph->n_nodes;
|
@@ -697,7 +697,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
697
697
|
}
|
698
698
|
if (galloc->n_leafs < graph->n_leafs) {
|
699
699
|
free(galloc->leaf_allocs);
|
700
|
-
galloc->leaf_allocs = calloc(sizeof(galloc->leaf_allocs[0])
|
700
|
+
galloc->leaf_allocs = calloc(graph->n_leafs, sizeof(galloc->leaf_allocs[0]));
|
701
701
|
GGML_ASSERT(galloc->leaf_allocs != NULL);
|
702
702
|
}
|
703
703
|
galloc->n_leafs = graph->n_leafs;
|
@@ -822,7 +822,11 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t
|
|
822
822
|
GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
823
823
|
switch (op->op) {
|
824
824
|
case GGML_OP_CPY:
|
825
|
-
return
|
825
|
+
return
|
826
|
+
op->type != GGML_TYPE_IQ2_XXS &&
|
827
|
+
op->type != GGML_TYPE_IQ2_XS &&
|
828
|
+
op->type != GGML_TYPE_IQ1_S &&
|
829
|
+
op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
|
826
830
|
case GGML_OP_MUL_MAT:
|
827
831
|
return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
|
828
832
|
default:
|
@@ -1721,23 +1725,23 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
|
1721
1725
|
GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
|
1722
1726
|
GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
|
1723
1727
|
|
1724
|
-
struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched)
|
1728
|
+
struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched));
|
1725
1729
|
|
1726
1730
|
// initialize hash table
|
1727
1731
|
sched->hash_set = ggml_hash_set_new(graph_size);
|
1728
|
-
sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0])
|
1729
|
-
sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0])
|
1732
|
+
sched->tensor_backend_id = calloc(sched->hash_set.size, sizeof(sched->tensor_backend_id[0]));
|
1733
|
+
sched->tensor_copies = calloc(sched->hash_set.size, sizeof(sched->tensor_copies[0]));
|
1730
1734
|
|
1731
1735
|
const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
|
1732
|
-
sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0])
|
1733
|
-
sched->leaf_backend_ids = calloc(sizeof(sched->leaf_backend_ids[0])
|
1736
|
+
sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
|
1737
|
+
sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
|
1734
1738
|
|
1735
1739
|
sched->n_backends = n_backends;
|
1736
1740
|
|
1737
1741
|
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
|
1738
1742
|
|
1739
1743
|
const int initial_splits_capacity = 16;
|
1740
|
-
sched->splits = calloc(sizeof(sched->splits[0])
|
1744
|
+
sched->splits = calloc(initial_splits_capacity, sizeof(sched->splits[0]));
|
1741
1745
|
sched->splits_capacity = initial_splits_capacity;
|
1742
1746
|
|
1743
1747
|
for (int b = 0; b < n_backends; b++) {
|
@@ -1968,10 +1972,10 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te
|
|
1968
1972
|
struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
|
1969
1973
|
struct ggml_hash_set hash_set = {
|
1970
1974
|
/* .size = */ graph->visited_hash_table.size,
|
1971
|
-
/* .keys = */ calloc(sizeof(hash_set.keys[0])
|
1975
|
+
/* .keys = */ calloc(graph->visited_hash_table.size, sizeof(hash_set.keys[0])) // NOLINT
|
1972
1976
|
};
|
1973
|
-
struct ggml_tensor ** node_copies = calloc(sizeof(node_copies[0])
|
1974
|
-
bool * node_init = calloc(sizeof(node_init[0])
|
1977
|
+
struct ggml_tensor ** node_copies = calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
|
1978
|
+
bool * node_init = calloc(hash_set.size, sizeof(node_init[0]));
|
1975
1979
|
|
1976
1980
|
struct ggml_init_params params = {
|
1977
1981
|
/* .mem_size = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false),
|
@@ -1231,7 +1231,7 @@ static void ggml_cuda_op_mul_mat_cublas(
|
|
1231
1231
|
|
1232
1232
|
if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
|
1233
1233
|
// convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
|
1234
|
-
ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool());
|
1234
|
+
ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool(id));
|
1235
1235
|
if (src0->type != GGML_TYPE_F16) {
|
1236
1236
|
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type);
|
1237
1237
|
GGML_ASSERT(to_fp16_cuda != nullptr);
|
@@ -1241,7 +1241,7 @@ static void ggml_cuda_op_mul_mat_cublas(
|
|
1241
1241
|
}
|
1242
1242
|
const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16.get();
|
1243
1243
|
|
1244
|
-
ggml_cuda_pool_alloc<half> src1_as_f16(ctx.pool());
|
1244
|
+
ggml_cuda_pool_alloc<half> src1_as_f16(ctx.pool(id));
|
1245
1245
|
if (src1->type != GGML_TYPE_F16) {
|
1246
1246
|
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
|
1247
1247
|
GGML_ASSERT(to_fp16_cuda != nullptr);
|
@@ -1250,7 +1250,7 @@ static void ggml_cuda_op_mul_mat_cublas(
|
|
1250
1250
|
to_fp16_cuda(src1_ddf_i, src1_as_f16.get(), ne, stream);
|
1251
1251
|
}
|
1252
1252
|
const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16.get();
|
1253
|
-
ggml_cuda_pool_alloc<half> dst_f16(ctx.pool(), row_diff*src1_ncols);
|
1253
|
+
ggml_cuda_pool_alloc<half> dst_f16(ctx.pool(id), row_diff*src1_ncols);
|
1254
1254
|
|
1255
1255
|
const half alpha_f16 = 1.0f;
|
1256
1256
|
const half beta_f16 = 0.0f;
|
@@ -1946,7 +1946,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
|
1946
1946
|
} else if (!split && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
1947
1947
|
// KQV single-batch
|
1948
1948
|
ggml_cuda_mul_mat_vec_nc(ctx, src0, src1, dst);
|
1949
|
-
} else if (!split &&
|
1949
|
+
} else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || fp16_performance_good) && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
1950
1950
|
// KQ + KQV multi-batch
|
1951
1951
|
ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
|
1952
1952
|
} else if (use_dequantize_mul_mat_vec) {
|
@@ -1960,20 +1960,73 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
|
1960
1960
|
}
|
1961
1961
|
}
|
1962
1962
|
|
1963
|
+
struct mmid_row_mapping {
|
1964
|
+
int32_t i1;
|
1965
|
+
int32_t i2;
|
1966
|
+
};
|
1967
|
+
|
1968
|
+
static __global__ void k_copy_src1_to_contiguous(const char * __restrict__ src1_original, char * __restrict__ src1_contiguous,
|
1969
|
+
int * __restrict__ cur_src1_row, mmid_row_mapping * __restrict__ row_mapping,
|
1970
|
+
const char * __restrict ids, int64_t i02, size_t ids_nb1, size_t ids_nb0,
|
1971
|
+
int64_t ne11, int64_t ne10,
|
1972
|
+
size_t nb11, size_t nb12) {
|
1973
|
+
int32_t iid1 = blockIdx.x;
|
1974
|
+
int32_t id = blockIdx.y;
|
1975
|
+
|
1976
|
+
const int32_t row_id_i = *(const int32_t *) (ids + iid1*ids_nb1 + id*ids_nb0);
|
1977
|
+
|
1978
|
+
if (row_id_i != i02) {
|
1979
|
+
return;
|
1980
|
+
}
|
1981
|
+
|
1982
|
+
const int64_t i11 = id % ne11;
|
1983
|
+
const int64_t i12 = iid1;
|
1984
|
+
|
1985
|
+
__shared__ int src1_row;
|
1986
|
+
if (threadIdx.x == 0) {
|
1987
|
+
src1_row = atomicAdd(cur_src1_row, 1);
|
1988
|
+
row_mapping[src1_row] = {id, iid1};
|
1989
|
+
}
|
1990
|
+
__syncthreads();
|
1991
|
+
|
1992
|
+
const float * src1_row_original = (const float *)(src1_original + i11*nb11 + i12*nb12);
|
1993
|
+
float * src1_row_contiguous = (float *)(src1_contiguous + src1_row*nb11);
|
1994
|
+
|
1995
|
+
for (int i = threadIdx.x; i < ne10; i += blockDim.x) {
|
1996
|
+
src1_row_contiguous[i] = src1_row_original[i];
|
1997
|
+
}
|
1998
|
+
}
|
1999
|
+
|
2000
|
+
static __global__ void k_copy_dst_from_contiguous(char * __restrict__ dst_original, const char * __restrict__ dst_contiguous,
|
2001
|
+
const mmid_row_mapping * __restrict__ row_mapping,
|
2002
|
+
int64_t ne0,
|
2003
|
+
size_t nb1, size_t nb2) {
|
2004
|
+
int32_t i = blockIdx.x;
|
2005
|
+
|
2006
|
+
const int32_t i1 = row_mapping[i].i1;
|
2007
|
+
const int32_t i2 = row_mapping[i].i2;
|
2008
|
+
|
2009
|
+
const float * dst_row_contiguous = (const float *)(dst_contiguous + i*nb1);
|
2010
|
+
float * dst_row_original = (float *)(dst_original + i1*nb1 + i2*nb2);
|
2011
|
+
|
2012
|
+
for (int j = threadIdx.x; j < ne0; j += blockDim.x) {
|
2013
|
+
dst_row_original[j] = dst_row_contiguous[j];
|
2014
|
+
}
|
2015
|
+
}
|
2016
|
+
|
1963
2017
|
static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
1964
2018
|
const ggml_tensor * src0 = dst->src[0];
|
1965
2019
|
const ggml_tensor * src1 = dst->src[1];
|
1966
2020
|
const ggml_tensor * ids = dst->src[2];
|
1967
2021
|
|
2022
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
2023
|
+
|
1968
2024
|
GGML_ASSERT(!ggml_backend_buffer_is_cuda_split(src0->buffer) && "mul_mat_id does not support split buffers");
|
1969
2025
|
|
1970
2026
|
cudaStream_t stream = ctx.stream();
|
1971
2027
|
|
1972
|
-
const
|
1973
|
-
const
|
1974
|
-
|
1975
|
-
const int32_t id = ((int32_t *) dst->op_params)[0];
|
1976
|
-
const int32_t n_as = src0->ne[2];
|
2028
|
+
const int64_t n_as = ne02;
|
2029
|
+
const int64_t n_ids = ids->ne[0];
|
1977
2030
|
|
1978
2031
|
std::vector<char> ids_host(ggml_nbytes(ids));
|
1979
2032
|
const char * ids_dev = (const char *) ids->data;
|
@@ -1982,7 +2035,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
|
|
1982
2035
|
|
1983
2036
|
ggml_tensor src0_row = *src0;
|
1984
2037
|
ggml_tensor src1_row = *src1;
|
1985
|
-
ggml_tensor dst_row
|
2038
|
+
ggml_tensor dst_row = *dst;
|
1986
2039
|
|
1987
2040
|
char * src0_original = (char *) src0->data;
|
1988
2041
|
char * src1_original = (char *) src1->data;
|
@@ -1990,19 +2043,39 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
|
|
1990
2043
|
|
1991
2044
|
src0_row.ne[2] = 1;
|
1992
2045
|
src0_row.ne[3] = 1;
|
1993
|
-
src0_row.nb[3] =
|
2046
|
+
src0_row.nb[3] = nb02;
|
1994
2047
|
|
1995
|
-
|
1996
|
-
|
1997
|
-
|
2048
|
+
src1_row.ne[1] = 1;
|
2049
|
+
src1_row.ne[2] = 1;
|
2050
|
+
src1_row.ne[3] = 1;
|
2051
|
+
src1_row.nb[2] = nb11;
|
2052
|
+
src1_row.nb[3] = nb11;
|
1998
2053
|
|
1999
|
-
|
2054
|
+
dst_row.ne[1] = 1;
|
2055
|
+
dst_row.ne[2] = 1;
|
2056
|
+
dst_row.ne[3] = 1;
|
2057
|
+
dst_row.nb[2] = nb1;
|
2058
|
+
dst_row.nb[3] = nb1;
|
2000
2059
|
|
2001
|
-
|
2002
|
-
|
2003
|
-
|
2060
|
+
if (ne12 == 1) {
|
2061
|
+
for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
|
2062
|
+
for (int64_t id = 0; id < n_ids; id++) {
|
2063
|
+
const int32_t i02 = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
|
2004
2064
|
|
2005
|
-
|
2065
|
+
GGML_ASSERT(i02 >= 0 && i02 < n_as);
|
2066
|
+
|
2067
|
+
const int64_t i11 = id % ne11;
|
2068
|
+
const int64_t i12 = iid1;
|
2069
|
+
|
2070
|
+
const int64_t i1 = id;
|
2071
|
+
const int64_t i2 = i12;
|
2072
|
+
|
2073
|
+
src0_row.data = src0_original + i02*nb02;
|
2074
|
+
src1_row.data = src1_original + i11*nb11 + i12*nb12;
|
2075
|
+
dst_row.data = dst_original + i1*nb1 + i2*nb2;
|
2076
|
+
|
2077
|
+
ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
|
2078
|
+
}
|
2006
2079
|
}
|
2007
2080
|
} else {
|
2008
2081
|
ggml_cuda_pool_alloc<char> src1_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(src1));
|
@@ -2011,54 +2084,69 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
|
|
2011
2084
|
src1_row.data = src1_contiguous.get();
|
2012
2085
|
dst_row.data = dst_contiguous.get();
|
2013
2086
|
|
2014
|
-
for (
|
2087
|
+
for (int64_t i02 = 0; i02 < n_as; i02++) {
|
2015
2088
|
int64_t num_src1_rows = 0;
|
2016
|
-
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
2017
|
-
const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
|
2018
2089
|
|
2019
|
-
|
2020
|
-
|
2021
|
-
|
2090
|
+
for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
|
2091
|
+
for (int64_t id = 0; id < n_ids; id++) {
|
2092
|
+
const int32_t row_id_i = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
|
2022
2093
|
|
2023
|
-
|
2094
|
+
GGML_ASSERT(row_id_i >= 0 && row_id_i < n_as);
|
2024
2095
|
|
2025
|
-
|
2026
|
-
|
2027
|
-
|
2096
|
+
if (row_id_i != i02) {
|
2097
|
+
continue;
|
2098
|
+
}
|
2099
|
+
|
2100
|
+
num_src1_rows++;
|
2101
|
+
}
|
2028
2102
|
}
|
2029
2103
|
|
2030
2104
|
if (num_src1_rows == 0) {
|
2031
2105
|
continue;
|
2032
2106
|
}
|
2033
2107
|
|
2034
|
-
|
2108
|
+
ggml_cuda_pool_alloc<int> dev_cur_src1_row(ctx.pool(), 1);
|
2109
|
+
ggml_cuda_pool_alloc<mmid_row_mapping> dev_row_mapping(ctx.pool(), num_src1_rows);
|
2110
|
+
CUDA_CHECK(cudaMemsetAsync(dev_cur_src1_row.get(), 0, sizeof(int), stream));
|
2035
2111
|
|
2036
|
-
|
2037
|
-
|
2112
|
+
{
|
2113
|
+
dim3 block_dims(std::min((unsigned int)ne10, 768u));
|
2114
|
+
dim3 grid_dims(ids->ne[1], n_ids);
|
2115
|
+
k_copy_src1_to_contiguous<<<grid_dims, block_dims, 0, stream>>>(
|
2116
|
+
src1_original, src1_contiguous.get(),
|
2117
|
+
dev_cur_src1_row.get(), dev_row_mapping.get(),
|
2118
|
+
ids_dev, i02, ids->nb[1], ids->nb[0],
|
2119
|
+
ne11, ne10,
|
2120
|
+
nb11, nb12);
|
2121
|
+
CUDA_CHECK(cudaGetLastError());
|
2122
|
+
}
|
2123
|
+
|
2124
|
+
src0_row.data = src0_original + i02*nb02;
|
2038
2125
|
|
2126
|
+
GGML_ASSERT(nb11 == sizeof(float)*ne10);
|
2127
|
+
GGML_ASSERT(nb1 == sizeof(float)*ne0);
|
2128
|
+
|
2129
|
+
src1_row.ne[1] = num_src1_rows;
|
2039
2130
|
src1_row.nb[1] = nb11;
|
2040
2131
|
src1_row.nb[2] = num_src1_rows*nb11;
|
2041
2132
|
src1_row.nb[3] = num_src1_rows*nb11;
|
2042
2133
|
|
2134
|
+
dst_row.ne[1] = num_src1_rows;
|
2043
2135
|
dst_row.nb[1] = nb1;
|
2044
2136
|
dst_row.nb[2] = num_src1_rows*nb1;
|
2045
2137
|
dst_row.nb[3] = num_src1_rows*nb1;
|
2046
2138
|
|
2047
2139
|
ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
|
2048
2140
|
|
2049
|
-
|
2050
|
-
|
2051
|
-
|
2052
|
-
|
2053
|
-
|
2054
|
-
|
2055
|
-
|
2056
|
-
|
2057
|
-
|
2058
|
-
|
2059
|
-
CUDA_CHECK(cudaMemcpyAsync(dst_original + i01*nb1, dst_contiguous.get() + num_src1_rows*nb1,
|
2060
|
-
nb1, cudaMemcpyDeviceToDevice, stream));
|
2061
|
-
num_src1_rows++;
|
2141
|
+
{
|
2142
|
+
dim3 block_dims(std::min((unsigned int)ne0, 768u));
|
2143
|
+
dim3 grid_dims(num_src1_rows);
|
2144
|
+
k_copy_dst_from_contiguous<<<grid_dims, block_dims, 0, stream>>>(
|
2145
|
+
dst_original, dst_contiguous.get(),
|
2146
|
+
dev_row_mapping.get(),
|
2147
|
+
ne0,
|
2148
|
+
nb1, nb2);
|
2149
|
+
CUDA_CHECK(cudaGetLastError());
|
2062
2150
|
}
|
2063
2151
|
}
|
2064
2152
|
}
|
@@ -2487,7 +2575,8 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|
2487
2575
|
GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
|
2488
2576
|
const int min_batch_size = 32;
|
2489
2577
|
|
2490
|
-
return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS
|
2578
|
+
return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
|
2579
|
+
(op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
|
2491
2580
|
|
2492
2581
|
GGML_UNUSED(backend);
|
2493
2582
|
}
|