llama_cpp 0.14.6 → 0.14.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/ext/llama_cpp/llama_cpp.cpp +37 -2
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +11 -6
- data/vendor/tmp/llama.cpp/ggml-alloc.c +8 -8
- data/vendor/tmp/llama.cpp/ggml-backend.c +14 -10
- data/vendor/tmp/llama.cpp/ggml-impl.h +262 -4
- data/vendor/tmp/llama.cpp/ggml-quants.c +0 -293
- data/vendor/tmp/llama.cpp/ggml.c +3 -17
- data/vendor/tmp/llama.cpp/llama.cpp +379 -66
- data/vendor/tmp/llama.cpp/llama.h +19 -6
- data/vendor/tmp/llama.cpp/sgemm.cpp +404 -553
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 243241c78383cb68d4fb5027ffc54ea7f6789bd74bfe85fae8e62d45e7c3145d
|
4
|
+
data.tar.gz: b7c792c6fb2287b71a72ff823a31706dc0830aa704d86e6f8a92d1d0630649d9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 59565cd5e6bd79d98d31dcf1ce505c8388a97296f607c2a114cf92a614a2cd39291a8a18a3f58993606ea3f0970d1eadbfe670280c5261c5826a54d77a2eb85d
|
7
|
+
data.tar.gz: 228bc19181b0163ef922e847f67e7b6a52dc1311c4e8173586dfca82eb402c5a08c104b5bac5ba0eee4772f615f8fd17f2d06cbc6db5323d133a46d3de85eeb4
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,12 @@
|
|
1
|
+
## [[0.14.7](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.6...v0.14.7)] - 2024-04-27
|
2
|
+
|
3
|
+
- Bump llama.cpp from b2698 to b2740.
|
4
|
+
- Add `keep_split` accessor to `ModelQuantizeParams`.
|
5
|
+
- Add `pooling_type` method to `Context`.
|
6
|
+
- Add `token_is_eog?` method to `Model`.
|
7
|
+
|
8
|
+
Implementation binding for llama_sample_token_with_rng has been skipped.
|
9
|
+
|
1
10
|
## [[0.14.6](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.5...v0.14.6)] - 2024-04-20
|
2
11
|
|
3
12
|
- Bump llama.cpp from b2658 to b2698.
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -1321,6 +1321,8 @@ public:
|
|
1321
1321
|
rb_define_method(rb_cLLaMAModelQuantizeParams, "only_copy", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_only_copy), 0);
|
1322
1322
|
rb_define_method(rb_cLLaMAModelQuantizeParams, "pure=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_pure), 1);
|
1323
1323
|
rb_define_method(rb_cLLaMAModelQuantizeParams, "pure", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_pure), 0);
|
1324
|
+
rb_define_method(rb_cLLaMAModelQuantizeParams, "keep_split=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_keep_split), 1);
|
1325
|
+
rb_define_method(rb_cLLaMAModelQuantizeParams, "keep_split", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_keep_split), 0);
|
1324
1326
|
}
|
1325
1327
|
|
1326
1328
|
private:
|
@@ -1405,6 +1407,18 @@ private:
|
|
1405
1407
|
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
1406
1408
|
return ptr->params.pure ? Qtrue : Qfalse;
|
1407
1409
|
}
|
1410
|
+
|
1411
|
+
// keep_split
|
1412
|
+
static VALUE _llama_model_quantize_params_set_keep_split(VALUE self, VALUE keep_split) {
|
1413
|
+
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
1414
|
+
ptr->params.keep_split = RTEST(keep_split) ? true : false;
|
1415
|
+
return ptr->params.keep_split ? Qtrue : Qfalse;
|
1416
|
+
}
|
1417
|
+
|
1418
|
+
static VALUE _llama_model_quantize_params_get_keep_split(VALUE self) {
|
1419
|
+
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
1420
|
+
return ptr->params.keep_split ? Qtrue : Qfalse;
|
1421
|
+
}
|
1408
1422
|
};
|
1409
1423
|
|
1410
1424
|
const rb_data_type_t RbLLaMAModelQuantizeParams::llama_model_quantize_params_type = {
|
@@ -1487,6 +1501,7 @@ public:
|
|
1487
1501
|
rb_define_method(rb_cLLaMAModel, "token_middle", RUBY_METHOD_FUNC(_llama_model_token_middle), 0);
|
1488
1502
|
rb_define_method(rb_cLLaMAModel, "token_suffix", RUBY_METHOD_FUNC(_llama_model_token_suffix), 0);
|
1489
1503
|
rb_define_method(rb_cLLaMAModel, "token_eot", RUBY_METHOD_FUNC(_llama_model_token_eot), 0);
|
1504
|
+
rb_define_method(rb_cLLaMAModel, "token_is_eog?", RUBY_METHOD_FUNC(_llama_model_token_is_eog), 1);
|
1490
1505
|
}
|
1491
1506
|
|
1492
1507
|
private:
|
@@ -1634,10 +1649,10 @@ private:
|
|
1634
1649
|
const llama_token token = NUM2INT(token_);
|
1635
1650
|
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1636
1651
|
std::vector<char> result(8, 0);
|
1637
|
-
const int n_tokens = llama_token_to_piece(ptr->model, token, result.data(), result.size());
|
1652
|
+
const int n_tokens = llama_token_to_piece(ptr->model, token, result.data(), result.size(), false);
|
1638
1653
|
if (n_tokens < 0) {
|
1639
1654
|
result.resize(-n_tokens);
|
1640
|
-
const int check = llama_token_to_piece(ptr->model, token, result.data(), result.size());
|
1655
|
+
const int check = llama_token_to_piece(ptr->model, token, result.data(), result.size(), false);
|
1641
1656
|
if (check != -n_tokens) {
|
1642
1657
|
rb_raise(rb_eRuntimeError, "failed to convert");
|
1643
1658
|
return Qnil;
|
@@ -1789,6 +1804,16 @@ private:
|
|
1789
1804
|
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1790
1805
|
return INT2NUM(llama_token_eot(ptr->model));
|
1791
1806
|
}
|
1807
|
+
|
1808
|
+
static VALUE _llama_model_token_is_eog(VALUE self, VALUE token_) {
|
1809
|
+
if (!RB_INTEGER_TYPE_P(token_)) {
|
1810
|
+
rb_raise(rb_eArgError, "token must be an integer");
|
1811
|
+
return Qnil;
|
1812
|
+
}
|
1813
|
+
const llama_token token = NUM2INT(token_);
|
1814
|
+
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1815
|
+
return llama_token_is_eog(ptr->model, token) ? Qtrue : Qfalse;
|
1816
|
+
}
|
1792
1817
|
};
|
1793
1818
|
|
1794
1819
|
const rb_data_type_t RbLLaMAModel::llama_model_type = {
|
@@ -2102,6 +2127,7 @@ public:
|
|
2102
2127
|
rb_define_method(rb_cLLaMAContext, "sample_grammar", RUBY_METHOD_FUNC(_llama_context_sample_grammar), -1);
|
2103
2128
|
rb_define_method(rb_cLLaMAContext, "grammar_accept_token", RUBY_METHOD_FUNC(_llama_context_grammar_accept_token), -1);
|
2104
2129
|
rb_define_method(rb_cLLaMAContext, "apply_control_vector", RUBY_METHOD_FUNC(_llama_context_apply_control_vector), -1);
|
2130
|
+
rb_define_method(rb_cLLaMAContext, "pooling_type", RUBY_METHOD_FUNC(_llama_context_pooling_type), 0);
|
2105
2131
|
}
|
2106
2132
|
|
2107
2133
|
private:
|
@@ -3225,6 +3251,15 @@ private:
|
|
3225
3251
|
|
3226
3252
|
return Qnil;
|
3227
3253
|
}
|
3254
|
+
|
3255
|
+
static VALUE _llama_context_pooling_type(VALUE self) {
|
3256
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
3257
|
+
if (ptr->ctx == NULL) {
|
3258
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
3259
|
+
return Qnil;
|
3260
|
+
}
|
3261
|
+
return INT2NUM(static_cast<int>(llama_pooling_type(ptr->ctx)));
|
3262
|
+
}
|
3228
3263
|
};
|
3229
3264
|
|
3230
3265
|
const rb_data_type_t RbLLaMAContext::llama_context_type = {
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.14.
|
6
|
+
VERSION = '0.14.7'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b2740'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -141,6 +141,7 @@ module LLaMACpp
|
|
141
141
|
def token_middle: () -> Integer
|
142
142
|
def token_suffix: () -> Integer
|
143
143
|
def token_eot: () -> Integer
|
144
|
+
def token_is_eog?: (Integer) -> bool
|
144
145
|
end
|
145
146
|
|
146
147
|
class Timings
|
@@ -260,6 +261,7 @@ module LLaMACpp
|
|
260
261
|
def sample_grammar: (::LLaMACpp::TokenDataArray, grammar: ::LLaMACpp::Grammar) -> void
|
261
262
|
def grammar_accept_token: (grammar: ::LLaMACpp::Grammar, token: Integer) -> void
|
262
263
|
def apply_control_vector: (data: Array[Float], n_embd: Integer, il_start: Integer, il_end: Integer) -> void
|
264
|
+
def pooling_type: () -> Integer
|
263
265
|
end
|
264
266
|
|
265
267
|
class ContextParams
|
@@ -328,6 +330,8 @@ module LLaMACpp
|
|
328
330
|
def only_copy=: (bool) -> bool
|
329
331
|
def pure: () -> bool
|
330
332
|
def pure=: (bool) -> bool
|
333
|
+
def keep_split: () -> bool
|
334
|
+
def keep_split=: (bool) -> bool
|
331
335
|
end
|
332
336
|
|
333
337
|
class Params = ContextParams
|
@@ -386,10 +386,6 @@ ifdef LLAMA_OPENBLAS
|
|
386
386
|
MK_LDFLAGS += $(shell pkg-config --libs openblas)
|
387
387
|
endif # LLAMA_OPENBLAS
|
388
388
|
|
389
|
-
# TODO: temporary disable until MoE is fixed
|
390
|
-
# https://github.com/ggerganov/llama.cpp/pull/6716
|
391
|
-
LLAMA_NO_LLAMAFILE := 1
|
392
|
-
|
393
389
|
ifndef LLAMA_NO_LLAMAFILE
|
394
390
|
MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
|
395
391
|
OBJS += sgemm.o
|
@@ -701,7 +697,7 @@ OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
|
|
701
697
|
llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
|
702
698
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
703
699
|
|
704
|
-
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
|
700
|
+
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
|
705
701
|
COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
|
706
702
|
|
707
703
|
common.o: common/common.cpp $(COMMON_H_DEPS)
|
@@ -805,10 +801,19 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
|
|
805
801
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
806
802
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
807
803
|
|
808
|
-
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
804
|
+
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
809
805
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
810
806
|
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
811
807
|
|
808
|
+
# Portable equivalent of `cd examples/server/public && xxd -i $(notdir $<) ../$(notdir $<).hpp`:
|
809
|
+
examples/server/%.hpp: examples/server/public/% Makefile
|
810
|
+
@( export NAME=$(subst .,_,$(subst -,_,$(notdir $<))) && \
|
811
|
+
echo "unsigned char $${NAME}[] = {" && \
|
812
|
+
cat $< | od -v -t x1 -An | sed -E 's/([0-9a-fA-F]+)/0x\1, /g' && \
|
813
|
+
echo "};" && \
|
814
|
+
echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \
|
815
|
+
) > $@
|
816
|
+
|
812
817
|
gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
|
813
818
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
814
819
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
@@ -371,16 +371,16 @@ struct ggml_gallocr {
|
|
371
371
|
};
|
372
372
|
|
373
373
|
ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
|
374
|
-
ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(sizeof(struct ggml_gallocr)
|
374
|
+
ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(1, sizeof(struct ggml_gallocr));
|
375
375
|
GGML_ASSERT(galloc != NULL);
|
376
376
|
|
377
|
-
galloc->bufts = calloc(sizeof(ggml_backend_buffer_type_t)
|
377
|
+
galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t));
|
378
378
|
GGML_ASSERT(galloc->bufts != NULL);
|
379
379
|
|
380
|
-
galloc->buffers = calloc(sizeof(ggml_backend_buffer_t) * n_bufs
|
380
|
+
galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t) * n_bufs);
|
381
381
|
GGML_ASSERT(galloc->buffers != NULL);
|
382
382
|
|
383
|
-
galloc->buf_tallocs = calloc(sizeof(struct ggml_dyn_tallocr *)
|
383
|
+
galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
|
384
384
|
GGML_ASSERT(galloc->buf_tallocs != NULL);
|
385
385
|
|
386
386
|
for (int i = 0; i < n_bufs; i++) {
|
@@ -646,8 +646,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
646
646
|
free(galloc->hash_set.keys);
|
647
647
|
free(galloc->hash_values);
|
648
648
|
galloc->hash_set.size = hash_size;
|
649
|
-
galloc->hash_set.keys = calloc(sizeof(struct ggml_tensor *)
|
650
|
-
galloc->hash_values = calloc(sizeof(struct hash_node)
|
649
|
+
galloc->hash_set.keys = calloc(hash_size, sizeof(struct ggml_tensor *));
|
650
|
+
galloc->hash_values = calloc(hash_size, sizeof(struct hash_node));
|
651
651
|
GGML_ASSERT(galloc->hash_set.keys != NULL);
|
652
652
|
GGML_ASSERT(galloc->hash_values != NULL);
|
653
653
|
} else {
|
@@ -667,7 +667,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
667
667
|
// set the node_allocs from the hash table
|
668
668
|
if (galloc->n_nodes < graph->n_nodes) {
|
669
669
|
free(galloc->node_allocs);
|
670
|
-
galloc->node_allocs = calloc(sizeof(struct node_alloc)
|
670
|
+
galloc->node_allocs = calloc(graph->n_nodes, sizeof(struct node_alloc));
|
671
671
|
GGML_ASSERT(galloc->node_allocs != NULL);
|
672
672
|
}
|
673
673
|
galloc->n_nodes = graph->n_nodes;
|
@@ -697,7 +697,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
697
697
|
}
|
698
698
|
if (galloc->n_leafs < graph->n_leafs) {
|
699
699
|
free(galloc->leaf_allocs);
|
700
|
-
galloc->leaf_allocs = calloc(sizeof(galloc->leaf_allocs[0])
|
700
|
+
galloc->leaf_allocs = calloc(graph->n_leafs, sizeof(galloc->leaf_allocs[0]));
|
701
701
|
GGML_ASSERT(galloc->leaf_allocs != NULL);
|
702
702
|
}
|
703
703
|
galloc->n_leafs = graph->n_leafs;
|
@@ -822,7 +822,11 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t
|
|
822
822
|
GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
823
823
|
switch (op->op) {
|
824
824
|
case GGML_OP_CPY:
|
825
|
-
return
|
825
|
+
return
|
826
|
+
op->type != GGML_TYPE_IQ2_XXS &&
|
827
|
+
op->type != GGML_TYPE_IQ2_XS &&
|
828
|
+
op->type != GGML_TYPE_IQ1_S &&
|
829
|
+
op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
|
826
830
|
case GGML_OP_MUL_MAT:
|
827
831
|
return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
|
828
832
|
default:
|
@@ -1721,23 +1725,23 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
|
1721
1725
|
GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
|
1722
1726
|
GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
|
1723
1727
|
|
1724
|
-
struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched)
|
1728
|
+
struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched));
|
1725
1729
|
|
1726
1730
|
// initialize hash table
|
1727
1731
|
sched->hash_set = ggml_hash_set_new(graph_size);
|
1728
|
-
sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0])
|
1729
|
-
sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0])
|
1732
|
+
sched->tensor_backend_id = calloc(sched->hash_set.size, sizeof(sched->tensor_backend_id[0]));
|
1733
|
+
sched->tensor_copies = calloc(sched->hash_set.size, sizeof(sched->tensor_copies[0]));
|
1730
1734
|
|
1731
1735
|
const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
|
1732
|
-
sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0])
|
1733
|
-
sched->leaf_backend_ids = calloc(sizeof(sched->leaf_backend_ids[0])
|
1736
|
+
sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
|
1737
|
+
sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
|
1734
1738
|
|
1735
1739
|
sched->n_backends = n_backends;
|
1736
1740
|
|
1737
1741
|
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
|
1738
1742
|
|
1739
1743
|
const int initial_splits_capacity = 16;
|
1740
|
-
sched->splits = calloc(sizeof(sched->splits[0])
|
1744
|
+
sched->splits = calloc(initial_splits_capacity, sizeof(sched->splits[0]));
|
1741
1745
|
sched->splits_capacity = initial_splits_capacity;
|
1742
1746
|
|
1743
1747
|
for (int b = 0; b < n_backends; b++) {
|
@@ -1968,10 +1972,10 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te
|
|
1968
1972
|
struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
|
1969
1973
|
struct ggml_hash_set hash_set = {
|
1970
1974
|
/* .size = */ graph->visited_hash_table.size,
|
1971
|
-
/* .keys = */ calloc(sizeof(hash_set.keys[0])
|
1975
|
+
/* .keys = */ calloc(graph->visited_hash_table.size, sizeof(hash_set.keys[0])) // NOLINT
|
1972
1976
|
};
|
1973
|
-
struct ggml_tensor ** node_copies = calloc(sizeof(node_copies[0])
|
1974
|
-
bool * node_init = calloc(sizeof(node_init[0])
|
1977
|
+
struct ggml_tensor ** node_copies = calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
|
1978
|
+
bool * node_init = calloc(hash_set.size, sizeof(node_init[0]));
|
1975
1979
|
|
1976
1980
|
struct ggml_init_params params = {
|
1977
1981
|
/* .mem_size = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false),
|
@@ -11,6 +11,12 @@
|
|
11
11
|
#include <string.h> // memcpy
|
12
12
|
#include <math.h> // fabsf
|
13
13
|
|
14
|
+
#undef MIN
|
15
|
+
#undef MAX
|
16
|
+
|
17
|
+
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
18
|
+
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
19
|
+
|
14
20
|
#ifdef __cplusplus
|
15
21
|
extern "C" {
|
16
22
|
#endif
|
@@ -45,7 +51,7 @@ extern "C" {
|
|
45
51
|
// 16-bit float
|
46
52
|
// on Arm, we use __fp16
|
47
53
|
// on x86, we use uint16_t
|
48
|
-
#if defined(__ARM_NEON)
|
54
|
+
#if defined(__ARM_NEON)
|
49
55
|
|
50
56
|
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
|
51
57
|
//
|
@@ -53,8 +59,262 @@ extern "C" {
|
|
53
59
|
//
|
54
60
|
#include <arm_neon.h>
|
55
61
|
|
62
|
+
#ifdef _MSC_VER
|
63
|
+
|
64
|
+
typedef uint16_t ggml_fp16_internal_t;
|
65
|
+
|
66
|
+
#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
|
67
|
+
|
68
|
+
#else
|
69
|
+
|
56
70
|
typedef __fp16 ggml_fp16_internal_t;
|
57
71
|
|
72
|
+
#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
|
73
|
+
|
74
|
+
#endif // _MSC_VER
|
75
|
+
|
76
|
+
#if !defined(__aarch64__)
|
77
|
+
|
78
|
+
// 32-bit ARM compatibility
|
79
|
+
|
80
|
+
// vaddvq_s16
|
81
|
+
// vpaddq_s16
|
82
|
+
// vpaddq_s32
|
83
|
+
// vaddvq_s32
|
84
|
+
// vaddvq_f32
|
85
|
+
// vmaxvq_f32
|
86
|
+
// vcvtnq_s32_f32
|
87
|
+
// vzip1_u8
|
88
|
+
// vzip2_u8
|
89
|
+
|
90
|
+
inline static int32_t vaddvq_s16(int16x8_t v) {
|
91
|
+
return
|
92
|
+
(int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
|
93
|
+
(int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
|
94
|
+
(int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
|
95
|
+
(int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
|
96
|
+
}
|
97
|
+
|
98
|
+
inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
|
99
|
+
int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
|
100
|
+
int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
|
101
|
+
return vcombine_s16(a0, b0);
|
102
|
+
}
|
103
|
+
|
104
|
+
inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
|
105
|
+
int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
|
106
|
+
int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
|
107
|
+
return vcombine_s32(a0, b0);
|
108
|
+
}
|
109
|
+
|
110
|
+
inline static int32_t vaddvq_s32(int32x4_t v) {
|
111
|
+
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
|
112
|
+
}
|
113
|
+
|
114
|
+
inline static float vaddvq_f32(float32x4_t v) {
|
115
|
+
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
|
116
|
+
}
|
117
|
+
|
118
|
+
inline static float vmaxvq_f32(float32x4_t v) {
|
119
|
+
return
|
120
|
+
MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
|
121
|
+
MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
|
122
|
+
}
|
123
|
+
|
124
|
+
inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
|
125
|
+
int32x4_t res;
|
126
|
+
|
127
|
+
res[0] = roundf(vgetq_lane_f32(v, 0));
|
128
|
+
res[1] = roundf(vgetq_lane_f32(v, 1));
|
129
|
+
res[2] = roundf(vgetq_lane_f32(v, 2));
|
130
|
+
res[3] = roundf(vgetq_lane_f32(v, 3));
|
131
|
+
|
132
|
+
return res;
|
133
|
+
}
|
134
|
+
|
135
|
+
inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
|
136
|
+
uint8x8_t res;
|
137
|
+
|
138
|
+
res[0] = a[0]; res[1] = b[0];
|
139
|
+
res[2] = a[1]; res[3] = b[1];
|
140
|
+
res[4] = a[2]; res[5] = b[2];
|
141
|
+
res[6] = a[3]; res[7] = b[3];
|
142
|
+
|
143
|
+
return res;
|
144
|
+
}
|
145
|
+
|
146
|
+
inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
|
147
|
+
uint8x8_t res;
|
148
|
+
|
149
|
+
res[0] = a[4]; res[1] = b[4];
|
150
|
+
res[2] = a[5]; res[3] = b[5];
|
151
|
+
res[4] = a[6]; res[5] = b[6];
|
152
|
+
res[6] = a[7]; res[7] = b[7];
|
153
|
+
|
154
|
+
return res;
|
155
|
+
}
|
156
|
+
|
157
|
+
// vld1q_s16_x2
|
158
|
+
// vld1q_u8_x2
|
159
|
+
// vld1q_u8_x4
|
160
|
+
// vld1q_s8_x2
|
161
|
+
// vld1q_s8_x4
|
162
|
+
// TODO: double-check these work correctly
|
163
|
+
|
164
|
+
typedef struct ggml_int16x8x2_t {
|
165
|
+
int16x8_t val[2];
|
166
|
+
} ggml_int16x8x2_t;
|
167
|
+
|
168
|
+
inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) {
|
169
|
+
ggml_int16x8x2_t res;
|
170
|
+
|
171
|
+
res.val[0] = vld1q_s16(ptr + 0);
|
172
|
+
res.val[1] = vld1q_s16(ptr + 8);
|
173
|
+
|
174
|
+
return res;
|
175
|
+
}
|
176
|
+
|
177
|
+
typedef struct ggml_uint8x16x2_t {
|
178
|
+
uint8x16_t val[2];
|
179
|
+
} ggml_uint8x16x2_t;
|
180
|
+
|
181
|
+
inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) {
|
182
|
+
ggml_uint8x16x2_t res;
|
183
|
+
|
184
|
+
res.val[0] = vld1q_u8(ptr + 0);
|
185
|
+
res.val[1] = vld1q_u8(ptr + 16);
|
186
|
+
|
187
|
+
return res;
|
188
|
+
}
|
189
|
+
|
190
|
+
typedef struct ggml_uint8x16x4_t {
|
191
|
+
uint8x16_t val[4];
|
192
|
+
} ggml_uint8x16x4_t;
|
193
|
+
|
194
|
+
inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) {
|
195
|
+
ggml_uint8x16x4_t res;
|
196
|
+
|
197
|
+
res.val[0] = vld1q_u8(ptr + 0);
|
198
|
+
res.val[1] = vld1q_u8(ptr + 16);
|
199
|
+
res.val[2] = vld1q_u8(ptr + 32);
|
200
|
+
res.val[3] = vld1q_u8(ptr + 48);
|
201
|
+
|
202
|
+
return res;
|
203
|
+
}
|
204
|
+
|
205
|
+
typedef struct ggml_int8x16x2_t {
|
206
|
+
int8x16_t val[2];
|
207
|
+
} ggml_int8x16x2_t;
|
208
|
+
|
209
|
+
inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) {
|
210
|
+
ggml_int8x16x2_t res;
|
211
|
+
|
212
|
+
res.val[0] = vld1q_s8(ptr + 0);
|
213
|
+
res.val[1] = vld1q_s8(ptr + 16);
|
214
|
+
|
215
|
+
return res;
|
216
|
+
}
|
217
|
+
|
218
|
+
typedef struct ggml_int8x16x4_t {
|
219
|
+
int8x16_t val[4];
|
220
|
+
} ggml_int8x16x4_t;
|
221
|
+
|
222
|
+
inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
|
223
|
+
ggml_int8x16x4_t res;
|
224
|
+
|
225
|
+
res.val[0] = vld1q_s8(ptr + 0);
|
226
|
+
res.val[1] = vld1q_s8(ptr + 16);
|
227
|
+
res.val[2] = vld1q_s8(ptr + 32);
|
228
|
+
res.val[3] = vld1q_s8(ptr + 48);
|
229
|
+
|
230
|
+
return res;
|
231
|
+
}
|
232
|
+
|
233
|
+
// NOTE: not tested
|
234
|
+
inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
|
235
|
+
int8x16_t res;
|
236
|
+
|
237
|
+
res[ 0] = a[b[ 0]];
|
238
|
+
res[ 1] = a[b[ 1]];
|
239
|
+
res[ 2] = a[b[ 2]];
|
240
|
+
res[ 3] = a[b[ 3]];
|
241
|
+
res[ 4] = a[b[ 4]];
|
242
|
+
res[ 5] = a[b[ 5]];
|
243
|
+
res[ 6] = a[b[ 6]];
|
244
|
+
res[ 7] = a[b[ 7]];
|
245
|
+
res[ 8] = a[b[ 8]];
|
246
|
+
res[ 9] = a[b[ 9]];
|
247
|
+
res[10] = a[b[10]];
|
248
|
+
res[11] = a[b[11]];
|
249
|
+
res[12] = a[b[12]];
|
250
|
+
res[13] = a[b[13]];
|
251
|
+
res[14] = a[b[14]];
|
252
|
+
res[15] = a[b[15]];
|
253
|
+
|
254
|
+
return res;
|
255
|
+
}
|
256
|
+
|
257
|
+
// NOTE: not tested
|
258
|
+
inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
|
259
|
+
uint8x16_t res;
|
260
|
+
|
261
|
+
res[ 0] = a[b[ 0]];
|
262
|
+
res[ 1] = a[b[ 1]];
|
263
|
+
res[ 2] = a[b[ 2]];
|
264
|
+
res[ 3] = a[b[ 3]];
|
265
|
+
res[ 4] = a[b[ 4]];
|
266
|
+
res[ 5] = a[b[ 5]];
|
267
|
+
res[ 6] = a[b[ 6]];
|
268
|
+
res[ 7] = a[b[ 7]];
|
269
|
+
res[ 8] = a[b[ 8]];
|
270
|
+
res[ 9] = a[b[ 9]];
|
271
|
+
res[10] = a[b[10]];
|
272
|
+
res[11] = a[b[11]];
|
273
|
+
res[12] = a[b[12]];
|
274
|
+
res[13] = a[b[13]];
|
275
|
+
res[14] = a[b[14]];
|
276
|
+
res[15] = a[b[15]];
|
277
|
+
|
278
|
+
return res;
|
279
|
+
}
|
280
|
+
|
281
|
+
#else
|
282
|
+
|
283
|
+
#define ggml_int16x8x2_t int16x8x2_t
|
284
|
+
#define ggml_uint8x16x2_t uint8x16x2_t
|
285
|
+
#define ggml_uint8x16x4_t uint8x16x4_t
|
286
|
+
#define ggml_int8x16x2_t int8x16x2_t
|
287
|
+
#define ggml_int8x16x4_t int8x16x4_t
|
288
|
+
|
289
|
+
#define ggml_vld1q_s16_x2 vld1q_s16_x2
|
290
|
+
#define ggml_vld1q_u8_x2 vld1q_u8_x2
|
291
|
+
#define ggml_vld1q_u8_x4 vld1q_u8_x4
|
292
|
+
#define ggml_vld1q_s8_x2 vld1q_s8_x2
|
293
|
+
#define ggml_vld1q_s8_x4 vld1q_s8_x4
|
294
|
+
#define ggml_vqtbl1q_s8 vqtbl1q_s8
|
295
|
+
#define ggml_vqtbl1q_u8 vqtbl1q_u8
|
296
|
+
|
297
|
+
#endif // !defined(__aarch64__)
|
298
|
+
|
299
|
+
#if !defined(__ARM_FEATURE_DOTPROD)
|
300
|
+
|
301
|
+
inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
|
302
|
+
const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
|
303
|
+
const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
|
304
|
+
|
305
|
+
return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
|
306
|
+
}
|
307
|
+
|
308
|
+
#else
|
309
|
+
|
310
|
+
#define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c)
|
311
|
+
|
312
|
+
#endif // !defined(__ARM_FEATURE_DOTPROD)
|
313
|
+
|
314
|
+
#endif // defined(__ARM_NEON)
|
315
|
+
|
316
|
+
#if defined(__ARM_NEON) && !defined(__MSC_VER)
|
317
|
+
|
58
318
|
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
59
319
|
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
60
320
|
|
@@ -75,8 +335,6 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
|
75
335
|
|
76
336
|
#else
|
77
337
|
|
78
|
-
typedef uint16_t ggml_fp16_internal_t;
|
79
|
-
|
80
338
|
#ifdef __wasm_simd128__
|
81
339
|
#include <wasm_simd128.h>
|
82
340
|
#else
|
@@ -221,7 +479,7 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
|
221
479
|
|
222
480
|
#endif // __F16C__
|
223
481
|
|
224
|
-
#endif // __ARM_NEON
|
482
|
+
#endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
|
225
483
|
|
226
484
|
// precomputed f32 table for f16 (256 KB)
|
227
485
|
// defined in ggml.c, initialized in ggml_init()
|