llama_cpp 0.14.6 → 0.14.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/ext/llama_cpp/llama_cpp.cpp +37 -2
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +11 -6
- data/vendor/tmp/llama.cpp/ggml-alloc.c +8 -8
- data/vendor/tmp/llama.cpp/ggml-backend.c +14 -10
- data/vendor/tmp/llama.cpp/ggml-impl.h +262 -4
- data/vendor/tmp/llama.cpp/ggml-quants.c +0 -293
- data/vendor/tmp/llama.cpp/ggml.c +3 -17
- data/vendor/tmp/llama.cpp/llama.cpp +379 -66
- data/vendor/tmp/llama.cpp/llama.h +19 -6
- data/vendor/tmp/llama.cpp/sgemm.cpp +404 -553
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 243241c78383cb68d4fb5027ffc54ea7f6789bd74bfe85fae8e62d45e7c3145d
|
4
|
+
data.tar.gz: b7c792c6fb2287b71a72ff823a31706dc0830aa704d86e6f8a92d1d0630649d9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 59565cd5e6bd79d98d31dcf1ce505c8388a97296f607c2a114cf92a614a2cd39291a8a18a3f58993606ea3f0970d1eadbfe670280c5261c5826a54d77a2eb85d
|
7
|
+
data.tar.gz: 228bc19181b0163ef922e847f67e7b6a52dc1311c4e8173586dfca82eb402c5a08c104b5bac5ba0eee4772f615f8fd17f2d06cbc6db5323d133a46d3de85eeb4
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,12 @@
|
|
1
|
+
## [[0.14.7](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.6...v0.14.7)] - 2024-04-27
|
2
|
+
|
3
|
+
- Bump llama.cpp from b2698 to b2740.
|
4
|
+
- Add `keep_split` accessor to `ModelQuantizeParams`.
|
5
|
+
- Add `pooling_type` method to `Context`.
|
6
|
+
- Add `token_is_eog?` method to `Model`.
|
7
|
+
|
8
|
+
Implementation binding for llama_sample_token_with_rng has been skipped.
|
9
|
+
|
1
10
|
## [[0.14.6](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.5...v0.14.6)] - 2024-04-20
|
2
11
|
|
3
12
|
- Bump llama.cpp from b2658 to b2698.
|
data/ext/llama_cpp/llama_cpp.cpp
CHANGED
@@ -1321,6 +1321,8 @@ public:
|
|
1321
1321
|
rb_define_method(rb_cLLaMAModelQuantizeParams, "only_copy", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_only_copy), 0);
|
1322
1322
|
rb_define_method(rb_cLLaMAModelQuantizeParams, "pure=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_pure), 1);
|
1323
1323
|
rb_define_method(rb_cLLaMAModelQuantizeParams, "pure", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_pure), 0);
|
1324
|
+
rb_define_method(rb_cLLaMAModelQuantizeParams, "keep_split=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_keep_split), 1);
|
1325
|
+
rb_define_method(rb_cLLaMAModelQuantizeParams, "keep_split", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_keep_split), 0);
|
1324
1326
|
}
|
1325
1327
|
|
1326
1328
|
private:
|
@@ -1405,6 +1407,18 @@ private:
|
|
1405
1407
|
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
1406
1408
|
return ptr->params.pure ? Qtrue : Qfalse;
|
1407
1409
|
}
|
1410
|
+
|
1411
|
+
// keep_split
|
1412
|
+
static VALUE _llama_model_quantize_params_set_keep_split(VALUE self, VALUE keep_split) {
|
1413
|
+
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
1414
|
+
ptr->params.keep_split = RTEST(keep_split) ? true : false;
|
1415
|
+
return ptr->params.keep_split ? Qtrue : Qfalse;
|
1416
|
+
}
|
1417
|
+
|
1418
|
+
static VALUE _llama_model_quantize_params_get_keep_split(VALUE self) {
|
1419
|
+
LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
|
1420
|
+
return ptr->params.keep_split ? Qtrue : Qfalse;
|
1421
|
+
}
|
1408
1422
|
};
|
1409
1423
|
|
1410
1424
|
const rb_data_type_t RbLLaMAModelQuantizeParams::llama_model_quantize_params_type = {
|
@@ -1487,6 +1501,7 @@ public:
|
|
1487
1501
|
rb_define_method(rb_cLLaMAModel, "token_middle", RUBY_METHOD_FUNC(_llama_model_token_middle), 0);
|
1488
1502
|
rb_define_method(rb_cLLaMAModel, "token_suffix", RUBY_METHOD_FUNC(_llama_model_token_suffix), 0);
|
1489
1503
|
rb_define_method(rb_cLLaMAModel, "token_eot", RUBY_METHOD_FUNC(_llama_model_token_eot), 0);
|
1504
|
+
rb_define_method(rb_cLLaMAModel, "token_is_eog?", RUBY_METHOD_FUNC(_llama_model_token_is_eog), 1);
|
1490
1505
|
}
|
1491
1506
|
|
1492
1507
|
private:
|
@@ -1634,10 +1649,10 @@ private:
|
|
1634
1649
|
const llama_token token = NUM2INT(token_);
|
1635
1650
|
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1636
1651
|
std::vector<char> result(8, 0);
|
1637
|
-
const int n_tokens = llama_token_to_piece(ptr->model, token, result.data(), result.size());
|
1652
|
+
const int n_tokens = llama_token_to_piece(ptr->model, token, result.data(), result.size(), false);
|
1638
1653
|
if (n_tokens < 0) {
|
1639
1654
|
result.resize(-n_tokens);
|
1640
|
-
const int check = llama_token_to_piece(ptr->model, token, result.data(), result.size());
|
1655
|
+
const int check = llama_token_to_piece(ptr->model, token, result.data(), result.size(), false);
|
1641
1656
|
if (check != -n_tokens) {
|
1642
1657
|
rb_raise(rb_eRuntimeError, "failed to convert");
|
1643
1658
|
return Qnil;
|
@@ -1789,6 +1804,16 @@ private:
|
|
1789
1804
|
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1790
1805
|
return INT2NUM(llama_token_eot(ptr->model));
|
1791
1806
|
}
|
1807
|
+
|
1808
|
+
static VALUE _llama_model_token_is_eog(VALUE self, VALUE token_) {
|
1809
|
+
if (!RB_INTEGER_TYPE_P(token_)) {
|
1810
|
+
rb_raise(rb_eArgError, "token must be an integer");
|
1811
|
+
return Qnil;
|
1812
|
+
}
|
1813
|
+
const llama_token token = NUM2INT(token_);
|
1814
|
+
LLaMAModelWrapper* ptr = get_llama_model(self);
|
1815
|
+
return llama_token_is_eog(ptr->model, token) ? Qtrue : Qfalse;
|
1816
|
+
}
|
1792
1817
|
};
|
1793
1818
|
|
1794
1819
|
const rb_data_type_t RbLLaMAModel::llama_model_type = {
|
@@ -2102,6 +2127,7 @@ public:
|
|
2102
2127
|
rb_define_method(rb_cLLaMAContext, "sample_grammar", RUBY_METHOD_FUNC(_llama_context_sample_grammar), -1);
|
2103
2128
|
rb_define_method(rb_cLLaMAContext, "grammar_accept_token", RUBY_METHOD_FUNC(_llama_context_grammar_accept_token), -1);
|
2104
2129
|
rb_define_method(rb_cLLaMAContext, "apply_control_vector", RUBY_METHOD_FUNC(_llama_context_apply_control_vector), -1);
|
2130
|
+
rb_define_method(rb_cLLaMAContext, "pooling_type", RUBY_METHOD_FUNC(_llama_context_pooling_type), 0);
|
2105
2131
|
}
|
2106
2132
|
|
2107
2133
|
private:
|
@@ -3225,6 +3251,15 @@ private:
|
|
3225
3251
|
|
3226
3252
|
return Qnil;
|
3227
3253
|
}
|
3254
|
+
|
3255
|
+
static VALUE _llama_context_pooling_type(VALUE self) {
|
3256
|
+
LLaMAContextWrapper* ptr = get_llama_context(self);
|
3257
|
+
if (ptr->ctx == NULL) {
|
3258
|
+
rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
|
3259
|
+
return Qnil;
|
3260
|
+
}
|
3261
|
+
return INT2NUM(static_cast<int>(llama_pooling_type(ptr->ctx)));
|
3262
|
+
}
|
3228
3263
|
};
|
3229
3264
|
|
3230
3265
|
const rb_data_type_t RbLLaMAContext::llama_context_type = {
|
data/lib/llama_cpp/version.rb
CHANGED
@@ -3,8 +3,8 @@
|
|
3
3
|
# llama_cpp.rb provides Ruby bindings for the llama.cpp.
|
4
4
|
module LLaMACpp
|
5
5
|
# The version of llama_cpp.rb you install.
|
6
|
-
VERSION = '0.14.
|
6
|
+
VERSION = '0.14.7'
|
7
7
|
|
8
8
|
# The version of llama.cpp bundled with llama_cpp.rb.
|
9
|
-
LLAMA_CPP_VERSION = '
|
9
|
+
LLAMA_CPP_VERSION = 'b2740'
|
10
10
|
end
|
data/sig/llama_cpp.rbs
CHANGED
@@ -141,6 +141,7 @@ module LLaMACpp
|
|
141
141
|
def token_middle: () -> Integer
|
142
142
|
def token_suffix: () -> Integer
|
143
143
|
def token_eot: () -> Integer
|
144
|
+
def token_is_eog?: (Integer) -> bool
|
144
145
|
end
|
145
146
|
|
146
147
|
class Timings
|
@@ -260,6 +261,7 @@ module LLaMACpp
|
|
260
261
|
def sample_grammar: (::LLaMACpp::TokenDataArray, grammar: ::LLaMACpp::Grammar) -> void
|
261
262
|
def grammar_accept_token: (grammar: ::LLaMACpp::Grammar, token: Integer) -> void
|
262
263
|
def apply_control_vector: (data: Array[Float], n_embd: Integer, il_start: Integer, il_end: Integer) -> void
|
264
|
+
def pooling_type: () -> Integer
|
263
265
|
end
|
264
266
|
|
265
267
|
class ContextParams
|
@@ -328,6 +330,8 @@ module LLaMACpp
|
|
328
330
|
def only_copy=: (bool) -> bool
|
329
331
|
def pure: () -> bool
|
330
332
|
def pure=: (bool) -> bool
|
333
|
+
def keep_split: () -> bool
|
334
|
+
def keep_split=: (bool) -> bool
|
331
335
|
end
|
332
336
|
|
333
337
|
class Params = ContextParams
|
@@ -386,10 +386,6 @@ ifdef LLAMA_OPENBLAS
|
|
386
386
|
MK_LDFLAGS += $(shell pkg-config --libs openblas)
|
387
387
|
endif # LLAMA_OPENBLAS
|
388
388
|
|
389
|
-
# TODO: temporary disable until MoE is fixed
|
390
|
-
# https://github.com/ggerganov/llama.cpp/pull/6716
|
391
|
-
LLAMA_NO_LLAMAFILE := 1
|
392
|
-
|
393
389
|
ifndef LLAMA_NO_LLAMAFILE
|
394
390
|
MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
|
395
391
|
OBJS += sgemm.o
|
@@ -701,7 +697,7 @@ OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
|
|
701
697
|
llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
|
702
698
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
703
699
|
|
704
|
-
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
|
700
|
+
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
|
705
701
|
COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
|
706
702
|
|
707
703
|
common.o: common/common.cpp $(COMMON_H_DEPS)
|
@@ -805,10 +801,19 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
|
|
805
801
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
806
802
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
807
803
|
|
808
|
-
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
804
|
+
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
809
805
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
810
806
|
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
|
811
807
|
|
808
|
+
# Portable equivalent of `cd examples/server/public && xxd -i $(notdir $<) ../$(notdir $<).hpp`:
|
809
|
+
examples/server/%.hpp: examples/server/public/% Makefile
|
810
|
+
@( export NAME=$(subst .,_,$(subst -,_,$(notdir $<))) && \
|
811
|
+
echo "unsigned char $${NAME}[] = {" && \
|
812
|
+
cat $< | od -v -t x1 -An | sed -E 's/([0-9a-fA-F]+)/0x\1, /g' && \
|
813
|
+
echo "};" && \
|
814
|
+
echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \
|
815
|
+
) > $@
|
816
|
+
|
812
817
|
gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
|
813
818
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
814
819
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
@@ -371,16 +371,16 @@ struct ggml_gallocr {
|
|
371
371
|
};
|
372
372
|
|
373
373
|
ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
|
374
|
-
ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(sizeof(struct ggml_gallocr)
|
374
|
+
ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(1, sizeof(struct ggml_gallocr));
|
375
375
|
GGML_ASSERT(galloc != NULL);
|
376
376
|
|
377
|
-
galloc->bufts = calloc(sizeof(ggml_backend_buffer_type_t)
|
377
|
+
galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t));
|
378
378
|
GGML_ASSERT(galloc->bufts != NULL);
|
379
379
|
|
380
|
-
galloc->buffers = calloc(sizeof(ggml_backend_buffer_t) * n_bufs
|
380
|
+
galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t) * n_bufs);
|
381
381
|
GGML_ASSERT(galloc->buffers != NULL);
|
382
382
|
|
383
|
-
galloc->buf_tallocs = calloc(sizeof(struct ggml_dyn_tallocr *)
|
383
|
+
galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
|
384
384
|
GGML_ASSERT(galloc->buf_tallocs != NULL);
|
385
385
|
|
386
386
|
for (int i = 0; i < n_bufs; i++) {
|
@@ -646,8 +646,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
646
646
|
free(galloc->hash_set.keys);
|
647
647
|
free(galloc->hash_values);
|
648
648
|
galloc->hash_set.size = hash_size;
|
649
|
-
galloc->hash_set.keys = calloc(sizeof(struct ggml_tensor *)
|
650
|
-
galloc->hash_values = calloc(sizeof(struct hash_node)
|
649
|
+
galloc->hash_set.keys = calloc(hash_size, sizeof(struct ggml_tensor *));
|
650
|
+
galloc->hash_values = calloc(hash_size, sizeof(struct hash_node));
|
651
651
|
GGML_ASSERT(galloc->hash_set.keys != NULL);
|
652
652
|
GGML_ASSERT(galloc->hash_values != NULL);
|
653
653
|
} else {
|
@@ -667,7 +667,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
667
667
|
// set the node_allocs from the hash table
|
668
668
|
if (galloc->n_nodes < graph->n_nodes) {
|
669
669
|
free(galloc->node_allocs);
|
670
|
-
galloc->node_allocs = calloc(sizeof(struct node_alloc)
|
670
|
+
galloc->node_allocs = calloc(graph->n_nodes, sizeof(struct node_alloc));
|
671
671
|
GGML_ASSERT(galloc->node_allocs != NULL);
|
672
672
|
}
|
673
673
|
galloc->n_nodes = graph->n_nodes;
|
@@ -697,7 +697,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|
697
697
|
}
|
698
698
|
if (galloc->n_leafs < graph->n_leafs) {
|
699
699
|
free(galloc->leaf_allocs);
|
700
|
-
galloc->leaf_allocs = calloc(sizeof(galloc->leaf_allocs[0])
|
700
|
+
galloc->leaf_allocs = calloc(graph->n_leafs, sizeof(galloc->leaf_allocs[0]));
|
701
701
|
GGML_ASSERT(galloc->leaf_allocs != NULL);
|
702
702
|
}
|
703
703
|
galloc->n_leafs = graph->n_leafs;
|
@@ -822,7 +822,11 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t
|
|
822
822
|
GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
823
823
|
switch (op->op) {
|
824
824
|
case GGML_OP_CPY:
|
825
|
-
return
|
825
|
+
return
|
826
|
+
op->type != GGML_TYPE_IQ2_XXS &&
|
827
|
+
op->type != GGML_TYPE_IQ2_XS &&
|
828
|
+
op->type != GGML_TYPE_IQ1_S &&
|
829
|
+
op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
|
826
830
|
case GGML_OP_MUL_MAT:
|
827
831
|
return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
|
828
832
|
default:
|
@@ -1721,23 +1725,23 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
|
1721
1725
|
GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
|
1722
1726
|
GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
|
1723
1727
|
|
1724
|
-
struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched)
|
1728
|
+
struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched));
|
1725
1729
|
|
1726
1730
|
// initialize hash table
|
1727
1731
|
sched->hash_set = ggml_hash_set_new(graph_size);
|
1728
|
-
sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0])
|
1729
|
-
sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0])
|
1732
|
+
sched->tensor_backend_id = calloc(sched->hash_set.size, sizeof(sched->tensor_backend_id[0]));
|
1733
|
+
sched->tensor_copies = calloc(sched->hash_set.size, sizeof(sched->tensor_copies[0]));
|
1730
1734
|
|
1731
1735
|
const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
|
1732
|
-
sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0])
|
1733
|
-
sched->leaf_backend_ids = calloc(sizeof(sched->leaf_backend_ids[0])
|
1736
|
+
sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
|
1737
|
+
sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
|
1734
1738
|
|
1735
1739
|
sched->n_backends = n_backends;
|
1736
1740
|
|
1737
1741
|
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
|
1738
1742
|
|
1739
1743
|
const int initial_splits_capacity = 16;
|
1740
|
-
sched->splits = calloc(sizeof(sched->splits[0])
|
1744
|
+
sched->splits = calloc(initial_splits_capacity, sizeof(sched->splits[0]));
|
1741
1745
|
sched->splits_capacity = initial_splits_capacity;
|
1742
1746
|
|
1743
1747
|
for (int b = 0; b < n_backends; b++) {
|
@@ -1968,10 +1972,10 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te
|
|
1968
1972
|
struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
|
1969
1973
|
struct ggml_hash_set hash_set = {
|
1970
1974
|
/* .size = */ graph->visited_hash_table.size,
|
1971
|
-
/* .keys = */ calloc(sizeof(hash_set.keys[0])
|
1975
|
+
/* .keys = */ calloc(graph->visited_hash_table.size, sizeof(hash_set.keys[0])) // NOLINT
|
1972
1976
|
};
|
1973
|
-
struct ggml_tensor ** node_copies = calloc(sizeof(node_copies[0])
|
1974
|
-
bool * node_init = calloc(sizeof(node_init[0])
|
1977
|
+
struct ggml_tensor ** node_copies = calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
|
1978
|
+
bool * node_init = calloc(hash_set.size, sizeof(node_init[0]));
|
1975
1979
|
|
1976
1980
|
struct ggml_init_params params = {
|
1977
1981
|
/* .mem_size = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false),
|
@@ -11,6 +11,12 @@
|
|
11
11
|
#include <string.h> // memcpy
|
12
12
|
#include <math.h> // fabsf
|
13
13
|
|
14
|
+
#undef MIN
|
15
|
+
#undef MAX
|
16
|
+
|
17
|
+
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
18
|
+
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
19
|
+
|
14
20
|
#ifdef __cplusplus
|
15
21
|
extern "C" {
|
16
22
|
#endif
|
@@ -45,7 +51,7 @@ extern "C" {
|
|
45
51
|
// 16-bit float
|
46
52
|
// on Arm, we use __fp16
|
47
53
|
// on x86, we use uint16_t
|
48
|
-
#if defined(__ARM_NEON)
|
54
|
+
#if defined(__ARM_NEON)
|
49
55
|
|
50
56
|
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
|
51
57
|
//
|
@@ -53,8 +59,262 @@ extern "C" {
|
|
53
59
|
//
|
54
60
|
#include <arm_neon.h>
|
55
61
|
|
62
|
+
#ifdef _MSC_VER
|
63
|
+
|
64
|
+
typedef uint16_t ggml_fp16_internal_t;
|
65
|
+
|
66
|
+
#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
|
67
|
+
|
68
|
+
#else
|
69
|
+
|
56
70
|
typedef __fp16 ggml_fp16_internal_t;
|
57
71
|
|
72
|
+
#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
|
73
|
+
|
74
|
+
#endif // _MSC_VER
|
75
|
+
|
76
|
+
#if !defined(__aarch64__)
|
77
|
+
|
78
|
+
// 32-bit ARM compatibility
|
79
|
+
|
80
|
+
// vaddvq_s16
|
81
|
+
// vpaddq_s16
|
82
|
+
// vpaddq_s32
|
83
|
+
// vaddvq_s32
|
84
|
+
// vaddvq_f32
|
85
|
+
// vmaxvq_f32
|
86
|
+
// vcvtnq_s32_f32
|
87
|
+
// vzip1_u8
|
88
|
+
// vzip2_u8
|
89
|
+
|
90
|
+
inline static int32_t vaddvq_s16(int16x8_t v) {
|
91
|
+
return
|
92
|
+
(int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
|
93
|
+
(int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
|
94
|
+
(int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
|
95
|
+
(int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
|
96
|
+
}
|
97
|
+
|
98
|
+
inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
|
99
|
+
int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
|
100
|
+
int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
|
101
|
+
return vcombine_s16(a0, b0);
|
102
|
+
}
|
103
|
+
|
104
|
+
inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
|
105
|
+
int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
|
106
|
+
int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
|
107
|
+
return vcombine_s32(a0, b0);
|
108
|
+
}
|
109
|
+
|
110
|
+
inline static int32_t vaddvq_s32(int32x4_t v) {
|
111
|
+
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
|
112
|
+
}
|
113
|
+
|
114
|
+
inline static float vaddvq_f32(float32x4_t v) {
|
115
|
+
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
|
116
|
+
}
|
117
|
+
|
118
|
+
inline static float vmaxvq_f32(float32x4_t v) {
|
119
|
+
return
|
120
|
+
MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
|
121
|
+
MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
|
122
|
+
}
|
123
|
+
|
124
|
+
inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
|
125
|
+
int32x4_t res;
|
126
|
+
|
127
|
+
res[0] = roundf(vgetq_lane_f32(v, 0));
|
128
|
+
res[1] = roundf(vgetq_lane_f32(v, 1));
|
129
|
+
res[2] = roundf(vgetq_lane_f32(v, 2));
|
130
|
+
res[3] = roundf(vgetq_lane_f32(v, 3));
|
131
|
+
|
132
|
+
return res;
|
133
|
+
}
|
134
|
+
|
135
|
+
inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
|
136
|
+
uint8x8_t res;
|
137
|
+
|
138
|
+
res[0] = a[0]; res[1] = b[0];
|
139
|
+
res[2] = a[1]; res[3] = b[1];
|
140
|
+
res[4] = a[2]; res[5] = b[2];
|
141
|
+
res[6] = a[3]; res[7] = b[3];
|
142
|
+
|
143
|
+
return res;
|
144
|
+
}
|
145
|
+
|
146
|
+
inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
|
147
|
+
uint8x8_t res;
|
148
|
+
|
149
|
+
res[0] = a[4]; res[1] = b[4];
|
150
|
+
res[2] = a[5]; res[3] = b[5];
|
151
|
+
res[4] = a[6]; res[5] = b[6];
|
152
|
+
res[6] = a[7]; res[7] = b[7];
|
153
|
+
|
154
|
+
return res;
|
155
|
+
}
|
156
|
+
|
157
|
+
// vld1q_s16_x2
|
158
|
+
// vld1q_u8_x2
|
159
|
+
// vld1q_u8_x4
|
160
|
+
// vld1q_s8_x2
|
161
|
+
// vld1q_s8_x4
|
162
|
+
// TODO: double-check these work correctly
|
163
|
+
|
164
|
+
typedef struct ggml_int16x8x2_t {
|
165
|
+
int16x8_t val[2];
|
166
|
+
} ggml_int16x8x2_t;
|
167
|
+
|
168
|
+
inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) {
|
169
|
+
ggml_int16x8x2_t res;
|
170
|
+
|
171
|
+
res.val[0] = vld1q_s16(ptr + 0);
|
172
|
+
res.val[1] = vld1q_s16(ptr + 8);
|
173
|
+
|
174
|
+
return res;
|
175
|
+
}
|
176
|
+
|
177
|
+
typedef struct ggml_uint8x16x2_t {
|
178
|
+
uint8x16_t val[2];
|
179
|
+
} ggml_uint8x16x2_t;
|
180
|
+
|
181
|
+
inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) {
|
182
|
+
ggml_uint8x16x2_t res;
|
183
|
+
|
184
|
+
res.val[0] = vld1q_u8(ptr + 0);
|
185
|
+
res.val[1] = vld1q_u8(ptr + 16);
|
186
|
+
|
187
|
+
return res;
|
188
|
+
}
|
189
|
+
|
190
|
+
typedef struct ggml_uint8x16x4_t {
|
191
|
+
uint8x16_t val[4];
|
192
|
+
} ggml_uint8x16x4_t;
|
193
|
+
|
194
|
+
inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) {
|
195
|
+
ggml_uint8x16x4_t res;
|
196
|
+
|
197
|
+
res.val[0] = vld1q_u8(ptr + 0);
|
198
|
+
res.val[1] = vld1q_u8(ptr + 16);
|
199
|
+
res.val[2] = vld1q_u8(ptr + 32);
|
200
|
+
res.val[3] = vld1q_u8(ptr + 48);
|
201
|
+
|
202
|
+
return res;
|
203
|
+
}
|
204
|
+
|
205
|
+
typedef struct ggml_int8x16x2_t {
|
206
|
+
int8x16_t val[2];
|
207
|
+
} ggml_int8x16x2_t;
|
208
|
+
|
209
|
+
inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) {
|
210
|
+
ggml_int8x16x2_t res;
|
211
|
+
|
212
|
+
res.val[0] = vld1q_s8(ptr + 0);
|
213
|
+
res.val[1] = vld1q_s8(ptr + 16);
|
214
|
+
|
215
|
+
return res;
|
216
|
+
}
|
217
|
+
|
218
|
+
typedef struct ggml_int8x16x4_t {
|
219
|
+
int8x16_t val[4];
|
220
|
+
} ggml_int8x16x4_t;
|
221
|
+
|
222
|
+
inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
|
223
|
+
ggml_int8x16x4_t res;
|
224
|
+
|
225
|
+
res.val[0] = vld1q_s8(ptr + 0);
|
226
|
+
res.val[1] = vld1q_s8(ptr + 16);
|
227
|
+
res.val[2] = vld1q_s8(ptr + 32);
|
228
|
+
res.val[3] = vld1q_s8(ptr + 48);
|
229
|
+
|
230
|
+
return res;
|
231
|
+
}
|
232
|
+
|
233
|
+
// NOTE: not tested
|
234
|
+
inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
|
235
|
+
int8x16_t res;
|
236
|
+
|
237
|
+
res[ 0] = a[b[ 0]];
|
238
|
+
res[ 1] = a[b[ 1]];
|
239
|
+
res[ 2] = a[b[ 2]];
|
240
|
+
res[ 3] = a[b[ 3]];
|
241
|
+
res[ 4] = a[b[ 4]];
|
242
|
+
res[ 5] = a[b[ 5]];
|
243
|
+
res[ 6] = a[b[ 6]];
|
244
|
+
res[ 7] = a[b[ 7]];
|
245
|
+
res[ 8] = a[b[ 8]];
|
246
|
+
res[ 9] = a[b[ 9]];
|
247
|
+
res[10] = a[b[10]];
|
248
|
+
res[11] = a[b[11]];
|
249
|
+
res[12] = a[b[12]];
|
250
|
+
res[13] = a[b[13]];
|
251
|
+
res[14] = a[b[14]];
|
252
|
+
res[15] = a[b[15]];
|
253
|
+
|
254
|
+
return res;
|
255
|
+
}
|
256
|
+
|
257
|
+
// NOTE: not tested
|
258
|
+
inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
|
259
|
+
uint8x16_t res;
|
260
|
+
|
261
|
+
res[ 0] = a[b[ 0]];
|
262
|
+
res[ 1] = a[b[ 1]];
|
263
|
+
res[ 2] = a[b[ 2]];
|
264
|
+
res[ 3] = a[b[ 3]];
|
265
|
+
res[ 4] = a[b[ 4]];
|
266
|
+
res[ 5] = a[b[ 5]];
|
267
|
+
res[ 6] = a[b[ 6]];
|
268
|
+
res[ 7] = a[b[ 7]];
|
269
|
+
res[ 8] = a[b[ 8]];
|
270
|
+
res[ 9] = a[b[ 9]];
|
271
|
+
res[10] = a[b[10]];
|
272
|
+
res[11] = a[b[11]];
|
273
|
+
res[12] = a[b[12]];
|
274
|
+
res[13] = a[b[13]];
|
275
|
+
res[14] = a[b[14]];
|
276
|
+
res[15] = a[b[15]];
|
277
|
+
|
278
|
+
return res;
|
279
|
+
}
|
280
|
+
|
281
|
+
#else
|
282
|
+
|
283
|
+
#define ggml_int16x8x2_t int16x8x2_t
|
284
|
+
#define ggml_uint8x16x2_t uint8x16x2_t
|
285
|
+
#define ggml_uint8x16x4_t uint8x16x4_t
|
286
|
+
#define ggml_int8x16x2_t int8x16x2_t
|
287
|
+
#define ggml_int8x16x4_t int8x16x4_t
|
288
|
+
|
289
|
+
#define ggml_vld1q_s16_x2 vld1q_s16_x2
|
290
|
+
#define ggml_vld1q_u8_x2 vld1q_u8_x2
|
291
|
+
#define ggml_vld1q_u8_x4 vld1q_u8_x4
|
292
|
+
#define ggml_vld1q_s8_x2 vld1q_s8_x2
|
293
|
+
#define ggml_vld1q_s8_x4 vld1q_s8_x4
|
294
|
+
#define ggml_vqtbl1q_s8 vqtbl1q_s8
|
295
|
+
#define ggml_vqtbl1q_u8 vqtbl1q_u8
|
296
|
+
|
297
|
+
#endif // !defined(__aarch64__)
|
298
|
+
|
299
|
+
#if !defined(__ARM_FEATURE_DOTPROD)
|
300
|
+
|
301
|
+
inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
|
302
|
+
const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
|
303
|
+
const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
|
304
|
+
|
305
|
+
return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
|
306
|
+
}
|
307
|
+
|
308
|
+
#else
|
309
|
+
|
310
|
+
#define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c)
|
311
|
+
|
312
|
+
#endif // !defined(__ARM_FEATURE_DOTPROD)
|
313
|
+
|
314
|
+
#endif // defined(__ARM_NEON)
|
315
|
+
|
316
|
+
#if defined(__ARM_NEON) && !defined(__MSC_VER)
|
317
|
+
|
58
318
|
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
59
319
|
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
60
320
|
|
@@ -75,8 +335,6 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
|
75
335
|
|
76
336
|
#else
|
77
337
|
|
78
|
-
typedef uint16_t ggml_fp16_internal_t;
|
79
|
-
|
80
338
|
#ifdef __wasm_simd128__
|
81
339
|
#include <wasm_simd128.h>
|
82
340
|
#else
|
@@ -221,7 +479,7 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
|
221
479
|
|
222
480
|
#endif // __F16C__
|
223
481
|
|
224
|
-
#endif // __ARM_NEON
|
482
|
+
#endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
|
225
483
|
|
226
484
|
// precomputed f32 table for f16 (256 KB)
|
227
485
|
// defined in ggml.c, initialized in ggml_init()
|