llama_cpp 0.14.6 → 0.14.7

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5c4bd6bcb93b98a00f94dcdf93d04f853174f73e281d96fce8f837a6ba7f250e
4
- data.tar.gz: 6d184e9ce927c06ba794bea63a09007a175a72e477366ffb1c5763ceb2c7c71e
3
+ metadata.gz: 243241c78383cb68d4fb5027ffc54ea7f6789bd74bfe85fae8e62d45e7c3145d
4
+ data.tar.gz: b7c792c6fb2287b71a72ff823a31706dc0830aa704d86e6f8a92d1d0630649d9
5
5
  SHA512:
6
- metadata.gz: 953fe2777a759e5467694b8afb9d3f929a42603e81b2c3e38ba0fda4bb6dca78b2d147345023f99c2c9fb899cc746bf6729ad2726c2cb473d7094e93c13caf73
7
- data.tar.gz: 71eb3cd5a5c619e9cc8a3418be745a8b76dc5e8cabe5b26a766230a8533df9a11c3981601b0be4ec0adb34a49f86ad741503ffc9f3b0d7ba021a7e9ddc3246a7
6
+ metadata.gz: 59565cd5e6bd79d98d31dcf1ce505c8388a97296f607c2a114cf92a614a2cd39291a8a18a3f58993606ea3f0970d1eadbfe670280c5261c5826a54d77a2eb85d
7
+ data.tar.gz: 228bc19181b0163ef922e847f67e7b6a52dc1311c4e8173586dfca82eb402c5a08c104b5bac5ba0eee4772f615f8fd17f2d06cbc6db5323d133a46d3de85eeb4
data/CHANGELOG.md CHANGED
@@ -1,3 +1,12 @@
1
+ ## [[0.14.7](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.6...v0.14.7)] - 2024-04-27
2
+
3
+ - Bump llama.cpp from b2698 to b2740.
4
+ - Add `keep_split` accessor to `ModelQuantizeParams`.
5
+ - Add `pooling_type` method to `Context`.
6
+ - Add `token_is_eog?` method to `Model`.
7
+
8
+ Implementation binding for llama_sample_token_with_rng has been skipped.
9
+
1
10
  ## [[0.14.6](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.5...v0.14.6)] - 2024-04-20
2
11
 
3
12
  - Bump llama.cpp from b2658 to b2698.
@@ -1321,6 +1321,8 @@ public:
1321
1321
  rb_define_method(rb_cLLaMAModelQuantizeParams, "only_copy", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_only_copy), 0);
1322
1322
  rb_define_method(rb_cLLaMAModelQuantizeParams, "pure=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_pure), 1);
1323
1323
  rb_define_method(rb_cLLaMAModelQuantizeParams, "pure", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_pure), 0);
1324
+ rb_define_method(rb_cLLaMAModelQuantizeParams, "keep_split=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_keep_split), 1);
1325
+ rb_define_method(rb_cLLaMAModelQuantizeParams, "keep_split", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_keep_split), 0);
1324
1326
  }
1325
1327
 
1326
1328
  private:
@@ -1405,6 +1407,18 @@ private:
1405
1407
  LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
1406
1408
  return ptr->params.pure ? Qtrue : Qfalse;
1407
1409
  }
1410
+
1411
+ // keep_split
1412
+ static VALUE _llama_model_quantize_params_set_keep_split(VALUE self, VALUE keep_split) {
1413
+ LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
1414
+ ptr->params.keep_split = RTEST(keep_split) ? true : false;
1415
+ return ptr->params.keep_split ? Qtrue : Qfalse;
1416
+ }
1417
+
1418
+ static VALUE _llama_model_quantize_params_get_keep_split(VALUE self) {
1419
+ LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
1420
+ return ptr->params.keep_split ? Qtrue : Qfalse;
1421
+ }
1408
1422
  };
1409
1423
 
1410
1424
  const rb_data_type_t RbLLaMAModelQuantizeParams::llama_model_quantize_params_type = {
@@ -1487,6 +1501,7 @@ public:
1487
1501
  rb_define_method(rb_cLLaMAModel, "token_middle", RUBY_METHOD_FUNC(_llama_model_token_middle), 0);
1488
1502
  rb_define_method(rb_cLLaMAModel, "token_suffix", RUBY_METHOD_FUNC(_llama_model_token_suffix), 0);
1489
1503
  rb_define_method(rb_cLLaMAModel, "token_eot", RUBY_METHOD_FUNC(_llama_model_token_eot), 0);
1504
+ rb_define_method(rb_cLLaMAModel, "token_is_eog?", RUBY_METHOD_FUNC(_llama_model_token_is_eog), 1);
1490
1505
  }
1491
1506
 
1492
1507
  private:
@@ -1634,10 +1649,10 @@ private:
1634
1649
  const llama_token token = NUM2INT(token_);
1635
1650
  LLaMAModelWrapper* ptr = get_llama_model(self);
1636
1651
  std::vector<char> result(8, 0);
1637
- const int n_tokens = llama_token_to_piece(ptr->model, token, result.data(), result.size());
1652
+ const int n_tokens = llama_token_to_piece(ptr->model, token, result.data(), result.size(), false);
1638
1653
  if (n_tokens < 0) {
1639
1654
  result.resize(-n_tokens);
1640
- const int check = llama_token_to_piece(ptr->model, token, result.data(), result.size());
1655
+ const int check = llama_token_to_piece(ptr->model, token, result.data(), result.size(), false);
1641
1656
  if (check != -n_tokens) {
1642
1657
  rb_raise(rb_eRuntimeError, "failed to convert");
1643
1658
  return Qnil;
@@ -1789,6 +1804,16 @@ private:
1789
1804
  LLaMAModelWrapper* ptr = get_llama_model(self);
1790
1805
  return INT2NUM(llama_token_eot(ptr->model));
1791
1806
  }
1807
+
1808
+ static VALUE _llama_model_token_is_eog(VALUE self, VALUE token_) {
1809
+ if (!RB_INTEGER_TYPE_P(token_)) {
1810
+ rb_raise(rb_eArgError, "token must be an integer");
1811
+ return Qnil;
1812
+ }
1813
+ const llama_token token = NUM2INT(token_);
1814
+ LLaMAModelWrapper* ptr = get_llama_model(self);
1815
+ return llama_token_is_eog(ptr->model, token) ? Qtrue : Qfalse;
1816
+ }
1792
1817
  };
1793
1818
 
1794
1819
  const rb_data_type_t RbLLaMAModel::llama_model_type = {
@@ -2102,6 +2127,7 @@ public:
2102
2127
  rb_define_method(rb_cLLaMAContext, "sample_grammar", RUBY_METHOD_FUNC(_llama_context_sample_grammar), -1);
2103
2128
  rb_define_method(rb_cLLaMAContext, "grammar_accept_token", RUBY_METHOD_FUNC(_llama_context_grammar_accept_token), -1);
2104
2129
  rb_define_method(rb_cLLaMAContext, "apply_control_vector", RUBY_METHOD_FUNC(_llama_context_apply_control_vector), -1);
2130
+ rb_define_method(rb_cLLaMAContext, "pooling_type", RUBY_METHOD_FUNC(_llama_context_pooling_type), 0);
2105
2131
  }
2106
2132
 
2107
2133
  private:
@@ -3225,6 +3251,15 @@ private:
3225
3251
 
3226
3252
  return Qnil;
3227
3253
  }
3254
+
3255
+ static VALUE _llama_context_pooling_type(VALUE self) {
3256
+ LLaMAContextWrapper* ptr = get_llama_context(self);
3257
+ if (ptr->ctx == NULL) {
3258
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
3259
+ return Qnil;
3260
+ }
3261
+ return INT2NUM(static_cast<int>(llama_pooling_type(ptr->ctx)));
3262
+ }
3228
3263
  };
3229
3264
 
3230
3265
  const rb_data_type_t RbLLaMAContext::llama_context_type = {
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.14.6'
6
+ VERSION = '0.14.7'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b2698'
9
+ LLAMA_CPP_VERSION = 'b2740'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -141,6 +141,7 @@ module LLaMACpp
141
141
  def token_middle: () -> Integer
142
142
  def token_suffix: () -> Integer
143
143
  def token_eot: () -> Integer
144
+ def token_is_eog?: (Integer) -> bool
144
145
  end
145
146
 
146
147
  class Timings
@@ -260,6 +261,7 @@ module LLaMACpp
260
261
  def sample_grammar: (::LLaMACpp::TokenDataArray, grammar: ::LLaMACpp::Grammar) -> void
261
262
  def grammar_accept_token: (grammar: ::LLaMACpp::Grammar, token: Integer) -> void
262
263
  def apply_control_vector: (data: Array[Float], n_embd: Integer, il_start: Integer, il_end: Integer) -> void
264
+ def pooling_type: () -> Integer
263
265
  end
264
266
 
265
267
  class ContextParams
@@ -328,6 +330,8 @@ module LLaMACpp
328
330
  def only_copy=: (bool) -> bool
329
331
  def pure: () -> bool
330
332
  def pure=: (bool) -> bool
333
+ def keep_split: () -> bool
334
+ def keep_split=: (bool) -> bool
331
335
  end
332
336
 
333
337
  class Params = ContextParams
@@ -386,10 +386,6 @@ ifdef LLAMA_OPENBLAS
386
386
  MK_LDFLAGS += $(shell pkg-config --libs openblas)
387
387
  endif # LLAMA_OPENBLAS
388
388
 
389
- # TODO: temporary disable until MoE is fixed
390
- # https://github.com/ggerganov/llama.cpp/pull/6716
391
- LLAMA_NO_LLAMAFILE := 1
392
-
393
389
  ifndef LLAMA_NO_LLAMAFILE
394
390
  MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
395
391
  OBJS += sgemm.o
@@ -701,7 +697,7 @@ OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
701
697
  llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
702
698
  $(CXX) $(CXXFLAGS) -c $< -o $@
703
699
 
704
- COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
700
+ COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
705
701
  COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
706
702
 
707
703
  common.o: common/common.cpp $(COMMON_H_DEPS)
@@ -805,10 +801,19 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
805
801
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
806
802
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
807
803
 
808
- server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
804
+ server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
809
805
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
810
806
  $(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
811
807
 
808
+ # Portable equivalent of `cd examples/server/public && xxd -i $(notdir $<) ../$(notdir $<).hpp`:
809
+ examples/server/%.hpp: examples/server/public/% Makefile
810
+ @( export NAME=$(subst .,_,$(subst -,_,$(notdir $<))) && \
811
+ echo "unsigned char $${NAME}[] = {" && \
812
+ cat $< | od -v -t x1 -An | sed -E 's/([0-9a-fA-F]+)/0x\1, /g' && \
813
+ echo "};" && \
814
+ echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \
815
+ ) > $@
816
+
812
817
  gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
813
818
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
814
819
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -371,16 +371,16 @@ struct ggml_gallocr {
371
371
  };
372
372
 
373
373
  ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
374
- ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(sizeof(struct ggml_gallocr), 1);
374
+ ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(1, sizeof(struct ggml_gallocr));
375
375
  GGML_ASSERT(galloc != NULL);
376
376
 
377
- galloc->bufts = calloc(sizeof(ggml_backend_buffer_type_t) * n_bufs, 1);
377
+ galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t));
378
378
  GGML_ASSERT(galloc->bufts != NULL);
379
379
 
380
- galloc->buffers = calloc(sizeof(ggml_backend_buffer_t) * n_bufs, 1);
380
+ galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t) * n_bufs);
381
381
  GGML_ASSERT(galloc->buffers != NULL);
382
382
 
383
- galloc->buf_tallocs = calloc(sizeof(struct ggml_dyn_tallocr *) * n_bufs, 1);
383
+ galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
384
384
  GGML_ASSERT(galloc->buf_tallocs != NULL);
385
385
 
386
386
  for (int i = 0; i < n_bufs; i++) {
@@ -646,8 +646,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
646
646
  free(galloc->hash_set.keys);
647
647
  free(galloc->hash_values);
648
648
  galloc->hash_set.size = hash_size;
649
- galloc->hash_set.keys = calloc(sizeof(struct ggml_tensor *), hash_size);
650
- galloc->hash_values = calloc(sizeof(struct hash_node), hash_size);
649
+ galloc->hash_set.keys = calloc(hash_size, sizeof(struct ggml_tensor *));
650
+ galloc->hash_values = calloc(hash_size, sizeof(struct hash_node));
651
651
  GGML_ASSERT(galloc->hash_set.keys != NULL);
652
652
  GGML_ASSERT(galloc->hash_values != NULL);
653
653
  } else {
@@ -667,7 +667,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
667
667
  // set the node_allocs from the hash table
668
668
  if (galloc->n_nodes < graph->n_nodes) {
669
669
  free(galloc->node_allocs);
670
- galloc->node_allocs = calloc(sizeof(struct node_alloc), graph->n_nodes);
670
+ galloc->node_allocs = calloc(graph->n_nodes, sizeof(struct node_alloc));
671
671
  GGML_ASSERT(galloc->node_allocs != NULL);
672
672
  }
673
673
  galloc->n_nodes = graph->n_nodes;
@@ -697,7 +697,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
697
697
  }
698
698
  if (galloc->n_leafs < graph->n_leafs) {
699
699
  free(galloc->leaf_allocs);
700
- galloc->leaf_allocs = calloc(sizeof(galloc->leaf_allocs[0]), graph->n_leafs);
700
+ galloc->leaf_allocs = calloc(graph->n_leafs, sizeof(galloc->leaf_allocs[0]));
701
701
  GGML_ASSERT(galloc->leaf_allocs != NULL);
702
702
  }
703
703
  galloc->n_leafs = graph->n_leafs;
@@ -822,7 +822,11 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t
822
822
  GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
823
823
  switch (op->op) {
824
824
  case GGML_OP_CPY:
825
- return op->type != GGML_TYPE_IQ2_XXS && op->type != GGML_TYPE_IQ2_XS && op->type != GGML_TYPE_IQ1_S; // missing type_traits.from_float
825
+ return
826
+ op->type != GGML_TYPE_IQ2_XXS &&
827
+ op->type != GGML_TYPE_IQ2_XS &&
828
+ op->type != GGML_TYPE_IQ1_S &&
829
+ op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
826
830
  case GGML_OP_MUL_MAT:
827
831
  return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
828
832
  default:
@@ -1721,23 +1725,23 @@ ggml_backend_sched_t ggml_backend_sched_new(
1721
1725
  GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
1722
1726
  GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
1723
1727
 
1724
- struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
1728
+ struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched));
1725
1729
 
1726
1730
  // initialize hash table
1727
1731
  sched->hash_set = ggml_hash_set_new(graph_size);
1728
- sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size);
1729
- sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size);
1732
+ sched->tensor_backend_id = calloc(sched->hash_set.size, sizeof(sched->tensor_backend_id[0]));
1733
+ sched->tensor_copies = calloc(sched->hash_set.size, sizeof(sched->tensor_copies[0]));
1730
1734
 
1731
1735
  const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
1732
- sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), nodes_size);
1733
- sched->leaf_backend_ids = calloc(sizeof(sched->leaf_backend_ids[0]), nodes_size);
1736
+ sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
1737
+ sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
1734
1738
 
1735
1739
  sched->n_backends = n_backends;
1736
1740
 
1737
1741
  sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
1738
1742
 
1739
1743
  const int initial_splits_capacity = 16;
1740
- sched->splits = calloc(sizeof(sched->splits[0]), initial_splits_capacity);
1744
+ sched->splits = calloc(initial_splits_capacity, sizeof(sched->splits[0]));
1741
1745
  sched->splits_capacity = initial_splits_capacity;
1742
1746
 
1743
1747
  for (int b = 0; b < n_backends; b++) {
@@ -1968,10 +1972,10 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te
1968
1972
  struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
1969
1973
  struct ggml_hash_set hash_set = {
1970
1974
  /* .size = */ graph->visited_hash_table.size,
1971
- /* .keys = */ calloc(sizeof(hash_set.keys[0]), graph->visited_hash_table.size) // NOLINT
1975
+ /* .keys = */ calloc(graph->visited_hash_table.size, sizeof(hash_set.keys[0])) // NOLINT
1972
1976
  };
1973
- struct ggml_tensor ** node_copies = calloc(sizeof(node_copies[0]), hash_set.size); // NOLINT
1974
- bool * node_init = calloc(sizeof(node_init[0]), hash_set.size);
1977
+ struct ggml_tensor ** node_copies = calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
1978
+ bool * node_init = calloc(hash_set.size, sizeof(node_init[0]));
1975
1979
 
1976
1980
  struct ggml_init_params params = {
1977
1981
  /* .mem_size = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false),
@@ -11,6 +11,12 @@
11
11
  #include <string.h> // memcpy
12
12
  #include <math.h> // fabsf
13
13
 
14
+ #undef MIN
15
+ #undef MAX
16
+
17
+ #define MIN(a, b) ((a) < (b) ? (a) : (b))
18
+ #define MAX(a, b) ((a) > (b) ? (a) : (b))
19
+
14
20
  #ifdef __cplusplus
15
21
  extern "C" {
16
22
  #endif
@@ -45,7 +51,7 @@ extern "C" {
45
51
  // 16-bit float
46
52
  // on Arm, we use __fp16
47
53
  // on x86, we use uint16_t
48
- #if defined(__ARM_NEON) && !defined(_MSC_VER)
54
+ #if defined(__ARM_NEON)
49
55
 
50
56
  // if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
51
57
  //
@@ -53,8 +59,262 @@ extern "C" {
53
59
  //
54
60
  #include <arm_neon.h>
55
61
 
62
+ #ifdef _MSC_VER
63
+
64
+ typedef uint16_t ggml_fp16_internal_t;
65
+
66
+ #define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
67
+
68
+ #else
69
+
56
70
  typedef __fp16 ggml_fp16_internal_t;
57
71
 
72
+ #define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
73
+
74
+ #endif // _MSC_VER
75
+
76
+ #if !defined(__aarch64__)
77
+
78
+ // 32-bit ARM compatibility
79
+
80
+ // vaddvq_s16
81
+ // vpaddq_s16
82
+ // vpaddq_s32
83
+ // vaddvq_s32
84
+ // vaddvq_f32
85
+ // vmaxvq_f32
86
+ // vcvtnq_s32_f32
87
+ // vzip1_u8
88
+ // vzip2_u8
89
+
90
+ inline static int32_t vaddvq_s16(int16x8_t v) {
91
+ return
92
+ (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
93
+ (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
94
+ (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
95
+ (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
96
+ }
97
+
98
+ inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
99
+ int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
100
+ int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
101
+ return vcombine_s16(a0, b0);
102
+ }
103
+
104
+ inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
105
+ int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
106
+ int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
107
+ return vcombine_s32(a0, b0);
108
+ }
109
+
110
+ inline static int32_t vaddvq_s32(int32x4_t v) {
111
+ return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
112
+ }
113
+
114
+ inline static float vaddvq_f32(float32x4_t v) {
115
+ return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
116
+ }
117
+
118
+ inline static float vmaxvq_f32(float32x4_t v) {
119
+ return
120
+ MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
121
+ MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
122
+ }
123
+
124
+ inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
125
+ int32x4_t res;
126
+
127
+ res[0] = roundf(vgetq_lane_f32(v, 0));
128
+ res[1] = roundf(vgetq_lane_f32(v, 1));
129
+ res[2] = roundf(vgetq_lane_f32(v, 2));
130
+ res[3] = roundf(vgetq_lane_f32(v, 3));
131
+
132
+ return res;
133
+ }
134
+
135
+ inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
136
+ uint8x8_t res;
137
+
138
+ res[0] = a[0]; res[1] = b[0];
139
+ res[2] = a[1]; res[3] = b[1];
140
+ res[4] = a[2]; res[5] = b[2];
141
+ res[6] = a[3]; res[7] = b[3];
142
+
143
+ return res;
144
+ }
145
+
146
+ inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
147
+ uint8x8_t res;
148
+
149
+ res[0] = a[4]; res[1] = b[4];
150
+ res[2] = a[5]; res[3] = b[5];
151
+ res[4] = a[6]; res[5] = b[6];
152
+ res[6] = a[7]; res[7] = b[7];
153
+
154
+ return res;
155
+ }
156
+
157
+ // vld1q_s16_x2
158
+ // vld1q_u8_x2
159
+ // vld1q_u8_x4
160
+ // vld1q_s8_x2
161
+ // vld1q_s8_x4
162
+ // TODO: double-check these work correctly
163
+
164
+ typedef struct ggml_int16x8x2_t {
165
+ int16x8_t val[2];
166
+ } ggml_int16x8x2_t;
167
+
168
+ inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) {
169
+ ggml_int16x8x2_t res;
170
+
171
+ res.val[0] = vld1q_s16(ptr + 0);
172
+ res.val[1] = vld1q_s16(ptr + 8);
173
+
174
+ return res;
175
+ }
176
+
177
+ typedef struct ggml_uint8x16x2_t {
178
+ uint8x16_t val[2];
179
+ } ggml_uint8x16x2_t;
180
+
181
+ inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) {
182
+ ggml_uint8x16x2_t res;
183
+
184
+ res.val[0] = vld1q_u8(ptr + 0);
185
+ res.val[1] = vld1q_u8(ptr + 16);
186
+
187
+ return res;
188
+ }
189
+
190
+ typedef struct ggml_uint8x16x4_t {
191
+ uint8x16_t val[4];
192
+ } ggml_uint8x16x4_t;
193
+
194
+ inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) {
195
+ ggml_uint8x16x4_t res;
196
+
197
+ res.val[0] = vld1q_u8(ptr + 0);
198
+ res.val[1] = vld1q_u8(ptr + 16);
199
+ res.val[2] = vld1q_u8(ptr + 32);
200
+ res.val[3] = vld1q_u8(ptr + 48);
201
+
202
+ return res;
203
+ }
204
+
205
+ typedef struct ggml_int8x16x2_t {
206
+ int8x16_t val[2];
207
+ } ggml_int8x16x2_t;
208
+
209
+ inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) {
210
+ ggml_int8x16x2_t res;
211
+
212
+ res.val[0] = vld1q_s8(ptr + 0);
213
+ res.val[1] = vld1q_s8(ptr + 16);
214
+
215
+ return res;
216
+ }
217
+
218
+ typedef struct ggml_int8x16x4_t {
219
+ int8x16_t val[4];
220
+ } ggml_int8x16x4_t;
221
+
222
+ inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
223
+ ggml_int8x16x4_t res;
224
+
225
+ res.val[0] = vld1q_s8(ptr + 0);
226
+ res.val[1] = vld1q_s8(ptr + 16);
227
+ res.val[2] = vld1q_s8(ptr + 32);
228
+ res.val[3] = vld1q_s8(ptr + 48);
229
+
230
+ return res;
231
+ }
232
+
233
+ // NOTE: not tested
234
+ inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
235
+ int8x16_t res;
236
+
237
+ res[ 0] = a[b[ 0]];
238
+ res[ 1] = a[b[ 1]];
239
+ res[ 2] = a[b[ 2]];
240
+ res[ 3] = a[b[ 3]];
241
+ res[ 4] = a[b[ 4]];
242
+ res[ 5] = a[b[ 5]];
243
+ res[ 6] = a[b[ 6]];
244
+ res[ 7] = a[b[ 7]];
245
+ res[ 8] = a[b[ 8]];
246
+ res[ 9] = a[b[ 9]];
247
+ res[10] = a[b[10]];
248
+ res[11] = a[b[11]];
249
+ res[12] = a[b[12]];
250
+ res[13] = a[b[13]];
251
+ res[14] = a[b[14]];
252
+ res[15] = a[b[15]];
253
+
254
+ return res;
255
+ }
256
+
257
+ // NOTE: not tested
258
+ inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
259
+ uint8x16_t res;
260
+
261
+ res[ 0] = a[b[ 0]];
262
+ res[ 1] = a[b[ 1]];
263
+ res[ 2] = a[b[ 2]];
264
+ res[ 3] = a[b[ 3]];
265
+ res[ 4] = a[b[ 4]];
266
+ res[ 5] = a[b[ 5]];
267
+ res[ 6] = a[b[ 6]];
268
+ res[ 7] = a[b[ 7]];
269
+ res[ 8] = a[b[ 8]];
270
+ res[ 9] = a[b[ 9]];
271
+ res[10] = a[b[10]];
272
+ res[11] = a[b[11]];
273
+ res[12] = a[b[12]];
274
+ res[13] = a[b[13]];
275
+ res[14] = a[b[14]];
276
+ res[15] = a[b[15]];
277
+
278
+ return res;
279
+ }
280
+
281
+ #else
282
+
283
+ #define ggml_int16x8x2_t int16x8x2_t
284
+ #define ggml_uint8x16x2_t uint8x16x2_t
285
+ #define ggml_uint8x16x4_t uint8x16x4_t
286
+ #define ggml_int8x16x2_t int8x16x2_t
287
+ #define ggml_int8x16x4_t int8x16x4_t
288
+
289
+ #define ggml_vld1q_s16_x2 vld1q_s16_x2
290
+ #define ggml_vld1q_u8_x2 vld1q_u8_x2
291
+ #define ggml_vld1q_u8_x4 vld1q_u8_x4
292
+ #define ggml_vld1q_s8_x2 vld1q_s8_x2
293
+ #define ggml_vld1q_s8_x4 vld1q_s8_x4
294
+ #define ggml_vqtbl1q_s8 vqtbl1q_s8
295
+ #define ggml_vqtbl1q_u8 vqtbl1q_u8
296
+
297
+ #endif // !defined(__aarch64__)
298
+
299
+ #if !defined(__ARM_FEATURE_DOTPROD)
300
+
301
+ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
302
+ const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
303
+ const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
304
+
305
+ return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
306
+ }
307
+
308
+ #else
309
+
310
+ #define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c)
311
+
312
+ #endif // !defined(__ARM_FEATURE_DOTPROD)
313
+
314
+ #endif // defined(__ARM_NEON)
315
+
316
+ #if defined(__ARM_NEON) && !defined(__MSC_VER)
317
+
58
318
  #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
59
319
  #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
60
320
 
@@ -75,8 +335,6 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
75
335
 
76
336
  #else
77
337
 
78
- typedef uint16_t ggml_fp16_internal_t;
79
-
80
338
  #ifdef __wasm_simd128__
81
339
  #include <wasm_simd128.h>
82
340
  #else
@@ -221,7 +479,7 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
221
479
 
222
480
  #endif // __F16C__
223
481
 
224
- #endif // __ARM_NEON
482
+ #endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
225
483
 
226
484
  // precomputed f32 table for f16 (256 KB)
227
485
  // defined in ggml.c, initialized in ggml_init()