llama_cpp 0.14.6 → 0.14.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5c4bd6bcb93b98a00f94dcdf93d04f853174f73e281d96fce8f837a6ba7f250e
4
- data.tar.gz: 6d184e9ce927c06ba794bea63a09007a175a72e477366ffb1c5763ceb2c7c71e
3
+ metadata.gz: 243241c78383cb68d4fb5027ffc54ea7f6789bd74bfe85fae8e62d45e7c3145d
4
+ data.tar.gz: b7c792c6fb2287b71a72ff823a31706dc0830aa704d86e6f8a92d1d0630649d9
5
5
  SHA512:
6
- metadata.gz: 953fe2777a759e5467694b8afb9d3f929a42603e81b2c3e38ba0fda4bb6dca78b2d147345023f99c2c9fb899cc746bf6729ad2726c2cb473d7094e93c13caf73
7
- data.tar.gz: 71eb3cd5a5c619e9cc8a3418be745a8b76dc5e8cabe5b26a766230a8533df9a11c3981601b0be4ec0adb34a49f86ad741503ffc9f3b0d7ba021a7e9ddc3246a7
6
+ metadata.gz: 59565cd5e6bd79d98d31dcf1ce505c8388a97296f607c2a114cf92a614a2cd39291a8a18a3f58993606ea3f0970d1eadbfe670280c5261c5826a54d77a2eb85d
7
+ data.tar.gz: 228bc19181b0163ef922e847f67e7b6a52dc1311c4e8173586dfca82eb402c5a08c104b5bac5ba0eee4772f615f8fd17f2d06cbc6db5323d133a46d3de85eeb4
data/CHANGELOG.md CHANGED
@@ -1,3 +1,12 @@
1
+ ## [[0.14.7](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.6...v0.14.7)] - 2024-04-27
2
+
3
+ - Bump llama.cpp from b2698 to b2740.
4
+ - Add `keep_split` accessor to `ModelQuantizeParams`.
5
+ - Add `pooling_type` method to `Context`.
6
+ - Add `token_is_eog?` method to `Model`.
7
+
8
+ Implementation binding for llama_sample_token_with_rng has been skipped.
9
+
1
10
  ## [[0.14.6](https://github.com/yoshoku/llama_cpp.rb/compare/v0.14.5...v0.14.6)] - 2024-04-20
2
11
 
3
12
  - Bump llama.cpp from b2658 to b2698.
@@ -1321,6 +1321,8 @@ public:
1321
1321
  rb_define_method(rb_cLLaMAModelQuantizeParams, "only_copy", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_only_copy), 0);
1322
1322
  rb_define_method(rb_cLLaMAModelQuantizeParams, "pure=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_pure), 1);
1323
1323
  rb_define_method(rb_cLLaMAModelQuantizeParams, "pure", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_pure), 0);
1324
+ rb_define_method(rb_cLLaMAModelQuantizeParams, "keep_split=", RUBY_METHOD_FUNC(_llama_model_quantize_params_set_keep_split), 1);
1325
+ rb_define_method(rb_cLLaMAModelQuantizeParams, "keep_split", RUBY_METHOD_FUNC(_llama_model_quantize_params_get_keep_split), 0);
1324
1326
  }
1325
1327
 
1326
1328
  private:
@@ -1405,6 +1407,18 @@ private:
1405
1407
  LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
1406
1408
  return ptr->params.pure ? Qtrue : Qfalse;
1407
1409
  }
1410
+
1411
+ // keep_split
1412
+ static VALUE _llama_model_quantize_params_set_keep_split(VALUE self, VALUE keep_split) {
1413
+ LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
1414
+ ptr->params.keep_split = RTEST(keep_split) ? true : false;
1415
+ return ptr->params.keep_split ? Qtrue : Qfalse;
1416
+ }
1417
+
1418
+ static VALUE _llama_model_quantize_params_get_keep_split(VALUE self) {
1419
+ LLaMAModelQuantizeParamsWrapper* ptr = get_llama_model_quantize_params(self);
1420
+ return ptr->params.keep_split ? Qtrue : Qfalse;
1421
+ }
1408
1422
  };
1409
1423
 
1410
1424
  const rb_data_type_t RbLLaMAModelQuantizeParams::llama_model_quantize_params_type = {
@@ -1487,6 +1501,7 @@ public:
1487
1501
  rb_define_method(rb_cLLaMAModel, "token_middle", RUBY_METHOD_FUNC(_llama_model_token_middle), 0);
1488
1502
  rb_define_method(rb_cLLaMAModel, "token_suffix", RUBY_METHOD_FUNC(_llama_model_token_suffix), 0);
1489
1503
  rb_define_method(rb_cLLaMAModel, "token_eot", RUBY_METHOD_FUNC(_llama_model_token_eot), 0);
1504
+ rb_define_method(rb_cLLaMAModel, "token_is_eog?", RUBY_METHOD_FUNC(_llama_model_token_is_eog), 1);
1490
1505
  }
1491
1506
 
1492
1507
  private:
@@ -1634,10 +1649,10 @@ private:
1634
1649
  const llama_token token = NUM2INT(token_);
1635
1650
  LLaMAModelWrapper* ptr = get_llama_model(self);
1636
1651
  std::vector<char> result(8, 0);
1637
- const int n_tokens = llama_token_to_piece(ptr->model, token, result.data(), result.size());
1652
+ const int n_tokens = llama_token_to_piece(ptr->model, token, result.data(), result.size(), false);
1638
1653
  if (n_tokens < 0) {
1639
1654
  result.resize(-n_tokens);
1640
- const int check = llama_token_to_piece(ptr->model, token, result.data(), result.size());
1655
+ const int check = llama_token_to_piece(ptr->model, token, result.data(), result.size(), false);
1641
1656
  if (check != -n_tokens) {
1642
1657
  rb_raise(rb_eRuntimeError, "failed to convert");
1643
1658
  return Qnil;
@@ -1789,6 +1804,16 @@ private:
1789
1804
  LLaMAModelWrapper* ptr = get_llama_model(self);
1790
1805
  return INT2NUM(llama_token_eot(ptr->model));
1791
1806
  }
1807
+
1808
+ static VALUE _llama_model_token_is_eog(VALUE self, VALUE token_) {
1809
+ if (!RB_INTEGER_TYPE_P(token_)) {
1810
+ rb_raise(rb_eArgError, "token must be an integer");
1811
+ return Qnil;
1812
+ }
1813
+ const llama_token token = NUM2INT(token_);
1814
+ LLaMAModelWrapper* ptr = get_llama_model(self);
1815
+ return llama_token_is_eog(ptr->model, token) ? Qtrue : Qfalse;
1816
+ }
1792
1817
  };
1793
1818
 
1794
1819
  const rb_data_type_t RbLLaMAModel::llama_model_type = {
@@ -2102,6 +2127,7 @@ public:
2102
2127
  rb_define_method(rb_cLLaMAContext, "sample_grammar", RUBY_METHOD_FUNC(_llama_context_sample_grammar), -1);
2103
2128
  rb_define_method(rb_cLLaMAContext, "grammar_accept_token", RUBY_METHOD_FUNC(_llama_context_grammar_accept_token), -1);
2104
2129
  rb_define_method(rb_cLLaMAContext, "apply_control_vector", RUBY_METHOD_FUNC(_llama_context_apply_control_vector), -1);
2130
+ rb_define_method(rb_cLLaMAContext, "pooling_type", RUBY_METHOD_FUNC(_llama_context_pooling_type), 0);
2105
2131
  }
2106
2132
 
2107
2133
  private:
@@ -3225,6 +3251,15 @@ private:
3225
3251
 
3226
3252
  return Qnil;
3227
3253
  }
3254
+
3255
+ static VALUE _llama_context_pooling_type(VALUE self) {
3256
+ LLaMAContextWrapper* ptr = get_llama_context(self);
3257
+ if (ptr->ctx == NULL) {
3258
+ rb_raise(rb_eRuntimeError, "LLaMA context is not initialized");
3259
+ return Qnil;
3260
+ }
3261
+ return INT2NUM(static_cast<int>(llama_pooling_type(ptr->ctx)));
3262
+ }
3228
3263
  };
3229
3264
 
3230
3265
  const rb_data_type_t RbLLaMAContext::llama_context_type = {
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.14.6'
6
+ VERSION = '0.14.7'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b2698'
9
+ LLAMA_CPP_VERSION = 'b2740'
10
10
  end
data/sig/llama_cpp.rbs CHANGED
@@ -141,6 +141,7 @@ module LLaMACpp
141
141
  def token_middle: () -> Integer
142
142
  def token_suffix: () -> Integer
143
143
  def token_eot: () -> Integer
144
+ def token_is_eog?: (Integer) -> bool
144
145
  end
145
146
 
146
147
  class Timings
@@ -260,6 +261,7 @@ module LLaMACpp
260
261
  def sample_grammar: (::LLaMACpp::TokenDataArray, grammar: ::LLaMACpp::Grammar) -> void
261
262
  def grammar_accept_token: (grammar: ::LLaMACpp::Grammar, token: Integer) -> void
262
263
  def apply_control_vector: (data: Array[Float], n_embd: Integer, il_start: Integer, il_end: Integer) -> void
264
+ def pooling_type: () -> Integer
263
265
  end
264
266
 
265
267
  class ContextParams
@@ -328,6 +330,8 @@ module LLaMACpp
328
330
  def only_copy=: (bool) -> bool
329
331
  def pure: () -> bool
330
332
  def pure=: (bool) -> bool
333
+ def keep_split: () -> bool
334
+ def keep_split=: (bool) -> bool
331
335
  end
332
336
 
333
337
  class Params = ContextParams
@@ -386,10 +386,6 @@ ifdef LLAMA_OPENBLAS
386
386
  MK_LDFLAGS += $(shell pkg-config --libs openblas)
387
387
  endif # LLAMA_OPENBLAS
388
388
 
389
- # TODO: temporary disable until MoE is fixed
390
- # https://github.com/ggerganov/llama.cpp/pull/6716
391
- LLAMA_NO_LLAMAFILE := 1
392
-
393
389
  ifndef LLAMA_NO_LLAMAFILE
394
390
  MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
395
391
  OBJS += sgemm.o
@@ -701,7 +697,7 @@ OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
701
697
  llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
702
698
  $(CXX) $(CXXFLAGS) -c $< -o $@
703
699
 
704
- COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
700
+ COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
705
701
  COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
706
702
 
707
703
  common.o: common/common.cpp $(COMMON_H_DEPS)
@@ -805,10 +801,19 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
805
801
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
806
802
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
807
803
 
808
- server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
804
+ server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
809
805
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
810
806
  $(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
811
807
 
808
+ # Portable equivalent of `cd examples/server/public && xxd -i $(notdir $<) ../$(notdir $<).hpp`:
809
+ examples/server/%.hpp: examples/server/public/% Makefile
810
+ @( export NAME=$(subst .,_,$(subst -,_,$(notdir $<))) && \
811
+ echo "unsigned char $${NAME}[] = {" && \
812
+ cat $< | od -v -t x1 -An | sed -E 's/([0-9a-fA-F]+)/0x\1, /g' && \
813
+ echo "};" && \
814
+ echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \
815
+ ) > $@
816
+
812
817
  gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
813
818
  $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
814
819
  $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@@ -371,16 +371,16 @@ struct ggml_gallocr {
371
371
  };
372
372
 
373
373
  ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
374
- ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(sizeof(struct ggml_gallocr), 1);
374
+ ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(1, sizeof(struct ggml_gallocr));
375
375
  GGML_ASSERT(galloc != NULL);
376
376
 
377
- galloc->bufts = calloc(sizeof(ggml_backend_buffer_type_t) * n_bufs, 1);
377
+ galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t));
378
378
  GGML_ASSERT(galloc->bufts != NULL);
379
379
 
380
- galloc->buffers = calloc(sizeof(ggml_backend_buffer_t) * n_bufs, 1);
380
+ galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t) * n_bufs);
381
381
  GGML_ASSERT(galloc->buffers != NULL);
382
382
 
383
- galloc->buf_tallocs = calloc(sizeof(struct ggml_dyn_tallocr *) * n_bufs, 1);
383
+ galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
384
384
  GGML_ASSERT(galloc->buf_tallocs != NULL);
385
385
 
386
386
  for (int i = 0; i < n_bufs; i++) {
@@ -646,8 +646,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
646
646
  free(galloc->hash_set.keys);
647
647
  free(galloc->hash_values);
648
648
  galloc->hash_set.size = hash_size;
649
- galloc->hash_set.keys = calloc(sizeof(struct ggml_tensor *), hash_size);
650
- galloc->hash_values = calloc(sizeof(struct hash_node), hash_size);
649
+ galloc->hash_set.keys = calloc(hash_size, sizeof(struct ggml_tensor *));
650
+ galloc->hash_values = calloc(hash_size, sizeof(struct hash_node));
651
651
  GGML_ASSERT(galloc->hash_set.keys != NULL);
652
652
  GGML_ASSERT(galloc->hash_values != NULL);
653
653
  } else {
@@ -667,7 +667,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
667
667
  // set the node_allocs from the hash table
668
668
  if (galloc->n_nodes < graph->n_nodes) {
669
669
  free(galloc->node_allocs);
670
- galloc->node_allocs = calloc(sizeof(struct node_alloc), graph->n_nodes);
670
+ galloc->node_allocs = calloc(graph->n_nodes, sizeof(struct node_alloc));
671
671
  GGML_ASSERT(galloc->node_allocs != NULL);
672
672
  }
673
673
  galloc->n_nodes = graph->n_nodes;
@@ -697,7 +697,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
697
697
  }
698
698
  if (galloc->n_leafs < graph->n_leafs) {
699
699
  free(galloc->leaf_allocs);
700
- galloc->leaf_allocs = calloc(sizeof(galloc->leaf_allocs[0]), graph->n_leafs);
700
+ galloc->leaf_allocs = calloc(graph->n_leafs, sizeof(galloc->leaf_allocs[0]));
701
701
  GGML_ASSERT(galloc->leaf_allocs != NULL);
702
702
  }
703
703
  galloc->n_leafs = graph->n_leafs;
@@ -822,7 +822,11 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t
822
822
  GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
823
823
  switch (op->op) {
824
824
  case GGML_OP_CPY:
825
- return op->type != GGML_TYPE_IQ2_XXS && op->type != GGML_TYPE_IQ2_XS && op->type != GGML_TYPE_IQ1_S; // missing type_traits.from_float
825
+ return
826
+ op->type != GGML_TYPE_IQ2_XXS &&
827
+ op->type != GGML_TYPE_IQ2_XS &&
828
+ op->type != GGML_TYPE_IQ1_S &&
829
+ op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
826
830
  case GGML_OP_MUL_MAT:
827
831
  return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
828
832
  default:
@@ -1721,23 +1725,23 @@ ggml_backend_sched_t ggml_backend_sched_new(
1721
1725
  GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
1722
1726
  GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
1723
1727
 
1724
- struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
1728
+ struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched));
1725
1729
 
1726
1730
  // initialize hash table
1727
1731
  sched->hash_set = ggml_hash_set_new(graph_size);
1728
- sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size);
1729
- sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size);
1732
+ sched->tensor_backend_id = calloc(sched->hash_set.size, sizeof(sched->tensor_backend_id[0]));
1733
+ sched->tensor_copies = calloc(sched->hash_set.size, sizeof(sched->tensor_copies[0]));
1730
1734
 
1731
1735
  const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
1732
- sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), nodes_size);
1733
- sched->leaf_backend_ids = calloc(sizeof(sched->leaf_backend_ids[0]), nodes_size);
1736
+ sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
1737
+ sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
1734
1738
 
1735
1739
  sched->n_backends = n_backends;
1736
1740
 
1737
1741
  sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
1738
1742
 
1739
1743
  const int initial_splits_capacity = 16;
1740
- sched->splits = calloc(sizeof(sched->splits[0]), initial_splits_capacity);
1744
+ sched->splits = calloc(initial_splits_capacity, sizeof(sched->splits[0]));
1741
1745
  sched->splits_capacity = initial_splits_capacity;
1742
1746
 
1743
1747
  for (int b = 0; b < n_backends; b++) {
@@ -1968,10 +1972,10 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te
1968
1972
  struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
1969
1973
  struct ggml_hash_set hash_set = {
1970
1974
  /* .size = */ graph->visited_hash_table.size,
1971
- /* .keys = */ calloc(sizeof(hash_set.keys[0]), graph->visited_hash_table.size) // NOLINT
1975
+ /* .keys = */ calloc(graph->visited_hash_table.size, sizeof(hash_set.keys[0])) // NOLINT
1972
1976
  };
1973
- struct ggml_tensor ** node_copies = calloc(sizeof(node_copies[0]), hash_set.size); // NOLINT
1974
- bool * node_init = calloc(sizeof(node_init[0]), hash_set.size);
1977
+ struct ggml_tensor ** node_copies = calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
1978
+ bool * node_init = calloc(hash_set.size, sizeof(node_init[0]));
1975
1979
 
1976
1980
  struct ggml_init_params params = {
1977
1981
  /* .mem_size = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false),
@@ -11,6 +11,12 @@
11
11
  #include <string.h> // memcpy
12
12
  #include <math.h> // fabsf
13
13
 
14
+ #undef MIN
15
+ #undef MAX
16
+
17
+ #define MIN(a, b) ((a) < (b) ? (a) : (b))
18
+ #define MAX(a, b) ((a) > (b) ? (a) : (b))
19
+
14
20
  #ifdef __cplusplus
15
21
  extern "C" {
16
22
  #endif
@@ -45,7 +51,7 @@ extern "C" {
45
51
  // 16-bit float
46
52
  // on Arm, we use __fp16
47
53
  // on x86, we use uint16_t
48
- #if defined(__ARM_NEON) && !defined(_MSC_VER)
54
+ #if defined(__ARM_NEON)
49
55
 
50
56
  // if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
51
57
  //
@@ -53,8 +59,262 @@ extern "C" {
53
59
  //
54
60
  #include <arm_neon.h>
55
61
 
62
+ #ifdef _MSC_VER
63
+
64
+ typedef uint16_t ggml_fp16_internal_t;
65
+
66
+ #define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
67
+
68
+ #else
69
+
56
70
  typedef __fp16 ggml_fp16_internal_t;
57
71
 
72
+ #define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
73
+
74
+ #endif // _MSC_VER
75
+
76
+ #if !defined(__aarch64__)
77
+
78
+ // 32-bit ARM compatibility
79
+
80
+ // vaddvq_s16
81
+ // vpaddq_s16
82
+ // vpaddq_s32
83
+ // vaddvq_s32
84
+ // vaddvq_f32
85
+ // vmaxvq_f32
86
+ // vcvtnq_s32_f32
87
+ // vzip1_u8
88
+ // vzip2_u8
89
+
90
+ inline static int32_t vaddvq_s16(int16x8_t v) {
91
+ return
92
+ (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
93
+ (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
94
+ (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
95
+ (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
96
+ }
97
+
98
+ inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
99
+ int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
100
+ int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
101
+ return vcombine_s16(a0, b0);
102
+ }
103
+
104
+ inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
105
+ int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
106
+ int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
107
+ return vcombine_s32(a0, b0);
108
+ }
109
+
110
+ inline static int32_t vaddvq_s32(int32x4_t v) {
111
+ return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
112
+ }
113
+
114
+ inline static float vaddvq_f32(float32x4_t v) {
115
+ return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
116
+ }
117
+
118
+ inline static float vmaxvq_f32(float32x4_t v) {
119
+ return
120
+ MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
121
+ MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
122
+ }
123
+
124
+ inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
125
+ int32x4_t res;
126
+
127
+ res[0] = roundf(vgetq_lane_f32(v, 0));
128
+ res[1] = roundf(vgetq_lane_f32(v, 1));
129
+ res[2] = roundf(vgetq_lane_f32(v, 2));
130
+ res[3] = roundf(vgetq_lane_f32(v, 3));
131
+
132
+ return res;
133
+ }
134
+
135
+ inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
136
+ uint8x8_t res;
137
+
138
+ res[0] = a[0]; res[1] = b[0];
139
+ res[2] = a[1]; res[3] = b[1];
140
+ res[4] = a[2]; res[5] = b[2];
141
+ res[6] = a[3]; res[7] = b[3];
142
+
143
+ return res;
144
+ }
145
+
146
+ inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
147
+ uint8x8_t res;
148
+
149
+ res[0] = a[4]; res[1] = b[4];
150
+ res[2] = a[5]; res[3] = b[5];
151
+ res[4] = a[6]; res[5] = b[6];
152
+ res[6] = a[7]; res[7] = b[7];
153
+
154
+ return res;
155
+ }
156
+
157
+ // vld1q_s16_x2
158
+ // vld1q_u8_x2
159
+ // vld1q_u8_x4
160
+ // vld1q_s8_x2
161
+ // vld1q_s8_x4
162
+ // TODO: double-check these work correctly
163
+
164
+ typedef struct ggml_int16x8x2_t {
165
+ int16x8_t val[2];
166
+ } ggml_int16x8x2_t;
167
+
168
+ inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) {
169
+ ggml_int16x8x2_t res;
170
+
171
+ res.val[0] = vld1q_s16(ptr + 0);
172
+ res.val[1] = vld1q_s16(ptr + 8);
173
+
174
+ return res;
175
+ }
176
+
177
+ typedef struct ggml_uint8x16x2_t {
178
+ uint8x16_t val[2];
179
+ } ggml_uint8x16x2_t;
180
+
181
+ inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) {
182
+ ggml_uint8x16x2_t res;
183
+
184
+ res.val[0] = vld1q_u8(ptr + 0);
185
+ res.val[1] = vld1q_u8(ptr + 16);
186
+
187
+ return res;
188
+ }
189
+
190
+ typedef struct ggml_uint8x16x4_t {
191
+ uint8x16_t val[4];
192
+ } ggml_uint8x16x4_t;
193
+
194
+ inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) {
195
+ ggml_uint8x16x4_t res;
196
+
197
+ res.val[0] = vld1q_u8(ptr + 0);
198
+ res.val[1] = vld1q_u8(ptr + 16);
199
+ res.val[2] = vld1q_u8(ptr + 32);
200
+ res.val[3] = vld1q_u8(ptr + 48);
201
+
202
+ return res;
203
+ }
204
+
205
+ typedef struct ggml_int8x16x2_t {
206
+ int8x16_t val[2];
207
+ } ggml_int8x16x2_t;
208
+
209
+ inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) {
210
+ ggml_int8x16x2_t res;
211
+
212
+ res.val[0] = vld1q_s8(ptr + 0);
213
+ res.val[1] = vld1q_s8(ptr + 16);
214
+
215
+ return res;
216
+ }
217
+
218
+ typedef struct ggml_int8x16x4_t {
219
+ int8x16_t val[4];
220
+ } ggml_int8x16x4_t;
221
+
222
+ inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
223
+ ggml_int8x16x4_t res;
224
+
225
+ res.val[0] = vld1q_s8(ptr + 0);
226
+ res.val[1] = vld1q_s8(ptr + 16);
227
+ res.val[2] = vld1q_s8(ptr + 32);
228
+ res.val[3] = vld1q_s8(ptr + 48);
229
+
230
+ return res;
231
+ }
232
+
233
+ // NOTE: not tested
234
+ inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
235
+ int8x16_t res;
236
+
237
+ res[ 0] = a[b[ 0]];
238
+ res[ 1] = a[b[ 1]];
239
+ res[ 2] = a[b[ 2]];
240
+ res[ 3] = a[b[ 3]];
241
+ res[ 4] = a[b[ 4]];
242
+ res[ 5] = a[b[ 5]];
243
+ res[ 6] = a[b[ 6]];
244
+ res[ 7] = a[b[ 7]];
245
+ res[ 8] = a[b[ 8]];
246
+ res[ 9] = a[b[ 9]];
247
+ res[10] = a[b[10]];
248
+ res[11] = a[b[11]];
249
+ res[12] = a[b[12]];
250
+ res[13] = a[b[13]];
251
+ res[14] = a[b[14]];
252
+ res[15] = a[b[15]];
253
+
254
+ return res;
255
+ }
256
+
257
+ // NOTE: not tested
258
+ inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
259
+ uint8x16_t res;
260
+
261
+ res[ 0] = a[b[ 0]];
262
+ res[ 1] = a[b[ 1]];
263
+ res[ 2] = a[b[ 2]];
264
+ res[ 3] = a[b[ 3]];
265
+ res[ 4] = a[b[ 4]];
266
+ res[ 5] = a[b[ 5]];
267
+ res[ 6] = a[b[ 6]];
268
+ res[ 7] = a[b[ 7]];
269
+ res[ 8] = a[b[ 8]];
270
+ res[ 9] = a[b[ 9]];
271
+ res[10] = a[b[10]];
272
+ res[11] = a[b[11]];
273
+ res[12] = a[b[12]];
274
+ res[13] = a[b[13]];
275
+ res[14] = a[b[14]];
276
+ res[15] = a[b[15]];
277
+
278
+ return res;
279
+ }
280
+
281
+ #else
282
+
283
+ #define ggml_int16x8x2_t int16x8x2_t
284
+ #define ggml_uint8x16x2_t uint8x16x2_t
285
+ #define ggml_uint8x16x4_t uint8x16x4_t
286
+ #define ggml_int8x16x2_t int8x16x2_t
287
+ #define ggml_int8x16x4_t int8x16x4_t
288
+
289
+ #define ggml_vld1q_s16_x2 vld1q_s16_x2
290
+ #define ggml_vld1q_u8_x2 vld1q_u8_x2
291
+ #define ggml_vld1q_u8_x4 vld1q_u8_x4
292
+ #define ggml_vld1q_s8_x2 vld1q_s8_x2
293
+ #define ggml_vld1q_s8_x4 vld1q_s8_x4
294
+ #define ggml_vqtbl1q_s8 vqtbl1q_s8
295
+ #define ggml_vqtbl1q_u8 vqtbl1q_u8
296
+
297
+ #endif // !defined(__aarch64__)
298
+
299
+ #if !defined(__ARM_FEATURE_DOTPROD)
300
+
301
+ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
302
+ const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
303
+ const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
304
+
305
+ return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
306
+ }
307
+
308
+ #else
309
+
310
+ #define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c)
311
+
312
+ #endif // !defined(__ARM_FEATURE_DOTPROD)
313
+
314
+ #endif // defined(__ARM_NEON)
315
+
316
+ #if defined(__ARM_NEON) && !defined(__MSC_VER)
317
+
58
318
  #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
59
319
  #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
60
320
 
@@ -75,8 +335,6 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
75
335
 
76
336
  #else
77
337
 
78
- typedef uint16_t ggml_fp16_internal_t;
79
-
80
338
  #ifdef __wasm_simd128__
81
339
  #include <wasm_simd128.h>
82
340
  #else
@@ -221,7 +479,7 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
221
479
 
222
480
  #endif // __F16C__
223
481
 
224
- #endif // __ARM_NEON
482
+ #endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
225
483
 
226
484
  // precomputed f32 table for f16 (256 KB)
227
485
  // defined in ggml.c, initialized in ggml_init()