@fugood/llama.node 0.3.12 → 0.3.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (159) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/lib/binding.ts +2 -1
  18. package/package.json +1 -1
  19. package/src/LlamaCompletionWorker.cpp +14 -0
  20. package/src/LlamaContext.cpp +110 -79
  21. package/src/LlamaContext.h +1 -1
  22. package/src/common.hpp +1 -2
  23. package/src/llama.cpp/.github/workflows/build.yml +95 -13
  24. package/src/llama.cpp/.github/workflows/docker.yml +2 -0
  25. package/src/llama.cpp/.github/workflows/labeler.yml +1 -1
  26. package/src/llama.cpp/.github/workflows/server.yml +2 -0
  27. package/src/llama.cpp/common/CMakeLists.txt +23 -6
  28. package/src/llama.cpp/common/arg.cpp +292 -14
  29. package/src/llama.cpp/common/chat.cpp +1128 -315
  30. package/src/llama.cpp/common/chat.h +135 -0
  31. package/src/llama.cpp/common/common.cpp +27 -171
  32. package/src/llama.cpp/common/common.h +41 -73
  33. package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
  34. package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
  35. package/src/llama.cpp/common/llguidance.cpp +3 -3
  36. package/src/llama.cpp/common/log.cpp +1 -0
  37. package/src/llama.cpp/common/log.h +2 -1
  38. package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +21 -7
  39. package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +61 -14
  40. package/src/llama.cpp/common/ngram-cache.cpp +1 -0
  41. package/src/llama.cpp/common/sampling.cpp +93 -49
  42. package/src/llama.cpp/common/speculative.cpp +6 -5
  43. package/src/llama.cpp/common/speculative.h +1 -1
  44. package/src/llama.cpp/docs/build.md +47 -9
  45. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +3 -1
  46. package/src/llama.cpp/examples/embedding/embedding.cpp +1 -0
  47. package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
  48. package/src/llama.cpp/examples/imatrix/imatrix.cpp +4 -4
  49. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +6 -5
  50. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +1 -1
  51. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +1 -1
  52. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  53. package/src/llama.cpp/examples/llava/clip.cpp +373 -107
  54. package/src/llama.cpp/examples/llava/clip.h +19 -3
  55. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
  56. package/src/llama.cpp/examples/llava/llava.cpp +4 -2
  57. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
  58. package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -0
  59. package/src/llama.cpp/examples/main/main.cpp +73 -28
  60. package/src/llama.cpp/examples/parallel/parallel.cpp +1 -0
  61. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -0
  62. package/src/llama.cpp/examples/perplexity/perplexity.cpp +1 -0
  63. package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
  64. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
  65. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
  66. package/src/llama.cpp/examples/run/run.cpp +115 -79
  67. package/src/llama.cpp/examples/server/CMakeLists.txt +1 -1
  68. package/src/llama.cpp/examples/server/httplib.h +381 -292
  69. package/src/llama.cpp/examples/server/server.cpp +134 -128
  70. package/src/llama.cpp/examples/server/utils.hpp +95 -106
  71. package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
  72. package/src/llama.cpp/examples/tts/tts.cpp +251 -142
  73. package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
  74. package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
  75. package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
  76. package/src/llama.cpp/ggml/include/ggml-cpu.h +4 -1
  77. package/src/llama.cpp/ggml/include/ggml-metal.h +1 -1
  78. package/src/llama.cpp/ggml/include/ggml-vulkan.h +0 -2
  79. package/src/llama.cpp/ggml/include/ggml.h +6 -2
  80. package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
  81. package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
  82. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
  83. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
  84. package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
  85. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -2
  86. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
  87. package/src/llama.cpp/ggml/src/ggml-common.h +0 -2
  88. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +132 -17
  89. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
  90. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
  91. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
  92. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +156 -11
  93. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +2235 -641
  94. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1572 -198
  95. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +24 -5
  96. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
  97. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
  98. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
  99. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +9 -8
  101. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +16 -3
  102. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
  103. package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
  104. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
  105. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +235 -0
  106. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -2
  107. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
  108. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +246 -120
  109. package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
  110. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
  111. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +2 -0
  112. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  113. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
  114. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +51 -10
  115. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
  116. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
  117. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
  118. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
  119. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
  120. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +136 -4
  121. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +308 -0
  122. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
  123. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +174 -728
  124. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -77
  125. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -0
  126. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
  127. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
  128. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +949 -602
  129. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +37 -3
  130. package/src/llama.cpp/ggml/src/ggml.c +9 -4
  131. package/src/llama.cpp/include/llama.h +32 -14
  132. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
  133. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
  134. package/src/llama.cpp/requirements/requirements-all.txt +1 -0
  135. package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
  136. package/src/llama.cpp/requirements.txt +1 -0
  137. package/src/llama.cpp/src/llama-arch.cpp +21 -0
  138. package/src/llama.cpp/src/llama-arch.h +1 -0
  139. package/src/llama.cpp/src/llama-chat.cpp +1 -0
  140. package/src/llama.cpp/src/llama-grammar.cpp +183 -183
  141. package/src/llama.cpp/src/llama-grammar.h +13 -4
  142. package/src/llama.cpp/src/llama-impl.h +6 -6
  143. package/src/llama.cpp/src/llama-kv-cache.h +2 -1
  144. package/src/llama.cpp/src/llama-mmap.cpp +11 -1
  145. package/src/llama.cpp/src/llama-mmap.h +1 -0
  146. package/src/llama.cpp/src/llama-model.cpp +70 -6
  147. package/src/llama.cpp/src/llama-sampling.cpp +174 -67
  148. package/src/llama.cpp/src/llama-vocab.cpp +12 -0
  149. package/src/llama.cpp/src/llama.cpp +154 -5
  150. package/src/llama.cpp/src/unicode.cpp +9 -2
  151. package/src/llama.cpp/tests/test-backend-ops.cpp +171 -115
  152. package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
  153. package/src/llama.cpp/tests/test-chat.cpp +691 -325
  154. package/src/llama.cpp/tests/test-gguf.cpp +4 -4
  155. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
  156. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
  157. package/src/llama.cpp/tests/test-sampling.cpp +15 -0
  158. package/src/llama.cpp/Sources/llama/llama.h +0 -4
  159. package/src/llama.cpp/common/chat.hpp +0 -52
@@ -7,10 +7,8 @@
7
7
  #include "ggml-cpu-impl.h"
8
8
  #include "ggml-cpu.h"
9
9
  #include "ggml-impl.h"
10
- #include "ggml-quants.h"
11
10
  #include "ggml-cpu-quants.h"
12
11
  #include "ggml-threading.h"
13
- #include "amx/amx.h"
14
12
  #include "ggml.h"
15
13
 
16
14
  #if defined(_MSC_VER) || defined(__MINGW32__)
@@ -114,7 +112,8 @@ struct ggml_arm_arch_features_type {
114
112
  int has_i8mm;
115
113
  int has_sve;
116
114
  int sve_cnt;
117
- } ggml_arm_arch_features = {-1, -1, -1, -1, 0};
115
+ int has_sme;
116
+ } ggml_arm_arch_features = {-1, -1, -1, -1, 0, -1};
118
117
  #endif
119
118
 
120
119
 
@@ -238,6 +237,8 @@ typedef pthread_t ggml_thread_t;
238
237
  #else
239
238
  #if defined(__POWER9_VECTOR__)
240
239
  #define CACHE_LINE_SIZE 128
240
+ #elif defined(__VXE__) || defined(__VXE2__)
241
+ #define CACHE_LINE_SIZE 256
241
242
  #else
242
243
  #define CACHE_LINE_SIZE 64
243
244
  #endif
@@ -246,9 +247,9 @@ typedef pthread_t ggml_thread_t;
246
247
  static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
247
248
 
248
249
 
249
- static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc);
250
- static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc);
251
- static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t * restrict x, size_t bx, ggml_bf16_t * restrict y, size_t by, int nrc);
250
+ static void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc);
251
+ static void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc);
252
+ static void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc);
252
253
 
253
254
  static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
254
255
  [GGML_TYPE_F32] = {
@@ -1078,29 +1079,23 @@ do { \
1078
1079
  #define GGML_F16_STEP 32
1079
1080
  #define GGML_F16_EPR 8
1080
1081
 
1081
- // F16 arithmetic is not supported by AVX, so we use F32 instead
1082
+ // F16 arithmetic is not supported by LASX, so we use F32 instead
1082
1083
 
1083
1084
  #define GGML_F32Cx8 __m256
1084
1085
  #define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0)
1085
1086
  #define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))
1086
1087
 
1087
1088
  static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) {
1088
- float tmp[8];
1089
-
1090
- for (int i = 0; i < 8; i++) {
1091
- tmp[i] = GGML_FP16_TO_FP32(x[i]);
1092
- }
1093
-
1094
- return (__m256)__lasx_xvld(tmp, 0);
1089
+ __m256i a;
1090
+ memcpy(&a, x, sizeof(ggml_fp16_t) * 8);
1091
+ a = __lasx_xvpermi_d(a, 0 | (1 << 4));
1092
+ return __lasx_xvfcvtl_s_h(a);
1095
1093
  }
1096
- static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
1097
- float arr[8];
1098
-
1099
- __lasx_xvst(y, arr, 0);
1100
1094
 
1101
- for (int i = 0; i < 8; i++) {
1102
- x[i] = GGML_FP32_TO_FP16(arr[i]);
1103
- }
1095
+ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
1096
+ __m256i a = __lasx_xvfcvt_h_s(y, y);
1097
+ a = __lasx_xvpermi_d(a, 0 | (2 << 2));
1098
+ memcpy(x, &a, sizeof(ggml_fp16_t) * 8);
1104
1099
  }
1105
1100
  #define GGML_F32Cx8_LOAD(x) __lasx_f32cx8_load(x)
1106
1101
  #define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y)
@@ -1218,6 +1213,87 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
1218
1213
  #define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
1219
1214
  #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
1220
1215
 
1216
+ #elif defined(__VXE__) || defined(__VXE2__)
1217
+
1218
+ #define GGML_SIMD
1219
+
1220
+ // F32 s390x
1221
+
1222
+ #define GGML_F32_STEP 32
1223
+ #define GGML_F32_EPR 4
1224
+
1225
+ #define GGML_F32x4 __vector float
1226
+ #define GGML_F32x4_ZERO vec_splats(0.0f)
1227
+ #define GGML_F32x4_SET1 vec_splats
1228
+ #define GGML_F32x4_LOAD(p) vec_xl(0, p)
1229
+ #define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p)
1230
+ #define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
1231
+ #define GGML_F32x4_ADD vec_add
1232
+ #define GGML_F32x4_MUL vec_mul
1233
+ #define GGML_F32x4_REDUCE(res, x) \
1234
+ { \
1235
+ int offset = GGML_F32_ARR >> 1; \
1236
+ for (int i = 0; i < offset; ++i) { \
1237
+ x[i] = vec_add(x[i], x[offset + i]); \
1238
+ } \
1239
+ offset >>= 1; \
1240
+ for (int i = 0; i < offset; ++i) { \
1241
+ x[i] = vec_add(x[i], x[offset + i]); \
1242
+ } \
1243
+ offset >>= 1; \
1244
+ for (int i = 0; i < offset; ++i) { \
1245
+ x[i] = vec_add(x[i], x[offset + i]); \
1246
+ } \
1247
+ res = vec_extract(x[0], 0) + \
1248
+ vec_extract(x[0], 1) + \
1249
+ vec_extract(x[0], 2) + \
1250
+ vec_extract(x[0], 3); \
1251
+ }
1252
+
1253
+ #define GGML_F32_VEC GGML_F32x4
1254
+ #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
1255
+ #define GGML_F32_VEC_SET1 GGML_F32x4_SET1
1256
+ #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
1257
+ #define GGML_F32_VEC_STORE GGML_F32x4_STORE
1258
+ #define GGML_F32_VEC_FMA GGML_F32x4_FMA
1259
+ #define GGML_F32_VEC_ADD GGML_F32x4_ADD
1260
+ #define GGML_F32_VEC_MUL GGML_F32x4_MUL
1261
+ #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
1262
+
1263
+ // F16 s390x
1264
+ #define GGML_F16_STEP GGML_F32_STEP
1265
+ #define GGML_F16_EPR GGML_F32_EPR
1266
+
1267
+ static inline __vector float __lzs_f16cx4_load(const ggml_fp16_t * x) {
1268
+ float tmp[4];
1269
+
1270
+ for (int i = 0; i < 4; i++) {
1271
+ tmp[i] = GGML_FP16_TO_FP32(x[i]);
1272
+ }
1273
+
1274
+ return vec_xl(0, tmp);
1275
+ }
1276
+
1277
+ static inline void __lzs_f16cx4_store(ggml_fp16_t * x, __vector float y) {
1278
+ float arr[4];
1279
+
1280
+ vec_xst(y, 0, arr);
1281
+
1282
+ for (int i = 0; i < 4; i++) {
1283
+ x[i] = GGML_FP32_TO_FP16(arr[i]);
1284
+ }
1285
+ }
1286
+
1287
+ #define GGML_F16_VEC GGML_F32x4
1288
+ #define GGML_F16_VEC_ZERO GGML_F32x4_ZERO
1289
+ #define GGML_F16_VEC_SET1 GGML_F32x4_SET1
1290
+ #define GGML_F16_VEC_LOAD(p, i) __lzs_f16cx4_load(p)
1291
+ #define GGML_F16_VEC_STORE(p, r, i) __lzs_f16cx4_store(p, r[i])
1292
+ #define GGML_F16_VEC_FMA GGML_F32x4_FMA
1293
+ #define GGML_F16_VEC_ADD GGML_F32x4_ADD
1294
+ #define GGML_F16_VEC_MUL GGML_F32x4_MUL
1295
+ #define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
1296
+
1221
1297
  #endif
1222
1298
 
1223
1299
  // GGML_F32_ARR / GGML_F16_ARR
@@ -1297,7 +1373,7 @@ struct ggml_threadpool {
1297
1373
  atomic_int n_graph; // incremented when there is work to be done (i.e each graph)
1298
1374
  atomic_int GGML_CACHE_ALIGN n_barrier;
1299
1375
  atomic_int GGML_CACHE_ALIGN n_barrier_passed;
1300
- atomic_int current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
1376
+ atomic_int GGML_CACHE_ALIGN current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
1301
1377
 
1302
1378
  // these are atomic as an annotation for thread-sanitizer
1303
1379
  atomic_bool stop; // Used for stopping the threadpool altogether
@@ -1339,17 +1415,43 @@ inline static void ggml_vec_cpy_i32(const int n, int32_t * y, const int32_t * x)
1339
1415
  inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
1340
1416
  inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
1341
1417
  inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; }
1418
+ inline static void ggml_vec_add_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
1419
+ for (int i = 0; i < n; ++i) {
1420
+ z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) + GGML_FP16_TO_FP32(y[i]));
1421
+ }
1422
+ }
1342
1423
  inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { for (int i = 0; i < n; ++i) z[i] = x[i] + v; }
1343
1424
  inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] += x[i]; }
1344
1425
  inline static void ggml_vec_acc1_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] += v; }
1345
1426
  inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] - y[i]; }
1427
+ inline static void ggml_vec_sub_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
1428
+ for (int i = 0; i < n; ++i) {
1429
+ z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) - GGML_FP16_TO_FP32(y[i]));
1430
+ }
1431
+ }
1346
1432
  inline static void ggml_vec_set_f32 (const int n, float * x, const float v) { for (int i = 0; i < n; ++i) x[i] = v; }
1347
1433
  inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; }
1348
1434
  inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = -x[i]; }
1435
+ inline static void ggml_vec_neg_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
1436
+ for (int i = 0; i < n; ++i) {
1437
+ y[i] = GGML_FP32_TO_FP16(-GGML_FP16_TO_FP32(x[i]));
1438
+ }
1439
+ }
1440
+
1349
1441
  inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; }
1442
+ inline static void ggml_vec_mul_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
1443
+ for (int i = 0; i < n; ++i) {
1444
+ z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) * GGML_FP16_TO_FP32(y[i]));
1445
+ }
1446
+ }
1350
1447
  inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; }
1448
+ inline static void ggml_vec_div_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
1449
+ for (int i = 0; i < n; ++i) {
1450
+ z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) / GGML_FP16_TO_FP32(y[i]));
1451
+ }
1452
+ }
1351
1453
 
1352
- static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc) {
1454
+ static void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc) {
1353
1455
  assert(nrc == 1);
1354
1456
  UNUSED(nrc);
1355
1457
  UNUSED(bx);
@@ -1392,7 +1494,7 @@ static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float *
1392
1494
  *s = sumf;
1393
1495
  }
1394
1496
 
1395
- static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t * restrict x, size_t bx, ggml_bf16_t * restrict y, size_t by, int nrc) {
1497
+ static void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc) {
1396
1498
  assert(nrc == 1);
1397
1499
  UNUSED(nrc);
1398
1500
  UNUSED(bx);
@@ -1460,7 +1562,7 @@ static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t
1460
1562
  *s = sumf;
1461
1563
  }
1462
1564
 
1463
- static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc) {
1565
+ static void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc) {
1464
1566
  assert(nrc == 1);
1465
1567
  UNUSED(nrc);
1466
1568
  UNUSED(bx);
@@ -1504,10 +1606,10 @@ static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t *
1504
1606
 
1505
1607
  // compute GGML_VEC_DOT_UNROLL dot products at once
1506
1608
  // xs - x row stride in bytes
1507
- inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * restrict s, void * restrict xv, ggml_fp16_t * restrict y) {
1609
+ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GGML_RESTRICT s, void * GGML_RESTRICT xv, ggml_fp16_t * GGML_RESTRICT y) {
1508
1610
  ggml_float sumf[GGML_VEC_DOT_UNROLL] = { 0.0 };
1509
1611
 
1510
- ggml_fp16_t * restrict x[GGML_VEC_DOT_UNROLL];
1612
+ ggml_fp16_t * GGML_RESTRICT x[GGML_VEC_DOT_UNROLL];
1511
1613
 
1512
1614
  for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
1513
1615
  x[i] = (ggml_fp16_t *) ((char *) xv + i*xs);
@@ -1557,7 +1659,7 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * re
1557
1659
  }
1558
1660
  }
1559
1661
 
1560
- inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float * restrict x, const float v) {
1662
+ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const float * GGML_RESTRICT x, const float v) {
1561
1663
  #if defined(GGML_SIMD)
1562
1664
  const int np = (n & ~(GGML_F32_STEP - 1));
1563
1665
 
@@ -1588,7 +1690,7 @@ inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float
1588
1690
  #endif
1589
1691
  }
1590
1692
 
1591
- inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, const ggml_fp16_t * restrict x, const float v) {
1693
+ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y, const ggml_fp16_t * GGML_RESTRICT x, const float v) {
1592
1694
  #if defined(GGML_SIMD)
1593
1695
  const int np = (n & ~(GGML_F16_STEP - 1));
1594
1696
 
@@ -1620,10 +1722,10 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, const
1620
1722
  }
1621
1723
 
1622
1724
  // xs and vs are byte strides of x and v
1623
- inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * restrict y, const float * restrict xv, const float * restrict vv) {
1725
+ inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * GGML_RESTRICT y, const float * GGML_RESTRICT xv, const float * GGML_RESTRICT vv) {
1624
1726
 
1625
- const float * restrict x[GGML_VEC_MAD_UNROLL];
1626
- const float * restrict v[GGML_VEC_MAD_UNROLL];
1727
+ const float * GGML_RESTRICT x[GGML_VEC_MAD_UNROLL];
1728
+ const float * GGML_RESTRICT v[GGML_VEC_MAD_UNROLL];
1627
1729
 
1628
1730
  for (int i = 0; i < GGML_VEC_MAD_UNROLL; ++i) {
1629
1731
  x[i] = (const float *) ((const char *) xv + i*xs);
@@ -1734,22 +1836,107 @@ inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float
1734
1836
 
1735
1837
  inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s); }
1736
1838
  inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; }
1839
+ inline static void ggml_vec_sqr_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
1840
+ for (int i = 0; i < n; ++i) {
1841
+ float v = GGML_FP16_TO_FP32(x[i]);
1842
+ y[i] = GGML_FP32_TO_FP16(v*v);
1843
+ }
1844
+ }
1737
1845
  inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
1846
+ inline static void ggml_vec_sqrt_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
1847
+ for (int i = 0; i < n; ++i) {
1848
+ y[i] = GGML_FP32_TO_FP16(sqrtf(GGML_FP16_TO_FP32(x[i])));
1849
+ }
1850
+ }
1738
1851
  inline static void ggml_vec_log_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]); }
1852
+ inline static void ggml_vec_log_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
1853
+ for (int i = 0; i < n; ++i) {
1854
+ y[i] = GGML_FP32_TO_FP16(logf(GGML_FP16_TO_FP32(x[i])));
1855
+ }
1856
+ }
1739
1857
  inline static void ggml_vec_sin_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sinf(x[i]); }
1858
+ inline static void ggml_vec_sin_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
1859
+ for (int i = 0; i < n; ++i) {
1860
+ y[i] = GGML_FP32_TO_FP16(sinf(GGML_FP16_TO_FP32(x[i])));
1861
+ }
1862
+ }
1740
1863
  inline static void ggml_vec_cos_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = cosf(x[i]); }
1864
+ inline static void ggml_vec_cos_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
1865
+ for (int i = 0; i < n; ++i) {
1866
+ y[i] = GGML_FP32_TO_FP16(cosf(GGML_FP16_TO_FP32(x[i])));
1867
+ }
1868
+ }
1741
1869
  inline static void ggml_vec_abs_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); }
1870
+ inline static void ggml_vec_abs_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
1871
+ for (int i = 0; i < n; ++i) {
1872
+ y[i] = GGML_FP32_TO_FP16(fabsf(GGML_FP16_TO_FP32(x[i])));
1873
+ }
1874
+ }
1742
1875
  inline static void ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
1876
+ inline static void ggml_vec_sgn_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
1877
+ for (int i = 0; i < n; ++i) {
1878
+ float v = GGML_FP16_TO_FP32(x[i]);
1879
+ y[i] = GGML_FP32_TO_FP16((v > 0.f) ? 1.f : ((v < 0.f) ? -1.f : 0.f));
1880
+ }
1881
+ }
1743
1882
  inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
1883
+ inline static void ggml_vec_step_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
1884
+ for (int i = 0; i < n; ++i) {
1885
+ y[i] = GGML_FP32_TO_FP16((GGML_FP16_TO_FP32(x[i]) > 0.f) ? 1.f : 0.f);
1886
+ }
1887
+ }
1744
1888
  inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); }
1889
+ inline static void ggml_vec_tanh_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
1890
+ for (int i = 0; i < n; ++i) {
1891
+ y[i] = GGML_FP32_TO_FP16(tanhf(GGML_FP16_TO_FP32(x[i])));
1892
+ }
1893
+ }
1745
1894
  inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); }
1895
+ inline static void ggml_vec_elu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
1896
+ for (int i = 0; i < n; ++i) {
1897
+ y[i] = GGML_FP32_TO_FP16(expm1f(GGML_FP16_TO_FP32(x[i])));
1898
+ }
1899
+ }
1746
1900
  inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
1901
+ inline static void ggml_vec_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
1902
+ for (int i = 0; i < n; ++i) {
1903
+ float v = GGML_FP16_TO_FP32(x[i]);
1904
+ y[i] = GGML_FP32_TO_FP16((v > 0.f) ? v : 0.f);
1905
+ }
1906
+ }
1747
1907
  inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
1908
+ inline static void ggml_vec_leaky_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const float ns) {
1909
+ for (int i = 0; i < n; ++i) {
1910
+ float v = GGML_FP16_TO_FP32(x[i]);
1911
+ y[i] = GGML_FP32_TO_FP16(((v > 0.f) ? v : 0.f) + ns * ((v < 0.0f) ? v : 0.f));
1912
+ }
1913
+ }
1748
1914
  inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); }
1915
+ inline static void ggml_vec_sigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
1916
+ for (int i = 0; i < n; ++i) {
1917
+ y[i] = GGML_FP32_TO_FP16(1.f / (1.f + expf(-GGML_FP16_TO_FP32(x[i]))));
1918
+ }
1919
+ }
1749
1920
  // TODO: optimize performance
1750
1921
  inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
1922
+ inline static void ggml_vec_hardswish_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
1923
+ for (int i = 0; i < n; ++i) {
1924
+ float v = GGML_FP16_TO_FP32(x[i]);
1925
+ y[i] = GGML_FP32_TO_FP16(v * fminf(1.0f, fmaxf(0.0f, (v + 3.0f) / 6.0f)));
1926
+ }
1927
+ }
1751
1928
  inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
1929
+ inline static void ggml_vec_hardsigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
1930
+ for (int i = 0; i < n; ++i) {
1931
+ y[i] = GGML_FP32_TO_FP16(fminf(1.0f, fmaxf(0.0f, (GGML_FP16_TO_FP32(x[i]) + 3.0f) / 6.0f)));
1932
+ }
1933
+ }
1752
1934
  inline static void ggml_vec_exp_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = expf(x[i]); }
1935
+ inline static void ggml_vec_exp_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
1936
+ for (int i = 0; i < n; ++i) {
1937
+ y[i] = GGML_FP32_TO_FP16(expf(GGML_FP16_TO_FP32(x[i])));
1938
+ }
1939
+ }
1753
1940
 
1754
1941
  static const float GELU_COEF_A = 0.044715f;
1755
1942
  static const float GELU_QUICK_COEF = -1.702f;
@@ -1817,14 +2004,25 @@ inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float *
1817
2004
  }
1818
2005
  #endif
1819
2006
 
2007
+ inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
2008
+ for (int i = 0; i < n; ++i) {
2009
+ float v = GGML_FP16_TO_FP32(x[i]);
2010
+ y[i] = GGML_FP32_TO_FP16(v*(1.0f/(1.0f+expf(GELU_QUICK_COEF*v))));
2011
+ }
2012
+ }
2013
+
1820
2014
  // Sigmoid Linear Unit (SiLU) function
1821
2015
  inline static float ggml_silu_f32(float x) {
1822
2016
  return x/(1.0f + expf(-x));
1823
2017
  }
2018
+ inline static ggml_fp16_t ggml_silu_f16(ggml_fp16_t x) {
2019
+ float v = GGML_FP16_TO_FP32(x);
2020
+ return GGML_FP32_TO_FP16(v/(1.0f + expf(-v)));
2021
+ }
1824
2022
 
1825
2023
  #if __FINITE_MATH_ONLY__
1826
2024
  #error "some routines in ggml.c require non-finite math arithmetics -- pass -fno-finite-math-only to the compiler to fix"
1827
- #error "ref: https://github.com/ggerganov/llama.cpp/pull/7154#issuecomment-2143844461"
2025
+ #error "ref: https://github.com/ggml-org/llama.cpp/pull/7154#issuecomment-2143844461"
1828
2026
  #endif
1829
2027
 
1830
2028
  #if defined(__ARM_NEON) && defined(__aarch64__)
@@ -2044,6 +2242,12 @@ static void ggml_vec_silu_f32(const int n, float * y, const float * x) {
2044
2242
  }
2045
2243
  }
2046
2244
 
2245
+ inline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
2246
+ for (int i = 0; i < n; ++i) {
2247
+ y[i] = ggml_silu_f16(x[i]);
2248
+ }
2249
+ }
2250
+
2047
2251
  static ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max) {
2048
2252
  int i = 0;
2049
2253
  ggml_float sum = 0;
@@ -2115,12 +2319,24 @@ inline static float ggml_silu_backward_f32(float x, float dy) {
2115
2319
  return dy*s*(1.0f + x*(1.0f - s));
2116
2320
  }
2117
2321
 
2322
+ inline static ggml_fp16_t ggml_silu_backward_f16(ggml_fp16_t x, ggml_fp16_t dy) {
2323
+ const float v = GGML_FP16_TO_FP32(x);
2324
+ const float s = 1.0f/(1.0f + expf(-v));
2325
+ return GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(dy)*s*(1.0f + v*(1.0f - s)));
2326
+ }
2327
+
2118
2328
  inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
2119
2329
  for (int i = 0; i < n; ++i) {
2120
2330
  dx[i] = ggml_silu_backward_f32(x[i], dy[i]);
2121
2331
  }
2122
2332
  }
2123
2333
 
2334
+ inline static void ggml_vec_silu_backward_f16(const int n, ggml_fp16_t * dx, const ggml_fp16_t * x, const ggml_fp16_t * dy) {
2335
+ for (int i = 0; i < n; ++i) {
2336
+ dx[i] = ggml_silu_backward_f16(x[i], dy[i]);
2337
+ }
2338
+ }
2339
+
2124
2340
  inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
2125
2341
  #ifndef GGML_USE_ACCELERATE
2126
2342
  ggml_float sum = 0.0;
@@ -2389,15 +2605,20 @@ bool ggml_is_numa(void) {
2389
2605
  #define HWCAP2_I8MM (1 << 13)
2390
2606
  #endif
2391
2607
 
2608
+ #if !defined(HWCAP2_SME)
2609
+ #define HWCAP2_SME (1 << 23)
2610
+ #endif
2611
+
2392
2612
  static void ggml_init_arm_arch_features(void) {
2393
2613
  #if defined(__linux__) && defined(__aarch64__)
2394
2614
  uint32_t hwcap = getauxval(AT_HWCAP);
2395
2615
  uint32_t hwcap2 = getauxval(AT_HWCAP2);
2396
2616
 
2397
- ggml_arm_arch_features.has_neon = !!(hwcap & HWCAP_ASIMD);
2617
+ ggml_arm_arch_features.has_neon = !!(hwcap & HWCAP_ASIMD);
2398
2618
  ggml_arm_arch_features.has_dotprod = !!(hwcap & HWCAP_ASIMDDP);
2399
- ggml_arm_arch_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
2400
- ggml_arm_arch_features.has_sve = !!(hwcap & HWCAP_SVE);
2619
+ ggml_arm_arch_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
2620
+ ggml_arm_arch_features.has_sve = !!(hwcap & HWCAP_SVE);
2621
+ ggml_arm_arch_features.has_sme = !!(hwcap2 & HWCAP2_SME);
2401
2622
 
2402
2623
  #if defined(__ARM_FEATURE_SVE)
2403
2624
  ggml_arm_arch_features.sve_cnt = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
@@ -2420,6 +2641,11 @@ static void ggml_init_arm_arch_features(void) {
2420
2641
  }
2421
2642
  ggml_arm_arch_features.has_i8mm = oldp;
2422
2643
 
2644
+ if (sysctlbyname("hw.optional.arm.FEAT_SME", &oldp, &size, NULL, 0) != 0) {
2645
+ oldp = 0;
2646
+ }
2647
+ ggml_arm_arch_features.has_sme = oldp;
2648
+
2423
2649
  ggml_arm_arch_features.has_sve = 0;
2424
2650
  ggml_arm_arch_features.sve_cnt = 0;
2425
2651
  #else
@@ -2443,6 +2669,12 @@ static void ggml_init_arm_arch_features(void) {
2443
2669
  ggml_arm_arch_features.has_sve = 0;
2444
2670
  ggml_arm_arch_features.sve_cnt = 0;
2445
2671
  #endif
2672
+
2673
+ #if defined(__ARM_FEATURE_SME) || defined(__ARM_FEATURE_SME2)
2674
+ ggml_arm_arch_features.has_sme = 1;
2675
+ #else
2676
+ ggml_arm_arch_features.has_sme = 0;
2677
+ #endif
2446
2678
  #endif
2447
2679
  }
2448
2680
  #endif
@@ -4287,7 +4519,7 @@ static void ggml_compute_forward_add_f16_f16(
4287
4519
  const struct ggml_tensor * src0 = dst->src[0];
4288
4520
  const struct ggml_tensor * src1 = dst->src[1];
4289
4521
 
4290
- GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
4522
+ GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
4291
4523
 
4292
4524
  const int ith = params->ith;
4293
4525
  const int nth = params->nth;
@@ -4312,17 +4544,22 @@ static void ggml_compute_forward_add_f16_f16(
4312
4544
 
4313
4545
  if (nb10 == sizeof(ggml_fp16_t)) {
4314
4546
  for (int ir = ir0; ir < ir1; ++ir) {
4315
- // src0, src1 and dst are same shape => same indices
4316
- const int i3 = ir/(ne2*ne1);
4317
- const int i2 = (ir - i3*ne2*ne1)/ne1;
4318
- const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
4547
+ // src1 is broadcastable across src0 and dst in i1, i2, i3
4548
+ const int64_t i03 = ir/(ne02*ne01);
4549
+ const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
4550
+ const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
4319
4551
 
4320
- ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1);
4321
- ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
4322
- ggml_fp16_t * src1_ptr = (ggml_fp16_t *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
4552
+ const int64_t i13 = i03 % ne13;
4553
+ const int64_t i12 = i02 % ne12;
4554
+ const int64_t i11 = i01 % ne11;
4555
+ const int64_t nr0 = ne00 / ne10;
4323
4556
 
4324
- for (int i = 0; i < ne0; i++) {
4325
- dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + GGML_FP16_TO_FP32(src1_ptr[i]));
4557
+ ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
4558
+ ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
4559
+ ggml_fp16_t * src1_ptr = (ggml_fp16_t *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
4560
+
4561
+ for (int64_t r = 0; r < nr0; ++r) {
4562
+ ggml_vec_add_f16(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
4326
4563
  }
4327
4564
  }
4328
4565
  }
@@ -5110,6 +5347,62 @@ static void ggml_compute_forward_sub_f32(
5110
5347
  }
5111
5348
  }
5112
5349
 
5350
+ static void ggml_compute_forward_sub_f16(
5351
+ const struct ggml_compute_params * params,
5352
+ struct ggml_tensor * dst) {
5353
+
5354
+ const struct ggml_tensor * src0 = dst->src[0];
5355
+ const struct ggml_tensor * src1 = dst->src[1];
5356
+
5357
+ assert(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
5358
+
5359
+ const int ith = params->ith;
5360
+ const int nth = params->nth;
5361
+
5362
+ const int nr = ggml_nrows(src0);
5363
+
5364
+ GGML_TENSOR_BINARY_OP_LOCALS
5365
+
5366
+ GGML_ASSERT(src0->type == GGML_TYPE_F16);
5367
+ GGML_ASSERT(src1->type == GGML_TYPE_F16);
5368
+ GGML_ASSERT(dst->type == GGML_TYPE_F16);
5369
+
5370
+ GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
5371
+ GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
5372
+
5373
+ // rows per thread
5374
+ const int dr = (nr + nth - 1)/nth;
5375
+
5376
+ // row range for this thread
5377
+ const int ir0 = dr*ith;
5378
+ const int ir1 = MIN(ir0 + dr, nr);
5379
+
5380
+ if (nb10 == sizeof(ggml_fp16_t)) {
5381
+ for (int ir = ir0; ir < ir1; ++ir) {
5382
+ // src1 is broadcastable across src0 and dst in i1, i2, i3
5383
+ const int64_t i03 = ir/(ne02*ne01);
5384
+ const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
5385
+ const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
5386
+
5387
+ const int64_t i13 = i03 % ne13;
5388
+ const int64_t i12 = i02 % ne12;
5389
+ const int64_t i11 = i01 % ne11;
5390
+ const int64_t nr0 = ne00 / ne10;
5391
+
5392
+ ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
5393
+ ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
5394
+ ggml_fp16_t * src1_ptr = (ggml_fp16_t *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
5395
+
5396
+ for (int64_t r = 0; r < nr0; ++r) {
5397
+ ggml_vec_sub_f16(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
5398
+ }
5399
+ }
5400
+ } else {
5401
+ // src1 is not contiguous
5402
+ GGML_ABORT("unimplemented error");
5403
+ }
5404
+ }
5405
+
5113
5406
  static void ggml_compute_forward_sub(
5114
5407
  const struct ggml_compute_params * params,
5115
5408
  struct ggml_tensor * dst) {
@@ -5121,6 +5414,10 @@ static void ggml_compute_forward_sub(
5121
5414
  {
5122
5415
  ggml_compute_forward_sub_f32(params, dst);
5123
5416
  } break;
5417
+ case GGML_TYPE_F16:
5418
+ {
5419
+ ggml_compute_forward_sub_f16(params, dst);
5420
+ } break;
5124
5421
  default:
5125
5422
  {
5126
5423
  GGML_ABORT("fatal error");
@@ -5201,32 +5498,9 @@ static void ggml_compute_forward_mul_f32(
5201
5498
  }
5202
5499
  }
5203
5500
 
5204
- static void ggml_compute_forward_mul(
5205
- const struct ggml_compute_params * params,
5206
- struct ggml_tensor * dst) {
5207
-
5208
- const struct ggml_tensor * src0 = dst->src[0];
5209
- const struct ggml_tensor * src1 = dst->src[1];
5210
-
5211
- GGML_ASSERT(src1->type == GGML_TYPE_F32 && "only f32 src1 supported for now");
5212
-
5213
- switch (src0->type) {
5214
- case GGML_TYPE_F32:
5215
- {
5216
- ggml_compute_forward_mul_f32(params, dst);
5217
- } break;
5218
- default:
5219
- {
5220
- GGML_ABORT("fatal error");
5221
- }
5222
- }
5223
- }
5224
-
5225
- // ggml_compute_forward_div
5226
-
5227
- static void ggml_compute_forward_div_f32(
5228
- const struct ggml_compute_params * params,
5229
- struct ggml_tensor * dst) {
5501
+ static void ggml_compute_forward_mul_f16(
5502
+ const struct ggml_compute_params * params,
5503
+ struct ggml_tensor * dst) {
5230
5504
 
5231
5505
  const struct ggml_tensor * src0 = dst->src[0];
5232
5506
  const struct ggml_tensor * src1 = dst->src[1];
@@ -5240,10 +5514,14 @@ static void ggml_compute_forward_div_f32(
5240
5514
 
5241
5515
  GGML_TENSOR_BINARY_OP_LOCALS
5242
5516
 
5243
- GGML_ASSERT( nb0 == sizeof(float));
5244
- GGML_ASSERT(nb00 == sizeof(float));
5517
+ GGML_ASSERT(src0->type == GGML_TYPE_F16);
5518
+ GGML_ASSERT(src1->type == GGML_TYPE_F16);
5519
+ GGML_ASSERT(dst->type == GGML_TYPE_F16);
5245
5520
 
5246
- if (nb10 == sizeof(float)) {
5521
+ GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
5522
+ GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
5523
+
5524
+ if (nb10 == sizeof(ggml_fp16_t)) {
5247
5525
  for (int64_t ir = ith; ir < nr; ir += nth) {
5248
5526
  // src0 and dst are same shape => same indices
5249
5527
  const int64_t i03 = ir/(ne02*ne01);
@@ -5255,13 +5533,85 @@ static void ggml_compute_forward_div_f32(
5255
5533
  const int64_t i11 = i01 % ne11;
5256
5534
  const int64_t nr0 = ne00 / ne10;
5257
5535
 
5258
- float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
5259
- float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
5260
- float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
5261
-
5262
- for (int64_t r = 0; r < nr0; ++r) {
5263
- #ifdef GGML_USE_ACCELERATE
5264
- UNUSED(ggml_vec_div_f32);
5536
+ ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
5537
+ ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
5538
+ ggml_fp16_t * src1_ptr = (ggml_fp16_t *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
5539
+
5540
+ for (int64_t r = 0 ; r < nr0; ++r) {
5541
+ ggml_vec_mul_f16(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
5542
+ }
5543
+ }
5544
+ } else {
5545
+ // src1 is not contiguous
5546
+ GGML_ABORT("unimplemented error");
5547
+ }
5548
+ }
5549
+
5550
+ static void ggml_compute_forward_mul(
5551
+ const struct ggml_compute_params * params,
5552
+ struct ggml_tensor * dst) {
5553
+
5554
+ const struct ggml_tensor * src0 = dst->src[0];
5555
+ const struct ggml_tensor * src1 = dst->src[1];
5556
+
5557
+ GGML_ASSERT((src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16) && "only f32/f16 src1 supported for now");
5558
+
5559
+ switch (src0->type) {
5560
+ case GGML_TYPE_F32:
5561
+ {
5562
+ ggml_compute_forward_mul_f32(params, dst);
5563
+ } break;
5564
+ case GGML_TYPE_F16:
5565
+ {
5566
+ ggml_compute_forward_mul_f16(params, dst);
5567
+ } break;
5568
+ default:
5569
+ {
5570
+ GGML_ABORT("fatal error");
5571
+ }
5572
+ }
5573
+ }
5574
+
5575
+ // ggml_compute_forward_div
5576
+
5577
+ static void ggml_compute_forward_div_f32(
5578
+ const struct ggml_compute_params * params,
5579
+ struct ggml_tensor * dst) {
5580
+
5581
+ const struct ggml_tensor * src0 = dst->src[0];
5582
+ const struct ggml_tensor * src1 = dst->src[1];
5583
+
5584
+ GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
5585
+
5586
+ const int ith = params->ith;
5587
+ const int nth = params->nth;
5588
+
5589
+ const int64_t nr = ggml_nrows(src0);
5590
+
5591
+ GGML_TENSOR_BINARY_OP_LOCALS
5592
+
5593
+ GGML_ASSERT( nb0 == sizeof(float));
5594
+ GGML_ASSERT(nb00 == sizeof(float));
5595
+
5596
+ if (nb10 == sizeof(float)) {
5597
+ for (int64_t ir = ith; ir < nr; ir += nth) {
5598
+ // src0 and dst are same shape => same indices
5599
+ const int64_t i03 = ir/(ne02*ne01);
5600
+ const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
5601
+ const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
5602
+
5603
+ const int64_t i13 = i03 % ne13;
5604
+ const int64_t i12 = i02 % ne12;
5605
+ const int64_t i11 = i01 % ne11;
5606
+ const int64_t nr0 = ne00 / ne10;
5607
+
5608
+ float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
5609
+ float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
5610
+ float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
5611
+
5612
+ for (int64_t r = 0; r < nr0; ++r) {
5613
+ #ifdef GGML_USE_ACCELERATE
5614
+ UNUSED(ggml_vec_div_f32);
5265
5615
 
5266
5616
  vDSP_vdiv(src1_ptr, 1, src0_ptr + r*ne10, 1, dst_ptr + r*ne10, 1, ne10);
5267
5617
  #else
@@ -5295,6 +5645,55 @@ static void ggml_compute_forward_div_f32(
5295
5645
  }
5296
5646
  }
5297
5647
 
5648
+ static void ggml_compute_forward_div_f16(
5649
+ const struct ggml_compute_params * params,
5650
+ struct ggml_tensor * dst) {
5651
+
5652
+ const struct ggml_tensor * src0 = dst->src[0];
5653
+ const struct ggml_tensor * src1 = dst->src[1];
5654
+
5655
+ GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
5656
+
5657
+ const int ith = params->ith;
5658
+ const int nth = params->nth;
5659
+
5660
+ const int64_t nr = ggml_nrows(src0);
5661
+
5662
+ GGML_TENSOR_BINARY_OP_LOCALS
5663
+
5664
+ GGML_ASSERT(src0->type == GGML_TYPE_F16);
5665
+ GGML_ASSERT(src1->type == GGML_TYPE_F16);
5666
+ GGML_ASSERT(dst->type == GGML_TYPE_F16);
5667
+
5668
+ GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
5669
+ GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
5670
+
5671
+ if (nb10 == sizeof(ggml_fp16_t)) {
5672
+ for (int64_t ir = ith; ir < nr; ir += nth) {
5673
+ // src0 and dst are same shape => same indices
5674
+ const int64_t i03 = ir/(ne02*ne01);
5675
+ const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
5676
+ const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
5677
+
5678
+ const int64_t i13 = i03 % ne13;
5679
+ const int64_t i12 = i02 % ne12;
5680
+ const int64_t i11 = i01 % ne11;
5681
+ const int64_t nr0 = ne00 / ne10;
5682
+
5683
+ ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
5684
+ ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
5685
+ ggml_fp16_t * src1_ptr = (ggml_fp16_t *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
5686
+
5687
+ for (int64_t r = 0; r < nr0; ++r) {
5688
+ ggml_vec_div_f16(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
5689
+ }
5690
+ }
5691
+ } else {
5692
+ // src1 is not contiguous
5693
+ GGML_ABORT("unimplemented error");
5694
+ }
5695
+ }
5696
+
5298
5697
  static void ggml_compute_forward_div(
5299
5698
  const struct ggml_compute_params * params,
5300
5699
  struct ggml_tensor * dst) {
@@ -5306,6 +5705,10 @@ static void ggml_compute_forward_div(
5306
5705
  {
5307
5706
  ggml_compute_forward_div_f32(params, dst);
5308
5707
  } break;
5708
+ case GGML_TYPE_F16:
5709
+ {
5710
+ ggml_compute_forward_div_f16(params, dst);
5711
+ } break;
5309
5712
  default:
5310
5713
  {
5311
5714
  GGML_ABORT("fatal error");
@@ -5340,6 +5743,31 @@ static void ggml_compute_forward_sqr_f32(
5340
5743
  }
5341
5744
  }
5342
5745
 
5746
+ static void ggml_compute_forward_sqr_f16(
5747
+ const struct ggml_compute_params * params,
5748
+ struct ggml_tensor * dst) {
5749
+
5750
+ const struct ggml_tensor * src0 = dst->src[0];
5751
+
5752
+ if (params->ith != 0) {
5753
+ return;
5754
+ }
5755
+
5756
+ assert(ggml_are_same_shape(src0, dst));
5757
+
5758
+ const int n = ggml_nrows(src0);
5759
+ const int nc = src0->ne[0];
5760
+
5761
+ assert( dst->nb[0] == sizeof(ggml_fp16_t));
5762
+ assert(src0->nb[0] == sizeof(ggml_fp16_t));
5763
+
5764
+ for (int i = 0; i < n; i++) {
5765
+ ggml_vec_sqr_f16(nc,
5766
+ (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
5767
+ (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
5768
+ }
5769
+ }
5770
+
5343
5771
  static void ggml_compute_forward_sqr(
5344
5772
  const struct ggml_compute_params * params,
5345
5773
  struct ggml_tensor * dst) {
@@ -5351,6 +5779,10 @@ static void ggml_compute_forward_sqr(
5351
5779
  {
5352
5780
  ggml_compute_forward_sqr_f32(params, dst);
5353
5781
  } break;
5782
+ case GGML_TYPE_F16:
5783
+ {
5784
+ ggml_compute_forward_sqr_f16(params, dst);
5785
+ } break;
5354
5786
  default:
5355
5787
  {
5356
5788
  GGML_ABORT("fatal error");
@@ -5385,6 +5817,31 @@ static void ggml_compute_forward_sqrt_f32(
5385
5817
  }
5386
5818
  }
5387
5819
 
5820
+ static void ggml_compute_forward_sqrt_f16(
5821
+ const struct ggml_compute_params * params,
5822
+ struct ggml_tensor * dst) {
5823
+
5824
+ const struct ggml_tensor * src0 = dst->src[0];
5825
+
5826
+ if (params->ith != 0) {
5827
+ return;
5828
+ }
5829
+
5830
+ assert(ggml_are_same_shape(src0, dst));
5831
+
5832
+ const int n = ggml_nrows(src0);
5833
+ const int nc = src0->ne[0];
5834
+
5835
+ assert( dst->nb[0] == sizeof(ggml_fp16_t));
5836
+ assert(src0->nb[0] == sizeof(ggml_fp16_t));
5837
+
5838
+ for (int i = 0; i < n; i++) {
5839
+ ggml_vec_sqrt_f16(nc,
5840
+ (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
5841
+ (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
5842
+ }
5843
+ }
5844
+
5388
5845
  static void ggml_compute_forward_sqrt(
5389
5846
  const struct ggml_compute_params * params,
5390
5847
  struct ggml_tensor * dst) {
@@ -5396,6 +5853,10 @@ static void ggml_compute_forward_sqrt(
5396
5853
  {
5397
5854
  ggml_compute_forward_sqrt_f32(params, dst);
5398
5855
  } break;
5856
+ case GGML_TYPE_F16:
5857
+ {
5858
+ ggml_compute_forward_sqrt_f16(params, dst);
5859
+ } break;
5399
5860
  default:
5400
5861
  {
5401
5862
  GGML_ABORT("fatal error");
@@ -5430,6 +5891,31 @@ static void ggml_compute_forward_log_f32(
5430
5891
  }
5431
5892
  }
5432
5893
 
5894
+ static void ggml_compute_forward_log_f16(
5895
+ const struct ggml_compute_params * params,
5896
+ struct ggml_tensor * dst) {
5897
+
5898
+ const struct ggml_tensor * src0 = dst->src[0];
5899
+
5900
+ if (params->ith != 0) {
5901
+ return;
5902
+ }
5903
+
5904
+ GGML_ASSERT(ggml_are_same_shape(src0, dst));
5905
+
5906
+ const int n = ggml_nrows(src0);
5907
+ const int nc = src0->ne[0];
5908
+
5909
+ GGML_ASSERT( dst->nb[0] == sizeof(ggml_fp16_t));
5910
+ GGML_ASSERT(src0->nb[0] == sizeof(ggml_fp16_t));
5911
+
5912
+ for (int i = 0; i < n; i++) {
5913
+ ggml_vec_log_f16(nc,
5914
+ (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
5915
+ (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
5916
+ }
5917
+ }
5918
+
5433
5919
  static void ggml_compute_forward_log(
5434
5920
  const struct ggml_compute_params * params,
5435
5921
  struct ggml_tensor * dst) {
@@ -5441,6 +5927,10 @@ static void ggml_compute_forward_log(
5441
5927
  {
5442
5928
  ggml_compute_forward_log_f32(params, dst);
5443
5929
  } break;
5930
+ case GGML_TYPE_F16:
5931
+ {
5932
+ ggml_compute_forward_log_f16(params, dst);
5933
+ } break;
5444
5934
  default:
5445
5935
  {
5446
5936
  GGML_ABORT("fatal error");
@@ -5475,6 +5965,31 @@ static void ggml_compute_forward_sin_f32(
5475
5965
  }
5476
5966
  }
5477
5967
 
5968
+ static void ggml_compute_forward_sin_f16(
5969
+ const struct ggml_compute_params * params,
5970
+ struct ggml_tensor * dst) {
5971
+
5972
+ const struct ggml_tensor * src0 = dst->src[0];
5973
+
5974
+ if (params->ith != 0) {
5975
+ return;
5976
+ }
5977
+
5978
+ GGML_ASSERT(ggml_are_same_shape(src0, dst));
5979
+
5980
+ const int n = ggml_nrows(src0);
5981
+ const int nc = src0->ne[0];
5982
+
5983
+ GGML_ASSERT( dst->nb[0] == sizeof(ggml_fp16_t));
5984
+ GGML_ASSERT(src0->nb[0] == sizeof(ggml_fp16_t));
5985
+
5986
+ for (int i = 0; i < n; i++) {
5987
+ ggml_vec_sin_f16(nc,
5988
+ (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
5989
+ (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
5990
+ }
5991
+ }
5992
+
5478
5993
  static void ggml_compute_forward_sin(
5479
5994
  const struct ggml_compute_params * params,
5480
5995
  struct ggml_tensor * dst) {
@@ -5486,6 +6001,10 @@ static void ggml_compute_forward_sin(
5486
6001
  {
5487
6002
  ggml_compute_forward_sin_f32(params, dst);
5488
6003
  } break;
6004
+ case GGML_TYPE_F16:
6005
+ {
6006
+ ggml_compute_forward_sin_f16(params, dst);
6007
+ } break;
5489
6008
  default:
5490
6009
  {
5491
6010
  GGML_ABORT("fatal error");
@@ -5520,6 +6039,31 @@ static void ggml_compute_forward_cos_f32(
5520
6039
  }
5521
6040
  }
5522
6041
 
6042
+ static void ggml_compute_forward_cos_f16(
6043
+ const struct ggml_compute_params * params,
6044
+ struct ggml_tensor * dst) {
6045
+
6046
+ const struct ggml_tensor * src0 = dst->src[0];
6047
+
6048
+ if (params->ith != 0) {
6049
+ return;
6050
+ }
6051
+
6052
+ GGML_ASSERT(ggml_are_same_shape(src0, dst));
6053
+
6054
+ const int n = ggml_nrows(src0);
6055
+ const int nc = src0->ne[0];
6056
+
6057
+ GGML_ASSERT( dst->nb[0] == sizeof(ggml_fp16_t));
6058
+ GGML_ASSERT(src0->nb[0] == sizeof(ggml_fp16_t));
6059
+
6060
+ for (int i = 0; i < n; i++) {
6061
+ ggml_vec_cos_f16(nc,
6062
+ (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
6063
+ (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
6064
+ }
6065
+ }
6066
+
5523
6067
  static void ggml_compute_forward_cos(
5524
6068
  const struct ggml_compute_params * params,
5525
6069
  struct ggml_tensor * dst) {
@@ -5531,6 +6075,10 @@ static void ggml_compute_forward_cos(
5531
6075
  {
5532
6076
  ggml_compute_forward_cos_f32(params, dst);
5533
6077
  } break;
6078
+ case GGML_TYPE_F16:
6079
+ {
6080
+ ggml_compute_forward_cos_f16(params, dst);
6081
+ } break;
5534
6082
  default:
5535
6083
  {
5536
6084
  GGML_ABORT("fatal error");
@@ -6100,14 +6648,14 @@ static void ggml_compute_forward_repeat_back(
6100
6648
 
6101
6649
  // ggml_compute_forward_concat
6102
6650
 
6103
- static void ggml_compute_forward_concat_f32(
6651
+ static void ggml_compute_forward_concat_any(
6104
6652
  const struct ggml_compute_params * params,
6105
6653
  struct ggml_tensor * dst) {
6106
6654
 
6107
6655
  const struct ggml_tensor * src0 = dst->src[0];
6108
6656
  const struct ggml_tensor * src1 = dst->src[1];
6109
6657
 
6110
- GGML_ASSERT(src0->nb[0] == sizeof(float));
6658
+ const size_t len = ggml_type_size(src0->type);
6111
6659
 
6112
6660
  const int ith = params->ith;
6113
6661
  const int nth = params->nth;
@@ -6121,7 +6669,7 @@ static void ggml_compute_forward_concat_f32(
6121
6669
  int64_t o[4] = {0, 0, 0, 0};
6122
6670
  o[dim] = src0->ne[dim];
6123
6671
 
6124
- const float * x;
6672
+ const char * x;
6125
6673
 
6126
6674
  // TODO: smarter multi-theading
6127
6675
  for (int i3 = 0; i3 < ne3; i3++) {
@@ -6129,40 +6677,179 @@ static void ggml_compute_forward_concat_f32(
6129
6677
  for (int i1 = 0; i1 < ne1; i1++) {
6130
6678
  for (int i0 = 0; i0 < ne0; i0++) {
6131
6679
  if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
6132
- x = (const float *) ((const char *)src0->data + (i0 )*nb00 + (i1 )*nb01 + (i2 )*nb02 + (i3 )*nb03);
6680
+ x = (const char *)src0->data + (i0 )*nb00 + (i1 )*nb01 + (i2 )*nb02 + (i3 )*nb03;
6133
6681
  } else {
6134
- x = (const float *) ((const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13);
6682
+ x = (const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13;
6135
6683
  }
6136
6684
 
6137
- float * y = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
6685
+ char * y = (char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3;
6138
6686
 
6139
- *y = *x;
6687
+ memcpy(y, x, len);
6140
6688
  }
6141
6689
  }
6142
6690
  }
6143
6691
  }
6144
6692
  }
6145
6693
 
6146
- static void ggml_compute_forward_concat(
6694
+ static void ggml_compute_forward_concat_i8(
6147
6695
  const struct ggml_compute_params * params,
6148
6696
  struct ggml_tensor * dst) {
6149
6697
 
6150
6698
  const struct ggml_tensor * src0 = dst->src[0];
6699
+ const struct ggml_tensor * src1 = dst->src[1];
6151
6700
 
6152
- switch (src0->type) {
6153
- case GGML_TYPE_F32:
6154
- case GGML_TYPE_I32:
6155
- {
6156
- ggml_compute_forward_concat_f32(params, dst);
6157
- } break;
6158
- default:
6159
- {
6160
- GGML_ABORT("fatal error");
6161
- }
6162
- }
6163
- }
6164
-
6165
- // ggml_compute_forward_abs
6701
+ GGML_ASSERT(ggml_type_size(src0->type) == sizeof(int8_t));
6702
+
6703
+ const int ith = params->ith;
6704
+ const int nth = params->nth;
6705
+
6706
+ GGML_TENSOR_BINARY_OP_LOCALS
6707
+
6708
+ const int32_t dim = ggml_get_op_params_i32(dst, 0);
6709
+
6710
+ GGML_ASSERT(dim >= 0 && dim < 4);
6711
+
6712
+ int64_t o[4] = {0, 0, 0, 0};
6713
+ o[dim] = src0->ne[dim];
6714
+
6715
+ const int8_t * x;
6716
+
6717
+ // TODO: smarter multi-theading
6718
+ for (int i3 = 0; i3 < ne3; i3++) {
6719
+ for (int i2 = ith; i2 < ne2; i2 += nth) {
6720
+ for (int i1 = 0; i1 < ne1; i1++) {
6721
+ for (int i0 = 0; i0 < ne0; i0++) {
6722
+ if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
6723
+ x = (const int8_t *) ((const char *)src0->data + (i0 )*nb00 + (i1 )*nb01 + (i2 )*nb02 + (i3 )*nb03);
6724
+ } else {
6725
+ x = (const int8_t *) ((const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13);
6726
+ }
6727
+
6728
+ int8_t * y = (int8_t *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
6729
+
6730
+ *y = *x;
6731
+ }
6732
+ }
6733
+ }
6734
+ }
6735
+ }
6736
+
6737
+ static void ggml_compute_forward_concat_f16(
6738
+ const struct ggml_compute_params * params,
6739
+ struct ggml_tensor * dst) {
6740
+
6741
+ const struct ggml_tensor * src0 = dst->src[0];
6742
+ const struct ggml_tensor * src1 = dst->src[1];
6743
+
6744
+ GGML_ASSERT(ggml_type_size(src0->type) == sizeof(ggml_fp16_t));
6745
+
6746
+ const int ith = params->ith;
6747
+ const int nth = params->nth;
6748
+
6749
+ GGML_TENSOR_BINARY_OP_LOCALS
6750
+
6751
+ const int32_t dim = ggml_get_op_params_i32(dst, 0);
6752
+
6753
+ GGML_ASSERT(dim >= 0 && dim < 4);
6754
+
6755
+ int64_t o[4] = {0, 0, 0, 0};
6756
+ o[dim] = src0->ne[dim];
6757
+
6758
+ const ggml_fp16_t * x;
6759
+
6760
+ // TODO: smarter multi-theading
6761
+ for (int i3 = 0; i3 < ne3; i3++) {
6762
+ for (int i2 = ith; i2 < ne2; i2 += nth) {
6763
+ for (int i1 = 0; i1 < ne1; i1++) {
6764
+ for (int i0 = 0; i0 < ne0; i0++) {
6765
+ if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
6766
+ x = (const ggml_fp16_t *) ((const char *)src0->data + (i0 )*nb00 + (i1 )*nb01 + (i2 )*nb02 + (i3 )*nb03);
6767
+ } else {
6768
+ x = (const ggml_fp16_t *) ((const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13);
6769
+ }
6770
+
6771
+ ggml_fp16_t * y = (ggml_fp16_t *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
6772
+
6773
+ *y = *x;
6774
+ }
6775
+ }
6776
+ }
6777
+ }
6778
+ }
6779
+
6780
+ static void ggml_compute_forward_concat_f32(
6781
+ const struct ggml_compute_params * params,
6782
+ struct ggml_tensor * dst) {
6783
+
6784
+ const struct ggml_tensor * src0 = dst->src[0];
6785
+ const struct ggml_tensor * src1 = dst->src[1];
6786
+
6787
+ GGML_ASSERT(ggml_type_size(src0->type) == sizeof(float));
6788
+
6789
+ const int ith = params->ith;
6790
+ const int nth = params->nth;
6791
+
6792
+ GGML_TENSOR_BINARY_OP_LOCALS
6793
+
6794
+ const int32_t dim = ggml_get_op_params_i32(dst, 0);
6795
+
6796
+ GGML_ASSERT(dim >= 0 && dim < 4);
6797
+
6798
+ int64_t o[4] = {0, 0, 0, 0};
6799
+ o[dim] = src0->ne[dim];
6800
+
6801
+ const float * x;
6802
+
6803
+ // TODO: smarter multi-theading
6804
+ for (int i3 = 0; i3 < ne3; i3++) {
6805
+ for (int i2 = ith; i2 < ne2; i2 += nth) {
6806
+ for (int i1 = 0; i1 < ne1; i1++) {
6807
+ for (int i0 = 0; i0 < ne0; i0++) {
6808
+ if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
6809
+ x = (const float *) ((const char *)src0->data + (i0 )*nb00 + (i1 )*nb01 + (i2 )*nb02 + (i3 )*nb03);
6810
+ } else {
6811
+ x = (const float *) ((const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13);
6812
+ }
6813
+
6814
+ float * y = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
6815
+
6816
+ *y = *x;
6817
+ }
6818
+ }
6819
+ }
6820
+ }
6821
+ }
6822
+
6823
+ static void ggml_compute_forward_concat(
6824
+ const struct ggml_compute_params * params,
6825
+ struct ggml_tensor * dst) {
6826
+
6827
+ const struct ggml_tensor * src0 = dst->src[0];
6828
+
6829
+ switch (src0->type) {
6830
+ case GGML_TYPE_F16:
6831
+ case GGML_TYPE_BF16:
6832
+ case GGML_TYPE_I16:
6833
+ {
6834
+ ggml_compute_forward_concat_f16(params, dst);
6835
+ } break;
6836
+ case GGML_TYPE_I8:
6837
+ {
6838
+ ggml_compute_forward_concat_i8(params, dst);
6839
+ } break;
6840
+ case GGML_TYPE_F32:
6841
+ case GGML_TYPE_I32:
6842
+ {
6843
+ ggml_compute_forward_concat_f32(params, dst);
6844
+ } break;
6845
+ default:
6846
+ {
6847
+ ggml_compute_forward_concat_any(params, dst);
6848
+ }
6849
+ }
6850
+ }
6851
+
6852
+ // ggml_compute_forward_abs
6166
6853
 
6167
6854
  static void ggml_compute_forward_abs_f32(
6168
6855
  const struct ggml_compute_params * params,
@@ -6188,6 +6875,30 @@ static void ggml_compute_forward_abs_f32(
6188
6875
  }
6189
6876
  }
6190
6877
 
6878
+ static void ggml_compute_forward_abs_f16(
6879
+ const struct ggml_compute_params * params,
6880
+ struct ggml_tensor * dst) {
6881
+
6882
+ const struct ggml_tensor * src0 = dst->src[0];
6883
+
6884
+ if (params->ith != 0) {
6885
+ return;
6886
+ }
6887
+
6888
+ assert(ggml_is_contiguous_1(src0));
6889
+ assert(ggml_is_contiguous_1(dst));
6890
+ assert(ggml_are_same_shape(src0, dst));
6891
+
6892
+ const int n = ggml_nrows(src0);
6893
+ const int nc = src0->ne[0];
6894
+
6895
+ for (int i = 0; i < n; i++) {
6896
+ ggml_vec_abs_f16(nc,
6897
+ (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
6898
+ (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
6899
+ }
6900
+ }
6901
+
6191
6902
  static void ggml_compute_forward_abs(
6192
6903
  const struct ggml_compute_params * params,
6193
6904
  struct ggml_tensor * dst) {
@@ -6199,6 +6910,10 @@ static void ggml_compute_forward_abs(
6199
6910
  {
6200
6911
  ggml_compute_forward_abs_f32(params, dst);
6201
6912
  } break;
6913
+ case GGML_TYPE_F16:
6914
+ {
6915
+ ggml_compute_forward_abs_f16(params, dst);
6916
+ } break;
6202
6917
  default:
6203
6918
  {
6204
6919
  GGML_ABORT("fatal error");
@@ -6232,6 +6947,30 @@ static void ggml_compute_forward_sgn_f32(
6232
6947
  }
6233
6948
  }
6234
6949
 
6950
+ static void ggml_compute_forward_sgn_f16(
6951
+ const struct ggml_compute_params * params,
6952
+ struct ggml_tensor * dst) {
6953
+
6954
+ const struct ggml_tensor * src0 = dst->src[0];
6955
+
6956
+ if (params->ith != 0) {
6957
+ return;
6958
+ }
6959
+
6960
+ assert(ggml_is_contiguous_1(src0));
6961
+ assert(ggml_is_contiguous_1(dst));
6962
+ assert(ggml_are_same_shape(src0, dst));
6963
+
6964
+ const int n = ggml_nrows(src0);
6965
+ const int nc = src0->ne[0];
6966
+
6967
+ for (int i = 0; i < n; i++) {
6968
+ ggml_vec_sgn_f16(nc,
6969
+ (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
6970
+ (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
6971
+ }
6972
+ }
6973
+
6235
6974
  static void ggml_compute_forward_sgn(
6236
6975
  const struct ggml_compute_params * params,
6237
6976
  struct ggml_tensor * dst) {
@@ -6243,6 +6982,10 @@ static void ggml_compute_forward_sgn(
6243
6982
  {
6244
6983
  ggml_compute_forward_sgn_f32(params, dst);
6245
6984
  } break;
6985
+ case GGML_TYPE_F16:
6986
+ {
6987
+ ggml_compute_forward_sgn_f16(params, dst);
6988
+ } break;
6246
6989
  default:
6247
6990
  {
6248
6991
  GGML_ABORT("fatal error");
@@ -6276,6 +7019,30 @@ static void ggml_compute_forward_neg_f32(
6276
7019
  }
6277
7020
  }
6278
7021
 
7022
+ static void ggml_compute_forward_neg_f16(
7023
+ const struct ggml_compute_params * params,
7024
+ struct ggml_tensor * dst) {
7025
+
7026
+ const struct ggml_tensor * src0 = dst->src[0];
7027
+
7028
+ if (params->ith != 0) {
7029
+ return;
7030
+ }
7031
+
7032
+ assert(ggml_is_contiguous_1(src0));
7033
+ assert(ggml_is_contiguous_1(dst));
7034
+ assert(ggml_are_same_shape(src0, dst));
7035
+
7036
+ const int n = ggml_nrows(src0);
7037
+ const int nc = src0->ne[0];
7038
+
7039
+ for (int i = 0; i < n; i++) {
7040
+ ggml_vec_neg_f16(nc,
7041
+ (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
7042
+ (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
7043
+ }
7044
+ }
7045
+
6279
7046
  static void ggml_compute_forward_neg(
6280
7047
  const struct ggml_compute_params * params,
6281
7048
  struct ggml_tensor * dst) {
@@ -6287,6 +7054,10 @@ static void ggml_compute_forward_neg(
6287
7054
  {
6288
7055
  ggml_compute_forward_neg_f32(params, dst);
6289
7056
  } break;
7057
+ case GGML_TYPE_F16:
7058
+ {
7059
+ ggml_compute_forward_neg_f16(params, dst);
7060
+ } break;
6290
7061
  default:
6291
7062
  {
6292
7063
  GGML_ABORT("fatal error");
@@ -6320,6 +7091,30 @@ static void ggml_compute_forward_step_f32(
6320
7091
  }
6321
7092
  }
6322
7093
 
7094
+ static void ggml_compute_forward_step_f16(
7095
+ const struct ggml_compute_params * params,
7096
+ struct ggml_tensor * dst) {
7097
+
7098
+ const struct ggml_tensor * src0 = dst->src[0];
7099
+
7100
+ if (params->ith != 0) {
7101
+ return;
7102
+ }
7103
+
7104
+ assert(ggml_is_contiguous_1(src0));
7105
+ assert(ggml_is_contiguous_1(dst));
7106
+ assert(ggml_are_same_shape(src0, dst));
7107
+
7108
+ const int n = ggml_nrows(src0);
7109
+ const int nc = src0->ne[0];
7110
+
7111
+ for (int i = 0; i < n; i++) {
7112
+ ggml_vec_step_f16(nc,
7113
+ (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
7114
+ (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
7115
+ }
7116
+ }
7117
+
6323
7118
  static void ggml_compute_forward_step(
6324
7119
  const struct ggml_compute_params * params,
6325
7120
  struct ggml_tensor * dst) {
@@ -6331,6 +7126,10 @@ static void ggml_compute_forward_step(
6331
7126
  {
6332
7127
  ggml_compute_forward_step_f32(params, dst);
6333
7128
  } break;
7129
+ case GGML_TYPE_F16:
7130
+ {
7131
+ ggml_compute_forward_step_f16(params, dst);
7132
+ } break;
6334
7133
  default:
6335
7134
  {
6336
7135
  GGML_ABORT("fatal error");
@@ -6364,6 +7163,30 @@ static void ggml_compute_forward_tanh_f32(
6364
7163
  }
6365
7164
  }
6366
7165
 
7166
+ static void ggml_compute_forward_tanh_f16(
7167
+ const struct ggml_compute_params * params,
7168
+ struct ggml_tensor * dst) {
7169
+
7170
+ const struct ggml_tensor * src0 = dst->src[0];
7171
+
7172
+ if (params->ith != 0) {
7173
+ return;
7174
+ }
7175
+
7176
+ assert(ggml_is_contiguous_1(src0));
7177
+ assert(ggml_is_contiguous_1(dst));
7178
+ assert(ggml_are_same_shape(src0, dst));
7179
+
7180
+ const int n = ggml_nrows(src0);
7181
+ const int nc = src0->ne[0];
7182
+
7183
+ for (int i = 0; i < n; i++) {
7184
+ ggml_vec_tanh_f16(nc,
7185
+ (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
7186
+ (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
7187
+ }
7188
+ }
7189
+
6367
7190
  static void ggml_compute_forward_tanh(
6368
7191
  const struct ggml_compute_params * params,
6369
7192
  struct ggml_tensor * dst) {
@@ -6375,6 +7198,10 @@ static void ggml_compute_forward_tanh(
6375
7198
  {
6376
7199
  ggml_compute_forward_tanh_f32(params, dst);
6377
7200
  } break;
7201
+ case GGML_TYPE_F16:
7202
+ {
7203
+ ggml_compute_forward_tanh_f16(params, dst);
7204
+ } break;
6378
7205
  default:
6379
7206
  {
6380
7207
  GGML_ABORT("fatal error");
@@ -6408,6 +7235,30 @@ static void ggml_compute_forward_elu_f32(
6408
7235
  }
6409
7236
  }
6410
7237
 
7238
+ static void ggml_compute_forward_elu_f16(
7239
+ const struct ggml_compute_params * params,
7240
+ struct ggml_tensor * dst) {
7241
+
7242
+ const struct ggml_tensor * src0 = dst->src[0];
7243
+
7244
+ if (params->ith != 0) {
7245
+ return;
7246
+ }
7247
+
7248
+ assert(ggml_is_contiguous_1(src0));
7249
+ assert(ggml_is_contiguous_1(dst));
7250
+ assert(ggml_are_same_shape(src0, dst));
7251
+
7252
+ const int n = ggml_nrows(src0);
7253
+ const int nc = src0->ne[0];
7254
+
7255
+ for (int i = 0; i < n; i++) {
7256
+ ggml_vec_elu_f16(nc,
7257
+ (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
7258
+ (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
7259
+ }
7260
+ }
7261
+
6411
7262
  static void ggml_compute_forward_elu(
6412
7263
  const struct ggml_compute_params * params,
6413
7264
  struct ggml_tensor * dst) {
@@ -6419,6 +7270,10 @@ static void ggml_compute_forward_elu(
6419
7270
  {
6420
7271
  ggml_compute_forward_elu_f32(params, dst);
6421
7272
  } break;
7273
+ case GGML_TYPE_F16:
7274
+ {
7275
+ ggml_compute_forward_elu_f16(params, dst);
7276
+ } break;
6422
7277
  default:
6423
7278
  {
6424
7279
  GGML_ABORT("fatal error");
@@ -6452,6 +7307,30 @@ static void ggml_compute_forward_relu_f32(
6452
7307
  }
6453
7308
  }
6454
7309
 
7310
+ static void ggml_compute_forward_relu_f16(
7311
+ const struct ggml_compute_params * params,
7312
+ struct ggml_tensor * dst) {
7313
+
7314
+ const struct ggml_tensor * src0 = dst->src[0];
7315
+
7316
+ if (params->ith != 0) {
7317
+ return;
7318
+ }
7319
+
7320
+ assert(ggml_is_contiguous_1(src0));
7321
+ assert(ggml_is_contiguous_1(dst));
7322
+ assert(ggml_are_same_shape(src0, dst));
7323
+
7324
+ const int n = ggml_nrows(src0);
7325
+ const int nc = src0->ne[0];
7326
+
7327
+ for (int i = 0; i < n; i++) {
7328
+ ggml_vec_relu_f16(nc,
7329
+ (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
7330
+ (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
7331
+ }
7332
+ }
7333
+
6455
7334
  static void ggml_compute_forward_relu(
6456
7335
  const struct ggml_compute_params * params,
6457
7336
  struct ggml_tensor * dst) {
@@ -6463,6 +7342,10 @@ static void ggml_compute_forward_relu(
6463
7342
  {
6464
7343
  ggml_compute_forward_relu_f32(params, dst);
6465
7344
  } break;
7345
+ case GGML_TYPE_F16:
7346
+ {
7347
+ ggml_compute_forward_relu_f16(params, dst);
7348
+ } break;
6466
7349
  default:
6467
7350
  {
6468
7351
  GGML_ABORT("fatal error");
@@ -6496,6 +7379,30 @@ static void ggml_compute_forward_sigmoid_f32(
6496
7379
  }
6497
7380
  }
6498
7381
 
7382
+ static void ggml_compute_forward_sigmoid_f16(
7383
+ const struct ggml_compute_params * params,
7384
+ struct ggml_tensor * dst) {
7385
+
7386
+ const struct ggml_tensor * src0 = dst->src[0];
7387
+
7388
+ if (params->ith != 0) {
7389
+ return;
7390
+ }
7391
+
7392
+ assert(ggml_is_contiguous_1(src0));
7393
+ assert(ggml_is_contiguous_1(dst));
7394
+ assert(ggml_are_same_shape(src0, dst));
7395
+
7396
+ const int n = ggml_nrows(src0);
7397
+ const int nc = src0->ne[0];
7398
+
7399
+ for (int i = 0; i < n; i++) {
7400
+ ggml_vec_sigmoid_f16(nc,
7401
+ (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
7402
+ (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
7403
+ }
7404
+ }
7405
+
6499
7406
  static void ggml_compute_forward_sigmoid(
6500
7407
  const struct ggml_compute_params * params,
6501
7408
  struct ggml_tensor * dst) {
@@ -6507,6 +7414,10 @@ static void ggml_compute_forward_sigmoid(
6507
7414
  {
6508
7415
  ggml_compute_forward_sigmoid_f32(params, dst);
6509
7416
  } break;
7417
+ case GGML_TYPE_F16:
7418
+ {
7419
+ ggml_compute_forward_sigmoid_f16(params, dst);
7420
+ } break;
6510
7421
  default:
6511
7422
  {
6512
7423
  GGML_ABORT("fatal error");
@@ -6555,6 +7466,46 @@ static void ggml_compute_forward_gelu_f32(
6555
7466
  }
6556
7467
  }
6557
7468
 
7469
+ static void ggml_compute_forward_gelu_f16(
7470
+ const struct ggml_compute_params * params,
7471
+ struct ggml_tensor * dst) {
7472
+
7473
+ const struct ggml_tensor * src0 = dst->src[0];
7474
+
7475
+ assert(ggml_is_contiguous_1(src0));
7476
+ assert(ggml_is_contiguous_1(dst));
7477
+ assert(ggml_are_same_shape(src0, dst));
7478
+
7479
+ const int ith = params->ith;
7480
+ const int nth = params->nth;
7481
+
7482
+ const int nc = src0->ne[0];
7483
+ const int nr = ggml_nrows(src0);
7484
+
7485
+ // rows per thread
7486
+ const int dr = (nr + nth - 1)/nth;
7487
+
7488
+ // row range for this thread
7489
+ const int ir0 = dr*ith;
7490
+ const int ir1 = MIN(ir0 + dr, nr);
7491
+
7492
+ for (int i1 = ir0; i1 < ir1; i1++) {
7493
+ ggml_vec_gelu_f16(nc,
7494
+ (ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])),
7495
+ (ggml_fp16_t *) ((char *) src0->data + i1*(src0->nb[1])));
7496
+
7497
+ #ifndef NDEBUG
7498
+ for (int k = 0; k < nc; k++) {
7499
+ const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k];
7500
+ const float v = GGML_FP16_TO_FP32(x);
7501
+ UNUSED(v);
7502
+ assert(!isnan(v));
7503
+ assert(!isinf(v));
7504
+ }
7505
+ #endif
7506
+ }
7507
+ }
7508
+
6558
7509
  static void ggml_compute_forward_gelu(
6559
7510
  const struct ggml_compute_params * params,
6560
7511
  struct ggml_tensor * dst) {
@@ -6566,6 +7517,10 @@ static void ggml_compute_forward_gelu(
6566
7517
  {
6567
7518
  ggml_compute_forward_gelu_f32(params, dst);
6568
7519
  } break;
7520
+ case GGML_TYPE_F16:
7521
+ {
7522
+ ggml_compute_forward_gelu_f16(params, dst);
7523
+ } break;
6569
7524
  default:
6570
7525
  {
6571
7526
  GGML_ABORT("fatal error");
@@ -6614,6 +7569,46 @@ static void ggml_compute_forward_gelu_quick_f32(
6614
7569
  }
6615
7570
  }
6616
7571
 
7572
+ static void ggml_compute_forward_gelu_quick_f16(
7573
+ const struct ggml_compute_params * params,
7574
+ struct ggml_tensor * dst) {
7575
+
7576
+ const struct ggml_tensor * src0 = dst->src[0];
7577
+
7578
+ assert(ggml_is_contiguous_1(src0));
7579
+ assert(ggml_is_contiguous_1(dst));
7580
+ assert(ggml_are_same_shape(src0, dst));
7581
+
7582
+ const int ith = params->ith;
7583
+ const int nth = params->nth;
7584
+
7585
+ const int nc = src0->ne[0];
7586
+ const int nr = ggml_nrows(src0);
7587
+
7588
+ // rows per thread
7589
+ const int dr = (nr + nth - 1)/nth;
7590
+
7591
+ // row range for this thread
7592
+ const int ir0 = dr*ith;
7593
+ const int ir1 = MIN(ir0 + dr, nr);
7594
+
7595
+ for (int i1 = ir0; i1 < ir1; i1++) {
7596
+ ggml_vec_gelu_quick_f16(nc,
7597
+ (ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])),
7598
+ (ggml_fp16_t *) ((char *) src0->data + i1*(src0->nb[1])));
7599
+
7600
+ #ifndef NDEBUG
7601
+ for (int k = 0; k < nc; k++) {
7602
+ const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k];
7603
+ const float v = GGML_FP16_TO_FP32(x);
7604
+ UNUSED(v);
7605
+ assert(!isnan(v));
7606
+ assert(!isinf(v));
7607
+ }
7608
+ #endif
7609
+ }
7610
+ }
7611
+
6617
7612
  static void ggml_compute_forward_gelu_quick(
6618
7613
  const struct ggml_compute_params * params,
6619
7614
  struct ggml_tensor * dst) {
@@ -6625,6 +7620,10 @@ static void ggml_compute_forward_gelu_quick(
6625
7620
  {
6626
7621
  ggml_compute_forward_gelu_quick_f32(params, dst);
6627
7622
  } break;
7623
+ case GGML_TYPE_F16:
7624
+ {
7625
+ ggml_compute_forward_gelu_quick_f16(params, dst);
7626
+ } break;
6628
7627
  default:
6629
7628
  {
6630
7629
  GGML_ABORT("fatal error");
@@ -6673,6 +7672,46 @@ static void ggml_compute_forward_silu_f32(
6673
7672
  }
6674
7673
  }
6675
7674
 
7675
+ static void ggml_compute_forward_silu_f16(
7676
+ const struct ggml_compute_params * params,
7677
+ struct ggml_tensor * dst) {
7678
+
7679
+ const struct ggml_tensor * src0 = dst->src[0];
7680
+
7681
+ assert(ggml_is_contiguous_1(src0));
7682
+ assert(ggml_is_contiguous_1(dst));
7683
+ assert(ggml_are_same_shape(src0, dst));
7684
+
7685
+ const int ith = params->ith;
7686
+ const int nth = params->nth;
7687
+
7688
+ const int nc = src0->ne[0];
7689
+ const int nr = ggml_nrows(src0);
7690
+
7691
+ // rows per thread
7692
+ const int dr = (nr + nth - 1)/nth;
7693
+
7694
+ // row range for this thread
7695
+ const int ir0 = dr*ith;
7696
+ const int ir1 = MIN(ir0 + dr, nr);
7697
+
7698
+ for (int i1 = ir0; i1 < ir1; i1++) {
7699
+ ggml_vec_silu_f16(nc,
7700
+ (ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])),
7701
+ (ggml_fp16_t *) ((char *) src0->data + i1*(src0->nb[1])));
7702
+
7703
+ #ifndef NDEBUG
7704
+ for (int k = 0; k < nc; k++) {
7705
+ const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*(dst->nb[1])))[k];
7706
+ const float v = GGML_FP16_TO_FP32(x);
7707
+ UNUSED(v);
7708
+ assert(!isnan(v));
7709
+ assert(!isinf(v));
7710
+ }
7711
+ #endif
7712
+ }
7713
+ }
7714
+
6676
7715
  static void ggml_compute_forward_silu(
6677
7716
  const struct ggml_compute_params * params,
6678
7717
  struct ggml_tensor * dst) {
@@ -6684,6 +7723,10 @@ static void ggml_compute_forward_silu(
6684
7723
  {
6685
7724
  ggml_compute_forward_silu_f32(params, dst);
6686
7725
  } break;
7726
+ case GGML_TYPE_F16:
7727
+ {
7728
+ ggml_compute_forward_silu_f16(params, dst);
7729
+ } break;
6687
7730
  default:
6688
7731
  {
6689
7732
  GGML_ABORT("fatal error");
@@ -6712,13 +7755,43 @@ static void ggml_compute_forward_leaky_relu_f32(
6712
7755
  float negative_slope;
6713
7756
  memcpy(&negative_slope, dst->op_params, sizeof(float));
6714
7757
 
6715
- assert(dst->nb[0] == sizeof(float));
6716
- assert(src0->nb[0] == sizeof(float));
7758
+ assert(dst->nb[0] == sizeof(float));
7759
+ assert(src0->nb[0] == sizeof(float));
7760
+
7761
+ for (int i = 0; i < n; i++) {
7762
+ ggml_vec_leaky_relu_f32(nc,
7763
+ (float *) ((char *) dst->data + i*( dst->nb[1])),
7764
+ (float *) ((char *) src0->data + i*(src0->nb[1])), negative_slope);
7765
+ }
7766
+ }
7767
+
7768
+ static void ggml_compute_forward_leaky_relu_f16(
7769
+ const struct ggml_compute_params * params,
7770
+ struct ggml_tensor * dst) {
7771
+
7772
+ const struct ggml_tensor * src0 = dst->src[0];
7773
+
7774
+ if (params->ith != 0) {
7775
+ return;
7776
+ }
7777
+
7778
+ assert(ggml_is_contiguous_1(src0));
7779
+ assert(ggml_is_contiguous_1(dst));
7780
+ assert(ggml_are_same_shape(src0, dst));
7781
+
7782
+ const int n = ggml_nrows(src0);
7783
+ const int nc = src0->ne[0];
7784
+
7785
+ float negative_slope;
7786
+ memcpy(&negative_slope, dst->op_params, sizeof(float));
7787
+
7788
+ assert(dst->nb[0] == sizeof(ggml_fp16_t));
7789
+ assert(src0->nb[0] == sizeof(ggml_fp16_t));
6717
7790
 
6718
7791
  for (int i = 0; i < n; i++) {
6719
- ggml_vec_leaky_relu_f32(nc,
6720
- (float *) ((char *) dst->data + i*( dst->nb[1])),
6721
- (float *) ((char *) src0->data + i*(src0->nb[1])), negative_slope);
7792
+ ggml_vec_leaky_relu_f16(nc,
7793
+ (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
7794
+ (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])), negative_slope);
6722
7795
  }
6723
7796
  }
6724
7797
 
@@ -6733,6 +7806,10 @@ static void ggml_compute_forward_leaky_relu(
6733
7806
  {
6734
7807
  ggml_compute_forward_leaky_relu_f32(params, dst);
6735
7808
  } break;
7809
+ case GGML_TYPE_F16:
7810
+ {
7811
+ ggml_compute_forward_leaky_relu_f16(params, dst);
7812
+ } break;
6736
7813
  default:
6737
7814
  {
6738
7815
  GGML_ABORT("fatal error");
@@ -6785,6 +7862,50 @@ static void ggml_compute_forward_silu_back_f32(
6785
7862
  }
6786
7863
  }
6787
7864
 
7865
+ static void ggml_compute_forward_silu_back_f16(
7866
+ const struct ggml_compute_params * params,
7867
+ struct ggml_tensor * dst) {
7868
+
7869
+ const struct ggml_tensor * grad = dst->src[0];
7870
+ const struct ggml_tensor * src1 = dst->src[1];
7871
+
7872
+ assert(ggml_is_contiguous_1(grad));
7873
+ assert(ggml_is_contiguous_1(src1));
7874
+ assert(ggml_is_contiguous_1(dst));
7875
+ assert(ggml_are_same_shape(src1, dst));
7876
+ assert(ggml_are_same_shape(src1, grad));
7877
+
7878
+ const int ith = params->ith;
7879
+ const int nth = params->nth;
7880
+
7881
+ const int nc = src1->ne[0];
7882
+ const int nr = ggml_nrows(src1);
7883
+
7884
+ // rows per thread
7885
+ const int dr = (nr + nth - 1)/nth;
7886
+
7887
+ // row range for this thread
7888
+ const int ir0 = dr*ith;
7889
+ const int ir1 = MIN(ir0 + dr, nr);
7890
+
7891
+ for (int i1 = ir0; i1 < ir1; i1++) {
7892
+ ggml_vec_silu_backward_f16(nc,
7893
+ (ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])),
7894
+ (ggml_fp16_t *) ((char *) src1->data + i1*(src1->nb[1])),
7895
+ (ggml_fp16_t *) ((char *) grad->data + i1*(grad->nb[1])));
7896
+
7897
+ #ifndef NDEBUG
7898
+ for (int k = 0; k < nc; k++) {
7899
+ const float x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k];
7900
+ const float v = GGML_FP16_TO_FP32(x);
7901
+ UNUSED(v);
7902
+ assert(!isnan(v));
7903
+ assert(!isinf(v));
7904
+ }
7905
+ #endif
7906
+ }
7907
+ }
7908
+
6788
7909
  static void ggml_compute_forward_silu_back(
6789
7910
  const struct ggml_compute_params * params,
6790
7911
  struct ggml_tensor * dst) {
@@ -6796,6 +7917,10 @@ static void ggml_compute_forward_silu_back(
6796
7917
  {
6797
7918
  ggml_compute_forward_silu_back_f32(params, dst);
6798
7919
  } break;
7920
+ case GGML_TYPE_F16:
7921
+ {
7922
+ ggml_compute_forward_silu_back_f16(params, dst);
7923
+ } break;
6799
7924
  default:
6800
7925
  {
6801
7926
  GGML_ABORT("fatal error");
@@ -6803,7 +7928,6 @@ static void ggml_compute_forward_silu_back(
6803
7928
  }
6804
7929
  }
6805
7930
 
6806
-
6807
7931
  static void ggml_compute_forward_hardswish_f32(
6808
7932
  const struct ggml_compute_params * params,
6809
7933
  struct ggml_tensor * dst) {
@@ -6827,6 +7951,31 @@ static void ggml_compute_forward_hardswish_f32(
6827
7951
  (float *) ((char *) src0->data + i*(src0->nb[1])));
6828
7952
  }
6829
7953
  }
7954
+
7955
+ static void ggml_compute_forward_hardswish_f16(
7956
+ const struct ggml_compute_params * params,
7957
+ struct ggml_tensor * dst) {
7958
+
7959
+ const struct ggml_tensor * src0 = dst->src[0];
7960
+
7961
+ if (params->ith != 0) {
7962
+ return;
7963
+ }
7964
+
7965
+ assert(ggml_is_contiguous_1(src0));
7966
+ assert(ggml_is_contiguous_1(dst));
7967
+ assert(ggml_are_same_shape(src0, dst));
7968
+
7969
+ const int n = ggml_nrows(src0);
7970
+ const int nc = src0->ne[0];
7971
+
7972
+ for (int i = 0; i < n; i++) {
7973
+ ggml_vec_hardswish_f16(nc,
7974
+ (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
7975
+ (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
7976
+ }
7977
+ }
7978
+
6830
7979
  static void ggml_compute_forward_hardswish(
6831
7980
  const struct ggml_compute_params * params,
6832
7981
  struct ggml_tensor * dst) {
@@ -6838,6 +7987,10 @@ static void ggml_compute_forward_hardswish(
6838
7987
  {
6839
7988
  ggml_compute_forward_hardswish_f32(params, dst);
6840
7989
  } break;
7990
+ case GGML_TYPE_F16:
7991
+ {
7992
+ ggml_compute_forward_hardswish_f16(params, dst);
7993
+ } break;
6841
7994
  default:
6842
7995
  {
6843
7996
  GGML_ABORT("fatal error");
@@ -6869,6 +8022,30 @@ static void ggml_compute_forward_hardsigmoid_f32(
6869
8022
  }
6870
8023
  }
6871
8024
 
8025
+ static void ggml_compute_forward_hardsigmoid_f16(
8026
+ const struct ggml_compute_params * params,
8027
+ struct ggml_tensor * dst) {
8028
+
8029
+ const struct ggml_tensor * src0 = dst->src[0];
8030
+
8031
+ if (params->ith != 0) {
8032
+ return;
8033
+ }
8034
+
8035
+ assert(ggml_is_contiguous_1(src0));
8036
+ assert(ggml_is_contiguous_1(dst));
8037
+ assert(ggml_are_same_shape(src0, dst));
8038
+
8039
+ const int n = ggml_nrows(src0);
8040
+ const int nc = src0->ne[0];
8041
+
8042
+ for (int i = 0; i < n; i++) {
8043
+ ggml_vec_hardsigmoid_f16(nc,
8044
+ (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
8045
+ (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
8046
+ }
8047
+ }
8048
+
6872
8049
  static void ggml_compute_forward_hardsigmoid(
6873
8050
  const struct ggml_compute_params * params,
6874
8051
  struct ggml_tensor * dst) {
@@ -6880,6 +8057,10 @@ static void ggml_compute_forward_hardsigmoid(
6880
8057
  {
6881
8058
  ggml_compute_forward_hardsigmoid_f32(params, dst);
6882
8059
  } break;
8060
+ case GGML_TYPE_F16:
8061
+ {
8062
+ ggml_compute_forward_hardsigmoid_f16(params, dst);
8063
+ } break;
6883
8064
  default:
6884
8065
  {
6885
8066
  GGML_ABORT("fatal error");
@@ -6911,6 +8092,30 @@ static void ggml_compute_forward_exp_f32(
6911
8092
  }
6912
8093
  }
6913
8094
 
8095
+ static void ggml_compute_forward_exp_f16(
8096
+ const struct ggml_compute_params * params,
8097
+ struct ggml_tensor * dst) {
8098
+
8099
+ const struct ggml_tensor * src0 = dst->src[0];
8100
+
8101
+ if (params->ith != 0) {
8102
+ return;
8103
+ }
8104
+
8105
+ assert(ggml_is_contiguous_1(src0));
8106
+ assert(ggml_is_contiguous_1(dst));
8107
+ assert(ggml_are_same_shape(src0, dst));
8108
+
8109
+ const int n = ggml_nrows(src0);
8110
+ const int nc = src0->ne[0];
8111
+
8112
+ for (int i = 0; i < n; i++) {
8113
+ ggml_vec_exp_f16(nc,
8114
+ (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
8115
+ (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
8116
+ }
8117
+ }
8118
+
6914
8119
  static void ggml_compute_forward_exp(
6915
8120
  const struct ggml_compute_params * params,
6916
8121
  struct ggml_tensor * dst) {
@@ -6922,6 +8127,10 @@ static void ggml_compute_forward_exp(
6922
8127
  {
6923
8128
  ggml_compute_forward_exp_f32(params, dst);
6924
8129
  } break;
8130
+ case GGML_TYPE_F16:
8131
+ {
8132
+ ggml_compute_forward_exp_f16(params, dst);
8133
+ } break;
6925
8134
  default:
6926
8135
  {
6927
8136
  GGML_ABORT("fatal error");
@@ -7496,6 +8705,7 @@ UseGgmlGemm1:;
7496
8705
  if (src1->type != vec_dot_type) {
7497
8706
  char * wdata = params->wdata;
7498
8707
 
8708
+ const size_t nbw0 = ggml_type_size(vec_dot_type);
7499
8709
  const size_t nbw1 = ggml_row_size(vec_dot_type, ne10);
7500
8710
  const size_t nbw2 = nbw1*ne11;
7501
8711
  const size_t nbw3 = nbw2*ne12;
@@ -7503,6 +8713,7 @@ UseGgmlGemm1:;
7503
8713
  assert(params->wsize >= ne13*nbw3);
7504
8714
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
7505
8715
 
8716
+ #if 0
7506
8717
  for (int64_t i13 = 0; i13 < ne13; ++i13) {
7507
8718
  for (int64_t i12 = 0; i12 < ne12; ++i12) {
7508
8719
  for (int64_t i11 = ith; i11 < ne11; i11 += nth) {
@@ -7512,6 +8723,20 @@ UseGgmlGemm1:;
7512
8723
  }
7513
8724
  }
7514
8725
  }
8726
+ #else
8727
+ for (int64_t i13 = 0; i13 < ne13; ++i13) {
8728
+ for (int64_t i12 = 0; i12 < ne12; ++i12) {
8729
+ for (int64_t i11 = 0; i11 < ne11; ++i11) {
8730
+ size_t bs = ggml_blck_size(vec_dot_type);
8731
+ int64_t ne10_block_start = (ith * ne10/bs) / nth;
8732
+ int64_t ne10_block_end = ((ith + 1) * ne10/bs) / nth;
8733
+ from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + ne10_block_start*bs*nb10),
8734
+ (void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1 + ne10_block_start*nbw0),
8735
+ (ne10_block_end - ne10_block_start) * bs);
8736
+ }
8737
+ }
8738
+ }
8739
+ #endif
7515
8740
  }
7516
8741
 
7517
8742
  if (ith == 0) {
@@ -7566,7 +8791,7 @@ UseGgmlGemm2:;
7566
8791
  int64_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size;
7567
8792
 
7568
8793
  // If the chunking is poor for the number of threads on this setup, scrap the whole plan. Re-chunk it by thread.
7569
- // Also, chunking by thread was measured to have perform better on NUMA systems. See https://github.com/ggerganov/llama.cpp/pull/6915
8794
+ // Also, chunking by thread was measured to have perform better on NUMA systems. See https://github.com/ggml-org/llama.cpp/pull/6915
7570
8795
  // In theory, chunking should be just as useful on NUMA and non NUMA systems, but testing disagreed with that.
7571
8796
  if (nchunk0 * nchunk1 < nth * 4 || ggml_is_numa()) {
7572
8797
  // distribute the thread work across the inner or outer loop based on which one is larger
@@ -7599,7 +8824,6 @@ UseGgmlGemm2:;
7599
8824
  if ((nr0 % 2 != 0) || (ne11 % 2 != 0) || ((ir0_end - ir0_start) % 2 != 0) || ((ir1_end - ir1_start) % 2 != 0)) {
7600
8825
  num_rows_per_vec_dot = 1;
7601
8826
  }
7602
-
7603
8827
  ggml_compute_forward_mul_mat_one_chunk(params, dst, src0->type, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end);
7604
8828
 
7605
8829
  if (nth >= nchunk0 * nchunk1) {
@@ -7612,6 +8836,84 @@ UseGgmlGemm2:;
7612
8836
 
7613
8837
  // ggml_compute_forward_mul_mat_id
7614
8838
 
8839
+ #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ids->ne[0]*ids->ne[1] + (i1)]
8840
+
8841
+ struct mmid_row_mapping {
8842
+ int32_t i1;
8843
+ int32_t i2;
8844
+ };
8845
+
8846
+ static void ggml_compute_forward_mul_mat_id_one_chunk(
8847
+ struct ggml_tensor * dst,
8848
+ const struct ggml_tensor * src0,
8849
+ const struct ggml_tensor * src1,
8850
+ const struct ggml_tensor * ids,
8851
+ const int64_t cur_a,
8852
+ const int64_t ir0_start,
8853
+ const int64_t ir0_end,
8854
+ const int64_t ir1_start,
8855
+ const int64_t ir1_end,
8856
+ const char * src0_cur,
8857
+ const struct mmid_row_mapping * matrix_rows,
8858
+ const size_t row_size,
8859
+ const bool src1_cont,
8860
+ const void * wdata) {
8861
+
8862
+ GGML_TENSOR_BINARY_OP_LOCALS
8863
+
8864
+ const enum ggml_type type = src0->type;
8865
+
8866
+ ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot;
8867
+ enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
8868
+
8869
+ const int64_t blck_0 = 16;
8870
+ const int64_t blck_1 = 16;
8871
+
8872
+ float tmp[16];
8873
+
8874
+ for (int64_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) {
8875
+ for (int64_t iir0 = ir0_start; iir0 < ir0_end; iir0 += blck_0) {
8876
+ for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir1_end; ++ir1) {
8877
+ const int64_t _i12 = ir1; // logical row index for this expert
8878
+
8879
+ struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, _i12);
8880
+ const int id = row_mapping.i1; // selected expert index
8881
+
8882
+ const int64_t i11 = id % ne11;
8883
+ const int64_t i12 = row_mapping.i2; // row index in src1
8884
+
8885
+ const int64_t i1 = id; // selected expert index
8886
+ const int64_t i2 = i12; // row
8887
+
8888
+ // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
8889
+ // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
8890
+ // the original src1 data pointer, so we should index using the indices directly
8891
+ // TODO: this is a bit of a hack, we should probably have a better way to handle this
8892
+ const char * src1_col = (const char *) wdata +
8893
+ (src1_cont || src1->type != vec_dot_type
8894
+ ? (i11 + i12*ne11)*row_size
8895
+ : (i11*nb11 + i12*nb12));
8896
+
8897
+ float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2));
8898
+
8899
+ for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ++ir0) {
8900
+ vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_cur + ir0*nb01, 0, src1_col, 0, 1);
8901
+ }
8902
+
8903
+ memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir0_end) - iir0)*sizeof(float));
8904
+ }
8905
+ }
8906
+ }
8907
+ }
8908
+
8909
+ static void * incr_ptr_aligned(void ** p, size_t size, size_t align) {
8910
+
8911
+ void * ptr = *p;
8912
+ ptr = (void *) GGML_PAD((uintptr_t) ptr, align);
8913
+ *p = (void *) ((char *) ptr + size);
8914
+ return ptr;
8915
+ }
8916
+
7615
8917
  static void ggml_compute_forward_mul_mat_id(
7616
8918
  const struct ggml_compute_params * params,
7617
8919
  struct ggml_tensor * dst) {
@@ -7629,7 +8931,6 @@ static void ggml_compute_forward_mul_mat_id(
7629
8931
 
7630
8932
  const bool src1_cont = ggml_is_contiguous(src1);
7631
8933
 
7632
- ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot;
7633
8934
  enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
7634
8935
  ggml_from_float_t const from_float = type_traits_cpu[vec_dot_type].from_float;
7635
8936
 
@@ -7647,21 +8948,27 @@ static void ggml_compute_forward_mul_mat_id(
7647
8948
  const int n_ids = ids->ne[0]; // n_expert_used
7648
8949
  const int n_as = ne02; // n_expert
7649
8950
 
7650
- char * wdata_src1_end = (src1->type == vec_dot_type) ?
7651
- (char *) params->wdata :
7652
- (char *) params->wdata + GGML_PAD(ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));
8951
+ void * wdata_cur = params->wdata;
7653
8952
 
7654
- struct mmid_row_mapping {
7655
- int32_t i1;
7656
- int32_t i2;
7657
- };
8953
+ if (src1->type != vec_dot_type) {
8954
+ incr_ptr_aligned(&wdata_cur, ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));
8955
+ }
8956
+
8957
+ int64_t * matrix_row_counts = // [n_as]
8958
+ incr_ptr_aligned(&wdata_cur, n_as*sizeof(int64_t), sizeof(int64_t));
7658
8959
 
7659
- int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
7660
- struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *)(matrix_row_counts + n_as); // [n_as][ne11]
8960
+ struct mmid_row_mapping * matrix_rows = // [n_as][ids->ne[0]*ids->ne[1]]
8961
+ incr_ptr_aligned(&wdata_cur, n_as*ids->ne[0]*ids->ne[1]*sizeof(struct mmid_row_mapping), sizeof(int64_t));
8962
+
8963
+ char (*atomic_current_chunk)[CACHE_LINE_SIZE] = // [n_as]
8964
+ incr_ptr_aligned(&wdata_cur, CACHE_LINE_SIZE * n_as, CACHE_LINE_SIZE);
8965
+
8966
+ GGML_ASSERT(params->wsize >= (size_t)((char *) wdata_cur - (char *) params->wdata));
7661
8967
 
7662
8968
  if (src1->type != vec_dot_type) {
7663
8969
  char * wdata = params->wdata;
7664
8970
 
8971
+ const size_t nbw0 = ggml_type_size(vec_dot_type);
7665
8972
  const size_t nbw1 = ggml_row_size(vec_dot_type, ne10);
7666
8973
  const size_t nbw2 = nbw1*ne11;
7667
8974
  const size_t nbw3 = nbw2*ne12;
@@ -7669,19 +8976,32 @@ static void ggml_compute_forward_mul_mat_id(
7669
8976
  assert(params->wsize >= ne13*nbw3);
7670
8977
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
7671
8978
 
8979
+ #if 0
7672
8980
  for (int64_t i13 = 0; i13 < ne13; ++i13) {
7673
- for (int64_t i12 = 0; i12 < ne12; ++i12) {
7674
- for (int64_t i11 = ith; i11 < ne11; i11 += nth) {
8981
+ for (int64_t i12 = ith; i12 < ne12; i12 += nth) {
8982
+ for (int64_t i11 = 0; i11 < ne11; ++i11) {
7675
8983
  from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
7676
8984
  (void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
7677
8985
  ne10);
7678
8986
  }
7679
8987
  }
7680
8988
  }
8989
+ #else
8990
+ for (int64_t i13 = 0; i13 < ne13; ++i13) {
8991
+ for (int64_t i12 = 0; i12 < ne12; ++i12) {
8992
+ for (int64_t i11 = 0; i11 < ne11; ++i11) {
8993
+ size_t bs = ggml_blck_size(vec_dot_type);
8994
+ int64_t ne10_block_start = (ith * ne10/bs) / nth;
8995
+ int64_t ne10_block_end = ((ith + 1) * ne10/bs) / nth;
8996
+ from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + ne10_block_start*bs*nb10),
8997
+ (void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1 + ne10_block_start*nbw0),
8998
+ (ne10_block_end - ne10_block_start) * bs);
8999
+ }
9000
+ }
9001
+ }
9002
+ #endif
7681
9003
  }
7682
9004
 
7683
- #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne12 + (i1)]
7684
-
7685
9005
  if (ith == 0) {
7686
9006
  // initialize matrix_row_counts
7687
9007
  memset(matrix_row_counts, 0, n_as*sizeof(int64_t));
@@ -7699,9 +9019,14 @@ static void ggml_compute_forward_mul_mat_id(
7699
9019
  }
7700
9020
  }
7701
9021
 
9022
+ // reset current_chunk
9023
+ for (int cur_a = ith; cur_a < n_as; cur_a += nth) {
9024
+ atomic_int * current_chunk_ctr = (atomic_int *)(atomic_current_chunk + cur_a);
9025
+ *current_chunk_ctr = nth;
9026
+ }
9027
+
7702
9028
  ggml_barrier(params->threadpool);
7703
9029
 
7704
- // compute each matrix multiplication in sequence
7705
9030
  for (int cur_a = 0; cur_a < n_as; ++cur_a) {
7706
9031
  const int64_t cne1 = matrix_row_counts[cur_a];
7707
9032
 
@@ -7709,84 +9034,64 @@ static void ggml_compute_forward_mul_mat_id(
7709
9034
  continue;
7710
9035
  }
7711
9036
 
7712
- const char * src0_cur = (const char *) src0->data + cur_a*nb02;
7713
-
7714
- const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
9037
+ const char * src0_cur = (const char *) src0->data + cur_a * nb02;
9038
+ const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
7715
9039
  const size_t row_size = ggml_row_size(vec_dot_type, ne10);
7716
9040
 
7717
- const int64_t nr0 = ne01; // src0 rows
7718
- const int64_t nr1 = cne1; // src1 rows
7719
-
7720
- // distribute the thread work across the inner or outer loop based on which one is larger
7721
-
7722
- const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
7723
- const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
7724
-
7725
- const int64_t ith0 = ith % nth0;
7726
- const int64_t ith1 = ith / nth0;
7727
-
7728
- const int64_t dr0 = (nr0 + nth0 - 1)/nth0;
7729
- const int64_t dr1 = (nr1 + nth1 - 1)/nth1;
9041
+ const int64_t nr0 = ne01;
9042
+ const int64_t nr1 = cne1;
7730
9043
 
7731
- const int64_t ir010 = dr0*ith0;
7732
- const int64_t ir011 = MIN(ir010 + dr0, nr0);
7733
-
7734
- const int64_t ir110 = dr1*ith1;
7735
- const int64_t ir111 = MIN(ir110 + dr1, nr1);
7736
-
7737
- // threads with no work simply yield (not sure if it helps)
7738
- //if (ir010 >= ir011 || ir110 >= ir111) {
7739
- // sched_yield();
7740
- // continue;
7741
- //}
9044
+ int chunk_size = 16;
9045
+ if (nr0 == 1 || nr1 == 1) {
9046
+ chunk_size = 64;
9047
+ }
7742
9048
 
7743
- // block-tiling attempt
7744
- const int64_t blck_0 = 16;
7745
- const int64_t blck_1 = 16;
9049
+ #if defined(__aarch64__)
9050
+ // disable for ARM
9051
+ const bool disable_chunking = true;
9052
+ #else
9053
+ // disable for NUMA
9054
+ const bool disable_chunking = ggml_is_numa();
9055
+ #endif // defined(__aarch64__)
7746
9056
 
7747
- // attempt to reduce false-sharing (does not seem to make a difference)
7748
- float tmp[16];
9057
+ int64_t nchunk0 = (nr0 + chunk_size - 1) / chunk_size;
9058
+ int64_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size;
7749
9059
 
7750
- for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
7751
- for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
7752
- for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
7753
- const int64_t _i12 = ir1; // logical row index for this expert
9060
+ if (nchunk0 * nchunk1 < nth * 4 || disable_chunking) {
9061
+ nchunk0 = nr0 > nr1 ? nth : 1;
9062
+ nchunk1 = nr0 > nr1 ? 1 : nth;
9063
+ }
7754
9064
 
7755
- struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, _i12);
7756
- const int id = row_mapping.i1; // selected expert index
9065
+ const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
9066
+ const int64_t dr1 = (nr1 + nchunk1 - 1) / nchunk1;
7757
9067
 
7758
- const int64_t i11 = id % ne11;
7759
- const int64_t i12 = row_mapping.i2; // row index in src1
9068
+ int current_chunk = ith;
7760
9069
 
7761
- const int64_t i1 = id; // selected expert index
7762
- const int64_t i2 = i12; // row
9070
+ atomic_int * current_chunk_ctr = (atomic_int *)(atomic_current_chunk + cur_a);
7763
9071
 
7764
- // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
7765
- // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
7766
- // the original src1 data pointer, so we should index using the indices directly
7767
- // TODO: this is a bit of a hack, we should probably have a better way to handle this
7768
- const char * src1_col = (const char *) wdata +
7769
- (src1_cont || src1->type != vec_dot_type
7770
- ? (i11 + i12*ne11)*row_size
7771
- : (i11*nb11 + i12*nb12));
9072
+ while (current_chunk < nchunk0 * nchunk1) {
9073
+ const int64_t ith0 = current_chunk % nchunk0;
9074
+ const int64_t ith1 = current_chunk / nchunk0;
7772
9075
 
7773
- float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2));
9076
+ const int64_t ir0_start = dr0 * ith0;
9077
+ const int64_t ir0_end = MIN(ir0_start + dr0, nr0);
7774
9078
 
7775
- //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
7776
- // vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
7777
- //}
9079
+ const int64_t ir1_start = dr1 * ith1;
9080
+ const int64_t ir1_end = MIN(ir1_start + dr1, nr1);
7778
9081
 
7779
- for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
7780
- vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_cur + ir0*nb01, 0, src1_col, 0, 1);
7781
- }
9082
+ ggml_compute_forward_mul_mat_id_one_chunk(
9083
+ dst, src0, src1, ids, cur_a,
9084
+ ir0_start, ir0_end, ir1_start, ir1_end,
9085
+ src0_cur, matrix_rows, row_size, src1_cont, wdata
9086
+ );
7782
9087
 
7783
- memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
7784
- }
9088
+ if (nth >= nchunk0 * nchunk1) {
9089
+ break;
7785
9090
  }
9091
+
9092
+ current_chunk = atomic_fetch_add_explicit(current_chunk_ctr, 1, memory_order_relaxed);
7786
9093
  }
7787
9094
  }
7788
-
7789
- #undef MMID_MATRIX_ROW
7790
9095
  }
7791
9096
 
7792
9097
  // ggml_compute_forward_out_prod
@@ -9080,10 +10385,6 @@ static void ggml_compute_forward_clamp_f32(
9080
10385
 
9081
10386
  const struct ggml_tensor * src0 = dst->src[0];
9082
10387
 
9083
- if (params->ith != 0) {
9084
- return;
9085
- }
9086
-
9087
10388
  float min;
9088
10389
  float max;
9089
10390
  memcpy(&min, (float *) dst->op_params + 0, sizeof(float));
@@ -9114,6 +10415,43 @@ static void ggml_compute_forward_clamp_f32(
9114
10415
  }
9115
10416
  }
9116
10417
 
10418
+ static void ggml_compute_forward_clamp_f16(
10419
+ const struct ggml_compute_params * params,
10420
+ struct ggml_tensor * dst) {
10421
+
10422
+ const struct ggml_tensor * src0 = dst->src[0];
10423
+
10424
+ float min;
10425
+ float max;
10426
+ memcpy(&min, (float *) dst->op_params + 0, sizeof(float));
10427
+ memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
10428
+
10429
+ const int ith = params->ith;
10430
+ const int nth = params->nth;
10431
+
10432
+ const int n = ggml_nrows(src0);
10433
+ const int nc = src0->ne[0];
10434
+
10435
+ const size_t nb00 = src0->nb[0];
10436
+ const size_t nb01 = src0->nb[1];
10437
+
10438
+ const size_t nb0 = dst->nb[0];
10439
+ const size_t nb1 = dst->nb[1];
10440
+
10441
+ GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
10442
+ GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
10443
+
10444
+ for (int j = ith; j < n; j += nth) {
10445
+ ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + j*nb1);
10446
+ ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + j*nb01);
10447
+
10448
+ for (int i = 0; i < nc; i++) {
10449
+ float v = GGML_FP16_TO_FP32(src0_ptr[i]);
10450
+ dst_ptr[i] = GGML_FP32_TO_FP16(MAX(MIN(v, max), min));
10451
+ }
10452
+ }
10453
+ }
10454
+
9117
10455
  static void ggml_compute_forward_clamp(
9118
10456
  const struct ggml_compute_params * params,
9119
10457
  struct ggml_tensor * dst) {
@@ -9126,6 +10464,9 @@ static void ggml_compute_forward_clamp(
9126
10464
  ggml_compute_forward_clamp_f32(params, dst);
9127
10465
  } break;
9128
10466
  case GGML_TYPE_F16:
10467
+ {
10468
+ ggml_compute_forward_clamp_f16(params, dst);
10469
+ } break;
9129
10470
  case GGML_TYPE_BF16:
9130
10471
  case GGML_TYPE_Q4_0:
9131
10472
  case GGML_TYPE_Q4_1:
@@ -13723,14 +15064,19 @@ struct ggml_cplan ggml_graph_plan(
13723
15064
  cur = 0;
13724
15065
  const struct ggml_tensor * src0 = node->src[0];
13725
15066
  const struct ggml_tensor * src1 = node->src[1];
15067
+ const struct ggml_tensor * ids = node->src[2];
13726
15068
  const enum ggml_type vec_dot_type = type_traits_cpu[src0->type].vec_dot_type;
15069
+ const int n_as = src0->ne[2];
15070
+ // src1
13727
15071
  if (src1->type != vec_dot_type) {
13728
- cur += ggml_row_size(vec_dot_type, ggml_nelements(src1));
15072
+ cur += ggml_row_size(vec_dot_type, ggml_nelements(src1)) + sizeof(int64_t);
13729
15073
  }
13730
- const int n_as = src0->ne[2];
13731
- cur += GGML_PAD(cur, sizeof(int64_t)); // align
13732
- cur += n_as * sizeof(int64_t); // matrix_row_counts
13733
- cur += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows
15074
+ // matrix_row_counts
15075
+ cur += n_as * sizeof(int64_t) + sizeof(int64_t);
15076
+ // matrix_rows
15077
+ cur += n_as*ids->ne[0]*ids->ne[1]*sizeof(struct mmid_row_mapping) + sizeof(int64_t);
15078
+ // atomic_current_chunk
15079
+ cur += CACHE_LINE_SIZE*n_as + CACHE_LINE_SIZE;
13734
15080
  } break;
13735
15081
  case GGML_OP_OUT_PROD:
13736
15082
  {
@@ -13862,9 +15208,13 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
13862
15208
  tp->ec = GGML_STATUS_ABORTED;
13863
15209
  }
13864
15210
 
13865
- ggml_barrier(state->threadpool);
15211
+ if (node_n + 1 < cgraph->n_nodes) {
15212
+ ggml_barrier(state->threadpool);
15213
+ }
13866
15214
  }
13867
15215
 
15216
+ ggml_barrier(state->threadpool);
15217
+
13868
15218
  return 0;
13869
15219
  }
13870
15220
 
@@ -14229,6 +15579,14 @@ int ggml_cpu_has_amx_int8(void) {
14229
15579
  #endif
14230
15580
  }
14231
15581
 
15582
+ int ggml_cpu_has_bmi2(void) {
15583
+ #if defined(__BMI2__)
15584
+ return 1;
15585
+ #else
15586
+ return 0;
15587
+ #endif
15588
+ }
15589
+
14232
15590
  int ggml_cpu_has_fma(void) {
14233
15591
  #if defined(__FMA__)
14234
15592
  return 1;
@@ -14309,6 +15667,14 @@ int ggml_cpu_has_vsx(void) {
14309
15667
  #endif
14310
15668
  }
14311
15669
 
15670
+ int ggml_cpu_has_vxe(void) {
15671
+ #if defined(__VXE__) || defined(__VXE2__)
15672
+ return 1;
15673
+ #else
15674
+ return 0;
15675
+ #endif
15676
+ }
15677
+
14312
15678
  int ggml_cpu_has_neon(void) {
14313
15679
  #if defined(__ARM_ARCH) && defined(__ARM_NEON)
14314
15680
  return ggml_arm_arch_features.has_neon;
@@ -14349,6 +15715,14 @@ int ggml_cpu_get_sve_cnt(void) {
14349
15715
  #endif
14350
15716
  }
14351
15717
 
15718
+ int ggml_cpu_has_sme(void) {
15719
+ #if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SME)
15720
+ return ggml_arm_arch_features.has_sme;
15721
+ #else
15722
+ return 0;
15723
+ #endif
15724
+ }
15725
+
14352
15726
  void ggml_cpu_init(void) {
14353
15727
  // needed to initialize f16 tables
14354
15728
  {