@fugood/llama.node 0.3.13 → 0.3.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/lib/binding.ts +1 -1
  18. package/package.json +1 -1
  19. package/src/LlamaContext.cpp +98 -76
  20. package/src/LlamaContext.h +1 -1
  21. package/src/common.hpp +1 -2
  22. package/src/llama.cpp/.github/workflows/build.yml +60 -10
  23. package/src/llama.cpp/.github/workflows/server.yml +2 -0
  24. package/src/llama.cpp/common/CMakeLists.txt +3 -3
  25. package/src/llama.cpp/common/arg.cpp +112 -11
  26. package/src/llama.cpp/common/chat.cpp +960 -266
  27. package/src/llama.cpp/common/chat.h +135 -0
  28. package/src/llama.cpp/common/common.cpp +27 -171
  29. package/src/llama.cpp/common/common.h +27 -67
  30. package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
  31. package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
  32. package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +37 -5
  33. package/src/llama.cpp/common/ngram-cache.cpp +1 -0
  34. package/src/llama.cpp/common/sampling.cpp +45 -7
  35. package/src/llama.cpp/common/speculative.cpp +6 -5
  36. package/src/llama.cpp/common/speculative.h +1 -1
  37. package/src/llama.cpp/docs/build.md +45 -7
  38. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +3 -1
  39. package/src/llama.cpp/examples/embedding/embedding.cpp +1 -0
  40. package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
  41. package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -3
  42. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +1 -1
  43. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  44. package/src/llama.cpp/examples/llava/clip.cpp +373 -107
  45. package/src/llama.cpp/examples/llava/clip.h +19 -3
  46. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
  47. package/src/llama.cpp/examples/llava/llava.cpp +4 -2
  48. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
  49. package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -0
  50. package/src/llama.cpp/examples/main/main.cpp +73 -28
  51. package/src/llama.cpp/examples/parallel/parallel.cpp +1 -0
  52. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -0
  53. package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
  54. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
  55. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
  56. package/src/llama.cpp/examples/run/run.cpp +110 -67
  57. package/src/llama.cpp/examples/server/server.cpp +82 -87
  58. package/src/llama.cpp/examples/server/utils.hpp +94 -107
  59. package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
  60. package/src/llama.cpp/examples/tts/tts.cpp +251 -142
  61. package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
  62. package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
  63. package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
  64. package/src/llama.cpp/ggml/include/ggml-cpu.h +3 -0
  65. package/src/llama.cpp/ggml/include/ggml.h +5 -1
  66. package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
  67. package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
  68. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
  69. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
  70. package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
  71. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -2
  72. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
  73. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +132 -17
  74. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
  75. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
  76. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
  77. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +151 -0
  78. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1396 -386
  79. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1432 -151
  80. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +22 -0
  81. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
  82. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
  83. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
  84. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
  85. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +15 -2
  86. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
  87. package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
  88. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
  89. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +235 -0
  90. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -2
  91. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
  92. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +220 -116
  93. package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
  94. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
  95. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +2 -0
  96. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  97. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
  98. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +51 -10
  99. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
  100. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
  101. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
  102. package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
  103. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
  104. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +136 -4
  105. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +308 -0
  106. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
  107. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +168 -721
  108. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -77
  109. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -0
  110. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
  111. package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
  112. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +146 -42
  113. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +13 -3
  114. package/src/llama.cpp/ggml/src/ggml.c +8 -3
  115. package/src/llama.cpp/include/llama.h +19 -5
  116. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
  117. package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
  118. package/src/llama.cpp/requirements/requirements-all.txt +1 -0
  119. package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
  120. package/src/llama.cpp/requirements.txt +1 -0
  121. package/src/llama.cpp/src/llama-arch.cpp +21 -0
  122. package/src/llama.cpp/src/llama-arch.h +1 -0
  123. package/src/llama.cpp/src/llama-chat.cpp +1 -0
  124. package/src/llama.cpp/src/llama-grammar.cpp +182 -182
  125. package/src/llama.cpp/src/llama-grammar.h +12 -3
  126. package/src/llama.cpp/src/llama-kv-cache.h +1 -0
  127. package/src/llama.cpp/src/llama-mmap.cpp +11 -1
  128. package/src/llama.cpp/src/llama-model.cpp +69 -5
  129. package/src/llama.cpp/src/llama-sampling.cpp +43 -10
  130. package/src/llama.cpp/src/llama-vocab.cpp +12 -0
  131. package/src/llama.cpp/src/llama.cpp +147 -0
  132. package/src/llama.cpp/tests/test-backend-ops.cpp +166 -110
  133. package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
  134. package/src/llama.cpp/tests/test-chat.cpp +593 -395
  135. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
  136. package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
  137. package/src/llama.cpp/Sources/llama/llama.h +0 -4
  138. package/src/llama.cpp/common/chat.hpp +0 -55
  139. /package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +0 -0
@@ -112,7 +112,8 @@ struct ggml_arm_arch_features_type {
112
112
  int has_i8mm;
113
113
  int has_sve;
114
114
  int sve_cnt;
115
- } ggml_arm_arch_features = {-1, -1, -1, -1, 0};
115
+ int has_sme;
116
+ } ggml_arm_arch_features = {-1, -1, -1, -1, 0, -1};
116
117
  #endif
117
118
 
118
119
 
@@ -236,6 +237,8 @@ typedef pthread_t ggml_thread_t;
236
237
  #else
237
238
  #if defined(__POWER9_VECTOR__)
238
239
  #define CACHE_LINE_SIZE 128
240
+ #elif defined(__VXE__) || defined(__VXE2__)
241
+ #define CACHE_LINE_SIZE 256
239
242
  #else
240
243
  #define CACHE_LINE_SIZE 64
241
244
  #endif
@@ -244,9 +247,9 @@ typedef pthread_t ggml_thread_t;
244
247
  static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
245
248
 
246
249
 
247
- static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc);
248
- static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc);
249
- static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t * restrict x, size_t bx, ggml_bf16_t * restrict y, size_t by, int nrc);
250
+ static void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc);
251
+ static void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc);
252
+ static void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc);
250
253
 
251
254
  static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
252
255
  [GGML_TYPE_F32] = {
@@ -1210,6 +1213,87 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
1210
1213
  #define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
1211
1214
  #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
1212
1215
 
1216
+ #elif defined(__VXE__) || defined(__VXE2__)
1217
+
1218
+ #define GGML_SIMD
1219
+
1220
+ // F32 s390x
1221
+
1222
+ #define GGML_F32_STEP 32
1223
+ #define GGML_F32_EPR 4
1224
+
1225
+ #define GGML_F32x4 __vector float
1226
+ #define GGML_F32x4_ZERO vec_splats(0.0f)
1227
+ #define GGML_F32x4_SET1 vec_splats
1228
+ #define GGML_F32x4_LOAD(p) vec_xl(0, p)
1229
+ #define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p)
1230
+ #define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
1231
+ #define GGML_F32x4_ADD vec_add
1232
+ #define GGML_F32x4_MUL vec_mul
1233
+ #define GGML_F32x4_REDUCE(res, x) \
1234
+ { \
1235
+ int offset = GGML_F32_ARR >> 1; \
1236
+ for (int i = 0; i < offset; ++i) { \
1237
+ x[i] = vec_add(x[i], x[offset + i]); \
1238
+ } \
1239
+ offset >>= 1; \
1240
+ for (int i = 0; i < offset; ++i) { \
1241
+ x[i] = vec_add(x[i], x[offset + i]); \
1242
+ } \
1243
+ offset >>= 1; \
1244
+ for (int i = 0; i < offset; ++i) { \
1245
+ x[i] = vec_add(x[i], x[offset + i]); \
1246
+ } \
1247
+ res = vec_extract(x[0], 0) + \
1248
+ vec_extract(x[0], 1) + \
1249
+ vec_extract(x[0], 2) + \
1250
+ vec_extract(x[0], 3); \
1251
+ }
1252
+
1253
+ #define GGML_F32_VEC GGML_F32x4
1254
+ #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
1255
+ #define GGML_F32_VEC_SET1 GGML_F32x4_SET1
1256
+ #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
1257
+ #define GGML_F32_VEC_STORE GGML_F32x4_STORE
1258
+ #define GGML_F32_VEC_FMA GGML_F32x4_FMA
1259
+ #define GGML_F32_VEC_ADD GGML_F32x4_ADD
1260
+ #define GGML_F32_VEC_MUL GGML_F32x4_MUL
1261
+ #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
1262
+
1263
+ // F16 s390x
1264
+ #define GGML_F16_STEP GGML_F32_STEP
1265
+ #define GGML_F16_EPR GGML_F32_EPR
1266
+
1267
+ static inline __vector float __lzs_f16cx4_load(const ggml_fp16_t * x) {
1268
+ float tmp[4];
1269
+
1270
+ for (int i = 0; i < 4; i++) {
1271
+ tmp[i] = GGML_FP16_TO_FP32(x[i]);
1272
+ }
1273
+
1274
+ return vec_xl(0, tmp);
1275
+ }
1276
+
1277
+ static inline void __lzs_f16cx4_store(ggml_fp16_t * x, __vector float y) {
1278
+ float arr[4];
1279
+
1280
+ vec_xst(y, 0, arr);
1281
+
1282
+ for (int i = 0; i < 4; i++) {
1283
+ x[i] = GGML_FP32_TO_FP16(arr[i]);
1284
+ }
1285
+ }
1286
+
1287
+ #define GGML_F16_VEC GGML_F32x4
1288
+ #define GGML_F16_VEC_ZERO GGML_F32x4_ZERO
1289
+ #define GGML_F16_VEC_SET1 GGML_F32x4_SET1
1290
+ #define GGML_F16_VEC_LOAD(p, i) __lzs_f16cx4_load(p)
1291
+ #define GGML_F16_VEC_STORE(p, r, i) __lzs_f16cx4_store(p, r[i])
1292
+ #define GGML_F16_VEC_FMA GGML_F32x4_FMA
1293
+ #define GGML_F16_VEC_ADD GGML_F32x4_ADD
1294
+ #define GGML_F16_VEC_MUL GGML_F32x4_MUL
1295
+ #define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
1296
+
1213
1297
  #endif
1214
1298
 
1215
1299
  // GGML_F32_ARR / GGML_F16_ARR
@@ -1331,17 +1415,43 @@ inline static void ggml_vec_cpy_i32(const int n, int32_t * y, const int32_t * x)
1331
1415
  inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
1332
1416
  inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
1333
1417
  inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; }
1418
+ inline static void ggml_vec_add_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
1419
+ for (int i = 0; i < n; ++i) {
1420
+ z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) + GGML_FP16_TO_FP32(y[i]));
1421
+ }
1422
+ }
1334
1423
  inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { for (int i = 0; i < n; ++i) z[i] = x[i] + v; }
1335
1424
  inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] += x[i]; }
1336
1425
  inline static void ggml_vec_acc1_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] += v; }
1337
1426
  inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] - y[i]; }
1427
+ inline static void ggml_vec_sub_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
1428
+ for (int i = 0; i < n; ++i) {
1429
+ z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) - GGML_FP16_TO_FP32(y[i]));
1430
+ }
1431
+ }
1338
1432
  inline static void ggml_vec_set_f32 (const int n, float * x, const float v) { for (int i = 0; i < n; ++i) x[i] = v; }
1339
1433
  inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; }
1340
1434
  inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = -x[i]; }
1435
+ inline static void ggml_vec_neg_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
1436
+ for (int i = 0; i < n; ++i) {
1437
+ y[i] = GGML_FP32_TO_FP16(-GGML_FP16_TO_FP32(x[i]));
1438
+ }
1439
+ }
1440
+
1341
1441
  inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; }
1442
+ inline static void ggml_vec_mul_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
1443
+ for (int i = 0; i < n; ++i) {
1444
+ z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) * GGML_FP16_TO_FP32(y[i]));
1445
+ }
1446
+ }
1342
1447
  inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; }
1448
+ inline static void ggml_vec_div_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
1449
+ for (int i = 0; i < n; ++i) {
1450
+ z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) / GGML_FP16_TO_FP32(y[i]));
1451
+ }
1452
+ }
1343
1453
 
1344
- static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc) {
1454
+ static void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc) {
1345
1455
  assert(nrc == 1);
1346
1456
  UNUSED(nrc);
1347
1457
  UNUSED(bx);
@@ -1384,7 +1494,7 @@ static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float *
1384
1494
  *s = sumf;
1385
1495
  }
1386
1496
 
1387
- static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t * restrict x, size_t bx, ggml_bf16_t * restrict y, size_t by, int nrc) {
1497
+ static void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc) {
1388
1498
  assert(nrc == 1);
1389
1499
  UNUSED(nrc);
1390
1500
  UNUSED(bx);
@@ -1452,7 +1562,7 @@ static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t
1452
1562
  *s = sumf;
1453
1563
  }
1454
1564
 
1455
- static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc) {
1565
+ static void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc) {
1456
1566
  assert(nrc == 1);
1457
1567
  UNUSED(nrc);
1458
1568
  UNUSED(bx);
@@ -1496,10 +1606,10 @@ static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t *
1496
1606
 
1497
1607
  // compute GGML_VEC_DOT_UNROLL dot products at once
1498
1608
  // xs - x row stride in bytes
1499
- inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * restrict s, void * restrict xv, ggml_fp16_t * restrict y) {
1609
+ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GGML_RESTRICT s, void * GGML_RESTRICT xv, ggml_fp16_t * GGML_RESTRICT y) {
1500
1610
  ggml_float sumf[GGML_VEC_DOT_UNROLL] = { 0.0 };
1501
1611
 
1502
- ggml_fp16_t * restrict x[GGML_VEC_DOT_UNROLL];
1612
+ ggml_fp16_t * GGML_RESTRICT x[GGML_VEC_DOT_UNROLL];
1503
1613
 
1504
1614
  for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
1505
1615
  x[i] = (ggml_fp16_t *) ((char *) xv + i*xs);
@@ -1549,7 +1659,7 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * re
1549
1659
  }
1550
1660
  }
1551
1661
 
1552
- inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float * restrict x, const float v) {
1662
+ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const float * GGML_RESTRICT x, const float v) {
1553
1663
  #if defined(GGML_SIMD)
1554
1664
  const int np = (n & ~(GGML_F32_STEP - 1));
1555
1665
 
@@ -1580,7 +1690,7 @@ inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float
1580
1690
  #endif
1581
1691
  }
1582
1692
 
1583
- inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, const ggml_fp16_t * restrict x, const float v) {
1693
+ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y, const ggml_fp16_t * GGML_RESTRICT x, const float v) {
1584
1694
  #if defined(GGML_SIMD)
1585
1695
  const int np = (n & ~(GGML_F16_STEP - 1));
1586
1696
 
@@ -1612,10 +1722,10 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, const
1612
1722
  }
1613
1723
 
1614
1724
  // xs and vs are byte strides of x and v
1615
- inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * restrict y, const float * restrict xv, const float * restrict vv) {
1725
+ inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * GGML_RESTRICT y, const float * GGML_RESTRICT xv, const float * GGML_RESTRICT vv) {
1616
1726
 
1617
- const float * restrict x[GGML_VEC_MAD_UNROLL];
1618
- const float * restrict v[GGML_VEC_MAD_UNROLL];
1727
+ const float * GGML_RESTRICT x[GGML_VEC_MAD_UNROLL];
1728
+ const float * GGML_RESTRICT v[GGML_VEC_MAD_UNROLL];
1619
1729
 
1620
1730
  for (int i = 0; i < GGML_VEC_MAD_UNROLL; ++i) {
1621
1731
  x[i] = (const float *) ((const char *) xv + i*xs);
@@ -1726,22 +1836,107 @@ inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float
1726
1836
 
1727
1837
  inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s); }
1728
1838
  inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; }
1839
+ inline static void ggml_vec_sqr_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
1840
+ for (int i = 0; i < n; ++i) {
1841
+ float v = GGML_FP16_TO_FP32(x[i]);
1842
+ y[i] = GGML_FP32_TO_FP16(v*v);
1843
+ }
1844
+ }
1729
1845
  inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
1846
+ inline static void ggml_vec_sqrt_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
1847
+ for (int i = 0; i < n; ++i) {
1848
+ y[i] = GGML_FP32_TO_FP16(sqrtf(GGML_FP16_TO_FP32(x[i])));
1849
+ }
1850
+ }
1730
1851
  inline static void ggml_vec_log_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]); }
1852
+ inline static void ggml_vec_log_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
1853
+ for (int i = 0; i < n; ++i) {
1854
+ y[i] = GGML_FP32_TO_FP16(logf(GGML_FP16_TO_FP32(x[i])));
1855
+ }
1856
+ }
1731
1857
  inline static void ggml_vec_sin_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sinf(x[i]); }
1858
+ inline static void ggml_vec_sin_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
1859
+ for (int i = 0; i < n; ++i) {
1860
+ y[i] = GGML_FP32_TO_FP16(sinf(GGML_FP16_TO_FP32(x[i])));
1861
+ }
1862
+ }
1732
1863
  inline static void ggml_vec_cos_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = cosf(x[i]); }
1864
+ inline static void ggml_vec_cos_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
1865
+ for (int i = 0; i < n; ++i) {
1866
+ y[i] = GGML_FP32_TO_FP16(cosf(GGML_FP16_TO_FP32(x[i])));
1867
+ }
1868
+ }
1733
1869
  inline static void ggml_vec_abs_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); }
1870
+ inline static void ggml_vec_abs_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
1871
+ for (int i = 0; i < n; ++i) {
1872
+ y[i] = GGML_FP32_TO_FP16(fabsf(GGML_FP16_TO_FP32(x[i])));
1873
+ }
1874
+ }
1734
1875
  inline static void ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
1876
+ inline static void ggml_vec_sgn_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
1877
+ for (int i = 0; i < n; ++i) {
1878
+ float v = GGML_FP16_TO_FP32(x[i]);
1879
+ y[i] = GGML_FP32_TO_FP16((v > 0.f) ? 1.f : ((v < 0.f) ? -1.f : 0.f));
1880
+ }
1881
+ }
1735
1882
  inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
1883
+ inline static void ggml_vec_step_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
1884
+ for (int i = 0; i < n; ++i) {
1885
+ y[i] = GGML_FP32_TO_FP16((GGML_FP16_TO_FP32(x[i]) > 0.f) ? 1.f : 0.f);
1886
+ }
1887
+ }
1736
1888
  inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); }
1889
+ inline static void ggml_vec_tanh_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
1890
+ for (int i = 0; i < n; ++i) {
1891
+ y[i] = GGML_FP32_TO_FP16(tanhf(GGML_FP16_TO_FP32(x[i])));
1892
+ }
1893
+ }
1737
1894
  inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); }
1895
+ inline static void ggml_vec_elu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
1896
+ for (int i = 0; i < n; ++i) {
1897
+ y[i] = GGML_FP32_TO_FP16(expm1f(GGML_FP16_TO_FP32(x[i])));
1898
+ }
1899
+ }
1738
1900
  inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
1901
+ inline static void ggml_vec_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
1902
+ for (int i = 0; i < n; ++i) {
1903
+ float v = GGML_FP16_TO_FP32(x[i]);
1904
+ y[i] = GGML_FP32_TO_FP16((v > 0.f) ? v : 0.f);
1905
+ }
1906
+ }
1739
1907
  inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
1908
+ inline static void ggml_vec_leaky_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const float ns) {
1909
+ for (int i = 0; i < n; ++i) {
1910
+ float v = GGML_FP16_TO_FP32(x[i]);
1911
+ y[i] = GGML_FP32_TO_FP16(((v > 0.f) ? v : 0.f) + ns * ((v < 0.0f) ? v : 0.f));
1912
+ }
1913
+ }
1740
1914
  inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); }
1915
+ inline static void ggml_vec_sigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
1916
+ for (int i = 0; i < n; ++i) {
1917
+ y[i] = GGML_FP32_TO_FP16(1.f / (1.f + expf(-GGML_FP16_TO_FP32(x[i]))));
1918
+ }
1919
+ }
1741
1920
  // TODO: optimize performance
1742
1921
  inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
1922
+ inline static void ggml_vec_hardswish_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
1923
+ for (int i = 0; i < n; ++i) {
1924
+ float v = GGML_FP16_TO_FP32(x[i]);
1925
+ y[i] = GGML_FP32_TO_FP16(v * fminf(1.0f, fmaxf(0.0f, (v + 3.0f) / 6.0f)));
1926
+ }
1927
+ }
1743
1928
  inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
1929
+ inline static void ggml_vec_hardsigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
1930
+ for (int i = 0; i < n; ++i) {
1931
+ y[i] = GGML_FP32_TO_FP16(fminf(1.0f, fmaxf(0.0f, (GGML_FP16_TO_FP32(x[i]) + 3.0f) / 6.0f)));
1932
+ }
1933
+ }
1744
1934
  inline static void ggml_vec_exp_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = expf(x[i]); }
1935
+ inline static void ggml_vec_exp_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
1936
+ for (int i = 0; i < n; ++i) {
1937
+ y[i] = GGML_FP32_TO_FP16(expf(GGML_FP16_TO_FP32(x[i])));
1938
+ }
1939
+ }
1745
1940
 
1746
1941
  static const float GELU_COEF_A = 0.044715f;
1747
1942
  static const float GELU_QUICK_COEF = -1.702f;
@@ -1809,10 +2004,21 @@ inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float *
1809
2004
  }
1810
2005
  #endif
1811
2006
 
2007
+ inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
2008
+ for (int i = 0; i < n; ++i) {
2009
+ float v = GGML_FP16_TO_FP32(x[i]);
2010
+ y[i] = GGML_FP32_TO_FP16(v*(1.0f/(1.0f+expf(GELU_QUICK_COEF*v))));
2011
+ }
2012
+ }
2013
+
1812
2014
  // Sigmoid Linear Unit (SiLU) function
1813
2015
  inline static float ggml_silu_f32(float x) {
1814
2016
  return x/(1.0f + expf(-x));
1815
2017
  }
2018
+ inline static ggml_fp16_t ggml_silu_f16(ggml_fp16_t x) {
2019
+ float v = GGML_FP16_TO_FP32(x);
2020
+ return GGML_FP32_TO_FP16(v/(1.0f + expf(-v)));
2021
+ }
1816
2022
 
1817
2023
  #if __FINITE_MATH_ONLY__
1818
2024
  #error "some routines in ggml.c require non-finite math arithmetics -- pass -fno-finite-math-only to the compiler to fix"
@@ -2036,6 +2242,12 @@ static void ggml_vec_silu_f32(const int n, float * y, const float * x) {
2036
2242
  }
2037
2243
  }
2038
2244
 
2245
+ inline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
2246
+ for (int i = 0; i < n; ++i) {
2247
+ y[i] = ggml_silu_f16(x[i]);
2248
+ }
2249
+ }
2250
+
2039
2251
  static ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max) {
2040
2252
  int i = 0;
2041
2253
  ggml_float sum = 0;
@@ -2107,12 +2319,24 @@ inline static float ggml_silu_backward_f32(float x, float dy) {
2107
2319
  return dy*s*(1.0f + x*(1.0f - s));
2108
2320
  }
2109
2321
 
2322
+ inline static ggml_fp16_t ggml_silu_backward_f16(ggml_fp16_t x, ggml_fp16_t dy) {
2323
+ const float v = GGML_FP16_TO_FP32(x);
2324
+ const float s = 1.0f/(1.0f + expf(-v));
2325
+ return GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(dy)*s*(1.0f + v*(1.0f - s)));
2326
+ }
2327
+
2110
2328
  inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
2111
2329
  for (int i = 0; i < n; ++i) {
2112
2330
  dx[i] = ggml_silu_backward_f32(x[i], dy[i]);
2113
2331
  }
2114
2332
  }
2115
2333
 
2334
+ inline static void ggml_vec_silu_backward_f16(const int n, ggml_fp16_t * dx, const ggml_fp16_t * x, const ggml_fp16_t * dy) {
2335
+ for (int i = 0; i < n; ++i) {
2336
+ dx[i] = ggml_silu_backward_f16(x[i], dy[i]);
2337
+ }
2338
+ }
2339
+
2116
2340
  inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
2117
2341
  #ifndef GGML_USE_ACCELERATE
2118
2342
  ggml_float sum = 0.0;
@@ -2381,15 +2605,20 @@ bool ggml_is_numa(void) {
2381
2605
  #define HWCAP2_I8MM (1 << 13)
2382
2606
  #endif
2383
2607
 
2608
+ #if !defined(HWCAP2_SME)
2609
+ #define HWCAP2_SME (1 << 23)
2610
+ #endif
2611
+
2384
2612
  static void ggml_init_arm_arch_features(void) {
2385
2613
  #if defined(__linux__) && defined(__aarch64__)
2386
2614
  uint32_t hwcap = getauxval(AT_HWCAP);
2387
2615
  uint32_t hwcap2 = getauxval(AT_HWCAP2);
2388
2616
 
2389
- ggml_arm_arch_features.has_neon = !!(hwcap & HWCAP_ASIMD);
2617
+ ggml_arm_arch_features.has_neon = !!(hwcap & HWCAP_ASIMD);
2390
2618
  ggml_arm_arch_features.has_dotprod = !!(hwcap & HWCAP_ASIMDDP);
2391
- ggml_arm_arch_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
2392
- ggml_arm_arch_features.has_sve = !!(hwcap & HWCAP_SVE);
2619
+ ggml_arm_arch_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
2620
+ ggml_arm_arch_features.has_sve = !!(hwcap & HWCAP_SVE);
2621
+ ggml_arm_arch_features.has_sme = !!(hwcap2 & HWCAP2_SME);
2393
2622
 
2394
2623
  #if defined(__ARM_FEATURE_SVE)
2395
2624
  ggml_arm_arch_features.sve_cnt = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
@@ -2412,6 +2641,11 @@ static void ggml_init_arm_arch_features(void) {
2412
2641
  }
2413
2642
  ggml_arm_arch_features.has_i8mm = oldp;
2414
2643
 
2644
+ if (sysctlbyname("hw.optional.arm.FEAT_SME", &oldp, &size, NULL, 0) != 0) {
2645
+ oldp = 0;
2646
+ }
2647
+ ggml_arm_arch_features.has_sme = oldp;
2648
+
2415
2649
  ggml_arm_arch_features.has_sve = 0;
2416
2650
  ggml_arm_arch_features.sve_cnt = 0;
2417
2651
  #else
@@ -2435,6 +2669,12 @@ static void ggml_init_arm_arch_features(void) {
2435
2669
  ggml_arm_arch_features.has_sve = 0;
2436
2670
  ggml_arm_arch_features.sve_cnt = 0;
2437
2671
  #endif
2672
+
2673
+ #if defined(__ARM_FEATURE_SME) || defined(__ARM_FEATURE_SME2)
2674
+ ggml_arm_arch_features.has_sme = 1;
2675
+ #else
2676
+ ggml_arm_arch_features.has_sme = 0;
2677
+ #endif
2438
2678
  #endif
2439
2679
  }
2440
2680
  #endif
@@ -4279,7 +4519,7 @@ static void ggml_compute_forward_add_f16_f16(
4279
4519
  const struct ggml_tensor * src0 = dst->src[0];
4280
4520
  const struct ggml_tensor * src1 = dst->src[1];
4281
4521
 
4282
- GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
4522
+ GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
4283
4523
 
4284
4524
  const int ith = params->ith;
4285
4525
  const int nth = params->nth;
@@ -4304,17 +4544,22 @@ static void ggml_compute_forward_add_f16_f16(
4304
4544
 
4305
4545
  if (nb10 == sizeof(ggml_fp16_t)) {
4306
4546
  for (int ir = ir0; ir < ir1; ++ir) {
4307
- // src0, src1 and dst are same shape => same indices
4308
- const int i3 = ir/(ne2*ne1);
4309
- const int i2 = (ir - i3*ne2*ne1)/ne1;
4310
- const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
4547
+ // src1 is broadcastable across src0 and dst in i1, i2, i3
4548
+ const int64_t i03 = ir/(ne02*ne01);
4549
+ const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
4550
+ const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
4551
+
4552
+ const int64_t i13 = i03 % ne13;
4553
+ const int64_t i12 = i02 % ne12;
4554
+ const int64_t i11 = i01 % ne11;
4555
+ const int64_t nr0 = ne00 / ne10;
4311
4556
 
4312
- ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1);
4313
- ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
4314
- ggml_fp16_t * src1_ptr = (ggml_fp16_t *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
4557
+ ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
4558
+ ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
4559
+ ggml_fp16_t * src1_ptr = (ggml_fp16_t *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
4315
4560
 
4316
- for (int i = 0; i < ne0; i++) {
4317
- dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + GGML_FP16_TO_FP32(src1_ptr[i]));
4561
+ for (int64_t r = 0; r < nr0; ++r) {
4562
+ ggml_vec_add_f16(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
4318
4563
  }
4319
4564
  }
4320
4565
  }
@@ -5102,6 +5347,62 @@ static void ggml_compute_forward_sub_f32(
5102
5347
  }
5103
5348
  }
5104
5349
 
5350
+ static void ggml_compute_forward_sub_f16(
5351
+ const struct ggml_compute_params * params,
5352
+ struct ggml_tensor * dst) {
5353
+
5354
+ const struct ggml_tensor * src0 = dst->src[0];
5355
+ const struct ggml_tensor * src1 = dst->src[1];
5356
+
5357
+ assert(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
5358
+
5359
+ const int ith = params->ith;
5360
+ const int nth = params->nth;
5361
+
5362
+ const int nr = ggml_nrows(src0);
5363
+
5364
+ GGML_TENSOR_BINARY_OP_LOCALS
5365
+
5366
+ GGML_ASSERT(src0->type == GGML_TYPE_F16);
5367
+ GGML_ASSERT(src1->type == GGML_TYPE_F16);
5368
+ GGML_ASSERT(dst->type == GGML_TYPE_F16);
5369
+
5370
+ GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
5371
+ GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
5372
+
5373
+ // rows per thread
5374
+ const int dr = (nr + nth - 1)/nth;
5375
+
5376
+ // row range for this thread
5377
+ const int ir0 = dr*ith;
5378
+ const int ir1 = MIN(ir0 + dr, nr);
5379
+
5380
+ if (nb10 == sizeof(ggml_fp16_t)) {
5381
+ for (int ir = ir0; ir < ir1; ++ir) {
5382
+ // src1 is broadcastable across src0 and dst in i1, i2, i3
5383
+ const int64_t i03 = ir/(ne02*ne01);
5384
+ const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
5385
+ const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
5386
+
5387
+ const int64_t i13 = i03 % ne13;
5388
+ const int64_t i12 = i02 % ne12;
5389
+ const int64_t i11 = i01 % ne11;
5390
+ const int64_t nr0 = ne00 / ne10;
5391
+
5392
+ ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
5393
+ ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
5394
+ ggml_fp16_t * src1_ptr = (ggml_fp16_t *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
5395
+
5396
+ for (int64_t r = 0; r < nr0; ++r) {
5397
+ ggml_vec_sub_f16(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
5398
+ }
5399
+ }
5400
+ } else {
5401
+ // src1 is not contiguous
5402
+ GGML_ABORT("unimplemented error");
5403
+ }
5404
+ }
5405
+
5105
5406
  static void ggml_compute_forward_sub(
5106
5407
  const struct ggml_compute_params * params,
5107
5408
  struct ggml_tensor * dst) {
@@ -5113,6 +5414,10 @@ static void ggml_compute_forward_sub(
5113
5414
  {
5114
5415
  ggml_compute_forward_sub_f32(params, dst);
5115
5416
  } break;
5417
+ case GGML_TYPE_F16:
5418
+ {
5419
+ ggml_compute_forward_sub_f16(params, dst);
5420
+ } break;
5116
5421
  default:
5117
5422
  {
5118
5423
  GGML_ABORT("fatal error");
@@ -5193,6 +5498,55 @@ static void ggml_compute_forward_mul_f32(
5193
5498
  }
5194
5499
  }
5195
5500
 
5501
+ static void ggml_compute_forward_mul_f16(
5502
+ const struct ggml_compute_params * params,
5503
+ struct ggml_tensor * dst) {
5504
+
5505
+ const struct ggml_tensor * src0 = dst->src[0];
5506
+ const struct ggml_tensor * src1 = dst->src[1];
5507
+
5508
+ GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
5509
+
5510
+ const int ith = params->ith;
5511
+ const int nth = params->nth;
5512
+
5513
+ const int64_t nr = ggml_nrows(src0);
5514
+
5515
+ GGML_TENSOR_BINARY_OP_LOCALS
5516
+
5517
+ GGML_ASSERT(src0->type == GGML_TYPE_F16);
5518
+ GGML_ASSERT(src1->type == GGML_TYPE_F16);
5519
+ GGML_ASSERT(dst->type == GGML_TYPE_F16);
5520
+
5521
+ GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
5522
+ GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
5523
+
5524
+ if (nb10 == sizeof(ggml_fp16_t)) {
5525
+ for (int64_t ir = ith; ir < nr; ir += nth) {
5526
+ // src0 and dst are same shape => same indices
5527
+ const int64_t i03 = ir/(ne02*ne01);
5528
+ const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
5529
+ const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
5530
+
5531
+ const int64_t i13 = i03 % ne13;
5532
+ const int64_t i12 = i02 % ne12;
5533
+ const int64_t i11 = i01 % ne11;
5534
+ const int64_t nr0 = ne00 / ne10;
5535
+
5536
+ ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
5537
+ ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
5538
+ ggml_fp16_t * src1_ptr = (ggml_fp16_t *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
5539
+
5540
+ for (int64_t r = 0 ; r < nr0; ++r) {
5541
+ ggml_vec_mul_f16(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
5542
+ }
5543
+ }
5544
+ } else {
5545
+ // src1 is not contiguous
5546
+ GGML_ABORT("unimplemented error");
5547
+ }
5548
+ }
5549
+
5196
5550
  static void ggml_compute_forward_mul(
5197
5551
  const struct ggml_compute_params * params,
5198
5552
  struct ggml_tensor * dst) {
@@ -5200,13 +5554,17 @@ static void ggml_compute_forward_mul(
5200
5554
  const struct ggml_tensor * src0 = dst->src[0];
5201
5555
  const struct ggml_tensor * src1 = dst->src[1];
5202
5556
 
5203
- GGML_ASSERT(src1->type == GGML_TYPE_F32 && "only f32 src1 supported for now");
5557
+ GGML_ASSERT((src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16) && "only f32/f16 src1 supported for now");
5204
5558
 
5205
5559
  switch (src0->type) {
5206
5560
  case GGML_TYPE_F32:
5207
5561
  {
5208
5562
  ggml_compute_forward_mul_f32(params, dst);
5209
5563
  } break;
5564
+ case GGML_TYPE_F16:
5565
+ {
5566
+ ggml_compute_forward_mul_f16(params, dst);
5567
+ } break;
5210
5568
  default:
5211
5569
  {
5212
5570
  GGML_ABORT("fatal error");
@@ -5287,18 +5645,71 @@ static void ggml_compute_forward_div_f32(
5287
5645
  }
5288
5646
  }
5289
5647
 
5290
- static void ggml_compute_forward_div(
5291
- const struct ggml_compute_params * params,
5292
- struct ggml_tensor * dst) {
5648
+ static void ggml_compute_forward_div_f16(
5649
+ const struct ggml_compute_params * params,
5650
+ struct ggml_tensor * dst) {
5293
5651
 
5294
5652
  const struct ggml_tensor * src0 = dst->src[0];
5653
+ const struct ggml_tensor * src1 = dst->src[1];
5295
5654
 
5296
- switch (src0->type) {
5297
- case GGML_TYPE_F32:
5298
- {
5299
- ggml_compute_forward_div_f32(params, dst);
5300
- } break;
5301
- default:
5655
+ GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
5656
+
5657
+ const int ith = params->ith;
5658
+ const int nth = params->nth;
5659
+
5660
+ const int64_t nr = ggml_nrows(src0);
5661
+
5662
+ GGML_TENSOR_BINARY_OP_LOCALS
5663
+
5664
+ GGML_ASSERT(src0->type == GGML_TYPE_F16);
5665
+ GGML_ASSERT(src1->type == GGML_TYPE_F16);
5666
+ GGML_ASSERT(dst->type == GGML_TYPE_F16);
5667
+
5668
+ GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
5669
+ GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
5670
+
5671
+ if (nb10 == sizeof(ggml_fp16_t)) {
5672
+ for (int64_t ir = ith; ir < nr; ir += nth) {
5673
+ // src0 and dst are same shape => same indices
5674
+ const int64_t i03 = ir/(ne02*ne01);
5675
+ const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
5676
+ const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
5677
+
5678
+ const int64_t i13 = i03 % ne13;
5679
+ const int64_t i12 = i02 % ne12;
5680
+ const int64_t i11 = i01 % ne11;
5681
+ const int64_t nr0 = ne00 / ne10;
5682
+
5683
+ ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
5684
+ ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
5685
+ ggml_fp16_t * src1_ptr = (ggml_fp16_t *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
5686
+
5687
+ for (int64_t r = 0; r < nr0; ++r) {
5688
+ ggml_vec_div_f16(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
5689
+ }
5690
+ }
5691
+ } else {
5692
+ // src1 is not contiguous
5693
+ GGML_ABORT("unimplemented error");
5694
+ }
5695
+ }
5696
+
5697
+ static void ggml_compute_forward_div(
5698
+ const struct ggml_compute_params * params,
5699
+ struct ggml_tensor * dst) {
5700
+
5701
+ const struct ggml_tensor * src0 = dst->src[0];
5702
+
5703
+ switch (src0->type) {
5704
+ case GGML_TYPE_F32:
5705
+ {
5706
+ ggml_compute_forward_div_f32(params, dst);
5707
+ } break;
5708
+ case GGML_TYPE_F16:
5709
+ {
5710
+ ggml_compute_forward_div_f16(params, dst);
5711
+ } break;
5712
+ default:
5302
5713
  {
5303
5714
  GGML_ABORT("fatal error");
5304
5715
  }
@@ -5332,6 +5743,31 @@ static void ggml_compute_forward_sqr_f32(
5332
5743
  }
5333
5744
  }
5334
5745
 
5746
+ static void ggml_compute_forward_sqr_f16(
5747
+ const struct ggml_compute_params * params,
5748
+ struct ggml_tensor * dst) {
5749
+
5750
+ const struct ggml_tensor * src0 = dst->src[0];
5751
+
5752
+ if (params->ith != 0) {
5753
+ return;
5754
+ }
5755
+
5756
+ assert(ggml_are_same_shape(src0, dst));
5757
+
5758
+ const int n = ggml_nrows(src0);
5759
+ const int nc = src0->ne[0];
5760
+
5761
+ assert( dst->nb[0] == sizeof(ggml_fp16_t));
5762
+ assert(src0->nb[0] == sizeof(ggml_fp16_t));
5763
+
5764
+ for (int i = 0; i < n; i++) {
5765
+ ggml_vec_sqr_f16(nc,
5766
+ (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
5767
+ (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
5768
+ }
5769
+ }
5770
+
5335
5771
  static void ggml_compute_forward_sqr(
5336
5772
  const struct ggml_compute_params * params,
5337
5773
  struct ggml_tensor * dst) {
@@ -5343,6 +5779,10 @@ static void ggml_compute_forward_sqr(
5343
5779
  {
5344
5780
  ggml_compute_forward_sqr_f32(params, dst);
5345
5781
  } break;
5782
+ case GGML_TYPE_F16:
5783
+ {
5784
+ ggml_compute_forward_sqr_f16(params, dst);
5785
+ } break;
5346
5786
  default:
5347
5787
  {
5348
5788
  GGML_ABORT("fatal error");
@@ -5377,6 +5817,31 @@ static void ggml_compute_forward_sqrt_f32(
5377
5817
  }
5378
5818
  }
5379
5819
 
5820
+ static void ggml_compute_forward_sqrt_f16(
5821
+ const struct ggml_compute_params * params,
5822
+ struct ggml_tensor * dst) {
5823
+
5824
+ const struct ggml_tensor * src0 = dst->src[0];
5825
+
5826
+ if (params->ith != 0) {
5827
+ return;
5828
+ }
5829
+
5830
+ assert(ggml_are_same_shape(src0, dst));
5831
+
5832
+ const int n = ggml_nrows(src0);
5833
+ const int nc = src0->ne[0];
5834
+
5835
+ assert( dst->nb[0] == sizeof(ggml_fp16_t));
5836
+ assert(src0->nb[0] == sizeof(ggml_fp16_t));
5837
+
5838
+ for (int i = 0; i < n; i++) {
5839
+ ggml_vec_sqrt_f16(nc,
5840
+ (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
5841
+ (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
5842
+ }
5843
+ }
5844
+
5380
5845
  static void ggml_compute_forward_sqrt(
5381
5846
  const struct ggml_compute_params * params,
5382
5847
  struct ggml_tensor * dst) {
@@ -5388,6 +5853,10 @@ static void ggml_compute_forward_sqrt(
5388
5853
  {
5389
5854
  ggml_compute_forward_sqrt_f32(params, dst);
5390
5855
  } break;
5856
+ case GGML_TYPE_F16:
5857
+ {
5858
+ ggml_compute_forward_sqrt_f16(params, dst);
5859
+ } break;
5391
5860
  default:
5392
5861
  {
5393
5862
  GGML_ABORT("fatal error");
@@ -5422,6 +5891,31 @@ static void ggml_compute_forward_log_f32(
5422
5891
  }
5423
5892
  }
5424
5893
 
5894
+ static void ggml_compute_forward_log_f16(
5895
+ const struct ggml_compute_params * params,
5896
+ struct ggml_tensor * dst) {
5897
+
5898
+ const struct ggml_tensor * src0 = dst->src[0];
5899
+
5900
+ if (params->ith != 0) {
5901
+ return;
5902
+ }
5903
+
5904
+ GGML_ASSERT(ggml_are_same_shape(src0, dst));
5905
+
5906
+ const int n = ggml_nrows(src0);
5907
+ const int nc = src0->ne[0];
5908
+
5909
+ GGML_ASSERT( dst->nb[0] == sizeof(ggml_fp16_t));
5910
+ GGML_ASSERT(src0->nb[0] == sizeof(ggml_fp16_t));
5911
+
5912
+ for (int i = 0; i < n; i++) {
5913
+ ggml_vec_log_f16(nc,
5914
+ (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
5915
+ (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
5916
+ }
5917
+ }
5918
+
5425
5919
  static void ggml_compute_forward_log(
5426
5920
  const struct ggml_compute_params * params,
5427
5921
  struct ggml_tensor * dst) {
@@ -5433,6 +5927,10 @@ static void ggml_compute_forward_log(
5433
5927
  {
5434
5928
  ggml_compute_forward_log_f32(params, dst);
5435
5929
  } break;
5930
+ case GGML_TYPE_F16:
5931
+ {
5932
+ ggml_compute_forward_log_f16(params, dst);
5933
+ } break;
5436
5934
  default:
5437
5935
  {
5438
5936
  GGML_ABORT("fatal error");
@@ -5467,6 +5965,31 @@ static void ggml_compute_forward_sin_f32(
5467
5965
  }
5468
5966
  }
5469
5967
 
5968
+ static void ggml_compute_forward_sin_f16(
5969
+ const struct ggml_compute_params * params,
5970
+ struct ggml_tensor * dst) {
5971
+
5972
+ const struct ggml_tensor * src0 = dst->src[0];
5973
+
5974
+ if (params->ith != 0) {
5975
+ return;
5976
+ }
5977
+
5978
+ GGML_ASSERT(ggml_are_same_shape(src0, dst));
5979
+
5980
+ const int n = ggml_nrows(src0);
5981
+ const int nc = src0->ne[0];
5982
+
5983
+ GGML_ASSERT( dst->nb[0] == sizeof(ggml_fp16_t));
5984
+ GGML_ASSERT(src0->nb[0] == sizeof(ggml_fp16_t));
5985
+
5986
+ for (int i = 0; i < n; i++) {
5987
+ ggml_vec_sin_f16(nc,
5988
+ (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
5989
+ (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
5990
+ }
5991
+ }
5992
+
5470
5993
  static void ggml_compute_forward_sin(
5471
5994
  const struct ggml_compute_params * params,
5472
5995
  struct ggml_tensor * dst) {
@@ -5478,6 +6001,10 @@ static void ggml_compute_forward_sin(
5478
6001
  {
5479
6002
  ggml_compute_forward_sin_f32(params, dst);
5480
6003
  } break;
6004
+ case GGML_TYPE_F16:
6005
+ {
6006
+ ggml_compute_forward_sin_f16(params, dst);
6007
+ } break;
5481
6008
  default:
5482
6009
  {
5483
6010
  GGML_ABORT("fatal error");
@@ -5512,6 +6039,31 @@ static void ggml_compute_forward_cos_f32(
5512
6039
  }
5513
6040
  }
5514
6041
 
6042
+ static void ggml_compute_forward_cos_f16(
6043
+ const struct ggml_compute_params * params,
6044
+ struct ggml_tensor * dst) {
6045
+
6046
+ const struct ggml_tensor * src0 = dst->src[0];
6047
+
6048
+ if (params->ith != 0) {
6049
+ return;
6050
+ }
6051
+
6052
+ GGML_ASSERT(ggml_are_same_shape(src0, dst));
6053
+
6054
+ const int n = ggml_nrows(src0);
6055
+ const int nc = src0->ne[0];
6056
+
6057
+ GGML_ASSERT( dst->nb[0] == sizeof(ggml_fp16_t));
6058
+ GGML_ASSERT(src0->nb[0] == sizeof(ggml_fp16_t));
6059
+
6060
+ for (int i = 0; i < n; i++) {
6061
+ ggml_vec_cos_f16(nc,
6062
+ (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
6063
+ (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
6064
+ }
6065
+ }
6066
+
5515
6067
  static void ggml_compute_forward_cos(
5516
6068
  const struct ggml_compute_params * params,
5517
6069
  struct ggml_tensor * dst) {
@@ -5523,6 +6075,10 @@ static void ggml_compute_forward_cos(
5523
6075
  {
5524
6076
  ggml_compute_forward_cos_f32(params, dst);
5525
6077
  } break;
6078
+ case GGML_TYPE_F16:
6079
+ {
6080
+ ggml_compute_forward_cos_f16(params, dst);
6081
+ } break;
5526
6082
  default:
5527
6083
  {
5528
6084
  GGML_ABORT("fatal error");
@@ -6092,14 +6648,14 @@ static void ggml_compute_forward_repeat_back(
6092
6648
 
6093
6649
  // ggml_compute_forward_concat
6094
6650
 
6095
- static void ggml_compute_forward_concat_f32(
6651
+ static void ggml_compute_forward_concat_any(
6096
6652
  const struct ggml_compute_params * params,
6097
6653
  struct ggml_tensor * dst) {
6098
6654
 
6099
6655
  const struct ggml_tensor * src0 = dst->src[0];
6100
6656
  const struct ggml_tensor * src1 = dst->src[1];
6101
6657
 
6102
- GGML_ASSERT(src0->nb[0] == sizeof(float));
6658
+ const size_t len = ggml_type_size(src0->type);
6103
6659
 
6104
6660
  const int ith = params->ith;
6105
6661
  const int nth = params->nth;
@@ -6113,7 +6669,7 @@ static void ggml_compute_forward_concat_f32(
6113
6669
  int64_t o[4] = {0, 0, 0, 0};
6114
6670
  o[dim] = src0->ne[dim];
6115
6671
 
6116
- const float * x;
6672
+ const char * x;
6117
6673
 
6118
6674
  // TODO: smarter multi-theading
6119
6675
  for (int i3 = 0; i3 < ne3; i3++) {
@@ -6121,101 +6677,268 @@ static void ggml_compute_forward_concat_f32(
6121
6677
  for (int i1 = 0; i1 < ne1; i1++) {
6122
6678
  for (int i0 = 0; i0 < ne0; i0++) {
6123
6679
  if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
6124
- x = (const float *) ((const char *)src0->data + (i0 )*nb00 + (i1 )*nb01 + (i2 )*nb02 + (i3 )*nb03);
6680
+ x = (const char *)src0->data + (i0 )*nb00 + (i1 )*nb01 + (i2 )*nb02 + (i3 )*nb03;
6125
6681
  } else {
6126
- x = (const float *) ((const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13);
6682
+ x = (const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13;
6127
6683
  }
6128
6684
 
6129
- float * y = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
6685
+ char * y = (char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3;
6130
6686
 
6131
- *y = *x;
6687
+ memcpy(y, x, len);
6132
6688
  }
6133
6689
  }
6134
6690
  }
6135
6691
  }
6136
6692
  }
6137
6693
 
6138
- static void ggml_compute_forward_concat(
6694
+ static void ggml_compute_forward_concat_i8(
6139
6695
  const struct ggml_compute_params * params,
6140
6696
  struct ggml_tensor * dst) {
6141
6697
 
6142
6698
  const struct ggml_tensor * src0 = dst->src[0];
6699
+ const struct ggml_tensor * src1 = dst->src[1];
6143
6700
 
6144
- switch (src0->type) {
6145
- case GGML_TYPE_F32:
6146
- case GGML_TYPE_I32:
6147
- {
6148
- ggml_compute_forward_concat_f32(params, dst);
6149
- } break;
6150
- default:
6151
- {
6152
- GGML_ABORT("fatal error");
6153
- }
6154
- }
6155
- }
6156
-
6157
- // ggml_compute_forward_abs
6701
+ GGML_ASSERT(ggml_type_size(src0->type) == sizeof(int8_t));
6158
6702
 
6159
- static void ggml_compute_forward_abs_f32(
6160
- const struct ggml_compute_params * params,
6161
- struct ggml_tensor * dst) {
6703
+ const int ith = params->ith;
6704
+ const int nth = params->nth;
6162
6705
 
6163
- const struct ggml_tensor * src0 = dst->src[0];
6706
+ GGML_TENSOR_BINARY_OP_LOCALS
6164
6707
 
6165
- if (params->ith != 0) {
6166
- return;
6167
- }
6708
+ const int32_t dim = ggml_get_op_params_i32(dst, 0);
6168
6709
 
6169
- assert(ggml_is_contiguous_1(src0));
6170
- assert(ggml_is_contiguous_1(dst));
6171
- assert(ggml_are_same_shape(src0, dst));
6710
+ GGML_ASSERT(dim >= 0 && dim < 4);
6172
6711
 
6173
- const int n = ggml_nrows(src0);
6174
- const int nc = src0->ne[0];
6712
+ int64_t o[4] = {0, 0, 0, 0};
6713
+ o[dim] = src0->ne[dim];
6175
6714
 
6176
- for (int i = 0; i < n; i++) {
6177
- ggml_vec_abs_f32(nc,
6178
- (float *) ((char *) dst->data + i*( dst->nb[1])),
6179
- (float *) ((char *) src0->data + i*(src0->nb[1])));
6180
- }
6181
- }
6715
+ const int8_t * x;
6182
6716
 
6183
- static void ggml_compute_forward_abs(
6184
- const struct ggml_compute_params * params,
6185
- struct ggml_tensor * dst) {
6717
+ // TODO: smarter multi-theading
6718
+ for (int i3 = 0; i3 < ne3; i3++) {
6719
+ for (int i2 = ith; i2 < ne2; i2 += nth) {
6720
+ for (int i1 = 0; i1 < ne1; i1++) {
6721
+ for (int i0 = 0; i0 < ne0; i0++) {
6722
+ if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
6723
+ x = (const int8_t *) ((const char *)src0->data + (i0 )*nb00 + (i1 )*nb01 + (i2 )*nb02 + (i3 )*nb03);
6724
+ } else {
6725
+ x = (const int8_t *) ((const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13);
6726
+ }
6186
6727
 
6187
- const struct ggml_tensor * src0 = dst->src[0];
6728
+ int8_t * y = (int8_t *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
6188
6729
 
6189
- switch (src0->type) {
6190
- case GGML_TYPE_F32:
6191
- {
6192
- ggml_compute_forward_abs_f32(params, dst);
6193
- } break;
6194
- default:
6195
- {
6196
- GGML_ABORT("fatal error");
6730
+ *y = *x;
6731
+ }
6197
6732
  }
6733
+ }
6198
6734
  }
6199
6735
  }
6200
6736
 
6201
- // ggml_compute_forward_sgn
6202
-
6203
- static void ggml_compute_forward_sgn_f32(
6204
- const struct ggml_compute_params * params,
6205
- struct ggml_tensor * dst) {
6737
+ static void ggml_compute_forward_concat_f16(
6738
+ const struct ggml_compute_params * params,
6739
+ struct ggml_tensor * dst) {
6206
6740
 
6207
6741
  const struct ggml_tensor * src0 = dst->src[0];
6742
+ const struct ggml_tensor * src1 = dst->src[1];
6208
6743
 
6209
- if (params->ith != 0) {
6210
- return;
6211
- }
6744
+ GGML_ASSERT(ggml_type_size(src0->type) == sizeof(ggml_fp16_t));
6212
6745
 
6213
- assert(ggml_is_contiguous_1(src0));
6214
- assert(ggml_is_contiguous_1(dst));
6215
- assert(ggml_are_same_shape(src0, dst));
6746
+ const int ith = params->ith;
6747
+ const int nth = params->nth;
6216
6748
 
6217
- const int n = ggml_nrows(src0);
6218
- const int nc = src0->ne[0];
6749
+ GGML_TENSOR_BINARY_OP_LOCALS
6750
+
6751
+ const int32_t dim = ggml_get_op_params_i32(dst, 0);
6752
+
6753
+ GGML_ASSERT(dim >= 0 && dim < 4);
6754
+
6755
+ int64_t o[4] = {0, 0, 0, 0};
6756
+ o[dim] = src0->ne[dim];
6757
+
6758
+ const ggml_fp16_t * x;
6759
+
6760
+ // TODO: smarter multi-theading
6761
+ for (int i3 = 0; i3 < ne3; i3++) {
6762
+ for (int i2 = ith; i2 < ne2; i2 += nth) {
6763
+ for (int i1 = 0; i1 < ne1; i1++) {
6764
+ for (int i0 = 0; i0 < ne0; i0++) {
6765
+ if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
6766
+ x = (const ggml_fp16_t *) ((const char *)src0->data + (i0 )*nb00 + (i1 )*nb01 + (i2 )*nb02 + (i3 )*nb03);
6767
+ } else {
6768
+ x = (const ggml_fp16_t *) ((const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13);
6769
+ }
6770
+
6771
+ ggml_fp16_t * y = (ggml_fp16_t *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
6772
+
6773
+ *y = *x;
6774
+ }
6775
+ }
6776
+ }
6777
+ }
6778
+ }
6779
+
6780
+ static void ggml_compute_forward_concat_f32(
6781
+ const struct ggml_compute_params * params,
6782
+ struct ggml_tensor * dst) {
6783
+
6784
+ const struct ggml_tensor * src0 = dst->src[0];
6785
+ const struct ggml_tensor * src1 = dst->src[1];
6786
+
6787
+ GGML_ASSERT(ggml_type_size(src0->type) == sizeof(float));
6788
+
6789
+ const int ith = params->ith;
6790
+ const int nth = params->nth;
6791
+
6792
+ GGML_TENSOR_BINARY_OP_LOCALS
6793
+
6794
+ const int32_t dim = ggml_get_op_params_i32(dst, 0);
6795
+
6796
+ GGML_ASSERT(dim >= 0 && dim < 4);
6797
+
6798
+ int64_t o[4] = {0, 0, 0, 0};
6799
+ o[dim] = src0->ne[dim];
6800
+
6801
+ const float * x;
6802
+
6803
+ // TODO: smarter multi-theading
6804
+ for (int i3 = 0; i3 < ne3; i3++) {
6805
+ for (int i2 = ith; i2 < ne2; i2 += nth) {
6806
+ for (int i1 = 0; i1 < ne1; i1++) {
6807
+ for (int i0 = 0; i0 < ne0; i0++) {
6808
+ if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
6809
+ x = (const float *) ((const char *)src0->data + (i0 )*nb00 + (i1 )*nb01 + (i2 )*nb02 + (i3 )*nb03);
6810
+ } else {
6811
+ x = (const float *) ((const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13);
6812
+ }
6813
+
6814
+ float * y = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
6815
+
6816
+ *y = *x;
6817
+ }
6818
+ }
6819
+ }
6820
+ }
6821
+ }
6822
+
6823
+ static void ggml_compute_forward_concat(
6824
+ const struct ggml_compute_params * params,
6825
+ struct ggml_tensor * dst) {
6826
+
6827
+ const struct ggml_tensor * src0 = dst->src[0];
6828
+
6829
+ switch (src0->type) {
6830
+ case GGML_TYPE_F16:
6831
+ case GGML_TYPE_BF16:
6832
+ case GGML_TYPE_I16:
6833
+ {
6834
+ ggml_compute_forward_concat_f16(params, dst);
6835
+ } break;
6836
+ case GGML_TYPE_I8:
6837
+ {
6838
+ ggml_compute_forward_concat_i8(params, dst);
6839
+ } break;
6840
+ case GGML_TYPE_F32:
6841
+ case GGML_TYPE_I32:
6842
+ {
6843
+ ggml_compute_forward_concat_f32(params, dst);
6844
+ } break;
6845
+ default:
6846
+ {
6847
+ ggml_compute_forward_concat_any(params, dst);
6848
+ }
6849
+ }
6850
+ }
6851
+
6852
+ // ggml_compute_forward_abs
6853
+
6854
+ static void ggml_compute_forward_abs_f32(
6855
+ const struct ggml_compute_params * params,
6856
+ struct ggml_tensor * dst) {
6857
+
6858
+ const struct ggml_tensor * src0 = dst->src[0];
6859
+
6860
+ if (params->ith != 0) {
6861
+ return;
6862
+ }
6863
+
6864
+ assert(ggml_is_contiguous_1(src0));
6865
+ assert(ggml_is_contiguous_1(dst));
6866
+ assert(ggml_are_same_shape(src0, dst));
6867
+
6868
+ const int n = ggml_nrows(src0);
6869
+ const int nc = src0->ne[0];
6870
+
6871
+ for (int i = 0; i < n; i++) {
6872
+ ggml_vec_abs_f32(nc,
6873
+ (float *) ((char *) dst->data + i*( dst->nb[1])),
6874
+ (float *) ((char *) src0->data + i*(src0->nb[1])));
6875
+ }
6876
+ }
6877
+
6878
+ static void ggml_compute_forward_abs_f16(
6879
+ const struct ggml_compute_params * params,
6880
+ struct ggml_tensor * dst) {
6881
+
6882
+ const struct ggml_tensor * src0 = dst->src[0];
6883
+
6884
+ if (params->ith != 0) {
6885
+ return;
6886
+ }
6887
+
6888
+ assert(ggml_is_contiguous_1(src0));
6889
+ assert(ggml_is_contiguous_1(dst));
6890
+ assert(ggml_are_same_shape(src0, dst));
6891
+
6892
+ const int n = ggml_nrows(src0);
6893
+ const int nc = src0->ne[0];
6894
+
6895
+ for (int i = 0; i < n; i++) {
6896
+ ggml_vec_abs_f16(nc,
6897
+ (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
6898
+ (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
6899
+ }
6900
+ }
6901
+
6902
+ static void ggml_compute_forward_abs(
6903
+ const struct ggml_compute_params * params,
6904
+ struct ggml_tensor * dst) {
6905
+
6906
+ const struct ggml_tensor * src0 = dst->src[0];
6907
+
6908
+ switch (src0->type) {
6909
+ case GGML_TYPE_F32:
6910
+ {
6911
+ ggml_compute_forward_abs_f32(params, dst);
6912
+ } break;
6913
+ case GGML_TYPE_F16:
6914
+ {
6915
+ ggml_compute_forward_abs_f16(params, dst);
6916
+ } break;
6917
+ default:
6918
+ {
6919
+ GGML_ABORT("fatal error");
6920
+ }
6921
+ }
6922
+ }
6923
+
6924
+ // ggml_compute_forward_sgn
6925
+
6926
+ static void ggml_compute_forward_sgn_f32(
6927
+ const struct ggml_compute_params * params,
6928
+ struct ggml_tensor * dst) {
6929
+
6930
+ const struct ggml_tensor * src0 = dst->src[0];
6931
+
6932
+ if (params->ith != 0) {
6933
+ return;
6934
+ }
6935
+
6936
+ assert(ggml_is_contiguous_1(src0));
6937
+ assert(ggml_is_contiguous_1(dst));
6938
+ assert(ggml_are_same_shape(src0, dst));
6939
+
6940
+ const int n = ggml_nrows(src0);
6941
+ const int nc = src0->ne[0];
6219
6942
 
6220
6943
  for (int i = 0; i < n; i++) {
6221
6944
  ggml_vec_sgn_f32(nc,
@@ -6224,6 +6947,30 @@ static void ggml_compute_forward_sgn_f32(
6224
6947
  }
6225
6948
  }
6226
6949
 
6950
+ static void ggml_compute_forward_sgn_f16(
6951
+ const struct ggml_compute_params * params,
6952
+ struct ggml_tensor * dst) {
6953
+
6954
+ const struct ggml_tensor * src0 = dst->src[0];
6955
+
6956
+ if (params->ith != 0) {
6957
+ return;
6958
+ }
6959
+
6960
+ assert(ggml_is_contiguous_1(src0));
6961
+ assert(ggml_is_contiguous_1(dst));
6962
+ assert(ggml_are_same_shape(src0, dst));
6963
+
6964
+ const int n = ggml_nrows(src0);
6965
+ const int nc = src0->ne[0];
6966
+
6967
+ for (int i = 0; i < n; i++) {
6968
+ ggml_vec_sgn_f16(nc,
6969
+ (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
6970
+ (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
6971
+ }
6972
+ }
6973
+
6227
6974
  static void ggml_compute_forward_sgn(
6228
6975
  const struct ggml_compute_params * params,
6229
6976
  struct ggml_tensor * dst) {
@@ -6235,6 +6982,10 @@ static void ggml_compute_forward_sgn(
6235
6982
  {
6236
6983
  ggml_compute_forward_sgn_f32(params, dst);
6237
6984
  } break;
6985
+ case GGML_TYPE_F16:
6986
+ {
6987
+ ggml_compute_forward_sgn_f16(params, dst);
6988
+ } break;
6238
6989
  default:
6239
6990
  {
6240
6991
  GGML_ABORT("fatal error");
@@ -6268,6 +7019,30 @@ static void ggml_compute_forward_neg_f32(
6268
7019
  }
6269
7020
  }
6270
7021
 
7022
+ static void ggml_compute_forward_neg_f16(
7023
+ const struct ggml_compute_params * params,
7024
+ struct ggml_tensor * dst) {
7025
+
7026
+ const struct ggml_tensor * src0 = dst->src[0];
7027
+
7028
+ if (params->ith != 0) {
7029
+ return;
7030
+ }
7031
+
7032
+ assert(ggml_is_contiguous_1(src0));
7033
+ assert(ggml_is_contiguous_1(dst));
7034
+ assert(ggml_are_same_shape(src0, dst));
7035
+
7036
+ const int n = ggml_nrows(src0);
7037
+ const int nc = src0->ne[0];
7038
+
7039
+ for (int i = 0; i < n; i++) {
7040
+ ggml_vec_neg_f16(nc,
7041
+ (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
7042
+ (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
7043
+ }
7044
+ }
7045
+
6271
7046
  static void ggml_compute_forward_neg(
6272
7047
  const struct ggml_compute_params * params,
6273
7048
  struct ggml_tensor * dst) {
@@ -6279,6 +7054,10 @@ static void ggml_compute_forward_neg(
6279
7054
  {
6280
7055
  ggml_compute_forward_neg_f32(params, dst);
6281
7056
  } break;
7057
+ case GGML_TYPE_F16:
7058
+ {
7059
+ ggml_compute_forward_neg_f16(params, dst);
7060
+ } break;
6282
7061
  default:
6283
7062
  {
6284
7063
  GGML_ABORT("fatal error");
@@ -6312,6 +7091,30 @@ static void ggml_compute_forward_step_f32(
6312
7091
  }
6313
7092
  }
6314
7093
 
7094
+ static void ggml_compute_forward_step_f16(
7095
+ const struct ggml_compute_params * params,
7096
+ struct ggml_tensor * dst) {
7097
+
7098
+ const struct ggml_tensor * src0 = dst->src[0];
7099
+
7100
+ if (params->ith != 0) {
7101
+ return;
7102
+ }
7103
+
7104
+ assert(ggml_is_contiguous_1(src0));
7105
+ assert(ggml_is_contiguous_1(dst));
7106
+ assert(ggml_are_same_shape(src0, dst));
7107
+
7108
+ const int n = ggml_nrows(src0);
7109
+ const int nc = src0->ne[0];
7110
+
7111
+ for (int i = 0; i < n; i++) {
7112
+ ggml_vec_step_f16(nc,
7113
+ (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
7114
+ (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
7115
+ }
7116
+ }
7117
+
6315
7118
  static void ggml_compute_forward_step(
6316
7119
  const struct ggml_compute_params * params,
6317
7120
  struct ggml_tensor * dst) {
@@ -6323,6 +7126,10 @@ static void ggml_compute_forward_step(
6323
7126
  {
6324
7127
  ggml_compute_forward_step_f32(params, dst);
6325
7128
  } break;
7129
+ case GGML_TYPE_F16:
7130
+ {
7131
+ ggml_compute_forward_step_f16(params, dst);
7132
+ } break;
6326
7133
  default:
6327
7134
  {
6328
7135
  GGML_ABORT("fatal error");
@@ -6356,6 +7163,30 @@ static void ggml_compute_forward_tanh_f32(
6356
7163
  }
6357
7164
  }
6358
7165
 
7166
+ static void ggml_compute_forward_tanh_f16(
7167
+ const struct ggml_compute_params * params,
7168
+ struct ggml_tensor * dst) {
7169
+
7170
+ const struct ggml_tensor * src0 = dst->src[0];
7171
+
7172
+ if (params->ith != 0) {
7173
+ return;
7174
+ }
7175
+
7176
+ assert(ggml_is_contiguous_1(src0));
7177
+ assert(ggml_is_contiguous_1(dst));
7178
+ assert(ggml_are_same_shape(src0, dst));
7179
+
7180
+ const int n = ggml_nrows(src0);
7181
+ const int nc = src0->ne[0];
7182
+
7183
+ for (int i = 0; i < n; i++) {
7184
+ ggml_vec_tanh_f16(nc,
7185
+ (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
7186
+ (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
7187
+ }
7188
+ }
7189
+
6359
7190
  static void ggml_compute_forward_tanh(
6360
7191
  const struct ggml_compute_params * params,
6361
7192
  struct ggml_tensor * dst) {
@@ -6367,6 +7198,10 @@ static void ggml_compute_forward_tanh(
6367
7198
  {
6368
7199
  ggml_compute_forward_tanh_f32(params, dst);
6369
7200
  } break;
7201
+ case GGML_TYPE_F16:
7202
+ {
7203
+ ggml_compute_forward_tanh_f16(params, dst);
7204
+ } break;
6370
7205
  default:
6371
7206
  {
6372
7207
  GGML_ABORT("fatal error");
@@ -6400,6 +7235,30 @@ static void ggml_compute_forward_elu_f32(
6400
7235
  }
6401
7236
  }
6402
7237
 
7238
+ static void ggml_compute_forward_elu_f16(
7239
+ const struct ggml_compute_params * params,
7240
+ struct ggml_tensor * dst) {
7241
+
7242
+ const struct ggml_tensor * src0 = dst->src[0];
7243
+
7244
+ if (params->ith != 0) {
7245
+ return;
7246
+ }
7247
+
7248
+ assert(ggml_is_contiguous_1(src0));
7249
+ assert(ggml_is_contiguous_1(dst));
7250
+ assert(ggml_are_same_shape(src0, dst));
7251
+
7252
+ const int n = ggml_nrows(src0);
7253
+ const int nc = src0->ne[0];
7254
+
7255
+ for (int i = 0; i < n; i++) {
7256
+ ggml_vec_elu_f16(nc,
7257
+ (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
7258
+ (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
7259
+ }
7260
+ }
7261
+
6403
7262
  static void ggml_compute_forward_elu(
6404
7263
  const struct ggml_compute_params * params,
6405
7264
  struct ggml_tensor * dst) {
@@ -6411,6 +7270,10 @@ static void ggml_compute_forward_elu(
6411
7270
  {
6412
7271
  ggml_compute_forward_elu_f32(params, dst);
6413
7272
  } break;
7273
+ case GGML_TYPE_F16:
7274
+ {
7275
+ ggml_compute_forward_elu_f16(params, dst);
7276
+ } break;
6414
7277
  default:
6415
7278
  {
6416
7279
  GGML_ABORT("fatal error");
@@ -6444,6 +7307,30 @@ static void ggml_compute_forward_relu_f32(
6444
7307
  }
6445
7308
  }
6446
7309
 
7310
+ static void ggml_compute_forward_relu_f16(
7311
+ const struct ggml_compute_params * params,
7312
+ struct ggml_tensor * dst) {
7313
+
7314
+ const struct ggml_tensor * src0 = dst->src[0];
7315
+
7316
+ if (params->ith != 0) {
7317
+ return;
7318
+ }
7319
+
7320
+ assert(ggml_is_contiguous_1(src0));
7321
+ assert(ggml_is_contiguous_1(dst));
7322
+ assert(ggml_are_same_shape(src0, dst));
7323
+
7324
+ const int n = ggml_nrows(src0);
7325
+ const int nc = src0->ne[0];
7326
+
7327
+ for (int i = 0; i < n; i++) {
7328
+ ggml_vec_relu_f16(nc,
7329
+ (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
7330
+ (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
7331
+ }
7332
+ }
7333
+
6447
7334
  static void ggml_compute_forward_relu(
6448
7335
  const struct ggml_compute_params * params,
6449
7336
  struct ggml_tensor * dst) {
@@ -6455,6 +7342,10 @@ static void ggml_compute_forward_relu(
6455
7342
  {
6456
7343
  ggml_compute_forward_relu_f32(params, dst);
6457
7344
  } break;
7345
+ case GGML_TYPE_F16:
7346
+ {
7347
+ ggml_compute_forward_relu_f16(params, dst);
7348
+ } break;
6458
7349
  default:
6459
7350
  {
6460
7351
  GGML_ABORT("fatal error");
@@ -6488,6 +7379,30 @@ static void ggml_compute_forward_sigmoid_f32(
6488
7379
  }
6489
7380
  }
6490
7381
 
7382
+ static void ggml_compute_forward_sigmoid_f16(
7383
+ const struct ggml_compute_params * params,
7384
+ struct ggml_tensor * dst) {
7385
+
7386
+ const struct ggml_tensor * src0 = dst->src[0];
7387
+
7388
+ if (params->ith != 0) {
7389
+ return;
7390
+ }
7391
+
7392
+ assert(ggml_is_contiguous_1(src0));
7393
+ assert(ggml_is_contiguous_1(dst));
7394
+ assert(ggml_are_same_shape(src0, dst));
7395
+
7396
+ const int n = ggml_nrows(src0);
7397
+ const int nc = src0->ne[0];
7398
+
7399
+ for (int i = 0; i < n; i++) {
7400
+ ggml_vec_sigmoid_f16(nc,
7401
+ (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
7402
+ (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
7403
+ }
7404
+ }
7405
+
6491
7406
  static void ggml_compute_forward_sigmoid(
6492
7407
  const struct ggml_compute_params * params,
6493
7408
  struct ggml_tensor * dst) {
@@ -6499,6 +7414,113 @@ static void ggml_compute_forward_sigmoid(
6499
7414
  {
6500
7415
  ggml_compute_forward_sigmoid_f32(params, dst);
6501
7416
  } break;
7417
+ case GGML_TYPE_F16:
7418
+ {
7419
+ ggml_compute_forward_sigmoid_f16(params, dst);
7420
+ } break;
7421
+ default:
7422
+ {
7423
+ GGML_ABORT("fatal error");
7424
+ }
7425
+ }
7426
+ }
7427
+
7428
+ // ggml_compute_forward_gelu
7429
+
7430
+ static void ggml_compute_forward_gelu_f32(
7431
+ const struct ggml_compute_params * params,
7432
+ struct ggml_tensor * dst) {
7433
+
7434
+ const struct ggml_tensor * src0 = dst->src[0];
7435
+
7436
+ assert(ggml_is_contiguous_1(src0));
7437
+ assert(ggml_is_contiguous_1(dst));
7438
+ assert(ggml_are_same_shape(src0, dst));
7439
+
7440
+ const int ith = params->ith;
7441
+ const int nth = params->nth;
7442
+
7443
+ const int nc = src0->ne[0];
7444
+ const int nr = ggml_nrows(src0);
7445
+
7446
+ // rows per thread
7447
+ const int dr = (nr + nth - 1)/nth;
7448
+
7449
+ // row range for this thread
7450
+ const int ir0 = dr*ith;
7451
+ const int ir1 = MIN(ir0 + dr, nr);
7452
+
7453
+ for (int i1 = ir0; i1 < ir1; i1++) {
7454
+ ggml_vec_gelu_f32(nc,
7455
+ (float *) ((char *) dst->data + i1*( dst->nb[1])),
7456
+ (float *) ((char *) src0->data + i1*(src0->nb[1])));
7457
+
7458
+ #ifndef NDEBUG
7459
+ for (int k = 0; k < nc; k++) {
7460
+ const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
7461
+ UNUSED(x);
7462
+ assert(!isnan(x));
7463
+ assert(!isinf(x));
7464
+ }
7465
+ #endif
7466
+ }
7467
+ }
7468
+
7469
+ static void ggml_compute_forward_gelu_f16(
7470
+ const struct ggml_compute_params * params,
7471
+ struct ggml_tensor * dst) {
7472
+
7473
+ const struct ggml_tensor * src0 = dst->src[0];
7474
+
7475
+ assert(ggml_is_contiguous_1(src0));
7476
+ assert(ggml_is_contiguous_1(dst));
7477
+ assert(ggml_are_same_shape(src0, dst));
7478
+
7479
+ const int ith = params->ith;
7480
+ const int nth = params->nth;
7481
+
7482
+ const int nc = src0->ne[0];
7483
+ const int nr = ggml_nrows(src0);
7484
+
7485
+ // rows per thread
7486
+ const int dr = (nr + nth - 1)/nth;
7487
+
7488
+ // row range for this thread
7489
+ const int ir0 = dr*ith;
7490
+ const int ir1 = MIN(ir0 + dr, nr);
7491
+
7492
+ for (int i1 = ir0; i1 < ir1; i1++) {
7493
+ ggml_vec_gelu_f16(nc,
7494
+ (ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])),
7495
+ (ggml_fp16_t *) ((char *) src0->data + i1*(src0->nb[1])));
7496
+
7497
+ #ifndef NDEBUG
7498
+ for (int k = 0; k < nc; k++) {
7499
+ const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k];
7500
+ const float v = GGML_FP16_TO_FP32(x);
7501
+ UNUSED(v);
7502
+ assert(!isnan(v));
7503
+ assert(!isinf(v));
7504
+ }
7505
+ #endif
7506
+ }
7507
+ }
7508
+
7509
+ static void ggml_compute_forward_gelu(
7510
+ const struct ggml_compute_params * params,
7511
+ struct ggml_tensor * dst) {
7512
+
7513
+ const struct ggml_tensor * src0 = dst->src[0];
7514
+
7515
+ switch (src0->type) {
7516
+ case GGML_TYPE_F32:
7517
+ {
7518
+ ggml_compute_forward_gelu_f32(params, dst);
7519
+ } break;
7520
+ case GGML_TYPE_F16:
7521
+ {
7522
+ ggml_compute_forward_gelu_f16(params, dst);
7523
+ } break;
6502
7524
  default:
6503
7525
  {
6504
7526
  GGML_ABORT("fatal error");
@@ -6506,11 +7528,50 @@ static void ggml_compute_forward_sigmoid(
6506
7528
  }
6507
7529
  }
6508
7530
 
6509
- // ggml_compute_forward_gelu
6510
-
6511
- static void ggml_compute_forward_gelu_f32(
6512
- const struct ggml_compute_params * params,
6513
- struct ggml_tensor * dst) {
7531
+ // ggml_compute_forward_gelu_quick
7532
+
7533
+ static void ggml_compute_forward_gelu_quick_f32(
7534
+ const struct ggml_compute_params * params,
7535
+ struct ggml_tensor * dst) {
7536
+
7537
+ const struct ggml_tensor * src0 = dst->src[0];
7538
+
7539
+ assert(ggml_is_contiguous_1(src0));
7540
+ assert(ggml_is_contiguous_1(dst));
7541
+ assert(ggml_are_same_shape(src0, dst));
7542
+
7543
+ const int ith = params->ith;
7544
+ const int nth = params->nth;
7545
+
7546
+ const int nc = src0->ne[0];
7547
+ const int nr = ggml_nrows(src0);
7548
+
7549
+ // rows per thread
7550
+ const int dr = (nr + nth - 1)/nth;
7551
+
7552
+ // row range for this thread
7553
+ const int ir0 = dr*ith;
7554
+ const int ir1 = MIN(ir0 + dr, nr);
7555
+
7556
+ for (int i1 = ir0; i1 < ir1; i1++) {
7557
+ ggml_vec_gelu_quick_f32(nc,
7558
+ (float *) ((char *) dst->data + i1*( dst->nb[1])),
7559
+ (float *) ((char *) src0->data + i1*(src0->nb[1])));
7560
+
7561
+ #ifndef NDEBUG
7562
+ for (int k = 0; k < nc; k++) {
7563
+ const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
7564
+ UNUSED(x);
7565
+ assert(!isnan(x));
7566
+ assert(!isinf(x));
7567
+ }
7568
+ #endif
7569
+ }
7570
+ }
7571
+
7572
+ static void ggml_compute_forward_gelu_quick_f16(
7573
+ const struct ggml_compute_params * params,
7574
+ struct ggml_tensor * dst) {
6514
7575
 
6515
7576
  const struct ggml_tensor * src0 = dst->src[0];
6516
7577
 
@@ -6532,22 +7593,23 @@ static void ggml_compute_forward_gelu_f32(
6532
7593
  const int ir1 = MIN(ir0 + dr, nr);
6533
7594
 
6534
7595
  for (int i1 = ir0; i1 < ir1; i1++) {
6535
- ggml_vec_gelu_f32(nc,
6536
- (float *) ((char *) dst->data + i1*( dst->nb[1])),
6537
- (float *) ((char *) src0->data + i1*(src0->nb[1])));
7596
+ ggml_vec_gelu_quick_f16(nc,
7597
+ (ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])),
7598
+ (ggml_fp16_t *) ((char *) src0->data + i1*(src0->nb[1])));
6538
7599
 
6539
7600
  #ifndef NDEBUG
6540
7601
  for (int k = 0; k < nc; k++) {
6541
- const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
6542
- UNUSED(x);
6543
- assert(!isnan(x));
6544
- assert(!isinf(x));
7602
+ const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k];
7603
+ const float v = GGML_FP16_TO_FP32(x);
7604
+ UNUSED(v);
7605
+ assert(!isnan(v));
7606
+ assert(!isinf(v));
6545
7607
  }
6546
7608
  #endif
6547
7609
  }
6548
7610
  }
6549
7611
 
6550
- static void ggml_compute_forward_gelu(
7612
+ static void ggml_compute_forward_gelu_quick(
6551
7613
  const struct ggml_compute_params * params,
6552
7614
  struct ggml_tensor * dst) {
6553
7615
 
@@ -6556,7 +7618,11 @@ static void ggml_compute_forward_gelu(
6556
7618
  switch (src0->type) {
6557
7619
  case GGML_TYPE_F32:
6558
7620
  {
6559
- ggml_compute_forward_gelu_f32(params, dst);
7621
+ ggml_compute_forward_gelu_quick_f32(params, dst);
7622
+ } break;
7623
+ case GGML_TYPE_F16:
7624
+ {
7625
+ ggml_compute_forward_gelu_quick_f16(params, dst);
6560
7626
  } break;
6561
7627
  default:
6562
7628
  {
@@ -6565,9 +7631,9 @@ static void ggml_compute_forward_gelu(
6565
7631
  }
6566
7632
  }
6567
7633
 
6568
- // ggml_compute_forward_gelu_quick
7634
+ // ggml_compute_forward_silu
6569
7635
 
6570
- static void ggml_compute_forward_gelu_quick_f32(
7636
+ static void ggml_compute_forward_silu_f32(
6571
7637
  const struct ggml_compute_params * params,
6572
7638
  struct ggml_tensor * dst) {
6573
7639
 
@@ -6591,13 +7657,13 @@ static void ggml_compute_forward_gelu_quick_f32(
6591
7657
  const int ir1 = MIN(ir0 + dr, nr);
6592
7658
 
6593
7659
  for (int i1 = ir0; i1 < ir1; i1++) {
6594
- ggml_vec_gelu_quick_f32(nc,
7660
+ ggml_vec_silu_f32(nc,
6595
7661
  (float *) ((char *) dst->data + i1*( dst->nb[1])),
6596
7662
  (float *) ((char *) src0->data + i1*(src0->nb[1])));
6597
7663
 
6598
7664
  #ifndef NDEBUG
6599
7665
  for (int k = 0; k < nc; k++) {
6600
- const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
7666
+ const float x = ((float *) ((char *) dst->data + i1*(dst->nb[1])))[k];
6601
7667
  UNUSED(x);
6602
7668
  assert(!isnan(x));
6603
7669
  assert(!isinf(x));
@@ -6606,29 +7672,9 @@ static void ggml_compute_forward_gelu_quick_f32(
6606
7672
  }
6607
7673
  }
6608
7674
 
6609
- static void ggml_compute_forward_gelu_quick(
6610
- const struct ggml_compute_params * params,
6611
- struct ggml_tensor * dst) {
6612
-
6613
- const struct ggml_tensor * src0 = dst->src[0];
6614
-
6615
- switch (src0->type) {
6616
- case GGML_TYPE_F32:
6617
- {
6618
- ggml_compute_forward_gelu_quick_f32(params, dst);
6619
- } break;
6620
- default:
6621
- {
6622
- GGML_ABORT("fatal error");
6623
- }
6624
- }
6625
- }
6626
-
6627
- // ggml_compute_forward_silu
6628
-
6629
- static void ggml_compute_forward_silu_f32(
6630
- const struct ggml_compute_params * params,
6631
- struct ggml_tensor * dst) {
7675
+ static void ggml_compute_forward_silu_f16(
7676
+ const struct ggml_compute_params * params,
7677
+ struct ggml_tensor * dst) {
6632
7678
 
6633
7679
  const struct ggml_tensor * src0 = dst->src[0];
6634
7680
 
@@ -6650,16 +7696,17 @@ static void ggml_compute_forward_silu_f32(
6650
7696
  const int ir1 = MIN(ir0 + dr, nr);
6651
7697
 
6652
7698
  for (int i1 = ir0; i1 < ir1; i1++) {
6653
- ggml_vec_silu_f32(nc,
6654
- (float *) ((char *) dst->data + i1*( dst->nb[1])),
6655
- (float *) ((char *) src0->data + i1*(src0->nb[1])));
7699
+ ggml_vec_silu_f16(nc,
7700
+ (ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])),
7701
+ (ggml_fp16_t *) ((char *) src0->data + i1*(src0->nb[1])));
6656
7702
 
6657
7703
  #ifndef NDEBUG
6658
7704
  for (int k = 0; k < nc; k++) {
6659
- const float x = ((float *) ((char *) dst->data + i1*(dst->nb[1])))[k];
6660
- UNUSED(x);
6661
- assert(!isnan(x));
6662
- assert(!isinf(x));
7705
+ const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*(dst->nb[1])))[k];
7706
+ const float v = GGML_FP16_TO_FP32(x);
7707
+ UNUSED(v);
7708
+ assert(!isnan(v));
7709
+ assert(!isinf(v));
6663
7710
  }
6664
7711
  #endif
6665
7712
  }
@@ -6676,6 +7723,10 @@ static void ggml_compute_forward_silu(
6676
7723
  {
6677
7724
  ggml_compute_forward_silu_f32(params, dst);
6678
7725
  } break;
7726
+ case GGML_TYPE_F16:
7727
+ {
7728
+ ggml_compute_forward_silu_f16(params, dst);
7729
+ } break;
6679
7730
  default:
6680
7731
  {
6681
7732
  GGML_ABORT("fatal error");
@@ -6714,6 +7765,36 @@ static void ggml_compute_forward_leaky_relu_f32(
6714
7765
  }
6715
7766
  }
6716
7767
 
7768
+ static void ggml_compute_forward_leaky_relu_f16(
7769
+ const struct ggml_compute_params * params,
7770
+ struct ggml_tensor * dst) {
7771
+
7772
+ const struct ggml_tensor * src0 = dst->src[0];
7773
+
7774
+ if (params->ith != 0) {
7775
+ return;
7776
+ }
7777
+
7778
+ assert(ggml_is_contiguous_1(src0));
7779
+ assert(ggml_is_contiguous_1(dst));
7780
+ assert(ggml_are_same_shape(src0, dst));
7781
+
7782
+ const int n = ggml_nrows(src0);
7783
+ const int nc = src0->ne[0];
7784
+
7785
+ float negative_slope;
7786
+ memcpy(&negative_slope, dst->op_params, sizeof(float));
7787
+
7788
+ assert(dst->nb[0] == sizeof(ggml_fp16_t));
7789
+ assert(src0->nb[0] == sizeof(ggml_fp16_t));
7790
+
7791
+ for (int i = 0; i < n; i++) {
7792
+ ggml_vec_leaky_relu_f16(nc,
7793
+ (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
7794
+ (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])), negative_slope);
7795
+ }
7796
+ }
7797
+
6717
7798
  static void ggml_compute_forward_leaky_relu(
6718
7799
  const struct ggml_compute_params * params,
6719
7800
  struct ggml_tensor * dst) {
@@ -6725,6 +7806,10 @@ static void ggml_compute_forward_leaky_relu(
6725
7806
  {
6726
7807
  ggml_compute_forward_leaky_relu_f32(params, dst);
6727
7808
  } break;
7809
+ case GGML_TYPE_F16:
7810
+ {
7811
+ ggml_compute_forward_leaky_relu_f16(params, dst);
7812
+ } break;
6728
7813
  default:
6729
7814
  {
6730
7815
  GGML_ABORT("fatal error");
@@ -6777,6 +7862,50 @@ static void ggml_compute_forward_silu_back_f32(
6777
7862
  }
6778
7863
  }
6779
7864
 
7865
+ static void ggml_compute_forward_silu_back_f16(
7866
+ const struct ggml_compute_params * params,
7867
+ struct ggml_tensor * dst) {
7868
+
7869
+ const struct ggml_tensor * grad = dst->src[0];
7870
+ const struct ggml_tensor * src1 = dst->src[1];
7871
+
7872
+ assert(ggml_is_contiguous_1(grad));
7873
+ assert(ggml_is_contiguous_1(src1));
7874
+ assert(ggml_is_contiguous_1(dst));
7875
+ assert(ggml_are_same_shape(src1, dst));
7876
+ assert(ggml_are_same_shape(src1, grad));
7877
+
7878
+ const int ith = params->ith;
7879
+ const int nth = params->nth;
7880
+
7881
+ const int nc = src1->ne[0];
7882
+ const int nr = ggml_nrows(src1);
7883
+
7884
+ // rows per thread
7885
+ const int dr = (nr + nth - 1)/nth;
7886
+
7887
+ // row range for this thread
7888
+ const int ir0 = dr*ith;
7889
+ const int ir1 = MIN(ir0 + dr, nr);
7890
+
7891
+ for (int i1 = ir0; i1 < ir1; i1++) {
7892
+ ggml_vec_silu_backward_f16(nc,
7893
+ (ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])),
7894
+ (ggml_fp16_t *) ((char *) src1->data + i1*(src1->nb[1])),
7895
+ (ggml_fp16_t *) ((char *) grad->data + i1*(grad->nb[1])));
7896
+
7897
+ #ifndef NDEBUG
7898
+ for (int k = 0; k < nc; k++) {
7899
+ const float x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k];
7900
+ const float v = GGML_FP16_TO_FP32(x);
7901
+ UNUSED(v);
7902
+ assert(!isnan(v));
7903
+ assert(!isinf(v));
7904
+ }
7905
+ #endif
7906
+ }
7907
+ }
7908
+
6780
7909
  static void ggml_compute_forward_silu_back(
6781
7910
  const struct ggml_compute_params * params,
6782
7911
  struct ggml_tensor * dst) {
@@ -6788,6 +7917,10 @@ static void ggml_compute_forward_silu_back(
6788
7917
  {
6789
7918
  ggml_compute_forward_silu_back_f32(params, dst);
6790
7919
  } break;
7920
+ case GGML_TYPE_F16:
7921
+ {
7922
+ ggml_compute_forward_silu_back_f16(params, dst);
7923
+ } break;
6791
7924
  default:
6792
7925
  {
6793
7926
  GGML_ABORT("fatal error");
@@ -6795,7 +7928,6 @@ static void ggml_compute_forward_silu_back(
6795
7928
  }
6796
7929
  }
6797
7930
 
6798
-
6799
7931
  static void ggml_compute_forward_hardswish_f32(
6800
7932
  const struct ggml_compute_params * params,
6801
7933
  struct ggml_tensor * dst) {
@@ -6819,6 +7951,31 @@ static void ggml_compute_forward_hardswish_f32(
6819
7951
  (float *) ((char *) src0->data + i*(src0->nb[1])));
6820
7952
  }
6821
7953
  }
7954
+
7955
+ static void ggml_compute_forward_hardswish_f16(
7956
+ const struct ggml_compute_params * params,
7957
+ struct ggml_tensor * dst) {
7958
+
7959
+ const struct ggml_tensor * src0 = dst->src[0];
7960
+
7961
+ if (params->ith != 0) {
7962
+ return;
7963
+ }
7964
+
7965
+ assert(ggml_is_contiguous_1(src0));
7966
+ assert(ggml_is_contiguous_1(dst));
7967
+ assert(ggml_are_same_shape(src0, dst));
7968
+
7969
+ const int n = ggml_nrows(src0);
7970
+ const int nc = src0->ne[0];
7971
+
7972
+ for (int i = 0; i < n; i++) {
7973
+ ggml_vec_hardswish_f16(nc,
7974
+ (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
7975
+ (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
7976
+ }
7977
+ }
7978
+
6822
7979
  static void ggml_compute_forward_hardswish(
6823
7980
  const struct ggml_compute_params * params,
6824
7981
  struct ggml_tensor * dst) {
@@ -6830,6 +7987,10 @@ static void ggml_compute_forward_hardswish(
6830
7987
  {
6831
7988
  ggml_compute_forward_hardswish_f32(params, dst);
6832
7989
  } break;
7990
+ case GGML_TYPE_F16:
7991
+ {
7992
+ ggml_compute_forward_hardswish_f16(params, dst);
7993
+ } break;
6833
7994
  default:
6834
7995
  {
6835
7996
  GGML_ABORT("fatal error");
@@ -6861,6 +8022,30 @@ static void ggml_compute_forward_hardsigmoid_f32(
6861
8022
  }
6862
8023
  }
6863
8024
 
8025
+ static void ggml_compute_forward_hardsigmoid_f16(
8026
+ const struct ggml_compute_params * params,
8027
+ struct ggml_tensor * dst) {
8028
+
8029
+ const struct ggml_tensor * src0 = dst->src[0];
8030
+
8031
+ if (params->ith != 0) {
8032
+ return;
8033
+ }
8034
+
8035
+ assert(ggml_is_contiguous_1(src0));
8036
+ assert(ggml_is_contiguous_1(dst));
8037
+ assert(ggml_are_same_shape(src0, dst));
8038
+
8039
+ const int n = ggml_nrows(src0);
8040
+ const int nc = src0->ne[0];
8041
+
8042
+ for (int i = 0; i < n; i++) {
8043
+ ggml_vec_hardsigmoid_f16(nc,
8044
+ (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
8045
+ (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
8046
+ }
8047
+ }
8048
+
6864
8049
  static void ggml_compute_forward_hardsigmoid(
6865
8050
  const struct ggml_compute_params * params,
6866
8051
  struct ggml_tensor * dst) {
@@ -6872,6 +8057,10 @@ static void ggml_compute_forward_hardsigmoid(
6872
8057
  {
6873
8058
  ggml_compute_forward_hardsigmoid_f32(params, dst);
6874
8059
  } break;
8060
+ case GGML_TYPE_F16:
8061
+ {
8062
+ ggml_compute_forward_hardsigmoid_f16(params, dst);
8063
+ } break;
6875
8064
  default:
6876
8065
  {
6877
8066
  GGML_ABORT("fatal error");
@@ -6903,6 +8092,30 @@ static void ggml_compute_forward_exp_f32(
6903
8092
  }
6904
8093
  }
6905
8094
 
8095
+ static void ggml_compute_forward_exp_f16(
8096
+ const struct ggml_compute_params * params,
8097
+ struct ggml_tensor * dst) {
8098
+
8099
+ const struct ggml_tensor * src0 = dst->src[0];
8100
+
8101
+ if (params->ith != 0) {
8102
+ return;
8103
+ }
8104
+
8105
+ assert(ggml_is_contiguous_1(src0));
8106
+ assert(ggml_is_contiguous_1(dst));
8107
+ assert(ggml_are_same_shape(src0, dst));
8108
+
8109
+ const int n = ggml_nrows(src0);
8110
+ const int nc = src0->ne[0];
8111
+
8112
+ for (int i = 0; i < n; i++) {
8113
+ ggml_vec_exp_f16(nc,
8114
+ (ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
8115
+ (ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
8116
+ }
8117
+ }
8118
+
6906
8119
  static void ggml_compute_forward_exp(
6907
8120
  const struct ggml_compute_params * params,
6908
8121
  struct ggml_tensor * dst) {
@@ -6914,6 +8127,10 @@ static void ggml_compute_forward_exp(
6914
8127
  {
6915
8128
  ggml_compute_forward_exp_f32(params, dst);
6916
8129
  } break;
8130
+ case GGML_TYPE_F16:
8131
+ {
8132
+ ggml_compute_forward_exp_f16(params, dst);
8133
+ } break;
6917
8134
  default:
6918
8135
  {
6919
8136
  GGML_ABORT("fatal error");
@@ -9198,6 +10415,43 @@ static void ggml_compute_forward_clamp_f32(
9198
10415
  }
9199
10416
  }
9200
10417
 
10418
+ static void ggml_compute_forward_clamp_f16(
10419
+ const struct ggml_compute_params * params,
10420
+ struct ggml_tensor * dst) {
10421
+
10422
+ const struct ggml_tensor * src0 = dst->src[0];
10423
+
10424
+ float min;
10425
+ float max;
10426
+ memcpy(&min, (float *) dst->op_params + 0, sizeof(float));
10427
+ memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
10428
+
10429
+ const int ith = params->ith;
10430
+ const int nth = params->nth;
10431
+
10432
+ const int n = ggml_nrows(src0);
10433
+ const int nc = src0->ne[0];
10434
+
10435
+ const size_t nb00 = src0->nb[0];
10436
+ const size_t nb01 = src0->nb[1];
10437
+
10438
+ const size_t nb0 = dst->nb[0];
10439
+ const size_t nb1 = dst->nb[1];
10440
+
10441
+ GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
10442
+ GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
10443
+
10444
+ for (int j = ith; j < n; j += nth) {
10445
+ ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + j*nb1);
10446
+ ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + j*nb01);
10447
+
10448
+ for (int i = 0; i < nc; i++) {
10449
+ float v = GGML_FP16_TO_FP32(src0_ptr[i]);
10450
+ dst_ptr[i] = GGML_FP32_TO_FP16(MAX(MIN(v, max), min));
10451
+ }
10452
+ }
10453
+ }
10454
+
9201
10455
  static void ggml_compute_forward_clamp(
9202
10456
  const struct ggml_compute_params * params,
9203
10457
  struct ggml_tensor * dst) {
@@ -9210,6 +10464,9 @@ static void ggml_compute_forward_clamp(
9210
10464
  ggml_compute_forward_clamp_f32(params, dst);
9211
10465
  } break;
9212
10466
  case GGML_TYPE_F16:
10467
+ {
10468
+ ggml_compute_forward_clamp_f16(params, dst);
10469
+ } break;
9213
10470
  case GGML_TYPE_BF16:
9214
10471
  case GGML_TYPE_Q4_0:
9215
10472
  case GGML_TYPE_Q4_1:
@@ -14322,6 +15579,14 @@ int ggml_cpu_has_amx_int8(void) {
14322
15579
  #endif
14323
15580
  }
14324
15581
 
15582
+ int ggml_cpu_has_bmi2(void) {
15583
+ #if defined(__BMI2__)
15584
+ return 1;
15585
+ #else
15586
+ return 0;
15587
+ #endif
15588
+ }
15589
+
14325
15590
  int ggml_cpu_has_fma(void) {
14326
15591
  #if defined(__FMA__)
14327
15592
  return 1;
@@ -14402,6 +15667,14 @@ int ggml_cpu_has_vsx(void) {
14402
15667
  #endif
14403
15668
  }
14404
15669
 
15670
+ int ggml_cpu_has_vxe(void) {
15671
+ #if defined(__VXE__) || defined(__VXE2__)
15672
+ return 1;
15673
+ #else
15674
+ return 0;
15675
+ #endif
15676
+ }
15677
+
14405
15678
  int ggml_cpu_has_neon(void) {
14406
15679
  #if defined(__ARM_ARCH) && defined(__ARM_NEON)
14407
15680
  return ggml_arm_arch_features.has_neon;
@@ -14442,6 +15715,14 @@ int ggml_cpu_get_sve_cnt(void) {
14442
15715
  #endif
14443
15716
  }
14444
15717
 
15718
+ int ggml_cpu_has_sme(void) {
15719
+ #if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SME)
15720
+ return ggml_arm_arch_features.has_sme;
15721
+ #else
15722
+ return 0;
15723
+ #endif
15724
+ }
15725
+
14445
15726
  void ggml_cpu_init(void) {
14446
15727
  // needed to initialize f16 tables
14447
15728
  {