@fugood/llama.node 0.3.12 → 0.3.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +2 -1
- package/package.json +1 -1
- package/src/LlamaCompletionWorker.cpp +14 -0
- package/src/LlamaContext.cpp +110 -79
- package/src/LlamaContext.h +1 -1
- package/src/common.hpp +1 -2
- package/src/llama.cpp/.github/workflows/build.yml +95 -13
- package/src/llama.cpp/.github/workflows/docker.yml +2 -0
- package/src/llama.cpp/.github/workflows/labeler.yml +1 -1
- package/src/llama.cpp/.github/workflows/server.yml +2 -0
- package/src/llama.cpp/common/CMakeLists.txt +23 -6
- package/src/llama.cpp/common/arg.cpp +292 -14
- package/src/llama.cpp/common/chat.cpp +1128 -315
- package/src/llama.cpp/common/chat.h +135 -0
- package/src/llama.cpp/common/common.cpp +27 -171
- package/src/llama.cpp/common/common.h +41 -73
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +4 -5
- package/src/llama.cpp/common/json-schema-to-grammar.h +0 -1
- package/src/llama.cpp/common/llguidance.cpp +3 -3
- package/src/llama.cpp/common/log.cpp +1 -0
- package/src/llama.cpp/common/log.h +2 -1
- package/src/llama.cpp/common/{chat-template.hpp → minja/chat-template.hpp} +21 -7
- package/src/llama.cpp/common/{minja.hpp → minja/minja.hpp} +61 -14
- package/src/llama.cpp/common/ngram-cache.cpp +1 -0
- package/src/llama.cpp/common/sampling.cpp +93 -49
- package/src/llama.cpp/common/speculative.cpp +6 -5
- package/src/llama.cpp/common/speculative.h +1 -1
- package/src/llama.cpp/docs/build.md +47 -9
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +3 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +1 -0
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +4 -2
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +4 -4
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +6 -5
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +1 -1
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +373 -107
- package/src/llama.cpp/examples/llava/clip.h +19 -3
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +341 -0
- package/src/llama.cpp/examples/llava/llava.cpp +4 -2
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +30 -11
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -0
- package/src/llama.cpp/examples/main/main.cpp +73 -28
- package/src/llama.cpp/examples/parallel/parallel.cpp +1 -0
- package/src/llama.cpp/examples/passkey/passkey.cpp +1 -0
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +1 -0
- package/src/llama.cpp/examples/quantize/quantize.cpp +1 -0
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +882 -237
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +35 -26
- package/src/llama.cpp/examples/run/run.cpp +115 -79
- package/src/llama.cpp/examples/server/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/server/httplib.h +381 -292
- package/src/llama.cpp/examples/server/server.cpp +134 -128
- package/src/llama.cpp/examples/server/utils.hpp +95 -106
- package/src/llama.cpp/examples/sycl/run-llama2.sh +2 -2
- package/src/llama.cpp/examples/tts/tts.cpp +251 -142
- package/src/llama.cpp/ggml/CMakeLists.txt +13 -1
- package/src/llama.cpp/ggml/include/ggml-alloc.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-backend.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-cpu.h +4 -1
- package/src/llama.cpp/ggml/include/ggml-metal.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +0 -2
- package/src/llama.cpp/ggml/include/ggml.h +6 -2
- package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
- package/src/llama.cpp/ggml/src/ggml-alloc.c +24 -15
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +58 -54
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +10 -8
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +3 -5
- package/src/llama.cpp/ggml/src/ggml-common.h +0 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +132 -17
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +156 -11
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +2235 -641
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1572 -198
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +24 -5
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +259 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +61 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +288 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +9 -8
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +16 -3
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +14 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +4 -5
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +235 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +6 -2
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +246 -120
- package/src/llama.cpp/ggml/src/ggml-quants.c +114 -114
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +2 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +17 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +51 -10
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +33 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +701 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +55 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +136 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +308 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +174 -728
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -77
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +13 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +949 -602
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +37 -3
- package/src/llama.cpp/ggml/src/ggml.c +9 -4
- package/src/llama.cpp/include/llama.h +32 -14
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +1 -0
- package/src/llama.cpp/requirements/requirements-tool_bench.txt +12 -0
- package/src/llama.cpp/requirements.txt +1 -0
- package/src/llama.cpp/src/llama-arch.cpp +21 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-chat.cpp +1 -0
- package/src/llama.cpp/src/llama-grammar.cpp +183 -183
- package/src/llama.cpp/src/llama-grammar.h +13 -4
- package/src/llama.cpp/src/llama-impl.h +6 -6
- package/src/llama.cpp/src/llama-kv-cache.h +2 -1
- package/src/llama.cpp/src/llama-mmap.cpp +11 -1
- package/src/llama.cpp/src/llama-mmap.h +1 -0
- package/src/llama.cpp/src/llama-model.cpp +70 -6
- package/src/llama.cpp/src/llama-sampling.cpp +174 -67
- package/src/llama.cpp/src/llama-vocab.cpp +12 -0
- package/src/llama.cpp/src/llama.cpp +154 -5
- package/src/llama.cpp/src/unicode.cpp +9 -2
- package/src/llama.cpp/tests/test-backend-ops.cpp +171 -115
- package/src/llama.cpp/tests/test-chat-template.cpp +32 -22
- package/src/llama.cpp/tests/test-chat.cpp +691 -325
- package/src/llama.cpp/tests/test-gguf.cpp +4 -4
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +63 -63
- package/src/llama.cpp/tests/test-quantize-fns.cpp +1 -9
- package/src/llama.cpp/tests/test-sampling.cpp +15 -0
- package/src/llama.cpp/Sources/llama/llama.h +0 -4
- package/src/llama.cpp/common/chat.hpp +0 -52
|
@@ -7,10 +7,8 @@
|
|
|
7
7
|
#include "ggml-cpu-impl.h"
|
|
8
8
|
#include "ggml-cpu.h"
|
|
9
9
|
#include "ggml-impl.h"
|
|
10
|
-
#include "ggml-quants.h"
|
|
11
10
|
#include "ggml-cpu-quants.h"
|
|
12
11
|
#include "ggml-threading.h"
|
|
13
|
-
#include "amx/amx.h"
|
|
14
12
|
#include "ggml.h"
|
|
15
13
|
|
|
16
14
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
|
@@ -114,7 +112,8 @@ struct ggml_arm_arch_features_type {
|
|
|
114
112
|
int has_i8mm;
|
|
115
113
|
int has_sve;
|
|
116
114
|
int sve_cnt;
|
|
117
|
-
|
|
115
|
+
int has_sme;
|
|
116
|
+
} ggml_arm_arch_features = {-1, -1, -1, -1, 0, -1};
|
|
118
117
|
#endif
|
|
119
118
|
|
|
120
119
|
|
|
@@ -238,6 +237,8 @@ typedef pthread_t ggml_thread_t;
|
|
|
238
237
|
#else
|
|
239
238
|
#if defined(__POWER9_VECTOR__)
|
|
240
239
|
#define CACHE_LINE_SIZE 128
|
|
240
|
+
#elif defined(__VXE__) || defined(__VXE2__)
|
|
241
|
+
#define CACHE_LINE_SIZE 256
|
|
241
242
|
#else
|
|
242
243
|
#define CACHE_LINE_SIZE 64
|
|
243
244
|
#endif
|
|
@@ -246,9 +247,9 @@ typedef pthread_t ggml_thread_t;
|
|
|
246
247
|
static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
|
|
247
248
|
|
|
248
249
|
|
|
249
|
-
static void ggml_vec_dot_f32(int n, float *
|
|
250
|
-
static void ggml_vec_dot_f16(int n, float *
|
|
251
|
-
static void ggml_vec_dot_bf16(int n, float *
|
|
250
|
+
static void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc);
|
|
251
|
+
static void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc);
|
|
252
|
+
static void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc);
|
|
252
253
|
|
|
253
254
|
static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
|
|
254
255
|
[GGML_TYPE_F32] = {
|
|
@@ -1078,29 +1079,23 @@ do { \
|
|
|
1078
1079
|
#define GGML_F16_STEP 32
|
|
1079
1080
|
#define GGML_F16_EPR 8
|
|
1080
1081
|
|
|
1081
|
-
// F16 arithmetic is not supported by
|
|
1082
|
+
// F16 arithmetic is not supported by LASX, so we use F32 instead
|
|
1082
1083
|
|
|
1083
1084
|
#define GGML_F32Cx8 __m256
|
|
1084
1085
|
#define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0)
|
|
1085
1086
|
#define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))
|
|
1086
1087
|
|
|
1087
1088
|
static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) {
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
|
|
1091
|
-
|
|
1092
|
-
}
|
|
1093
|
-
|
|
1094
|
-
return (__m256)__lasx_xvld(tmp, 0);
|
|
1089
|
+
__m256i a;
|
|
1090
|
+
memcpy(&a, x, sizeof(ggml_fp16_t) * 8);
|
|
1091
|
+
a = __lasx_xvpermi_d(a, 0 | (1 << 4));
|
|
1092
|
+
return __lasx_xvfcvtl_s_h(a);
|
|
1095
1093
|
}
|
|
1096
|
-
static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
|
|
1097
|
-
float arr[8];
|
|
1098
|
-
|
|
1099
|
-
__lasx_xvst(y, arr, 0);
|
|
1100
1094
|
|
|
1101
|
-
|
|
1102
|
-
|
|
1103
|
-
|
|
1095
|
+
static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
|
|
1096
|
+
__m256i a = __lasx_xvfcvt_h_s(y, y);
|
|
1097
|
+
a = __lasx_xvpermi_d(a, 0 | (2 << 2));
|
|
1098
|
+
memcpy(x, &a, sizeof(ggml_fp16_t) * 8);
|
|
1104
1099
|
}
|
|
1105
1100
|
#define GGML_F32Cx8_LOAD(x) __lasx_f32cx8_load(x)
|
|
1106
1101
|
#define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y)
|
|
@@ -1218,6 +1213,87 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
|
|
|
1218
1213
|
#define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
|
|
1219
1214
|
#define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
|
|
1220
1215
|
|
|
1216
|
+
#elif defined(__VXE__) || defined(__VXE2__)
|
|
1217
|
+
|
|
1218
|
+
#define GGML_SIMD
|
|
1219
|
+
|
|
1220
|
+
// F32 s390x
|
|
1221
|
+
|
|
1222
|
+
#define GGML_F32_STEP 32
|
|
1223
|
+
#define GGML_F32_EPR 4
|
|
1224
|
+
|
|
1225
|
+
#define GGML_F32x4 __vector float
|
|
1226
|
+
#define GGML_F32x4_ZERO vec_splats(0.0f)
|
|
1227
|
+
#define GGML_F32x4_SET1 vec_splats
|
|
1228
|
+
#define GGML_F32x4_LOAD(p) vec_xl(0, p)
|
|
1229
|
+
#define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p)
|
|
1230
|
+
#define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
|
|
1231
|
+
#define GGML_F32x4_ADD vec_add
|
|
1232
|
+
#define GGML_F32x4_MUL vec_mul
|
|
1233
|
+
#define GGML_F32x4_REDUCE(res, x) \
|
|
1234
|
+
{ \
|
|
1235
|
+
int offset = GGML_F32_ARR >> 1; \
|
|
1236
|
+
for (int i = 0; i < offset; ++i) { \
|
|
1237
|
+
x[i] = vec_add(x[i], x[offset + i]); \
|
|
1238
|
+
} \
|
|
1239
|
+
offset >>= 1; \
|
|
1240
|
+
for (int i = 0; i < offset; ++i) { \
|
|
1241
|
+
x[i] = vec_add(x[i], x[offset + i]); \
|
|
1242
|
+
} \
|
|
1243
|
+
offset >>= 1; \
|
|
1244
|
+
for (int i = 0; i < offset; ++i) { \
|
|
1245
|
+
x[i] = vec_add(x[i], x[offset + i]); \
|
|
1246
|
+
} \
|
|
1247
|
+
res = vec_extract(x[0], 0) + \
|
|
1248
|
+
vec_extract(x[0], 1) + \
|
|
1249
|
+
vec_extract(x[0], 2) + \
|
|
1250
|
+
vec_extract(x[0], 3); \
|
|
1251
|
+
}
|
|
1252
|
+
|
|
1253
|
+
#define GGML_F32_VEC GGML_F32x4
|
|
1254
|
+
#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
|
|
1255
|
+
#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
|
|
1256
|
+
#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
|
|
1257
|
+
#define GGML_F32_VEC_STORE GGML_F32x4_STORE
|
|
1258
|
+
#define GGML_F32_VEC_FMA GGML_F32x4_FMA
|
|
1259
|
+
#define GGML_F32_VEC_ADD GGML_F32x4_ADD
|
|
1260
|
+
#define GGML_F32_VEC_MUL GGML_F32x4_MUL
|
|
1261
|
+
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
|
|
1262
|
+
|
|
1263
|
+
// F16 s390x
|
|
1264
|
+
#define GGML_F16_STEP GGML_F32_STEP
|
|
1265
|
+
#define GGML_F16_EPR GGML_F32_EPR
|
|
1266
|
+
|
|
1267
|
+
static inline __vector float __lzs_f16cx4_load(const ggml_fp16_t * x) {
|
|
1268
|
+
float tmp[4];
|
|
1269
|
+
|
|
1270
|
+
for (int i = 0; i < 4; i++) {
|
|
1271
|
+
tmp[i] = GGML_FP16_TO_FP32(x[i]);
|
|
1272
|
+
}
|
|
1273
|
+
|
|
1274
|
+
return vec_xl(0, tmp);
|
|
1275
|
+
}
|
|
1276
|
+
|
|
1277
|
+
static inline void __lzs_f16cx4_store(ggml_fp16_t * x, __vector float y) {
|
|
1278
|
+
float arr[4];
|
|
1279
|
+
|
|
1280
|
+
vec_xst(y, 0, arr);
|
|
1281
|
+
|
|
1282
|
+
for (int i = 0; i < 4; i++) {
|
|
1283
|
+
x[i] = GGML_FP32_TO_FP16(arr[i]);
|
|
1284
|
+
}
|
|
1285
|
+
}
|
|
1286
|
+
|
|
1287
|
+
#define GGML_F16_VEC GGML_F32x4
|
|
1288
|
+
#define GGML_F16_VEC_ZERO GGML_F32x4_ZERO
|
|
1289
|
+
#define GGML_F16_VEC_SET1 GGML_F32x4_SET1
|
|
1290
|
+
#define GGML_F16_VEC_LOAD(p, i) __lzs_f16cx4_load(p)
|
|
1291
|
+
#define GGML_F16_VEC_STORE(p, r, i) __lzs_f16cx4_store(p, r[i])
|
|
1292
|
+
#define GGML_F16_VEC_FMA GGML_F32x4_FMA
|
|
1293
|
+
#define GGML_F16_VEC_ADD GGML_F32x4_ADD
|
|
1294
|
+
#define GGML_F16_VEC_MUL GGML_F32x4_MUL
|
|
1295
|
+
#define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
|
|
1296
|
+
|
|
1221
1297
|
#endif
|
|
1222
1298
|
|
|
1223
1299
|
// GGML_F32_ARR / GGML_F16_ARR
|
|
@@ -1297,7 +1373,7 @@ struct ggml_threadpool {
|
|
|
1297
1373
|
atomic_int n_graph; // incremented when there is work to be done (i.e each graph)
|
|
1298
1374
|
atomic_int GGML_CACHE_ALIGN n_barrier;
|
|
1299
1375
|
atomic_int GGML_CACHE_ALIGN n_barrier_passed;
|
|
1300
|
-
atomic_int current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
|
|
1376
|
+
atomic_int GGML_CACHE_ALIGN current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
|
|
1301
1377
|
|
|
1302
1378
|
// these are atomic as an annotation for thread-sanitizer
|
|
1303
1379
|
atomic_bool stop; // Used for stopping the threadpool altogether
|
|
@@ -1339,17 +1415,43 @@ inline static void ggml_vec_cpy_i32(const int n, int32_t * y, const int32_t * x)
|
|
|
1339
1415
|
inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
|
1340
1416
|
inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
|
1341
1417
|
inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; }
|
|
1418
|
+
inline static void ggml_vec_add_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
|
|
1419
|
+
for (int i = 0; i < n; ++i) {
|
|
1420
|
+
z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) + GGML_FP16_TO_FP32(y[i]));
|
|
1421
|
+
}
|
|
1422
|
+
}
|
|
1342
1423
|
inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { for (int i = 0; i < n; ++i) z[i] = x[i] + v; }
|
|
1343
1424
|
inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] += x[i]; }
|
|
1344
1425
|
inline static void ggml_vec_acc1_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] += v; }
|
|
1345
1426
|
inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] - y[i]; }
|
|
1427
|
+
inline static void ggml_vec_sub_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
|
|
1428
|
+
for (int i = 0; i < n; ++i) {
|
|
1429
|
+
z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) - GGML_FP16_TO_FP32(y[i]));
|
|
1430
|
+
}
|
|
1431
|
+
}
|
|
1346
1432
|
inline static void ggml_vec_set_f32 (const int n, float * x, const float v) { for (int i = 0; i < n; ++i) x[i] = v; }
|
|
1347
1433
|
inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; }
|
|
1348
1434
|
inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = -x[i]; }
|
|
1435
|
+
inline static void ggml_vec_neg_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
1436
|
+
for (int i = 0; i < n; ++i) {
|
|
1437
|
+
y[i] = GGML_FP32_TO_FP16(-GGML_FP16_TO_FP32(x[i]));
|
|
1438
|
+
}
|
|
1439
|
+
}
|
|
1440
|
+
|
|
1349
1441
|
inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; }
|
|
1442
|
+
inline static void ggml_vec_mul_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
|
|
1443
|
+
for (int i = 0; i < n; ++i) {
|
|
1444
|
+
z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) * GGML_FP16_TO_FP32(y[i]));
|
|
1445
|
+
}
|
|
1446
|
+
}
|
|
1350
1447
|
inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; }
|
|
1448
|
+
inline static void ggml_vec_div_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
|
|
1449
|
+
for (int i = 0; i < n; ++i) {
|
|
1450
|
+
z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) / GGML_FP16_TO_FP32(y[i]));
|
|
1451
|
+
}
|
|
1452
|
+
}
|
|
1351
1453
|
|
|
1352
|
-
static void ggml_vec_dot_f32(int n, float *
|
|
1454
|
+
static void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc) {
|
|
1353
1455
|
assert(nrc == 1);
|
|
1354
1456
|
UNUSED(nrc);
|
|
1355
1457
|
UNUSED(bx);
|
|
@@ -1392,7 +1494,7 @@ static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float *
|
|
|
1392
1494
|
*s = sumf;
|
|
1393
1495
|
}
|
|
1394
1496
|
|
|
1395
|
-
static void ggml_vec_dot_bf16(int n, float *
|
|
1497
|
+
static void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc) {
|
|
1396
1498
|
assert(nrc == 1);
|
|
1397
1499
|
UNUSED(nrc);
|
|
1398
1500
|
UNUSED(bx);
|
|
@@ -1460,7 +1562,7 @@ static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t
|
|
|
1460
1562
|
*s = sumf;
|
|
1461
1563
|
}
|
|
1462
1564
|
|
|
1463
|
-
static void ggml_vec_dot_f16(int n, float *
|
|
1565
|
+
static void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc) {
|
|
1464
1566
|
assert(nrc == 1);
|
|
1465
1567
|
UNUSED(nrc);
|
|
1466
1568
|
UNUSED(bx);
|
|
@@ -1504,10 +1606,10 @@ static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t *
|
|
|
1504
1606
|
|
|
1505
1607
|
// compute GGML_VEC_DOT_UNROLL dot products at once
|
|
1506
1608
|
// xs - x row stride in bytes
|
|
1507
|
-
inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float *
|
|
1609
|
+
inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GGML_RESTRICT s, void * GGML_RESTRICT xv, ggml_fp16_t * GGML_RESTRICT y) {
|
|
1508
1610
|
ggml_float sumf[GGML_VEC_DOT_UNROLL] = { 0.0 };
|
|
1509
1611
|
|
|
1510
|
-
ggml_fp16_t *
|
|
1612
|
+
ggml_fp16_t * GGML_RESTRICT x[GGML_VEC_DOT_UNROLL];
|
|
1511
1613
|
|
|
1512
1614
|
for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
|
|
1513
1615
|
x[i] = (ggml_fp16_t *) ((char *) xv + i*xs);
|
|
@@ -1557,7 +1659,7 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * re
|
|
|
1557
1659
|
}
|
|
1558
1660
|
}
|
|
1559
1661
|
|
|
1560
|
-
inline static void ggml_vec_mad_f32(const int n, float *
|
|
1662
|
+
inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const float * GGML_RESTRICT x, const float v) {
|
|
1561
1663
|
#if defined(GGML_SIMD)
|
|
1562
1664
|
const int np = (n & ~(GGML_F32_STEP - 1));
|
|
1563
1665
|
|
|
@@ -1588,7 +1690,7 @@ inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float
|
|
|
1588
1690
|
#endif
|
|
1589
1691
|
}
|
|
1590
1692
|
|
|
1591
|
-
inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t *
|
|
1693
|
+
inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y, const ggml_fp16_t * GGML_RESTRICT x, const float v) {
|
|
1592
1694
|
#if defined(GGML_SIMD)
|
|
1593
1695
|
const int np = (n & ~(GGML_F16_STEP - 1));
|
|
1594
1696
|
|
|
@@ -1620,10 +1722,10 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, const
|
|
|
1620
1722
|
}
|
|
1621
1723
|
|
|
1622
1724
|
// xs and vs are byte strides of x and v
|
|
1623
|
-
inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float *
|
|
1725
|
+
inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * GGML_RESTRICT y, const float * GGML_RESTRICT xv, const float * GGML_RESTRICT vv) {
|
|
1624
1726
|
|
|
1625
|
-
const float *
|
|
1626
|
-
const float *
|
|
1727
|
+
const float * GGML_RESTRICT x[GGML_VEC_MAD_UNROLL];
|
|
1728
|
+
const float * GGML_RESTRICT v[GGML_VEC_MAD_UNROLL];
|
|
1627
1729
|
|
|
1628
1730
|
for (int i = 0; i < GGML_VEC_MAD_UNROLL; ++i) {
|
|
1629
1731
|
x[i] = (const float *) ((const char *) xv + i*xs);
|
|
@@ -1734,22 +1836,107 @@ inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float
|
|
|
1734
1836
|
|
|
1735
1837
|
inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s); }
|
|
1736
1838
|
inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; }
|
|
1839
|
+
inline static void ggml_vec_sqr_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
1840
|
+
for (int i = 0; i < n; ++i) {
|
|
1841
|
+
float v = GGML_FP16_TO_FP32(x[i]);
|
|
1842
|
+
y[i] = GGML_FP32_TO_FP16(v*v);
|
|
1843
|
+
}
|
|
1844
|
+
}
|
|
1737
1845
|
inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
|
|
1846
|
+
inline static void ggml_vec_sqrt_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
1847
|
+
for (int i = 0; i < n; ++i) {
|
|
1848
|
+
y[i] = GGML_FP32_TO_FP16(sqrtf(GGML_FP16_TO_FP32(x[i])));
|
|
1849
|
+
}
|
|
1850
|
+
}
|
|
1738
1851
|
inline static void ggml_vec_log_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]); }
|
|
1852
|
+
inline static void ggml_vec_log_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
1853
|
+
for (int i = 0; i < n; ++i) {
|
|
1854
|
+
y[i] = GGML_FP32_TO_FP16(logf(GGML_FP16_TO_FP32(x[i])));
|
|
1855
|
+
}
|
|
1856
|
+
}
|
|
1739
1857
|
inline static void ggml_vec_sin_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sinf(x[i]); }
|
|
1858
|
+
inline static void ggml_vec_sin_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
1859
|
+
for (int i = 0; i < n; ++i) {
|
|
1860
|
+
y[i] = GGML_FP32_TO_FP16(sinf(GGML_FP16_TO_FP32(x[i])));
|
|
1861
|
+
}
|
|
1862
|
+
}
|
|
1740
1863
|
inline static void ggml_vec_cos_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = cosf(x[i]); }
|
|
1864
|
+
inline static void ggml_vec_cos_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
1865
|
+
for (int i = 0; i < n; ++i) {
|
|
1866
|
+
y[i] = GGML_FP32_TO_FP16(cosf(GGML_FP16_TO_FP32(x[i])));
|
|
1867
|
+
}
|
|
1868
|
+
}
|
|
1741
1869
|
inline static void ggml_vec_abs_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); }
|
|
1870
|
+
inline static void ggml_vec_abs_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
1871
|
+
for (int i = 0; i < n; ++i) {
|
|
1872
|
+
y[i] = GGML_FP32_TO_FP16(fabsf(GGML_FP16_TO_FP32(x[i])));
|
|
1873
|
+
}
|
|
1874
|
+
}
|
|
1742
1875
|
inline static void ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
|
|
1876
|
+
inline static void ggml_vec_sgn_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
1877
|
+
for (int i = 0; i < n; ++i) {
|
|
1878
|
+
float v = GGML_FP16_TO_FP32(x[i]);
|
|
1879
|
+
y[i] = GGML_FP32_TO_FP16((v > 0.f) ? 1.f : ((v < 0.f) ? -1.f : 0.f));
|
|
1880
|
+
}
|
|
1881
|
+
}
|
|
1743
1882
|
inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
|
|
1883
|
+
inline static void ggml_vec_step_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
1884
|
+
for (int i = 0; i < n; ++i) {
|
|
1885
|
+
y[i] = GGML_FP32_TO_FP16((GGML_FP16_TO_FP32(x[i]) > 0.f) ? 1.f : 0.f);
|
|
1886
|
+
}
|
|
1887
|
+
}
|
|
1744
1888
|
inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); }
|
|
1889
|
+
inline static void ggml_vec_tanh_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
1890
|
+
for (int i = 0; i < n; ++i) {
|
|
1891
|
+
y[i] = GGML_FP32_TO_FP16(tanhf(GGML_FP16_TO_FP32(x[i])));
|
|
1892
|
+
}
|
|
1893
|
+
}
|
|
1745
1894
|
inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); }
|
|
1895
|
+
inline static void ggml_vec_elu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
1896
|
+
for (int i = 0; i < n; ++i) {
|
|
1897
|
+
y[i] = GGML_FP32_TO_FP16(expm1f(GGML_FP16_TO_FP32(x[i])));
|
|
1898
|
+
}
|
|
1899
|
+
}
|
|
1746
1900
|
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
|
|
1901
|
+
inline static void ggml_vec_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
1902
|
+
for (int i = 0; i < n; ++i) {
|
|
1903
|
+
float v = GGML_FP16_TO_FP32(x[i]);
|
|
1904
|
+
y[i] = GGML_FP32_TO_FP16((v > 0.f) ? v : 0.f);
|
|
1905
|
+
}
|
|
1906
|
+
}
|
|
1747
1907
|
inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
|
|
1908
|
+
inline static void ggml_vec_leaky_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const float ns) {
|
|
1909
|
+
for (int i = 0; i < n; ++i) {
|
|
1910
|
+
float v = GGML_FP16_TO_FP32(x[i]);
|
|
1911
|
+
y[i] = GGML_FP32_TO_FP16(((v > 0.f) ? v : 0.f) + ns * ((v < 0.0f) ? v : 0.f));
|
|
1912
|
+
}
|
|
1913
|
+
}
|
|
1748
1914
|
inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); }
|
|
1915
|
+
inline static void ggml_vec_sigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
1916
|
+
for (int i = 0; i < n; ++i) {
|
|
1917
|
+
y[i] = GGML_FP32_TO_FP16(1.f / (1.f + expf(-GGML_FP16_TO_FP32(x[i]))));
|
|
1918
|
+
}
|
|
1919
|
+
}
|
|
1749
1920
|
// TODO: optimize performance
|
|
1750
1921
|
inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
|
|
1922
|
+
inline static void ggml_vec_hardswish_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
1923
|
+
for (int i = 0; i < n; ++i) {
|
|
1924
|
+
float v = GGML_FP16_TO_FP32(x[i]);
|
|
1925
|
+
y[i] = GGML_FP32_TO_FP16(v * fminf(1.0f, fmaxf(0.0f, (v + 3.0f) / 6.0f)));
|
|
1926
|
+
}
|
|
1927
|
+
}
|
|
1751
1928
|
inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
|
|
1929
|
+
inline static void ggml_vec_hardsigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
1930
|
+
for (int i = 0; i < n; ++i) {
|
|
1931
|
+
y[i] = GGML_FP32_TO_FP16(fminf(1.0f, fmaxf(0.0f, (GGML_FP16_TO_FP32(x[i]) + 3.0f) / 6.0f)));
|
|
1932
|
+
}
|
|
1933
|
+
}
|
|
1752
1934
|
inline static void ggml_vec_exp_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = expf(x[i]); }
|
|
1935
|
+
inline static void ggml_vec_exp_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
1936
|
+
for (int i = 0; i < n; ++i) {
|
|
1937
|
+
y[i] = GGML_FP32_TO_FP16(expf(GGML_FP16_TO_FP32(x[i])));
|
|
1938
|
+
}
|
|
1939
|
+
}
|
|
1753
1940
|
|
|
1754
1941
|
static const float GELU_COEF_A = 0.044715f;
|
|
1755
1942
|
static const float GELU_QUICK_COEF = -1.702f;
|
|
@@ -1817,14 +2004,25 @@ inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float *
|
|
|
1817
2004
|
}
|
|
1818
2005
|
#endif
|
|
1819
2006
|
|
|
2007
|
+
inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
2008
|
+
for (int i = 0; i < n; ++i) {
|
|
2009
|
+
float v = GGML_FP16_TO_FP32(x[i]);
|
|
2010
|
+
y[i] = GGML_FP32_TO_FP16(v*(1.0f/(1.0f+expf(GELU_QUICK_COEF*v))));
|
|
2011
|
+
}
|
|
2012
|
+
}
|
|
2013
|
+
|
|
1820
2014
|
// Sigmoid Linear Unit (SiLU) function
|
|
1821
2015
|
inline static float ggml_silu_f32(float x) {
|
|
1822
2016
|
return x/(1.0f + expf(-x));
|
|
1823
2017
|
}
|
|
2018
|
+
inline static ggml_fp16_t ggml_silu_f16(ggml_fp16_t x) {
|
|
2019
|
+
float v = GGML_FP16_TO_FP32(x);
|
|
2020
|
+
return GGML_FP32_TO_FP16(v/(1.0f + expf(-v)));
|
|
2021
|
+
}
|
|
1824
2022
|
|
|
1825
2023
|
#if __FINITE_MATH_ONLY__
|
|
1826
2024
|
#error "some routines in ggml.c require non-finite math arithmetics -- pass -fno-finite-math-only to the compiler to fix"
|
|
1827
|
-
#error "ref: https://github.com/
|
|
2025
|
+
#error "ref: https://github.com/ggml-org/llama.cpp/pull/7154#issuecomment-2143844461"
|
|
1828
2026
|
#endif
|
|
1829
2027
|
|
|
1830
2028
|
#if defined(__ARM_NEON) && defined(__aarch64__)
|
|
@@ -2044,6 +2242,12 @@ static void ggml_vec_silu_f32(const int n, float * y, const float * x) {
|
|
|
2044
2242
|
}
|
|
2045
2243
|
}
|
|
2046
2244
|
|
|
2245
|
+
inline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
|
2246
|
+
for (int i = 0; i < n; ++i) {
|
|
2247
|
+
y[i] = ggml_silu_f16(x[i]);
|
|
2248
|
+
}
|
|
2249
|
+
}
|
|
2250
|
+
|
|
2047
2251
|
static ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max) {
|
|
2048
2252
|
int i = 0;
|
|
2049
2253
|
ggml_float sum = 0;
|
|
@@ -2115,12 +2319,24 @@ inline static float ggml_silu_backward_f32(float x, float dy) {
|
|
|
2115
2319
|
return dy*s*(1.0f + x*(1.0f - s));
|
|
2116
2320
|
}
|
|
2117
2321
|
|
|
2322
|
+
inline static ggml_fp16_t ggml_silu_backward_f16(ggml_fp16_t x, ggml_fp16_t dy) {
|
|
2323
|
+
const float v = GGML_FP16_TO_FP32(x);
|
|
2324
|
+
const float s = 1.0f/(1.0f + expf(-v));
|
|
2325
|
+
return GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(dy)*s*(1.0f + v*(1.0f - s)));
|
|
2326
|
+
}
|
|
2327
|
+
|
|
2118
2328
|
inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
|
|
2119
2329
|
for (int i = 0; i < n; ++i) {
|
|
2120
2330
|
dx[i] = ggml_silu_backward_f32(x[i], dy[i]);
|
|
2121
2331
|
}
|
|
2122
2332
|
}
|
|
2123
2333
|
|
|
2334
|
+
inline static void ggml_vec_silu_backward_f16(const int n, ggml_fp16_t * dx, const ggml_fp16_t * x, const ggml_fp16_t * dy) {
|
|
2335
|
+
for (int i = 0; i < n; ++i) {
|
|
2336
|
+
dx[i] = ggml_silu_backward_f16(x[i], dy[i]);
|
|
2337
|
+
}
|
|
2338
|
+
}
|
|
2339
|
+
|
|
2124
2340
|
inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
|
|
2125
2341
|
#ifndef GGML_USE_ACCELERATE
|
|
2126
2342
|
ggml_float sum = 0.0;
|
|
@@ -2389,15 +2605,20 @@ bool ggml_is_numa(void) {
|
|
|
2389
2605
|
#define HWCAP2_I8MM (1 << 13)
|
|
2390
2606
|
#endif
|
|
2391
2607
|
|
|
2608
|
+
#if !defined(HWCAP2_SME)
|
|
2609
|
+
#define HWCAP2_SME (1 << 23)
|
|
2610
|
+
#endif
|
|
2611
|
+
|
|
2392
2612
|
static void ggml_init_arm_arch_features(void) {
|
|
2393
2613
|
#if defined(__linux__) && defined(__aarch64__)
|
|
2394
2614
|
uint32_t hwcap = getauxval(AT_HWCAP);
|
|
2395
2615
|
uint32_t hwcap2 = getauxval(AT_HWCAP2);
|
|
2396
2616
|
|
|
2397
|
-
ggml_arm_arch_features.has_neon
|
|
2617
|
+
ggml_arm_arch_features.has_neon = !!(hwcap & HWCAP_ASIMD);
|
|
2398
2618
|
ggml_arm_arch_features.has_dotprod = !!(hwcap & HWCAP_ASIMDDP);
|
|
2399
|
-
ggml_arm_arch_features.has_i8mm
|
|
2400
|
-
ggml_arm_arch_features.has_sve
|
|
2619
|
+
ggml_arm_arch_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
|
|
2620
|
+
ggml_arm_arch_features.has_sve = !!(hwcap & HWCAP_SVE);
|
|
2621
|
+
ggml_arm_arch_features.has_sme = !!(hwcap2 & HWCAP2_SME);
|
|
2401
2622
|
|
|
2402
2623
|
#if defined(__ARM_FEATURE_SVE)
|
|
2403
2624
|
ggml_arm_arch_features.sve_cnt = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
|
|
@@ -2420,6 +2641,11 @@ static void ggml_init_arm_arch_features(void) {
|
|
|
2420
2641
|
}
|
|
2421
2642
|
ggml_arm_arch_features.has_i8mm = oldp;
|
|
2422
2643
|
|
|
2644
|
+
if (sysctlbyname("hw.optional.arm.FEAT_SME", &oldp, &size, NULL, 0) != 0) {
|
|
2645
|
+
oldp = 0;
|
|
2646
|
+
}
|
|
2647
|
+
ggml_arm_arch_features.has_sme = oldp;
|
|
2648
|
+
|
|
2423
2649
|
ggml_arm_arch_features.has_sve = 0;
|
|
2424
2650
|
ggml_arm_arch_features.sve_cnt = 0;
|
|
2425
2651
|
#else
|
|
@@ -2443,6 +2669,12 @@ static void ggml_init_arm_arch_features(void) {
|
|
|
2443
2669
|
ggml_arm_arch_features.has_sve = 0;
|
|
2444
2670
|
ggml_arm_arch_features.sve_cnt = 0;
|
|
2445
2671
|
#endif
|
|
2672
|
+
|
|
2673
|
+
#if defined(__ARM_FEATURE_SME) || defined(__ARM_FEATURE_SME2)
|
|
2674
|
+
ggml_arm_arch_features.has_sme = 1;
|
|
2675
|
+
#else
|
|
2676
|
+
ggml_arm_arch_features.has_sme = 0;
|
|
2677
|
+
#endif
|
|
2446
2678
|
#endif
|
|
2447
2679
|
}
|
|
2448
2680
|
#endif
|
|
@@ -4287,7 +4519,7 @@ static void ggml_compute_forward_add_f16_f16(
|
|
|
4287
4519
|
const struct ggml_tensor * src0 = dst->src[0];
|
|
4288
4520
|
const struct ggml_tensor * src1 = dst->src[1];
|
|
4289
4521
|
|
|
4290
|
-
GGML_ASSERT(
|
|
4522
|
+
GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
|
|
4291
4523
|
|
|
4292
4524
|
const int ith = params->ith;
|
|
4293
4525
|
const int nth = params->nth;
|
|
@@ -4312,17 +4544,22 @@ static void ggml_compute_forward_add_f16_f16(
|
|
|
4312
4544
|
|
|
4313
4545
|
if (nb10 == sizeof(ggml_fp16_t)) {
|
|
4314
4546
|
for (int ir = ir0; ir < ir1; ++ir) {
|
|
4315
|
-
// src0
|
|
4316
|
-
const
|
|
4317
|
-
const
|
|
4318
|
-
const
|
|
4547
|
+
// src1 is broadcastable across src0 and dst in i1, i2, i3
|
|
4548
|
+
const int64_t i03 = ir/(ne02*ne01);
|
|
4549
|
+
const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
|
|
4550
|
+
const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
|
|
4319
4551
|
|
|
4320
|
-
|
|
4321
|
-
|
|
4322
|
-
|
|
4552
|
+
const int64_t i13 = i03 % ne13;
|
|
4553
|
+
const int64_t i12 = i02 % ne12;
|
|
4554
|
+
const int64_t i11 = i01 % ne11;
|
|
4555
|
+
const int64_t nr0 = ne00 / ne10;
|
|
4323
4556
|
|
|
4324
|
-
|
|
4325
|
-
|
|
4557
|
+
ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
|
4558
|
+
ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
|
4559
|
+
ggml_fp16_t * src1_ptr = (ggml_fp16_t *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
|
|
4560
|
+
|
|
4561
|
+
for (int64_t r = 0; r < nr0; ++r) {
|
|
4562
|
+
ggml_vec_add_f16(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
|
|
4326
4563
|
}
|
|
4327
4564
|
}
|
|
4328
4565
|
}
|
|
@@ -5110,6 +5347,62 @@ static void ggml_compute_forward_sub_f32(
|
|
|
5110
5347
|
}
|
|
5111
5348
|
}
|
|
5112
5349
|
|
|
5350
|
+
static void ggml_compute_forward_sub_f16(
|
|
5351
|
+
const struct ggml_compute_params * params,
|
|
5352
|
+
struct ggml_tensor * dst) {
|
|
5353
|
+
|
|
5354
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
|
5355
|
+
const struct ggml_tensor * src1 = dst->src[1];
|
|
5356
|
+
|
|
5357
|
+
assert(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
|
|
5358
|
+
|
|
5359
|
+
const int ith = params->ith;
|
|
5360
|
+
const int nth = params->nth;
|
|
5361
|
+
|
|
5362
|
+
const int nr = ggml_nrows(src0);
|
|
5363
|
+
|
|
5364
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
|
5365
|
+
|
|
5366
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
|
5367
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F16);
|
|
5368
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F16);
|
|
5369
|
+
|
|
5370
|
+
GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
|
|
5371
|
+
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
|
5372
|
+
|
|
5373
|
+
// rows per thread
|
|
5374
|
+
const int dr = (nr + nth - 1)/nth;
|
|
5375
|
+
|
|
5376
|
+
// row range for this thread
|
|
5377
|
+
const int ir0 = dr*ith;
|
|
5378
|
+
const int ir1 = MIN(ir0 + dr, nr);
|
|
5379
|
+
|
|
5380
|
+
if (nb10 == sizeof(ggml_fp16_t)) {
|
|
5381
|
+
for (int ir = ir0; ir < ir1; ++ir) {
|
|
5382
|
+
// src1 is broadcastable across src0 and dst in i1, i2, i3
|
|
5383
|
+
const int64_t i03 = ir/(ne02*ne01);
|
|
5384
|
+
const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
|
|
5385
|
+
const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
|
|
5386
|
+
|
|
5387
|
+
const int64_t i13 = i03 % ne13;
|
|
5388
|
+
const int64_t i12 = i02 % ne12;
|
|
5389
|
+
const int64_t i11 = i01 % ne11;
|
|
5390
|
+
const int64_t nr0 = ne00 / ne10;
|
|
5391
|
+
|
|
5392
|
+
ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
|
5393
|
+
ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
|
5394
|
+
ggml_fp16_t * src1_ptr = (ggml_fp16_t *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
|
|
5395
|
+
|
|
5396
|
+
for (int64_t r = 0; r < nr0; ++r) {
|
|
5397
|
+
ggml_vec_sub_f16(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
|
|
5398
|
+
}
|
|
5399
|
+
}
|
|
5400
|
+
} else {
|
|
5401
|
+
// src1 is not contiguous
|
|
5402
|
+
GGML_ABORT("unimplemented error");
|
|
5403
|
+
}
|
|
5404
|
+
}
|
|
5405
|
+
|
|
5113
5406
|
static void ggml_compute_forward_sub(
|
|
5114
5407
|
const struct ggml_compute_params * params,
|
|
5115
5408
|
struct ggml_tensor * dst) {
|
|
@@ -5121,6 +5414,10 @@ static void ggml_compute_forward_sub(
|
|
|
5121
5414
|
{
|
|
5122
5415
|
ggml_compute_forward_sub_f32(params, dst);
|
|
5123
5416
|
} break;
|
|
5417
|
+
case GGML_TYPE_F16:
|
|
5418
|
+
{
|
|
5419
|
+
ggml_compute_forward_sub_f16(params, dst);
|
|
5420
|
+
} break;
|
|
5124
5421
|
default:
|
|
5125
5422
|
{
|
|
5126
5423
|
GGML_ABORT("fatal error");
|
|
@@ -5201,32 +5498,9 @@ static void ggml_compute_forward_mul_f32(
|
|
|
5201
5498
|
}
|
|
5202
5499
|
}
|
|
5203
5500
|
|
|
5204
|
-
static void
|
|
5205
|
-
|
|
5206
|
-
|
|
5207
|
-
|
|
5208
|
-
const struct ggml_tensor * src0 = dst->src[0];
|
|
5209
|
-
const struct ggml_tensor * src1 = dst->src[1];
|
|
5210
|
-
|
|
5211
|
-
GGML_ASSERT(src1->type == GGML_TYPE_F32 && "only f32 src1 supported for now");
|
|
5212
|
-
|
|
5213
|
-
switch (src0->type) {
|
|
5214
|
-
case GGML_TYPE_F32:
|
|
5215
|
-
{
|
|
5216
|
-
ggml_compute_forward_mul_f32(params, dst);
|
|
5217
|
-
} break;
|
|
5218
|
-
default:
|
|
5219
|
-
{
|
|
5220
|
-
GGML_ABORT("fatal error");
|
|
5221
|
-
}
|
|
5222
|
-
}
|
|
5223
|
-
}
|
|
5224
|
-
|
|
5225
|
-
// ggml_compute_forward_div
|
|
5226
|
-
|
|
5227
|
-
static void ggml_compute_forward_div_f32(
|
|
5228
|
-
const struct ggml_compute_params * params,
|
|
5229
|
-
struct ggml_tensor * dst) {
|
|
5501
|
+
static void ggml_compute_forward_mul_f16(
|
|
5502
|
+
const struct ggml_compute_params * params,
|
|
5503
|
+
struct ggml_tensor * dst) {
|
|
5230
5504
|
|
|
5231
5505
|
const struct ggml_tensor * src0 = dst->src[0];
|
|
5232
5506
|
const struct ggml_tensor * src1 = dst->src[1];
|
|
@@ -5240,10 +5514,14 @@ static void ggml_compute_forward_div_f32(
|
|
|
5240
5514
|
|
|
5241
5515
|
GGML_TENSOR_BINARY_OP_LOCALS
|
|
5242
5516
|
|
|
5243
|
-
GGML_ASSERT(
|
|
5244
|
-
GGML_ASSERT(
|
|
5517
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
|
5518
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F16);
|
|
5519
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F16);
|
|
5245
5520
|
|
|
5246
|
-
|
|
5521
|
+
GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
|
|
5522
|
+
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
|
5523
|
+
|
|
5524
|
+
if (nb10 == sizeof(ggml_fp16_t)) {
|
|
5247
5525
|
for (int64_t ir = ith; ir < nr; ir += nth) {
|
|
5248
5526
|
// src0 and dst are same shape => same indices
|
|
5249
5527
|
const int64_t i03 = ir/(ne02*ne01);
|
|
@@ -5255,13 +5533,85 @@ static void ggml_compute_forward_div_f32(
|
|
|
5255
5533
|
const int64_t i11 = i01 % ne11;
|
|
5256
5534
|
const int64_t nr0 = ne00 / ne10;
|
|
5257
5535
|
|
|
5258
|
-
|
|
5259
|
-
|
|
5260
|
-
|
|
5261
|
-
|
|
5262
|
-
for (int64_t r = 0; r < nr0; ++r) {
|
|
5263
|
-
|
|
5264
|
-
|
|
5536
|
+
ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
|
5537
|
+
ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
|
5538
|
+
ggml_fp16_t * src1_ptr = (ggml_fp16_t *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
|
|
5539
|
+
|
|
5540
|
+
for (int64_t r = 0 ; r < nr0; ++r) {
|
|
5541
|
+
ggml_vec_mul_f16(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
|
|
5542
|
+
}
|
|
5543
|
+
}
|
|
5544
|
+
} else {
|
|
5545
|
+
// src1 is not contiguous
|
|
5546
|
+
GGML_ABORT("unimplemented error");
|
|
5547
|
+
}
|
|
5548
|
+
}
|
|
5549
|
+
|
|
5550
|
+
static void ggml_compute_forward_mul(
|
|
5551
|
+
const struct ggml_compute_params * params,
|
|
5552
|
+
struct ggml_tensor * dst) {
|
|
5553
|
+
|
|
5554
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
|
5555
|
+
const struct ggml_tensor * src1 = dst->src[1];
|
|
5556
|
+
|
|
5557
|
+
GGML_ASSERT((src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16) && "only f32/f16 src1 supported for now");
|
|
5558
|
+
|
|
5559
|
+
switch (src0->type) {
|
|
5560
|
+
case GGML_TYPE_F32:
|
|
5561
|
+
{
|
|
5562
|
+
ggml_compute_forward_mul_f32(params, dst);
|
|
5563
|
+
} break;
|
|
5564
|
+
case GGML_TYPE_F16:
|
|
5565
|
+
{
|
|
5566
|
+
ggml_compute_forward_mul_f16(params, dst);
|
|
5567
|
+
} break;
|
|
5568
|
+
default:
|
|
5569
|
+
{
|
|
5570
|
+
GGML_ABORT("fatal error");
|
|
5571
|
+
}
|
|
5572
|
+
}
|
|
5573
|
+
}
|
|
5574
|
+
|
|
5575
|
+
// ggml_compute_forward_div
|
|
5576
|
+
|
|
5577
|
+
static void ggml_compute_forward_div_f32(
|
|
5578
|
+
const struct ggml_compute_params * params,
|
|
5579
|
+
struct ggml_tensor * dst) {
|
|
5580
|
+
|
|
5581
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
|
5582
|
+
const struct ggml_tensor * src1 = dst->src[1];
|
|
5583
|
+
|
|
5584
|
+
GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
|
|
5585
|
+
|
|
5586
|
+
const int ith = params->ith;
|
|
5587
|
+
const int nth = params->nth;
|
|
5588
|
+
|
|
5589
|
+
const int64_t nr = ggml_nrows(src0);
|
|
5590
|
+
|
|
5591
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
|
5592
|
+
|
|
5593
|
+
GGML_ASSERT( nb0 == sizeof(float));
|
|
5594
|
+
GGML_ASSERT(nb00 == sizeof(float));
|
|
5595
|
+
|
|
5596
|
+
if (nb10 == sizeof(float)) {
|
|
5597
|
+
for (int64_t ir = ith; ir < nr; ir += nth) {
|
|
5598
|
+
// src0 and dst are same shape => same indices
|
|
5599
|
+
const int64_t i03 = ir/(ne02*ne01);
|
|
5600
|
+
const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
|
|
5601
|
+
const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
|
|
5602
|
+
|
|
5603
|
+
const int64_t i13 = i03 % ne13;
|
|
5604
|
+
const int64_t i12 = i02 % ne12;
|
|
5605
|
+
const int64_t i11 = i01 % ne11;
|
|
5606
|
+
const int64_t nr0 = ne00 / ne10;
|
|
5607
|
+
|
|
5608
|
+
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
|
5609
|
+
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
|
5610
|
+
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
|
|
5611
|
+
|
|
5612
|
+
for (int64_t r = 0; r < nr0; ++r) {
|
|
5613
|
+
#ifdef GGML_USE_ACCELERATE
|
|
5614
|
+
UNUSED(ggml_vec_div_f32);
|
|
5265
5615
|
|
|
5266
5616
|
vDSP_vdiv(src1_ptr, 1, src0_ptr + r*ne10, 1, dst_ptr + r*ne10, 1, ne10);
|
|
5267
5617
|
#else
|
|
@@ -5295,6 +5645,55 @@ static void ggml_compute_forward_div_f32(
|
|
|
5295
5645
|
}
|
|
5296
5646
|
}
|
|
5297
5647
|
|
|
5648
|
+
static void ggml_compute_forward_div_f16(
|
|
5649
|
+
const struct ggml_compute_params * params,
|
|
5650
|
+
struct ggml_tensor * dst) {
|
|
5651
|
+
|
|
5652
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
|
5653
|
+
const struct ggml_tensor * src1 = dst->src[1];
|
|
5654
|
+
|
|
5655
|
+
GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
|
|
5656
|
+
|
|
5657
|
+
const int ith = params->ith;
|
|
5658
|
+
const int nth = params->nth;
|
|
5659
|
+
|
|
5660
|
+
const int64_t nr = ggml_nrows(src0);
|
|
5661
|
+
|
|
5662
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
|
5663
|
+
|
|
5664
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
|
5665
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F16);
|
|
5666
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F16);
|
|
5667
|
+
|
|
5668
|
+
GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
|
|
5669
|
+
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
|
5670
|
+
|
|
5671
|
+
if (nb10 == sizeof(ggml_fp16_t)) {
|
|
5672
|
+
for (int64_t ir = ith; ir < nr; ir += nth) {
|
|
5673
|
+
// src0 and dst are same shape => same indices
|
|
5674
|
+
const int64_t i03 = ir/(ne02*ne01);
|
|
5675
|
+
const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
|
|
5676
|
+
const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
|
|
5677
|
+
|
|
5678
|
+
const int64_t i13 = i03 % ne13;
|
|
5679
|
+
const int64_t i12 = i02 % ne12;
|
|
5680
|
+
const int64_t i11 = i01 % ne11;
|
|
5681
|
+
const int64_t nr0 = ne00 / ne10;
|
|
5682
|
+
|
|
5683
|
+
ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
|
5684
|
+
ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
|
5685
|
+
ggml_fp16_t * src1_ptr = (ggml_fp16_t *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
|
|
5686
|
+
|
|
5687
|
+
for (int64_t r = 0; r < nr0; ++r) {
|
|
5688
|
+
ggml_vec_div_f16(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
|
|
5689
|
+
}
|
|
5690
|
+
}
|
|
5691
|
+
} else {
|
|
5692
|
+
// src1 is not contiguous
|
|
5693
|
+
GGML_ABORT("unimplemented error");
|
|
5694
|
+
}
|
|
5695
|
+
}
|
|
5696
|
+
|
|
5298
5697
|
static void ggml_compute_forward_div(
|
|
5299
5698
|
const struct ggml_compute_params * params,
|
|
5300
5699
|
struct ggml_tensor * dst) {
|
|
@@ -5306,6 +5705,10 @@ static void ggml_compute_forward_div(
|
|
|
5306
5705
|
{
|
|
5307
5706
|
ggml_compute_forward_div_f32(params, dst);
|
|
5308
5707
|
} break;
|
|
5708
|
+
case GGML_TYPE_F16:
|
|
5709
|
+
{
|
|
5710
|
+
ggml_compute_forward_div_f16(params, dst);
|
|
5711
|
+
} break;
|
|
5309
5712
|
default:
|
|
5310
5713
|
{
|
|
5311
5714
|
GGML_ABORT("fatal error");
|
|
@@ -5340,6 +5743,31 @@ static void ggml_compute_forward_sqr_f32(
|
|
|
5340
5743
|
}
|
|
5341
5744
|
}
|
|
5342
5745
|
|
|
5746
|
+
static void ggml_compute_forward_sqr_f16(
|
|
5747
|
+
const struct ggml_compute_params * params,
|
|
5748
|
+
struct ggml_tensor * dst) {
|
|
5749
|
+
|
|
5750
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
|
5751
|
+
|
|
5752
|
+
if (params->ith != 0) {
|
|
5753
|
+
return;
|
|
5754
|
+
}
|
|
5755
|
+
|
|
5756
|
+
assert(ggml_are_same_shape(src0, dst));
|
|
5757
|
+
|
|
5758
|
+
const int n = ggml_nrows(src0);
|
|
5759
|
+
const int nc = src0->ne[0];
|
|
5760
|
+
|
|
5761
|
+
assert( dst->nb[0] == sizeof(ggml_fp16_t));
|
|
5762
|
+
assert(src0->nb[0] == sizeof(ggml_fp16_t));
|
|
5763
|
+
|
|
5764
|
+
for (int i = 0; i < n; i++) {
|
|
5765
|
+
ggml_vec_sqr_f16(nc,
|
|
5766
|
+
(ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
|
|
5767
|
+
(ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
|
|
5768
|
+
}
|
|
5769
|
+
}
|
|
5770
|
+
|
|
5343
5771
|
static void ggml_compute_forward_sqr(
|
|
5344
5772
|
const struct ggml_compute_params * params,
|
|
5345
5773
|
struct ggml_tensor * dst) {
|
|
@@ -5351,6 +5779,10 @@ static void ggml_compute_forward_sqr(
|
|
|
5351
5779
|
{
|
|
5352
5780
|
ggml_compute_forward_sqr_f32(params, dst);
|
|
5353
5781
|
} break;
|
|
5782
|
+
case GGML_TYPE_F16:
|
|
5783
|
+
{
|
|
5784
|
+
ggml_compute_forward_sqr_f16(params, dst);
|
|
5785
|
+
} break;
|
|
5354
5786
|
default:
|
|
5355
5787
|
{
|
|
5356
5788
|
GGML_ABORT("fatal error");
|
|
@@ -5385,6 +5817,31 @@ static void ggml_compute_forward_sqrt_f32(
|
|
|
5385
5817
|
}
|
|
5386
5818
|
}
|
|
5387
5819
|
|
|
5820
|
+
static void ggml_compute_forward_sqrt_f16(
|
|
5821
|
+
const struct ggml_compute_params * params,
|
|
5822
|
+
struct ggml_tensor * dst) {
|
|
5823
|
+
|
|
5824
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
|
5825
|
+
|
|
5826
|
+
if (params->ith != 0) {
|
|
5827
|
+
return;
|
|
5828
|
+
}
|
|
5829
|
+
|
|
5830
|
+
assert(ggml_are_same_shape(src0, dst));
|
|
5831
|
+
|
|
5832
|
+
const int n = ggml_nrows(src0);
|
|
5833
|
+
const int nc = src0->ne[0];
|
|
5834
|
+
|
|
5835
|
+
assert( dst->nb[0] == sizeof(ggml_fp16_t));
|
|
5836
|
+
assert(src0->nb[0] == sizeof(ggml_fp16_t));
|
|
5837
|
+
|
|
5838
|
+
for (int i = 0; i < n; i++) {
|
|
5839
|
+
ggml_vec_sqrt_f16(nc,
|
|
5840
|
+
(ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
|
|
5841
|
+
(ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
|
|
5842
|
+
}
|
|
5843
|
+
}
|
|
5844
|
+
|
|
5388
5845
|
static void ggml_compute_forward_sqrt(
|
|
5389
5846
|
const struct ggml_compute_params * params,
|
|
5390
5847
|
struct ggml_tensor * dst) {
|
|
@@ -5396,6 +5853,10 @@ static void ggml_compute_forward_sqrt(
|
|
|
5396
5853
|
{
|
|
5397
5854
|
ggml_compute_forward_sqrt_f32(params, dst);
|
|
5398
5855
|
} break;
|
|
5856
|
+
case GGML_TYPE_F16:
|
|
5857
|
+
{
|
|
5858
|
+
ggml_compute_forward_sqrt_f16(params, dst);
|
|
5859
|
+
} break;
|
|
5399
5860
|
default:
|
|
5400
5861
|
{
|
|
5401
5862
|
GGML_ABORT("fatal error");
|
|
@@ -5430,6 +5891,31 @@ static void ggml_compute_forward_log_f32(
|
|
|
5430
5891
|
}
|
|
5431
5892
|
}
|
|
5432
5893
|
|
|
5894
|
+
static void ggml_compute_forward_log_f16(
|
|
5895
|
+
const struct ggml_compute_params * params,
|
|
5896
|
+
struct ggml_tensor * dst) {
|
|
5897
|
+
|
|
5898
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
|
5899
|
+
|
|
5900
|
+
if (params->ith != 0) {
|
|
5901
|
+
return;
|
|
5902
|
+
}
|
|
5903
|
+
|
|
5904
|
+
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
|
5905
|
+
|
|
5906
|
+
const int n = ggml_nrows(src0);
|
|
5907
|
+
const int nc = src0->ne[0];
|
|
5908
|
+
|
|
5909
|
+
GGML_ASSERT( dst->nb[0] == sizeof(ggml_fp16_t));
|
|
5910
|
+
GGML_ASSERT(src0->nb[0] == sizeof(ggml_fp16_t));
|
|
5911
|
+
|
|
5912
|
+
for (int i = 0; i < n; i++) {
|
|
5913
|
+
ggml_vec_log_f16(nc,
|
|
5914
|
+
(ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
|
|
5915
|
+
(ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
|
|
5916
|
+
}
|
|
5917
|
+
}
|
|
5918
|
+
|
|
5433
5919
|
static void ggml_compute_forward_log(
|
|
5434
5920
|
const struct ggml_compute_params * params,
|
|
5435
5921
|
struct ggml_tensor * dst) {
|
|
@@ -5441,6 +5927,10 @@ static void ggml_compute_forward_log(
|
|
|
5441
5927
|
{
|
|
5442
5928
|
ggml_compute_forward_log_f32(params, dst);
|
|
5443
5929
|
} break;
|
|
5930
|
+
case GGML_TYPE_F16:
|
|
5931
|
+
{
|
|
5932
|
+
ggml_compute_forward_log_f16(params, dst);
|
|
5933
|
+
} break;
|
|
5444
5934
|
default:
|
|
5445
5935
|
{
|
|
5446
5936
|
GGML_ABORT("fatal error");
|
|
@@ -5475,6 +5965,31 @@ static void ggml_compute_forward_sin_f32(
|
|
|
5475
5965
|
}
|
|
5476
5966
|
}
|
|
5477
5967
|
|
|
5968
|
+
static void ggml_compute_forward_sin_f16(
|
|
5969
|
+
const struct ggml_compute_params * params,
|
|
5970
|
+
struct ggml_tensor * dst) {
|
|
5971
|
+
|
|
5972
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
|
5973
|
+
|
|
5974
|
+
if (params->ith != 0) {
|
|
5975
|
+
return;
|
|
5976
|
+
}
|
|
5977
|
+
|
|
5978
|
+
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
|
5979
|
+
|
|
5980
|
+
const int n = ggml_nrows(src0);
|
|
5981
|
+
const int nc = src0->ne[0];
|
|
5982
|
+
|
|
5983
|
+
GGML_ASSERT( dst->nb[0] == sizeof(ggml_fp16_t));
|
|
5984
|
+
GGML_ASSERT(src0->nb[0] == sizeof(ggml_fp16_t));
|
|
5985
|
+
|
|
5986
|
+
for (int i = 0; i < n; i++) {
|
|
5987
|
+
ggml_vec_sin_f16(nc,
|
|
5988
|
+
(ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
|
|
5989
|
+
(ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
|
|
5990
|
+
}
|
|
5991
|
+
}
|
|
5992
|
+
|
|
5478
5993
|
static void ggml_compute_forward_sin(
|
|
5479
5994
|
const struct ggml_compute_params * params,
|
|
5480
5995
|
struct ggml_tensor * dst) {
|
|
@@ -5486,6 +6001,10 @@ static void ggml_compute_forward_sin(
|
|
|
5486
6001
|
{
|
|
5487
6002
|
ggml_compute_forward_sin_f32(params, dst);
|
|
5488
6003
|
} break;
|
|
6004
|
+
case GGML_TYPE_F16:
|
|
6005
|
+
{
|
|
6006
|
+
ggml_compute_forward_sin_f16(params, dst);
|
|
6007
|
+
} break;
|
|
5489
6008
|
default:
|
|
5490
6009
|
{
|
|
5491
6010
|
GGML_ABORT("fatal error");
|
|
@@ -5520,6 +6039,31 @@ static void ggml_compute_forward_cos_f32(
|
|
|
5520
6039
|
}
|
|
5521
6040
|
}
|
|
5522
6041
|
|
|
6042
|
+
static void ggml_compute_forward_cos_f16(
|
|
6043
|
+
const struct ggml_compute_params * params,
|
|
6044
|
+
struct ggml_tensor * dst) {
|
|
6045
|
+
|
|
6046
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
|
6047
|
+
|
|
6048
|
+
if (params->ith != 0) {
|
|
6049
|
+
return;
|
|
6050
|
+
}
|
|
6051
|
+
|
|
6052
|
+
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
|
6053
|
+
|
|
6054
|
+
const int n = ggml_nrows(src0);
|
|
6055
|
+
const int nc = src0->ne[0];
|
|
6056
|
+
|
|
6057
|
+
GGML_ASSERT( dst->nb[0] == sizeof(ggml_fp16_t));
|
|
6058
|
+
GGML_ASSERT(src0->nb[0] == sizeof(ggml_fp16_t));
|
|
6059
|
+
|
|
6060
|
+
for (int i = 0; i < n; i++) {
|
|
6061
|
+
ggml_vec_cos_f16(nc,
|
|
6062
|
+
(ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
|
|
6063
|
+
(ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
|
|
6064
|
+
}
|
|
6065
|
+
}
|
|
6066
|
+
|
|
5523
6067
|
static void ggml_compute_forward_cos(
|
|
5524
6068
|
const struct ggml_compute_params * params,
|
|
5525
6069
|
struct ggml_tensor * dst) {
|
|
@@ -5531,6 +6075,10 @@ static void ggml_compute_forward_cos(
|
|
|
5531
6075
|
{
|
|
5532
6076
|
ggml_compute_forward_cos_f32(params, dst);
|
|
5533
6077
|
} break;
|
|
6078
|
+
case GGML_TYPE_F16:
|
|
6079
|
+
{
|
|
6080
|
+
ggml_compute_forward_cos_f16(params, dst);
|
|
6081
|
+
} break;
|
|
5534
6082
|
default:
|
|
5535
6083
|
{
|
|
5536
6084
|
GGML_ABORT("fatal error");
|
|
@@ -6100,14 +6648,14 @@ static void ggml_compute_forward_repeat_back(
|
|
|
6100
6648
|
|
|
6101
6649
|
// ggml_compute_forward_concat
|
|
6102
6650
|
|
|
6103
|
-
static void
|
|
6651
|
+
static void ggml_compute_forward_concat_any(
|
|
6104
6652
|
const struct ggml_compute_params * params,
|
|
6105
6653
|
struct ggml_tensor * dst) {
|
|
6106
6654
|
|
|
6107
6655
|
const struct ggml_tensor * src0 = dst->src[0];
|
|
6108
6656
|
const struct ggml_tensor * src1 = dst->src[1];
|
|
6109
6657
|
|
|
6110
|
-
|
|
6658
|
+
const size_t len = ggml_type_size(src0->type);
|
|
6111
6659
|
|
|
6112
6660
|
const int ith = params->ith;
|
|
6113
6661
|
const int nth = params->nth;
|
|
@@ -6121,7 +6669,7 @@ static void ggml_compute_forward_concat_f32(
|
|
|
6121
6669
|
int64_t o[4] = {0, 0, 0, 0};
|
|
6122
6670
|
o[dim] = src0->ne[dim];
|
|
6123
6671
|
|
|
6124
|
-
const
|
|
6672
|
+
const char * x;
|
|
6125
6673
|
|
|
6126
6674
|
// TODO: smarter multi-theading
|
|
6127
6675
|
for (int i3 = 0; i3 < ne3; i3++) {
|
|
@@ -6129,40 +6677,179 @@ static void ggml_compute_forward_concat_f32(
|
|
|
6129
6677
|
for (int i1 = 0; i1 < ne1; i1++) {
|
|
6130
6678
|
for (int i0 = 0; i0 < ne0; i0++) {
|
|
6131
6679
|
if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
|
|
6132
|
-
x = (const
|
|
6680
|
+
x = (const char *)src0->data + (i0 )*nb00 + (i1 )*nb01 + (i2 )*nb02 + (i3 )*nb03;
|
|
6133
6681
|
} else {
|
|
6134
|
-
x = (const
|
|
6682
|
+
x = (const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13;
|
|
6135
6683
|
}
|
|
6136
6684
|
|
|
6137
|
-
|
|
6685
|
+
char * y = (char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3;
|
|
6138
6686
|
|
|
6139
|
-
|
|
6687
|
+
memcpy(y, x, len);
|
|
6140
6688
|
}
|
|
6141
6689
|
}
|
|
6142
6690
|
}
|
|
6143
6691
|
}
|
|
6144
6692
|
}
|
|
6145
6693
|
|
|
6146
|
-
static void
|
|
6694
|
+
static void ggml_compute_forward_concat_i8(
|
|
6147
6695
|
const struct ggml_compute_params * params,
|
|
6148
6696
|
struct ggml_tensor * dst) {
|
|
6149
6697
|
|
|
6150
6698
|
const struct ggml_tensor * src0 = dst->src[0];
|
|
6699
|
+
const struct ggml_tensor * src1 = dst->src[1];
|
|
6151
6700
|
|
|
6152
|
-
|
|
6153
|
-
|
|
6154
|
-
|
|
6155
|
-
|
|
6156
|
-
|
|
6157
|
-
|
|
6158
|
-
|
|
6159
|
-
|
|
6160
|
-
|
|
6161
|
-
|
|
6162
|
-
|
|
6163
|
-
}
|
|
6164
|
-
|
|
6165
|
-
|
|
6701
|
+
GGML_ASSERT(ggml_type_size(src0->type) == sizeof(int8_t));
|
|
6702
|
+
|
|
6703
|
+
const int ith = params->ith;
|
|
6704
|
+
const int nth = params->nth;
|
|
6705
|
+
|
|
6706
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
|
6707
|
+
|
|
6708
|
+
const int32_t dim = ggml_get_op_params_i32(dst, 0);
|
|
6709
|
+
|
|
6710
|
+
GGML_ASSERT(dim >= 0 && dim < 4);
|
|
6711
|
+
|
|
6712
|
+
int64_t o[4] = {0, 0, 0, 0};
|
|
6713
|
+
o[dim] = src0->ne[dim];
|
|
6714
|
+
|
|
6715
|
+
const int8_t * x;
|
|
6716
|
+
|
|
6717
|
+
// TODO: smarter multi-theading
|
|
6718
|
+
for (int i3 = 0; i3 < ne3; i3++) {
|
|
6719
|
+
for (int i2 = ith; i2 < ne2; i2 += nth) {
|
|
6720
|
+
for (int i1 = 0; i1 < ne1; i1++) {
|
|
6721
|
+
for (int i0 = 0; i0 < ne0; i0++) {
|
|
6722
|
+
if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
|
|
6723
|
+
x = (const int8_t *) ((const char *)src0->data + (i0 )*nb00 + (i1 )*nb01 + (i2 )*nb02 + (i3 )*nb03);
|
|
6724
|
+
} else {
|
|
6725
|
+
x = (const int8_t *) ((const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13);
|
|
6726
|
+
}
|
|
6727
|
+
|
|
6728
|
+
int8_t * y = (int8_t *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
|
|
6729
|
+
|
|
6730
|
+
*y = *x;
|
|
6731
|
+
}
|
|
6732
|
+
}
|
|
6733
|
+
}
|
|
6734
|
+
}
|
|
6735
|
+
}
|
|
6736
|
+
|
|
6737
|
+
static void ggml_compute_forward_concat_f16(
|
|
6738
|
+
const struct ggml_compute_params * params,
|
|
6739
|
+
struct ggml_tensor * dst) {
|
|
6740
|
+
|
|
6741
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
|
6742
|
+
const struct ggml_tensor * src1 = dst->src[1];
|
|
6743
|
+
|
|
6744
|
+
GGML_ASSERT(ggml_type_size(src0->type) == sizeof(ggml_fp16_t));
|
|
6745
|
+
|
|
6746
|
+
const int ith = params->ith;
|
|
6747
|
+
const int nth = params->nth;
|
|
6748
|
+
|
|
6749
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
|
6750
|
+
|
|
6751
|
+
const int32_t dim = ggml_get_op_params_i32(dst, 0);
|
|
6752
|
+
|
|
6753
|
+
GGML_ASSERT(dim >= 0 && dim < 4);
|
|
6754
|
+
|
|
6755
|
+
int64_t o[4] = {0, 0, 0, 0};
|
|
6756
|
+
o[dim] = src0->ne[dim];
|
|
6757
|
+
|
|
6758
|
+
const ggml_fp16_t * x;
|
|
6759
|
+
|
|
6760
|
+
// TODO: smarter multi-theading
|
|
6761
|
+
for (int i3 = 0; i3 < ne3; i3++) {
|
|
6762
|
+
for (int i2 = ith; i2 < ne2; i2 += nth) {
|
|
6763
|
+
for (int i1 = 0; i1 < ne1; i1++) {
|
|
6764
|
+
for (int i0 = 0; i0 < ne0; i0++) {
|
|
6765
|
+
if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
|
|
6766
|
+
x = (const ggml_fp16_t *) ((const char *)src0->data + (i0 )*nb00 + (i1 )*nb01 + (i2 )*nb02 + (i3 )*nb03);
|
|
6767
|
+
} else {
|
|
6768
|
+
x = (const ggml_fp16_t *) ((const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13);
|
|
6769
|
+
}
|
|
6770
|
+
|
|
6771
|
+
ggml_fp16_t * y = (ggml_fp16_t *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
|
|
6772
|
+
|
|
6773
|
+
*y = *x;
|
|
6774
|
+
}
|
|
6775
|
+
}
|
|
6776
|
+
}
|
|
6777
|
+
}
|
|
6778
|
+
}
|
|
6779
|
+
|
|
6780
|
+
static void ggml_compute_forward_concat_f32(
|
|
6781
|
+
const struct ggml_compute_params * params,
|
|
6782
|
+
struct ggml_tensor * dst) {
|
|
6783
|
+
|
|
6784
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
|
6785
|
+
const struct ggml_tensor * src1 = dst->src[1];
|
|
6786
|
+
|
|
6787
|
+
GGML_ASSERT(ggml_type_size(src0->type) == sizeof(float));
|
|
6788
|
+
|
|
6789
|
+
const int ith = params->ith;
|
|
6790
|
+
const int nth = params->nth;
|
|
6791
|
+
|
|
6792
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
|
6793
|
+
|
|
6794
|
+
const int32_t dim = ggml_get_op_params_i32(dst, 0);
|
|
6795
|
+
|
|
6796
|
+
GGML_ASSERT(dim >= 0 && dim < 4);
|
|
6797
|
+
|
|
6798
|
+
int64_t o[4] = {0, 0, 0, 0};
|
|
6799
|
+
o[dim] = src0->ne[dim];
|
|
6800
|
+
|
|
6801
|
+
const float * x;
|
|
6802
|
+
|
|
6803
|
+
// TODO: smarter multi-theading
|
|
6804
|
+
for (int i3 = 0; i3 < ne3; i3++) {
|
|
6805
|
+
for (int i2 = ith; i2 < ne2; i2 += nth) {
|
|
6806
|
+
for (int i1 = 0; i1 < ne1; i1++) {
|
|
6807
|
+
for (int i0 = 0; i0 < ne0; i0++) {
|
|
6808
|
+
if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
|
|
6809
|
+
x = (const float *) ((const char *)src0->data + (i0 )*nb00 + (i1 )*nb01 + (i2 )*nb02 + (i3 )*nb03);
|
|
6810
|
+
} else {
|
|
6811
|
+
x = (const float *) ((const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13);
|
|
6812
|
+
}
|
|
6813
|
+
|
|
6814
|
+
float * y = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
|
|
6815
|
+
|
|
6816
|
+
*y = *x;
|
|
6817
|
+
}
|
|
6818
|
+
}
|
|
6819
|
+
}
|
|
6820
|
+
}
|
|
6821
|
+
}
|
|
6822
|
+
|
|
6823
|
+
static void ggml_compute_forward_concat(
|
|
6824
|
+
const struct ggml_compute_params * params,
|
|
6825
|
+
struct ggml_tensor * dst) {
|
|
6826
|
+
|
|
6827
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
|
6828
|
+
|
|
6829
|
+
switch (src0->type) {
|
|
6830
|
+
case GGML_TYPE_F16:
|
|
6831
|
+
case GGML_TYPE_BF16:
|
|
6832
|
+
case GGML_TYPE_I16:
|
|
6833
|
+
{
|
|
6834
|
+
ggml_compute_forward_concat_f16(params, dst);
|
|
6835
|
+
} break;
|
|
6836
|
+
case GGML_TYPE_I8:
|
|
6837
|
+
{
|
|
6838
|
+
ggml_compute_forward_concat_i8(params, dst);
|
|
6839
|
+
} break;
|
|
6840
|
+
case GGML_TYPE_F32:
|
|
6841
|
+
case GGML_TYPE_I32:
|
|
6842
|
+
{
|
|
6843
|
+
ggml_compute_forward_concat_f32(params, dst);
|
|
6844
|
+
} break;
|
|
6845
|
+
default:
|
|
6846
|
+
{
|
|
6847
|
+
ggml_compute_forward_concat_any(params, dst);
|
|
6848
|
+
}
|
|
6849
|
+
}
|
|
6850
|
+
}
|
|
6851
|
+
|
|
6852
|
+
// ggml_compute_forward_abs
|
|
6166
6853
|
|
|
6167
6854
|
static void ggml_compute_forward_abs_f32(
|
|
6168
6855
|
const struct ggml_compute_params * params,
|
|
@@ -6188,6 +6875,30 @@ static void ggml_compute_forward_abs_f32(
|
|
|
6188
6875
|
}
|
|
6189
6876
|
}
|
|
6190
6877
|
|
|
6878
|
+
static void ggml_compute_forward_abs_f16(
|
|
6879
|
+
const struct ggml_compute_params * params,
|
|
6880
|
+
struct ggml_tensor * dst) {
|
|
6881
|
+
|
|
6882
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
|
6883
|
+
|
|
6884
|
+
if (params->ith != 0) {
|
|
6885
|
+
return;
|
|
6886
|
+
}
|
|
6887
|
+
|
|
6888
|
+
assert(ggml_is_contiguous_1(src0));
|
|
6889
|
+
assert(ggml_is_contiguous_1(dst));
|
|
6890
|
+
assert(ggml_are_same_shape(src0, dst));
|
|
6891
|
+
|
|
6892
|
+
const int n = ggml_nrows(src0);
|
|
6893
|
+
const int nc = src0->ne[0];
|
|
6894
|
+
|
|
6895
|
+
for (int i = 0; i < n; i++) {
|
|
6896
|
+
ggml_vec_abs_f16(nc,
|
|
6897
|
+
(ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
|
|
6898
|
+
(ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
|
|
6899
|
+
}
|
|
6900
|
+
}
|
|
6901
|
+
|
|
6191
6902
|
static void ggml_compute_forward_abs(
|
|
6192
6903
|
const struct ggml_compute_params * params,
|
|
6193
6904
|
struct ggml_tensor * dst) {
|
|
@@ -6199,6 +6910,10 @@ static void ggml_compute_forward_abs(
|
|
|
6199
6910
|
{
|
|
6200
6911
|
ggml_compute_forward_abs_f32(params, dst);
|
|
6201
6912
|
} break;
|
|
6913
|
+
case GGML_TYPE_F16:
|
|
6914
|
+
{
|
|
6915
|
+
ggml_compute_forward_abs_f16(params, dst);
|
|
6916
|
+
} break;
|
|
6202
6917
|
default:
|
|
6203
6918
|
{
|
|
6204
6919
|
GGML_ABORT("fatal error");
|
|
@@ -6232,6 +6947,30 @@ static void ggml_compute_forward_sgn_f32(
|
|
|
6232
6947
|
}
|
|
6233
6948
|
}
|
|
6234
6949
|
|
|
6950
|
+
static void ggml_compute_forward_sgn_f16(
|
|
6951
|
+
const struct ggml_compute_params * params,
|
|
6952
|
+
struct ggml_tensor * dst) {
|
|
6953
|
+
|
|
6954
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
|
6955
|
+
|
|
6956
|
+
if (params->ith != 0) {
|
|
6957
|
+
return;
|
|
6958
|
+
}
|
|
6959
|
+
|
|
6960
|
+
assert(ggml_is_contiguous_1(src0));
|
|
6961
|
+
assert(ggml_is_contiguous_1(dst));
|
|
6962
|
+
assert(ggml_are_same_shape(src0, dst));
|
|
6963
|
+
|
|
6964
|
+
const int n = ggml_nrows(src0);
|
|
6965
|
+
const int nc = src0->ne[0];
|
|
6966
|
+
|
|
6967
|
+
for (int i = 0; i < n; i++) {
|
|
6968
|
+
ggml_vec_sgn_f16(nc,
|
|
6969
|
+
(ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
|
|
6970
|
+
(ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
|
|
6971
|
+
}
|
|
6972
|
+
}
|
|
6973
|
+
|
|
6235
6974
|
static void ggml_compute_forward_sgn(
|
|
6236
6975
|
const struct ggml_compute_params * params,
|
|
6237
6976
|
struct ggml_tensor * dst) {
|
|
@@ -6243,6 +6982,10 @@ static void ggml_compute_forward_sgn(
|
|
|
6243
6982
|
{
|
|
6244
6983
|
ggml_compute_forward_sgn_f32(params, dst);
|
|
6245
6984
|
} break;
|
|
6985
|
+
case GGML_TYPE_F16:
|
|
6986
|
+
{
|
|
6987
|
+
ggml_compute_forward_sgn_f16(params, dst);
|
|
6988
|
+
} break;
|
|
6246
6989
|
default:
|
|
6247
6990
|
{
|
|
6248
6991
|
GGML_ABORT("fatal error");
|
|
@@ -6276,6 +7019,30 @@ static void ggml_compute_forward_neg_f32(
|
|
|
6276
7019
|
}
|
|
6277
7020
|
}
|
|
6278
7021
|
|
|
7022
|
+
static void ggml_compute_forward_neg_f16(
|
|
7023
|
+
const struct ggml_compute_params * params,
|
|
7024
|
+
struct ggml_tensor * dst) {
|
|
7025
|
+
|
|
7026
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
|
7027
|
+
|
|
7028
|
+
if (params->ith != 0) {
|
|
7029
|
+
return;
|
|
7030
|
+
}
|
|
7031
|
+
|
|
7032
|
+
assert(ggml_is_contiguous_1(src0));
|
|
7033
|
+
assert(ggml_is_contiguous_1(dst));
|
|
7034
|
+
assert(ggml_are_same_shape(src0, dst));
|
|
7035
|
+
|
|
7036
|
+
const int n = ggml_nrows(src0);
|
|
7037
|
+
const int nc = src0->ne[0];
|
|
7038
|
+
|
|
7039
|
+
for (int i = 0; i < n; i++) {
|
|
7040
|
+
ggml_vec_neg_f16(nc,
|
|
7041
|
+
(ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
|
|
7042
|
+
(ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
|
|
7043
|
+
}
|
|
7044
|
+
}
|
|
7045
|
+
|
|
6279
7046
|
static void ggml_compute_forward_neg(
|
|
6280
7047
|
const struct ggml_compute_params * params,
|
|
6281
7048
|
struct ggml_tensor * dst) {
|
|
@@ -6287,6 +7054,10 @@ static void ggml_compute_forward_neg(
|
|
|
6287
7054
|
{
|
|
6288
7055
|
ggml_compute_forward_neg_f32(params, dst);
|
|
6289
7056
|
} break;
|
|
7057
|
+
case GGML_TYPE_F16:
|
|
7058
|
+
{
|
|
7059
|
+
ggml_compute_forward_neg_f16(params, dst);
|
|
7060
|
+
} break;
|
|
6290
7061
|
default:
|
|
6291
7062
|
{
|
|
6292
7063
|
GGML_ABORT("fatal error");
|
|
@@ -6320,6 +7091,30 @@ static void ggml_compute_forward_step_f32(
|
|
|
6320
7091
|
}
|
|
6321
7092
|
}
|
|
6322
7093
|
|
|
7094
|
+
static void ggml_compute_forward_step_f16(
|
|
7095
|
+
const struct ggml_compute_params * params,
|
|
7096
|
+
struct ggml_tensor * dst) {
|
|
7097
|
+
|
|
7098
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
|
7099
|
+
|
|
7100
|
+
if (params->ith != 0) {
|
|
7101
|
+
return;
|
|
7102
|
+
}
|
|
7103
|
+
|
|
7104
|
+
assert(ggml_is_contiguous_1(src0));
|
|
7105
|
+
assert(ggml_is_contiguous_1(dst));
|
|
7106
|
+
assert(ggml_are_same_shape(src0, dst));
|
|
7107
|
+
|
|
7108
|
+
const int n = ggml_nrows(src0);
|
|
7109
|
+
const int nc = src0->ne[0];
|
|
7110
|
+
|
|
7111
|
+
for (int i = 0; i < n; i++) {
|
|
7112
|
+
ggml_vec_step_f16(nc,
|
|
7113
|
+
(ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
|
|
7114
|
+
(ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
|
|
7115
|
+
}
|
|
7116
|
+
}
|
|
7117
|
+
|
|
6323
7118
|
static void ggml_compute_forward_step(
|
|
6324
7119
|
const struct ggml_compute_params * params,
|
|
6325
7120
|
struct ggml_tensor * dst) {
|
|
@@ -6331,6 +7126,10 @@ static void ggml_compute_forward_step(
|
|
|
6331
7126
|
{
|
|
6332
7127
|
ggml_compute_forward_step_f32(params, dst);
|
|
6333
7128
|
} break;
|
|
7129
|
+
case GGML_TYPE_F16:
|
|
7130
|
+
{
|
|
7131
|
+
ggml_compute_forward_step_f16(params, dst);
|
|
7132
|
+
} break;
|
|
6334
7133
|
default:
|
|
6335
7134
|
{
|
|
6336
7135
|
GGML_ABORT("fatal error");
|
|
@@ -6364,6 +7163,30 @@ static void ggml_compute_forward_tanh_f32(
|
|
|
6364
7163
|
}
|
|
6365
7164
|
}
|
|
6366
7165
|
|
|
7166
|
+
static void ggml_compute_forward_tanh_f16(
|
|
7167
|
+
const struct ggml_compute_params * params,
|
|
7168
|
+
struct ggml_tensor * dst) {
|
|
7169
|
+
|
|
7170
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
|
7171
|
+
|
|
7172
|
+
if (params->ith != 0) {
|
|
7173
|
+
return;
|
|
7174
|
+
}
|
|
7175
|
+
|
|
7176
|
+
assert(ggml_is_contiguous_1(src0));
|
|
7177
|
+
assert(ggml_is_contiguous_1(dst));
|
|
7178
|
+
assert(ggml_are_same_shape(src0, dst));
|
|
7179
|
+
|
|
7180
|
+
const int n = ggml_nrows(src0);
|
|
7181
|
+
const int nc = src0->ne[0];
|
|
7182
|
+
|
|
7183
|
+
for (int i = 0; i < n; i++) {
|
|
7184
|
+
ggml_vec_tanh_f16(nc,
|
|
7185
|
+
(ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
|
|
7186
|
+
(ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
|
|
7187
|
+
}
|
|
7188
|
+
}
|
|
7189
|
+
|
|
6367
7190
|
static void ggml_compute_forward_tanh(
|
|
6368
7191
|
const struct ggml_compute_params * params,
|
|
6369
7192
|
struct ggml_tensor * dst) {
|
|
@@ -6375,6 +7198,10 @@ static void ggml_compute_forward_tanh(
|
|
|
6375
7198
|
{
|
|
6376
7199
|
ggml_compute_forward_tanh_f32(params, dst);
|
|
6377
7200
|
} break;
|
|
7201
|
+
case GGML_TYPE_F16:
|
|
7202
|
+
{
|
|
7203
|
+
ggml_compute_forward_tanh_f16(params, dst);
|
|
7204
|
+
} break;
|
|
6378
7205
|
default:
|
|
6379
7206
|
{
|
|
6380
7207
|
GGML_ABORT("fatal error");
|
|
@@ -6408,6 +7235,30 @@ static void ggml_compute_forward_elu_f32(
|
|
|
6408
7235
|
}
|
|
6409
7236
|
}
|
|
6410
7237
|
|
|
7238
|
+
static void ggml_compute_forward_elu_f16(
|
|
7239
|
+
const struct ggml_compute_params * params,
|
|
7240
|
+
struct ggml_tensor * dst) {
|
|
7241
|
+
|
|
7242
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
|
7243
|
+
|
|
7244
|
+
if (params->ith != 0) {
|
|
7245
|
+
return;
|
|
7246
|
+
}
|
|
7247
|
+
|
|
7248
|
+
assert(ggml_is_contiguous_1(src0));
|
|
7249
|
+
assert(ggml_is_contiguous_1(dst));
|
|
7250
|
+
assert(ggml_are_same_shape(src0, dst));
|
|
7251
|
+
|
|
7252
|
+
const int n = ggml_nrows(src0);
|
|
7253
|
+
const int nc = src0->ne[0];
|
|
7254
|
+
|
|
7255
|
+
for (int i = 0; i < n; i++) {
|
|
7256
|
+
ggml_vec_elu_f16(nc,
|
|
7257
|
+
(ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
|
|
7258
|
+
(ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
|
|
7259
|
+
}
|
|
7260
|
+
}
|
|
7261
|
+
|
|
6411
7262
|
static void ggml_compute_forward_elu(
|
|
6412
7263
|
const struct ggml_compute_params * params,
|
|
6413
7264
|
struct ggml_tensor * dst) {
|
|
@@ -6419,6 +7270,10 @@ static void ggml_compute_forward_elu(
|
|
|
6419
7270
|
{
|
|
6420
7271
|
ggml_compute_forward_elu_f32(params, dst);
|
|
6421
7272
|
} break;
|
|
7273
|
+
case GGML_TYPE_F16:
|
|
7274
|
+
{
|
|
7275
|
+
ggml_compute_forward_elu_f16(params, dst);
|
|
7276
|
+
} break;
|
|
6422
7277
|
default:
|
|
6423
7278
|
{
|
|
6424
7279
|
GGML_ABORT("fatal error");
|
|
@@ -6452,6 +7307,30 @@ static void ggml_compute_forward_relu_f32(
|
|
|
6452
7307
|
}
|
|
6453
7308
|
}
|
|
6454
7309
|
|
|
7310
|
+
static void ggml_compute_forward_relu_f16(
|
|
7311
|
+
const struct ggml_compute_params * params,
|
|
7312
|
+
struct ggml_tensor * dst) {
|
|
7313
|
+
|
|
7314
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
|
7315
|
+
|
|
7316
|
+
if (params->ith != 0) {
|
|
7317
|
+
return;
|
|
7318
|
+
}
|
|
7319
|
+
|
|
7320
|
+
assert(ggml_is_contiguous_1(src0));
|
|
7321
|
+
assert(ggml_is_contiguous_1(dst));
|
|
7322
|
+
assert(ggml_are_same_shape(src0, dst));
|
|
7323
|
+
|
|
7324
|
+
const int n = ggml_nrows(src0);
|
|
7325
|
+
const int nc = src0->ne[0];
|
|
7326
|
+
|
|
7327
|
+
for (int i = 0; i < n; i++) {
|
|
7328
|
+
ggml_vec_relu_f16(nc,
|
|
7329
|
+
(ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
|
|
7330
|
+
(ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
|
|
7331
|
+
}
|
|
7332
|
+
}
|
|
7333
|
+
|
|
6455
7334
|
static void ggml_compute_forward_relu(
|
|
6456
7335
|
const struct ggml_compute_params * params,
|
|
6457
7336
|
struct ggml_tensor * dst) {
|
|
@@ -6463,6 +7342,10 @@ static void ggml_compute_forward_relu(
|
|
|
6463
7342
|
{
|
|
6464
7343
|
ggml_compute_forward_relu_f32(params, dst);
|
|
6465
7344
|
} break;
|
|
7345
|
+
case GGML_TYPE_F16:
|
|
7346
|
+
{
|
|
7347
|
+
ggml_compute_forward_relu_f16(params, dst);
|
|
7348
|
+
} break;
|
|
6466
7349
|
default:
|
|
6467
7350
|
{
|
|
6468
7351
|
GGML_ABORT("fatal error");
|
|
@@ -6496,6 +7379,30 @@ static void ggml_compute_forward_sigmoid_f32(
|
|
|
6496
7379
|
}
|
|
6497
7380
|
}
|
|
6498
7381
|
|
|
7382
|
+
static void ggml_compute_forward_sigmoid_f16(
|
|
7383
|
+
const struct ggml_compute_params * params,
|
|
7384
|
+
struct ggml_tensor * dst) {
|
|
7385
|
+
|
|
7386
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
|
7387
|
+
|
|
7388
|
+
if (params->ith != 0) {
|
|
7389
|
+
return;
|
|
7390
|
+
}
|
|
7391
|
+
|
|
7392
|
+
assert(ggml_is_contiguous_1(src0));
|
|
7393
|
+
assert(ggml_is_contiguous_1(dst));
|
|
7394
|
+
assert(ggml_are_same_shape(src0, dst));
|
|
7395
|
+
|
|
7396
|
+
const int n = ggml_nrows(src0);
|
|
7397
|
+
const int nc = src0->ne[0];
|
|
7398
|
+
|
|
7399
|
+
for (int i = 0; i < n; i++) {
|
|
7400
|
+
ggml_vec_sigmoid_f16(nc,
|
|
7401
|
+
(ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
|
|
7402
|
+
(ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
|
|
7403
|
+
}
|
|
7404
|
+
}
|
|
7405
|
+
|
|
6499
7406
|
static void ggml_compute_forward_sigmoid(
|
|
6500
7407
|
const struct ggml_compute_params * params,
|
|
6501
7408
|
struct ggml_tensor * dst) {
|
|
@@ -6507,6 +7414,10 @@ static void ggml_compute_forward_sigmoid(
|
|
|
6507
7414
|
{
|
|
6508
7415
|
ggml_compute_forward_sigmoid_f32(params, dst);
|
|
6509
7416
|
} break;
|
|
7417
|
+
case GGML_TYPE_F16:
|
|
7418
|
+
{
|
|
7419
|
+
ggml_compute_forward_sigmoid_f16(params, dst);
|
|
7420
|
+
} break;
|
|
6510
7421
|
default:
|
|
6511
7422
|
{
|
|
6512
7423
|
GGML_ABORT("fatal error");
|
|
@@ -6555,6 +7466,46 @@ static void ggml_compute_forward_gelu_f32(
|
|
|
6555
7466
|
}
|
|
6556
7467
|
}
|
|
6557
7468
|
|
|
7469
|
+
static void ggml_compute_forward_gelu_f16(
|
|
7470
|
+
const struct ggml_compute_params * params,
|
|
7471
|
+
struct ggml_tensor * dst) {
|
|
7472
|
+
|
|
7473
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
|
7474
|
+
|
|
7475
|
+
assert(ggml_is_contiguous_1(src0));
|
|
7476
|
+
assert(ggml_is_contiguous_1(dst));
|
|
7477
|
+
assert(ggml_are_same_shape(src0, dst));
|
|
7478
|
+
|
|
7479
|
+
const int ith = params->ith;
|
|
7480
|
+
const int nth = params->nth;
|
|
7481
|
+
|
|
7482
|
+
const int nc = src0->ne[0];
|
|
7483
|
+
const int nr = ggml_nrows(src0);
|
|
7484
|
+
|
|
7485
|
+
// rows per thread
|
|
7486
|
+
const int dr = (nr + nth - 1)/nth;
|
|
7487
|
+
|
|
7488
|
+
// row range for this thread
|
|
7489
|
+
const int ir0 = dr*ith;
|
|
7490
|
+
const int ir1 = MIN(ir0 + dr, nr);
|
|
7491
|
+
|
|
7492
|
+
for (int i1 = ir0; i1 < ir1; i1++) {
|
|
7493
|
+
ggml_vec_gelu_f16(nc,
|
|
7494
|
+
(ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])),
|
|
7495
|
+
(ggml_fp16_t *) ((char *) src0->data + i1*(src0->nb[1])));
|
|
7496
|
+
|
|
7497
|
+
#ifndef NDEBUG
|
|
7498
|
+
for (int k = 0; k < nc; k++) {
|
|
7499
|
+
const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k];
|
|
7500
|
+
const float v = GGML_FP16_TO_FP32(x);
|
|
7501
|
+
UNUSED(v);
|
|
7502
|
+
assert(!isnan(v));
|
|
7503
|
+
assert(!isinf(v));
|
|
7504
|
+
}
|
|
7505
|
+
#endif
|
|
7506
|
+
}
|
|
7507
|
+
}
|
|
7508
|
+
|
|
6558
7509
|
static void ggml_compute_forward_gelu(
|
|
6559
7510
|
const struct ggml_compute_params * params,
|
|
6560
7511
|
struct ggml_tensor * dst) {
|
|
@@ -6566,6 +7517,10 @@ static void ggml_compute_forward_gelu(
|
|
|
6566
7517
|
{
|
|
6567
7518
|
ggml_compute_forward_gelu_f32(params, dst);
|
|
6568
7519
|
} break;
|
|
7520
|
+
case GGML_TYPE_F16:
|
|
7521
|
+
{
|
|
7522
|
+
ggml_compute_forward_gelu_f16(params, dst);
|
|
7523
|
+
} break;
|
|
6569
7524
|
default:
|
|
6570
7525
|
{
|
|
6571
7526
|
GGML_ABORT("fatal error");
|
|
@@ -6614,6 +7569,46 @@ static void ggml_compute_forward_gelu_quick_f32(
|
|
|
6614
7569
|
}
|
|
6615
7570
|
}
|
|
6616
7571
|
|
|
7572
|
+
static void ggml_compute_forward_gelu_quick_f16(
|
|
7573
|
+
const struct ggml_compute_params * params,
|
|
7574
|
+
struct ggml_tensor * dst) {
|
|
7575
|
+
|
|
7576
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
|
7577
|
+
|
|
7578
|
+
assert(ggml_is_contiguous_1(src0));
|
|
7579
|
+
assert(ggml_is_contiguous_1(dst));
|
|
7580
|
+
assert(ggml_are_same_shape(src0, dst));
|
|
7581
|
+
|
|
7582
|
+
const int ith = params->ith;
|
|
7583
|
+
const int nth = params->nth;
|
|
7584
|
+
|
|
7585
|
+
const int nc = src0->ne[0];
|
|
7586
|
+
const int nr = ggml_nrows(src0);
|
|
7587
|
+
|
|
7588
|
+
// rows per thread
|
|
7589
|
+
const int dr = (nr + nth - 1)/nth;
|
|
7590
|
+
|
|
7591
|
+
// row range for this thread
|
|
7592
|
+
const int ir0 = dr*ith;
|
|
7593
|
+
const int ir1 = MIN(ir0 + dr, nr);
|
|
7594
|
+
|
|
7595
|
+
for (int i1 = ir0; i1 < ir1; i1++) {
|
|
7596
|
+
ggml_vec_gelu_quick_f16(nc,
|
|
7597
|
+
(ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])),
|
|
7598
|
+
(ggml_fp16_t *) ((char *) src0->data + i1*(src0->nb[1])));
|
|
7599
|
+
|
|
7600
|
+
#ifndef NDEBUG
|
|
7601
|
+
for (int k = 0; k < nc; k++) {
|
|
7602
|
+
const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k];
|
|
7603
|
+
const float v = GGML_FP16_TO_FP32(x);
|
|
7604
|
+
UNUSED(v);
|
|
7605
|
+
assert(!isnan(v));
|
|
7606
|
+
assert(!isinf(v));
|
|
7607
|
+
}
|
|
7608
|
+
#endif
|
|
7609
|
+
}
|
|
7610
|
+
}
|
|
7611
|
+
|
|
6617
7612
|
static void ggml_compute_forward_gelu_quick(
|
|
6618
7613
|
const struct ggml_compute_params * params,
|
|
6619
7614
|
struct ggml_tensor * dst) {
|
|
@@ -6625,6 +7620,10 @@ static void ggml_compute_forward_gelu_quick(
|
|
|
6625
7620
|
{
|
|
6626
7621
|
ggml_compute_forward_gelu_quick_f32(params, dst);
|
|
6627
7622
|
} break;
|
|
7623
|
+
case GGML_TYPE_F16:
|
|
7624
|
+
{
|
|
7625
|
+
ggml_compute_forward_gelu_quick_f16(params, dst);
|
|
7626
|
+
} break;
|
|
6628
7627
|
default:
|
|
6629
7628
|
{
|
|
6630
7629
|
GGML_ABORT("fatal error");
|
|
@@ -6673,6 +7672,46 @@ static void ggml_compute_forward_silu_f32(
|
|
|
6673
7672
|
}
|
|
6674
7673
|
}
|
|
6675
7674
|
|
|
7675
|
+
static void ggml_compute_forward_silu_f16(
|
|
7676
|
+
const struct ggml_compute_params * params,
|
|
7677
|
+
struct ggml_tensor * dst) {
|
|
7678
|
+
|
|
7679
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
|
7680
|
+
|
|
7681
|
+
assert(ggml_is_contiguous_1(src0));
|
|
7682
|
+
assert(ggml_is_contiguous_1(dst));
|
|
7683
|
+
assert(ggml_are_same_shape(src0, dst));
|
|
7684
|
+
|
|
7685
|
+
const int ith = params->ith;
|
|
7686
|
+
const int nth = params->nth;
|
|
7687
|
+
|
|
7688
|
+
const int nc = src0->ne[0];
|
|
7689
|
+
const int nr = ggml_nrows(src0);
|
|
7690
|
+
|
|
7691
|
+
// rows per thread
|
|
7692
|
+
const int dr = (nr + nth - 1)/nth;
|
|
7693
|
+
|
|
7694
|
+
// row range for this thread
|
|
7695
|
+
const int ir0 = dr*ith;
|
|
7696
|
+
const int ir1 = MIN(ir0 + dr, nr);
|
|
7697
|
+
|
|
7698
|
+
for (int i1 = ir0; i1 < ir1; i1++) {
|
|
7699
|
+
ggml_vec_silu_f16(nc,
|
|
7700
|
+
(ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])),
|
|
7701
|
+
(ggml_fp16_t *) ((char *) src0->data + i1*(src0->nb[1])));
|
|
7702
|
+
|
|
7703
|
+
#ifndef NDEBUG
|
|
7704
|
+
for (int k = 0; k < nc; k++) {
|
|
7705
|
+
const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*(dst->nb[1])))[k];
|
|
7706
|
+
const float v = GGML_FP16_TO_FP32(x);
|
|
7707
|
+
UNUSED(v);
|
|
7708
|
+
assert(!isnan(v));
|
|
7709
|
+
assert(!isinf(v));
|
|
7710
|
+
}
|
|
7711
|
+
#endif
|
|
7712
|
+
}
|
|
7713
|
+
}
|
|
7714
|
+
|
|
6676
7715
|
static void ggml_compute_forward_silu(
|
|
6677
7716
|
const struct ggml_compute_params * params,
|
|
6678
7717
|
struct ggml_tensor * dst) {
|
|
@@ -6684,6 +7723,10 @@ static void ggml_compute_forward_silu(
|
|
|
6684
7723
|
{
|
|
6685
7724
|
ggml_compute_forward_silu_f32(params, dst);
|
|
6686
7725
|
} break;
|
|
7726
|
+
case GGML_TYPE_F16:
|
|
7727
|
+
{
|
|
7728
|
+
ggml_compute_forward_silu_f16(params, dst);
|
|
7729
|
+
} break;
|
|
6687
7730
|
default:
|
|
6688
7731
|
{
|
|
6689
7732
|
GGML_ABORT("fatal error");
|
|
@@ -6712,13 +7755,43 @@ static void ggml_compute_forward_leaky_relu_f32(
|
|
|
6712
7755
|
float negative_slope;
|
|
6713
7756
|
memcpy(&negative_slope, dst->op_params, sizeof(float));
|
|
6714
7757
|
|
|
6715
|
-
assert(dst->nb[0] == sizeof(float));
|
|
6716
|
-
assert(src0->nb[0] == sizeof(float));
|
|
7758
|
+
assert(dst->nb[0] == sizeof(float));
|
|
7759
|
+
assert(src0->nb[0] == sizeof(float));
|
|
7760
|
+
|
|
7761
|
+
for (int i = 0; i < n; i++) {
|
|
7762
|
+
ggml_vec_leaky_relu_f32(nc,
|
|
7763
|
+
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
|
7764
|
+
(float *) ((char *) src0->data + i*(src0->nb[1])), negative_slope);
|
|
7765
|
+
}
|
|
7766
|
+
}
|
|
7767
|
+
|
|
7768
|
+
static void ggml_compute_forward_leaky_relu_f16(
|
|
7769
|
+
const struct ggml_compute_params * params,
|
|
7770
|
+
struct ggml_tensor * dst) {
|
|
7771
|
+
|
|
7772
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
|
7773
|
+
|
|
7774
|
+
if (params->ith != 0) {
|
|
7775
|
+
return;
|
|
7776
|
+
}
|
|
7777
|
+
|
|
7778
|
+
assert(ggml_is_contiguous_1(src0));
|
|
7779
|
+
assert(ggml_is_contiguous_1(dst));
|
|
7780
|
+
assert(ggml_are_same_shape(src0, dst));
|
|
7781
|
+
|
|
7782
|
+
const int n = ggml_nrows(src0);
|
|
7783
|
+
const int nc = src0->ne[0];
|
|
7784
|
+
|
|
7785
|
+
float negative_slope;
|
|
7786
|
+
memcpy(&negative_slope, dst->op_params, sizeof(float));
|
|
7787
|
+
|
|
7788
|
+
assert(dst->nb[0] == sizeof(ggml_fp16_t));
|
|
7789
|
+
assert(src0->nb[0] == sizeof(ggml_fp16_t));
|
|
6717
7790
|
|
|
6718
7791
|
for (int i = 0; i < n; i++) {
|
|
6719
|
-
|
|
6720
|
-
(
|
|
6721
|
-
(
|
|
7792
|
+
ggml_vec_leaky_relu_f16(nc,
|
|
7793
|
+
(ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
|
|
7794
|
+
(ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])), negative_slope);
|
|
6722
7795
|
}
|
|
6723
7796
|
}
|
|
6724
7797
|
|
|
@@ -6733,6 +7806,10 @@ static void ggml_compute_forward_leaky_relu(
|
|
|
6733
7806
|
{
|
|
6734
7807
|
ggml_compute_forward_leaky_relu_f32(params, dst);
|
|
6735
7808
|
} break;
|
|
7809
|
+
case GGML_TYPE_F16:
|
|
7810
|
+
{
|
|
7811
|
+
ggml_compute_forward_leaky_relu_f16(params, dst);
|
|
7812
|
+
} break;
|
|
6736
7813
|
default:
|
|
6737
7814
|
{
|
|
6738
7815
|
GGML_ABORT("fatal error");
|
|
@@ -6785,6 +7862,50 @@ static void ggml_compute_forward_silu_back_f32(
|
|
|
6785
7862
|
}
|
|
6786
7863
|
}
|
|
6787
7864
|
|
|
7865
|
+
static void ggml_compute_forward_silu_back_f16(
|
|
7866
|
+
const struct ggml_compute_params * params,
|
|
7867
|
+
struct ggml_tensor * dst) {
|
|
7868
|
+
|
|
7869
|
+
const struct ggml_tensor * grad = dst->src[0];
|
|
7870
|
+
const struct ggml_tensor * src1 = dst->src[1];
|
|
7871
|
+
|
|
7872
|
+
assert(ggml_is_contiguous_1(grad));
|
|
7873
|
+
assert(ggml_is_contiguous_1(src1));
|
|
7874
|
+
assert(ggml_is_contiguous_1(dst));
|
|
7875
|
+
assert(ggml_are_same_shape(src1, dst));
|
|
7876
|
+
assert(ggml_are_same_shape(src1, grad));
|
|
7877
|
+
|
|
7878
|
+
const int ith = params->ith;
|
|
7879
|
+
const int nth = params->nth;
|
|
7880
|
+
|
|
7881
|
+
const int nc = src1->ne[0];
|
|
7882
|
+
const int nr = ggml_nrows(src1);
|
|
7883
|
+
|
|
7884
|
+
// rows per thread
|
|
7885
|
+
const int dr = (nr + nth - 1)/nth;
|
|
7886
|
+
|
|
7887
|
+
// row range for this thread
|
|
7888
|
+
const int ir0 = dr*ith;
|
|
7889
|
+
const int ir1 = MIN(ir0 + dr, nr);
|
|
7890
|
+
|
|
7891
|
+
for (int i1 = ir0; i1 < ir1; i1++) {
|
|
7892
|
+
ggml_vec_silu_backward_f16(nc,
|
|
7893
|
+
(ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])),
|
|
7894
|
+
(ggml_fp16_t *) ((char *) src1->data + i1*(src1->nb[1])),
|
|
7895
|
+
(ggml_fp16_t *) ((char *) grad->data + i1*(grad->nb[1])));
|
|
7896
|
+
|
|
7897
|
+
#ifndef NDEBUG
|
|
7898
|
+
for (int k = 0; k < nc; k++) {
|
|
7899
|
+
const float x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k];
|
|
7900
|
+
const float v = GGML_FP16_TO_FP32(x);
|
|
7901
|
+
UNUSED(v);
|
|
7902
|
+
assert(!isnan(v));
|
|
7903
|
+
assert(!isinf(v));
|
|
7904
|
+
}
|
|
7905
|
+
#endif
|
|
7906
|
+
}
|
|
7907
|
+
}
|
|
7908
|
+
|
|
6788
7909
|
static void ggml_compute_forward_silu_back(
|
|
6789
7910
|
const struct ggml_compute_params * params,
|
|
6790
7911
|
struct ggml_tensor * dst) {
|
|
@@ -6796,6 +7917,10 @@ static void ggml_compute_forward_silu_back(
|
|
|
6796
7917
|
{
|
|
6797
7918
|
ggml_compute_forward_silu_back_f32(params, dst);
|
|
6798
7919
|
} break;
|
|
7920
|
+
case GGML_TYPE_F16:
|
|
7921
|
+
{
|
|
7922
|
+
ggml_compute_forward_silu_back_f16(params, dst);
|
|
7923
|
+
} break;
|
|
6799
7924
|
default:
|
|
6800
7925
|
{
|
|
6801
7926
|
GGML_ABORT("fatal error");
|
|
@@ -6803,7 +7928,6 @@ static void ggml_compute_forward_silu_back(
|
|
|
6803
7928
|
}
|
|
6804
7929
|
}
|
|
6805
7930
|
|
|
6806
|
-
|
|
6807
7931
|
static void ggml_compute_forward_hardswish_f32(
|
|
6808
7932
|
const struct ggml_compute_params * params,
|
|
6809
7933
|
struct ggml_tensor * dst) {
|
|
@@ -6827,6 +7951,31 @@ static void ggml_compute_forward_hardswish_f32(
|
|
|
6827
7951
|
(float *) ((char *) src0->data + i*(src0->nb[1])));
|
|
6828
7952
|
}
|
|
6829
7953
|
}
|
|
7954
|
+
|
|
7955
|
+
static void ggml_compute_forward_hardswish_f16(
|
|
7956
|
+
const struct ggml_compute_params * params,
|
|
7957
|
+
struct ggml_tensor * dst) {
|
|
7958
|
+
|
|
7959
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
|
7960
|
+
|
|
7961
|
+
if (params->ith != 0) {
|
|
7962
|
+
return;
|
|
7963
|
+
}
|
|
7964
|
+
|
|
7965
|
+
assert(ggml_is_contiguous_1(src0));
|
|
7966
|
+
assert(ggml_is_contiguous_1(dst));
|
|
7967
|
+
assert(ggml_are_same_shape(src0, dst));
|
|
7968
|
+
|
|
7969
|
+
const int n = ggml_nrows(src0);
|
|
7970
|
+
const int nc = src0->ne[0];
|
|
7971
|
+
|
|
7972
|
+
for (int i = 0; i < n; i++) {
|
|
7973
|
+
ggml_vec_hardswish_f16(nc,
|
|
7974
|
+
(ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
|
|
7975
|
+
(ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
|
|
7976
|
+
}
|
|
7977
|
+
}
|
|
7978
|
+
|
|
6830
7979
|
static void ggml_compute_forward_hardswish(
|
|
6831
7980
|
const struct ggml_compute_params * params,
|
|
6832
7981
|
struct ggml_tensor * dst) {
|
|
@@ -6838,6 +7987,10 @@ static void ggml_compute_forward_hardswish(
|
|
|
6838
7987
|
{
|
|
6839
7988
|
ggml_compute_forward_hardswish_f32(params, dst);
|
|
6840
7989
|
} break;
|
|
7990
|
+
case GGML_TYPE_F16:
|
|
7991
|
+
{
|
|
7992
|
+
ggml_compute_forward_hardswish_f16(params, dst);
|
|
7993
|
+
} break;
|
|
6841
7994
|
default:
|
|
6842
7995
|
{
|
|
6843
7996
|
GGML_ABORT("fatal error");
|
|
@@ -6869,6 +8022,30 @@ static void ggml_compute_forward_hardsigmoid_f32(
|
|
|
6869
8022
|
}
|
|
6870
8023
|
}
|
|
6871
8024
|
|
|
8025
|
+
static void ggml_compute_forward_hardsigmoid_f16(
|
|
8026
|
+
const struct ggml_compute_params * params,
|
|
8027
|
+
struct ggml_tensor * dst) {
|
|
8028
|
+
|
|
8029
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
|
8030
|
+
|
|
8031
|
+
if (params->ith != 0) {
|
|
8032
|
+
return;
|
|
8033
|
+
}
|
|
8034
|
+
|
|
8035
|
+
assert(ggml_is_contiguous_1(src0));
|
|
8036
|
+
assert(ggml_is_contiguous_1(dst));
|
|
8037
|
+
assert(ggml_are_same_shape(src0, dst));
|
|
8038
|
+
|
|
8039
|
+
const int n = ggml_nrows(src0);
|
|
8040
|
+
const int nc = src0->ne[0];
|
|
8041
|
+
|
|
8042
|
+
for (int i = 0; i < n; i++) {
|
|
8043
|
+
ggml_vec_hardsigmoid_f16(nc,
|
|
8044
|
+
(ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
|
|
8045
|
+
(ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
|
|
8046
|
+
}
|
|
8047
|
+
}
|
|
8048
|
+
|
|
6872
8049
|
static void ggml_compute_forward_hardsigmoid(
|
|
6873
8050
|
const struct ggml_compute_params * params,
|
|
6874
8051
|
struct ggml_tensor * dst) {
|
|
@@ -6880,6 +8057,10 @@ static void ggml_compute_forward_hardsigmoid(
|
|
|
6880
8057
|
{
|
|
6881
8058
|
ggml_compute_forward_hardsigmoid_f32(params, dst);
|
|
6882
8059
|
} break;
|
|
8060
|
+
case GGML_TYPE_F16:
|
|
8061
|
+
{
|
|
8062
|
+
ggml_compute_forward_hardsigmoid_f16(params, dst);
|
|
8063
|
+
} break;
|
|
6883
8064
|
default:
|
|
6884
8065
|
{
|
|
6885
8066
|
GGML_ABORT("fatal error");
|
|
@@ -6911,6 +8092,30 @@ static void ggml_compute_forward_exp_f32(
|
|
|
6911
8092
|
}
|
|
6912
8093
|
}
|
|
6913
8094
|
|
|
8095
|
+
static void ggml_compute_forward_exp_f16(
|
|
8096
|
+
const struct ggml_compute_params * params,
|
|
8097
|
+
struct ggml_tensor * dst) {
|
|
8098
|
+
|
|
8099
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
|
8100
|
+
|
|
8101
|
+
if (params->ith != 0) {
|
|
8102
|
+
return;
|
|
8103
|
+
}
|
|
8104
|
+
|
|
8105
|
+
assert(ggml_is_contiguous_1(src0));
|
|
8106
|
+
assert(ggml_is_contiguous_1(dst));
|
|
8107
|
+
assert(ggml_are_same_shape(src0, dst));
|
|
8108
|
+
|
|
8109
|
+
const int n = ggml_nrows(src0);
|
|
8110
|
+
const int nc = src0->ne[0];
|
|
8111
|
+
|
|
8112
|
+
for (int i = 0; i < n; i++) {
|
|
8113
|
+
ggml_vec_exp_f16(nc,
|
|
8114
|
+
(ggml_fp16_t *) ((char *) dst->data + i*( dst->nb[1])),
|
|
8115
|
+
(ggml_fp16_t *) ((char *) src0->data + i*(src0->nb[1])));
|
|
8116
|
+
}
|
|
8117
|
+
}
|
|
8118
|
+
|
|
6914
8119
|
static void ggml_compute_forward_exp(
|
|
6915
8120
|
const struct ggml_compute_params * params,
|
|
6916
8121
|
struct ggml_tensor * dst) {
|
|
@@ -6922,6 +8127,10 @@ static void ggml_compute_forward_exp(
|
|
|
6922
8127
|
{
|
|
6923
8128
|
ggml_compute_forward_exp_f32(params, dst);
|
|
6924
8129
|
} break;
|
|
8130
|
+
case GGML_TYPE_F16:
|
|
8131
|
+
{
|
|
8132
|
+
ggml_compute_forward_exp_f16(params, dst);
|
|
8133
|
+
} break;
|
|
6925
8134
|
default:
|
|
6926
8135
|
{
|
|
6927
8136
|
GGML_ABORT("fatal error");
|
|
@@ -7496,6 +8705,7 @@ UseGgmlGemm1:;
|
|
|
7496
8705
|
if (src1->type != vec_dot_type) {
|
|
7497
8706
|
char * wdata = params->wdata;
|
|
7498
8707
|
|
|
8708
|
+
const size_t nbw0 = ggml_type_size(vec_dot_type);
|
|
7499
8709
|
const size_t nbw1 = ggml_row_size(vec_dot_type, ne10);
|
|
7500
8710
|
const size_t nbw2 = nbw1*ne11;
|
|
7501
8711
|
const size_t nbw3 = nbw2*ne12;
|
|
@@ -7503,6 +8713,7 @@ UseGgmlGemm1:;
|
|
|
7503
8713
|
assert(params->wsize >= ne13*nbw3);
|
|
7504
8714
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
|
7505
8715
|
|
|
8716
|
+
#if 0
|
|
7506
8717
|
for (int64_t i13 = 0; i13 < ne13; ++i13) {
|
|
7507
8718
|
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
|
7508
8719
|
for (int64_t i11 = ith; i11 < ne11; i11 += nth) {
|
|
@@ -7512,6 +8723,20 @@ UseGgmlGemm1:;
|
|
|
7512
8723
|
}
|
|
7513
8724
|
}
|
|
7514
8725
|
}
|
|
8726
|
+
#else
|
|
8727
|
+
for (int64_t i13 = 0; i13 < ne13; ++i13) {
|
|
8728
|
+
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
|
8729
|
+
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
|
8730
|
+
size_t bs = ggml_blck_size(vec_dot_type);
|
|
8731
|
+
int64_t ne10_block_start = (ith * ne10/bs) / nth;
|
|
8732
|
+
int64_t ne10_block_end = ((ith + 1) * ne10/bs) / nth;
|
|
8733
|
+
from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + ne10_block_start*bs*nb10),
|
|
8734
|
+
(void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1 + ne10_block_start*nbw0),
|
|
8735
|
+
(ne10_block_end - ne10_block_start) * bs);
|
|
8736
|
+
}
|
|
8737
|
+
}
|
|
8738
|
+
}
|
|
8739
|
+
#endif
|
|
7515
8740
|
}
|
|
7516
8741
|
|
|
7517
8742
|
if (ith == 0) {
|
|
@@ -7566,7 +8791,7 @@ UseGgmlGemm2:;
|
|
|
7566
8791
|
int64_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size;
|
|
7567
8792
|
|
|
7568
8793
|
// If the chunking is poor for the number of threads on this setup, scrap the whole plan. Re-chunk it by thread.
|
|
7569
|
-
// Also, chunking by thread was measured to have perform better on NUMA systems. See https://github.com/
|
|
8794
|
+
// Also, chunking by thread was measured to have perform better on NUMA systems. See https://github.com/ggml-org/llama.cpp/pull/6915
|
|
7570
8795
|
// In theory, chunking should be just as useful on NUMA and non NUMA systems, but testing disagreed with that.
|
|
7571
8796
|
if (nchunk0 * nchunk1 < nth * 4 || ggml_is_numa()) {
|
|
7572
8797
|
// distribute the thread work across the inner or outer loop based on which one is larger
|
|
@@ -7599,7 +8824,6 @@ UseGgmlGemm2:;
|
|
|
7599
8824
|
if ((nr0 % 2 != 0) || (ne11 % 2 != 0) || ((ir0_end - ir0_start) % 2 != 0) || ((ir1_end - ir1_start) % 2 != 0)) {
|
|
7600
8825
|
num_rows_per_vec_dot = 1;
|
|
7601
8826
|
}
|
|
7602
|
-
|
|
7603
8827
|
ggml_compute_forward_mul_mat_one_chunk(params, dst, src0->type, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end);
|
|
7604
8828
|
|
|
7605
8829
|
if (nth >= nchunk0 * nchunk1) {
|
|
@@ -7612,6 +8836,84 @@ UseGgmlGemm2:;
|
|
|
7612
8836
|
|
|
7613
8837
|
// ggml_compute_forward_mul_mat_id
|
|
7614
8838
|
|
|
8839
|
+
#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ids->ne[0]*ids->ne[1] + (i1)]
|
|
8840
|
+
|
|
8841
|
+
struct mmid_row_mapping {
|
|
8842
|
+
int32_t i1;
|
|
8843
|
+
int32_t i2;
|
|
8844
|
+
};
|
|
8845
|
+
|
|
8846
|
+
static void ggml_compute_forward_mul_mat_id_one_chunk(
|
|
8847
|
+
struct ggml_tensor * dst,
|
|
8848
|
+
const struct ggml_tensor * src0,
|
|
8849
|
+
const struct ggml_tensor * src1,
|
|
8850
|
+
const struct ggml_tensor * ids,
|
|
8851
|
+
const int64_t cur_a,
|
|
8852
|
+
const int64_t ir0_start,
|
|
8853
|
+
const int64_t ir0_end,
|
|
8854
|
+
const int64_t ir1_start,
|
|
8855
|
+
const int64_t ir1_end,
|
|
8856
|
+
const char * src0_cur,
|
|
8857
|
+
const struct mmid_row_mapping * matrix_rows,
|
|
8858
|
+
const size_t row_size,
|
|
8859
|
+
const bool src1_cont,
|
|
8860
|
+
const void * wdata) {
|
|
8861
|
+
|
|
8862
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
|
8863
|
+
|
|
8864
|
+
const enum ggml_type type = src0->type;
|
|
8865
|
+
|
|
8866
|
+
ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot;
|
|
8867
|
+
enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
|
|
8868
|
+
|
|
8869
|
+
const int64_t blck_0 = 16;
|
|
8870
|
+
const int64_t blck_1 = 16;
|
|
8871
|
+
|
|
8872
|
+
float tmp[16];
|
|
8873
|
+
|
|
8874
|
+
for (int64_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) {
|
|
8875
|
+
for (int64_t iir0 = ir0_start; iir0 < ir0_end; iir0 += blck_0) {
|
|
8876
|
+
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir1_end; ++ir1) {
|
|
8877
|
+
const int64_t _i12 = ir1; // logical row index for this expert
|
|
8878
|
+
|
|
8879
|
+
struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, _i12);
|
|
8880
|
+
const int id = row_mapping.i1; // selected expert index
|
|
8881
|
+
|
|
8882
|
+
const int64_t i11 = id % ne11;
|
|
8883
|
+
const int64_t i12 = row_mapping.i2; // row index in src1
|
|
8884
|
+
|
|
8885
|
+
const int64_t i1 = id; // selected expert index
|
|
8886
|
+
const int64_t i2 = i12; // row
|
|
8887
|
+
|
|
8888
|
+
// desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
|
|
8889
|
+
// if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
|
|
8890
|
+
// the original src1 data pointer, so we should index using the indices directly
|
|
8891
|
+
// TODO: this is a bit of a hack, we should probably have a better way to handle this
|
|
8892
|
+
const char * src1_col = (const char *) wdata +
|
|
8893
|
+
(src1_cont || src1->type != vec_dot_type
|
|
8894
|
+
? (i11 + i12*ne11)*row_size
|
|
8895
|
+
: (i11*nb11 + i12*nb12));
|
|
8896
|
+
|
|
8897
|
+
float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2));
|
|
8898
|
+
|
|
8899
|
+
for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ++ir0) {
|
|
8900
|
+
vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_cur + ir0*nb01, 0, src1_col, 0, 1);
|
|
8901
|
+
}
|
|
8902
|
+
|
|
8903
|
+
memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir0_end) - iir0)*sizeof(float));
|
|
8904
|
+
}
|
|
8905
|
+
}
|
|
8906
|
+
}
|
|
8907
|
+
}
|
|
8908
|
+
|
|
8909
|
+
static void * incr_ptr_aligned(void ** p, size_t size, size_t align) {
|
|
8910
|
+
|
|
8911
|
+
void * ptr = *p;
|
|
8912
|
+
ptr = (void *) GGML_PAD((uintptr_t) ptr, align);
|
|
8913
|
+
*p = (void *) ((char *) ptr + size);
|
|
8914
|
+
return ptr;
|
|
8915
|
+
}
|
|
8916
|
+
|
|
7615
8917
|
static void ggml_compute_forward_mul_mat_id(
|
|
7616
8918
|
const struct ggml_compute_params * params,
|
|
7617
8919
|
struct ggml_tensor * dst) {
|
|
@@ -7629,7 +8931,6 @@ static void ggml_compute_forward_mul_mat_id(
|
|
|
7629
8931
|
|
|
7630
8932
|
const bool src1_cont = ggml_is_contiguous(src1);
|
|
7631
8933
|
|
|
7632
|
-
ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot;
|
|
7633
8934
|
enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
|
|
7634
8935
|
ggml_from_float_t const from_float = type_traits_cpu[vec_dot_type].from_float;
|
|
7635
8936
|
|
|
@@ -7647,21 +8948,27 @@ static void ggml_compute_forward_mul_mat_id(
|
|
|
7647
8948
|
const int n_ids = ids->ne[0]; // n_expert_used
|
|
7648
8949
|
const int n_as = ne02; // n_expert
|
|
7649
8950
|
|
|
7650
|
-
|
|
7651
|
-
(char *) params->wdata :
|
|
7652
|
-
(char *) params->wdata + GGML_PAD(ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));
|
|
8951
|
+
void * wdata_cur = params->wdata;
|
|
7653
8952
|
|
|
7654
|
-
|
|
7655
|
-
|
|
7656
|
-
|
|
7657
|
-
|
|
8953
|
+
if (src1->type != vec_dot_type) {
|
|
8954
|
+
incr_ptr_aligned(&wdata_cur, ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));
|
|
8955
|
+
}
|
|
8956
|
+
|
|
8957
|
+
int64_t * matrix_row_counts = // [n_as]
|
|
8958
|
+
incr_ptr_aligned(&wdata_cur, n_as*sizeof(int64_t), sizeof(int64_t));
|
|
7658
8959
|
|
|
7659
|
-
|
|
7660
|
-
|
|
8960
|
+
struct mmid_row_mapping * matrix_rows = // [n_as][ids->ne[0]*ids->ne[1]]
|
|
8961
|
+
incr_ptr_aligned(&wdata_cur, n_as*ids->ne[0]*ids->ne[1]*sizeof(struct mmid_row_mapping), sizeof(int64_t));
|
|
8962
|
+
|
|
8963
|
+
char (*atomic_current_chunk)[CACHE_LINE_SIZE] = // [n_as]
|
|
8964
|
+
incr_ptr_aligned(&wdata_cur, CACHE_LINE_SIZE * n_as, CACHE_LINE_SIZE);
|
|
8965
|
+
|
|
8966
|
+
GGML_ASSERT(params->wsize >= (size_t)((char *) wdata_cur - (char *) params->wdata));
|
|
7661
8967
|
|
|
7662
8968
|
if (src1->type != vec_dot_type) {
|
|
7663
8969
|
char * wdata = params->wdata;
|
|
7664
8970
|
|
|
8971
|
+
const size_t nbw0 = ggml_type_size(vec_dot_type);
|
|
7665
8972
|
const size_t nbw1 = ggml_row_size(vec_dot_type, ne10);
|
|
7666
8973
|
const size_t nbw2 = nbw1*ne11;
|
|
7667
8974
|
const size_t nbw3 = nbw2*ne12;
|
|
@@ -7669,19 +8976,32 @@ static void ggml_compute_forward_mul_mat_id(
|
|
|
7669
8976
|
assert(params->wsize >= ne13*nbw3);
|
|
7670
8977
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
|
7671
8978
|
|
|
8979
|
+
#if 0
|
|
7672
8980
|
for (int64_t i13 = 0; i13 < ne13; ++i13) {
|
|
7673
|
-
for (int64_t i12 =
|
|
7674
|
-
for (int64_t i11 =
|
|
8981
|
+
for (int64_t i12 = ith; i12 < ne12; i12 += nth) {
|
|
8982
|
+
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
|
7675
8983
|
from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
|
|
7676
8984
|
(void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
|
|
7677
8985
|
ne10);
|
|
7678
8986
|
}
|
|
7679
8987
|
}
|
|
7680
8988
|
}
|
|
8989
|
+
#else
|
|
8990
|
+
for (int64_t i13 = 0; i13 < ne13; ++i13) {
|
|
8991
|
+
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
|
8992
|
+
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
|
8993
|
+
size_t bs = ggml_blck_size(vec_dot_type);
|
|
8994
|
+
int64_t ne10_block_start = (ith * ne10/bs) / nth;
|
|
8995
|
+
int64_t ne10_block_end = ((ith + 1) * ne10/bs) / nth;
|
|
8996
|
+
from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + ne10_block_start*bs*nb10),
|
|
8997
|
+
(void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1 + ne10_block_start*nbw0),
|
|
8998
|
+
(ne10_block_end - ne10_block_start) * bs);
|
|
8999
|
+
}
|
|
9000
|
+
}
|
|
9001
|
+
}
|
|
9002
|
+
#endif
|
|
7681
9003
|
}
|
|
7682
9004
|
|
|
7683
|
-
#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne12 + (i1)]
|
|
7684
|
-
|
|
7685
9005
|
if (ith == 0) {
|
|
7686
9006
|
// initialize matrix_row_counts
|
|
7687
9007
|
memset(matrix_row_counts, 0, n_as*sizeof(int64_t));
|
|
@@ -7699,9 +9019,14 @@ static void ggml_compute_forward_mul_mat_id(
|
|
|
7699
9019
|
}
|
|
7700
9020
|
}
|
|
7701
9021
|
|
|
9022
|
+
// reset current_chunk
|
|
9023
|
+
for (int cur_a = ith; cur_a < n_as; cur_a += nth) {
|
|
9024
|
+
atomic_int * current_chunk_ctr = (atomic_int *)(atomic_current_chunk + cur_a);
|
|
9025
|
+
*current_chunk_ctr = nth;
|
|
9026
|
+
}
|
|
9027
|
+
|
|
7702
9028
|
ggml_barrier(params->threadpool);
|
|
7703
9029
|
|
|
7704
|
-
// compute each matrix multiplication in sequence
|
|
7705
9030
|
for (int cur_a = 0; cur_a < n_as; ++cur_a) {
|
|
7706
9031
|
const int64_t cne1 = matrix_row_counts[cur_a];
|
|
7707
9032
|
|
|
@@ -7709,84 +9034,64 @@ static void ggml_compute_forward_mul_mat_id(
|
|
|
7709
9034
|
continue;
|
|
7710
9035
|
}
|
|
7711
9036
|
|
|
7712
|
-
const char * src0_cur = (const char *) src0->data + cur_a*nb02;
|
|
7713
|
-
|
|
7714
|
-
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
|
9037
|
+
const char * src0_cur = (const char *) src0->data + cur_a * nb02;
|
|
9038
|
+
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
|
7715
9039
|
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
|
7716
9040
|
|
|
7717
|
-
const int64_t nr0 = ne01;
|
|
7718
|
-
const int64_t nr1 = cne1;
|
|
7719
|
-
|
|
7720
|
-
// distribute the thread work across the inner or outer loop based on which one is larger
|
|
7721
|
-
|
|
7722
|
-
const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
|
|
7723
|
-
const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
|
|
7724
|
-
|
|
7725
|
-
const int64_t ith0 = ith % nth0;
|
|
7726
|
-
const int64_t ith1 = ith / nth0;
|
|
7727
|
-
|
|
7728
|
-
const int64_t dr0 = (nr0 + nth0 - 1)/nth0;
|
|
7729
|
-
const int64_t dr1 = (nr1 + nth1 - 1)/nth1;
|
|
9041
|
+
const int64_t nr0 = ne01;
|
|
9042
|
+
const int64_t nr1 = cne1;
|
|
7730
9043
|
|
|
7731
|
-
|
|
7732
|
-
|
|
7733
|
-
|
|
7734
|
-
|
|
7735
|
-
const int64_t ir111 = MIN(ir110 + dr1, nr1);
|
|
7736
|
-
|
|
7737
|
-
// threads with no work simply yield (not sure if it helps)
|
|
7738
|
-
//if (ir010 >= ir011 || ir110 >= ir111) {
|
|
7739
|
-
// sched_yield();
|
|
7740
|
-
// continue;
|
|
7741
|
-
//}
|
|
9044
|
+
int chunk_size = 16;
|
|
9045
|
+
if (nr0 == 1 || nr1 == 1) {
|
|
9046
|
+
chunk_size = 64;
|
|
9047
|
+
}
|
|
7742
9048
|
|
|
7743
|
-
|
|
7744
|
-
|
|
7745
|
-
const
|
|
9049
|
+
#if defined(__aarch64__)
|
|
9050
|
+
// disable for ARM
|
|
9051
|
+
const bool disable_chunking = true;
|
|
9052
|
+
#else
|
|
9053
|
+
// disable for NUMA
|
|
9054
|
+
const bool disable_chunking = ggml_is_numa();
|
|
9055
|
+
#endif // defined(__aarch64__)
|
|
7746
9056
|
|
|
7747
|
-
|
|
7748
|
-
|
|
9057
|
+
int64_t nchunk0 = (nr0 + chunk_size - 1) / chunk_size;
|
|
9058
|
+
int64_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size;
|
|
7749
9059
|
|
|
7750
|
-
|
|
7751
|
-
|
|
7752
|
-
|
|
7753
|
-
|
|
9060
|
+
if (nchunk0 * nchunk1 < nth * 4 || disable_chunking) {
|
|
9061
|
+
nchunk0 = nr0 > nr1 ? nth : 1;
|
|
9062
|
+
nchunk1 = nr0 > nr1 ? 1 : nth;
|
|
9063
|
+
}
|
|
7754
9064
|
|
|
7755
|
-
|
|
7756
|
-
|
|
9065
|
+
const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
|
|
9066
|
+
const int64_t dr1 = (nr1 + nchunk1 - 1) / nchunk1;
|
|
7757
9067
|
|
|
7758
|
-
|
|
7759
|
-
const int64_t i12 = row_mapping.i2; // row index in src1
|
|
9068
|
+
int current_chunk = ith;
|
|
7760
9069
|
|
|
7761
|
-
|
|
7762
|
-
const int64_t i2 = i12; // row
|
|
9070
|
+
atomic_int * current_chunk_ctr = (atomic_int *)(atomic_current_chunk + cur_a);
|
|
7763
9071
|
|
|
7764
|
-
|
|
7765
|
-
|
|
7766
|
-
|
|
7767
|
-
// TODO: this is a bit of a hack, we should probably have a better way to handle this
|
|
7768
|
-
const char * src1_col = (const char *) wdata +
|
|
7769
|
-
(src1_cont || src1->type != vec_dot_type
|
|
7770
|
-
? (i11 + i12*ne11)*row_size
|
|
7771
|
-
: (i11*nb11 + i12*nb12));
|
|
9072
|
+
while (current_chunk < nchunk0 * nchunk1) {
|
|
9073
|
+
const int64_t ith0 = current_chunk % nchunk0;
|
|
9074
|
+
const int64_t ith1 = current_chunk / nchunk0;
|
|
7772
9075
|
|
|
7773
|
-
|
|
9076
|
+
const int64_t ir0_start = dr0 * ith0;
|
|
9077
|
+
const int64_t ir0_end = MIN(ir0_start + dr0, nr0);
|
|
7774
9078
|
|
|
7775
|
-
|
|
7776
|
-
|
|
7777
|
-
//}
|
|
9079
|
+
const int64_t ir1_start = dr1 * ith1;
|
|
9080
|
+
const int64_t ir1_end = MIN(ir1_start + dr1, nr1);
|
|
7778
9081
|
|
|
7779
|
-
|
|
7780
|
-
|
|
7781
|
-
|
|
9082
|
+
ggml_compute_forward_mul_mat_id_one_chunk(
|
|
9083
|
+
dst, src0, src1, ids, cur_a,
|
|
9084
|
+
ir0_start, ir0_end, ir1_start, ir1_end,
|
|
9085
|
+
src0_cur, matrix_rows, row_size, src1_cont, wdata
|
|
9086
|
+
);
|
|
7782
9087
|
|
|
7783
|
-
|
|
7784
|
-
|
|
9088
|
+
if (nth >= nchunk0 * nchunk1) {
|
|
9089
|
+
break;
|
|
7785
9090
|
}
|
|
9091
|
+
|
|
9092
|
+
current_chunk = atomic_fetch_add_explicit(current_chunk_ctr, 1, memory_order_relaxed);
|
|
7786
9093
|
}
|
|
7787
9094
|
}
|
|
7788
|
-
|
|
7789
|
-
#undef MMID_MATRIX_ROW
|
|
7790
9095
|
}
|
|
7791
9096
|
|
|
7792
9097
|
// ggml_compute_forward_out_prod
|
|
@@ -9080,10 +10385,6 @@ static void ggml_compute_forward_clamp_f32(
|
|
|
9080
10385
|
|
|
9081
10386
|
const struct ggml_tensor * src0 = dst->src[0];
|
|
9082
10387
|
|
|
9083
|
-
if (params->ith != 0) {
|
|
9084
|
-
return;
|
|
9085
|
-
}
|
|
9086
|
-
|
|
9087
10388
|
float min;
|
|
9088
10389
|
float max;
|
|
9089
10390
|
memcpy(&min, (float *) dst->op_params + 0, sizeof(float));
|
|
@@ -9114,6 +10415,43 @@ static void ggml_compute_forward_clamp_f32(
|
|
|
9114
10415
|
}
|
|
9115
10416
|
}
|
|
9116
10417
|
|
|
10418
|
+
static void ggml_compute_forward_clamp_f16(
|
|
10419
|
+
const struct ggml_compute_params * params,
|
|
10420
|
+
struct ggml_tensor * dst) {
|
|
10421
|
+
|
|
10422
|
+
const struct ggml_tensor * src0 = dst->src[0];
|
|
10423
|
+
|
|
10424
|
+
float min;
|
|
10425
|
+
float max;
|
|
10426
|
+
memcpy(&min, (float *) dst->op_params + 0, sizeof(float));
|
|
10427
|
+
memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
|
|
10428
|
+
|
|
10429
|
+
const int ith = params->ith;
|
|
10430
|
+
const int nth = params->nth;
|
|
10431
|
+
|
|
10432
|
+
const int n = ggml_nrows(src0);
|
|
10433
|
+
const int nc = src0->ne[0];
|
|
10434
|
+
|
|
10435
|
+
const size_t nb00 = src0->nb[0];
|
|
10436
|
+
const size_t nb01 = src0->nb[1];
|
|
10437
|
+
|
|
10438
|
+
const size_t nb0 = dst->nb[0];
|
|
10439
|
+
const size_t nb1 = dst->nb[1];
|
|
10440
|
+
|
|
10441
|
+
GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
|
|
10442
|
+
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
|
10443
|
+
|
|
10444
|
+
for (int j = ith; j < n; j += nth) {
|
|
10445
|
+
ggml_fp16_t * dst_ptr = (ggml_fp16_t *) ((char *) dst->data + j*nb1);
|
|
10446
|
+
ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + j*nb01);
|
|
10447
|
+
|
|
10448
|
+
for (int i = 0; i < nc; i++) {
|
|
10449
|
+
float v = GGML_FP16_TO_FP32(src0_ptr[i]);
|
|
10450
|
+
dst_ptr[i] = GGML_FP32_TO_FP16(MAX(MIN(v, max), min));
|
|
10451
|
+
}
|
|
10452
|
+
}
|
|
10453
|
+
}
|
|
10454
|
+
|
|
9117
10455
|
static void ggml_compute_forward_clamp(
|
|
9118
10456
|
const struct ggml_compute_params * params,
|
|
9119
10457
|
struct ggml_tensor * dst) {
|
|
@@ -9126,6 +10464,9 @@ static void ggml_compute_forward_clamp(
|
|
|
9126
10464
|
ggml_compute_forward_clamp_f32(params, dst);
|
|
9127
10465
|
} break;
|
|
9128
10466
|
case GGML_TYPE_F16:
|
|
10467
|
+
{
|
|
10468
|
+
ggml_compute_forward_clamp_f16(params, dst);
|
|
10469
|
+
} break;
|
|
9129
10470
|
case GGML_TYPE_BF16:
|
|
9130
10471
|
case GGML_TYPE_Q4_0:
|
|
9131
10472
|
case GGML_TYPE_Q4_1:
|
|
@@ -13723,14 +15064,19 @@ struct ggml_cplan ggml_graph_plan(
|
|
|
13723
15064
|
cur = 0;
|
|
13724
15065
|
const struct ggml_tensor * src0 = node->src[0];
|
|
13725
15066
|
const struct ggml_tensor * src1 = node->src[1];
|
|
15067
|
+
const struct ggml_tensor * ids = node->src[2];
|
|
13726
15068
|
const enum ggml_type vec_dot_type = type_traits_cpu[src0->type].vec_dot_type;
|
|
15069
|
+
const int n_as = src0->ne[2];
|
|
15070
|
+
// src1
|
|
13727
15071
|
if (src1->type != vec_dot_type) {
|
|
13728
|
-
cur += ggml_row_size(vec_dot_type, ggml_nelements(src1));
|
|
15072
|
+
cur += ggml_row_size(vec_dot_type, ggml_nelements(src1)) + sizeof(int64_t);
|
|
13729
15073
|
}
|
|
13730
|
-
|
|
13731
|
-
cur +=
|
|
13732
|
-
|
|
13733
|
-
cur += n_as
|
|
15074
|
+
// matrix_row_counts
|
|
15075
|
+
cur += n_as * sizeof(int64_t) + sizeof(int64_t);
|
|
15076
|
+
// matrix_rows
|
|
15077
|
+
cur += n_as*ids->ne[0]*ids->ne[1]*sizeof(struct mmid_row_mapping) + sizeof(int64_t);
|
|
15078
|
+
// atomic_current_chunk
|
|
15079
|
+
cur += CACHE_LINE_SIZE*n_as + CACHE_LINE_SIZE;
|
|
13734
15080
|
} break;
|
|
13735
15081
|
case GGML_OP_OUT_PROD:
|
|
13736
15082
|
{
|
|
@@ -13862,9 +15208,13 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
|
13862
15208
|
tp->ec = GGML_STATUS_ABORTED;
|
|
13863
15209
|
}
|
|
13864
15210
|
|
|
13865
|
-
|
|
15211
|
+
if (node_n + 1 < cgraph->n_nodes) {
|
|
15212
|
+
ggml_barrier(state->threadpool);
|
|
15213
|
+
}
|
|
13866
15214
|
}
|
|
13867
15215
|
|
|
15216
|
+
ggml_barrier(state->threadpool);
|
|
15217
|
+
|
|
13868
15218
|
return 0;
|
|
13869
15219
|
}
|
|
13870
15220
|
|
|
@@ -14229,6 +15579,14 @@ int ggml_cpu_has_amx_int8(void) {
|
|
|
14229
15579
|
#endif
|
|
14230
15580
|
}
|
|
14231
15581
|
|
|
15582
|
+
int ggml_cpu_has_bmi2(void) {
|
|
15583
|
+
#if defined(__BMI2__)
|
|
15584
|
+
return 1;
|
|
15585
|
+
#else
|
|
15586
|
+
return 0;
|
|
15587
|
+
#endif
|
|
15588
|
+
}
|
|
15589
|
+
|
|
14232
15590
|
int ggml_cpu_has_fma(void) {
|
|
14233
15591
|
#if defined(__FMA__)
|
|
14234
15592
|
return 1;
|
|
@@ -14309,6 +15667,14 @@ int ggml_cpu_has_vsx(void) {
|
|
|
14309
15667
|
#endif
|
|
14310
15668
|
}
|
|
14311
15669
|
|
|
15670
|
+
int ggml_cpu_has_vxe(void) {
|
|
15671
|
+
#if defined(__VXE__) || defined(__VXE2__)
|
|
15672
|
+
return 1;
|
|
15673
|
+
#else
|
|
15674
|
+
return 0;
|
|
15675
|
+
#endif
|
|
15676
|
+
}
|
|
15677
|
+
|
|
14312
15678
|
int ggml_cpu_has_neon(void) {
|
|
14313
15679
|
#if defined(__ARM_ARCH) && defined(__ARM_NEON)
|
|
14314
15680
|
return ggml_arm_arch_features.has_neon;
|
|
@@ -14349,6 +15715,14 @@ int ggml_cpu_get_sve_cnt(void) {
|
|
|
14349
15715
|
#endif
|
|
14350
15716
|
}
|
|
14351
15717
|
|
|
15718
|
+
int ggml_cpu_has_sme(void) {
|
|
15719
|
+
#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SME)
|
|
15720
|
+
return ggml_arm_arch_features.has_sme;
|
|
15721
|
+
#else
|
|
15722
|
+
return 0;
|
|
15723
|
+
#endif
|
|
15724
|
+
}
|
|
15725
|
+
|
|
14352
15726
|
void ggml_cpu_init(void) {
|
|
14353
15727
|
// needed to initialize f16 tables
|
|
14354
15728
|
{
|