llama_cpp 0.15.4 → 0.16.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/ext/llama_cpp/extconf.rb +3 -2
- data/ext/llama_cpp/llama_cpp.cpp +17 -3
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +15 -1
- data/vendor/tmp/llama.cpp/Makefile +166 -82
- data/vendor/tmp/llama.cpp/ggml-alloc.c +82 -26
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +20 -8
- data/vendor/tmp/llama.cpp/ggml-backend.c +183 -69
- data/vendor/tmp/llama.cpp/ggml-backend.h +4 -4
- data/vendor/tmp/llama.cpp/ggml-blas.cpp +363 -0
- data/vendor/tmp/llama.cpp/ggml-blas.h +23 -0
- data/vendor/tmp/llama.cpp/ggml-common.h +6 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +47 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +34 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +104 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +280 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +34 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +196 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +686 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +490 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +40 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +674 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +319 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +312 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +345 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +178 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +104 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +88 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +419 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +221 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +49 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +94 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +112 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +271 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +31 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +206 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +40 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +9 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +8 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +47 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +286 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +51 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +103 -135
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +29 -13
- data/vendor/tmp/llama.cpp/ggml-metal.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +45 -33
- data/vendor/tmp/llama.cpp/ggml-metal.metal +83 -59
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +15 -14
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +26 -90
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +74522 -14913
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +631 -471
- data/vendor/tmp/llama.cpp/ggml.c +278 -603
- data/vendor/tmp/llama.cpp/ggml.h +9 -28
- data/vendor/tmp/llama.cpp/llama.cpp +345 -473
- data/vendor/tmp/llama.cpp/llama.h +21 -43
- metadata +134 -7
- data/vendor/tmp/llama.cpp/ggml-mpi.c +0 -216
- data/vendor/tmp/llama.cpp/ggml-mpi.h +0 -39
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +0 -2305
- data/vendor/tmp/llama.cpp/ggml-opencl.h +0 -36
data/vendor/tmp/llama.cpp/ggml.c
CHANGED
|
@@ -5,6 +5,7 @@
|
|
|
5
5
|
#include "ggml-quants.h"
|
|
6
6
|
#include "ggml.h"
|
|
7
7
|
|
|
8
|
+
|
|
8
9
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
|
9
10
|
#include <malloc.h> // using malloc.h with MSC/MINGW
|
|
10
11
|
#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
|
|
@@ -28,6 +29,10 @@
|
|
|
28
29
|
#include <syscall.h>
|
|
29
30
|
#endif
|
|
30
31
|
|
|
32
|
+
#ifdef GGML_USE_OPENMP
|
|
33
|
+
#include <omp.h>
|
|
34
|
+
#endif
|
|
35
|
+
|
|
31
36
|
#ifdef GGML_USE_METAL
|
|
32
37
|
#include <unistd.h>
|
|
33
38
|
#endif
|
|
@@ -292,17 +297,6 @@ inline static void * ggml_calloc(size_t num, size_t size) {
|
|
|
292
297
|
|
|
293
298
|
#if defined(GGML_USE_ACCELERATE)
|
|
294
299
|
#include <Accelerate/Accelerate.h>
|
|
295
|
-
#if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
|
|
296
|
-
#include "ggml-opencl.h"
|
|
297
|
-
#endif
|
|
298
|
-
#elif defined(GGML_USE_OPENBLAS)
|
|
299
|
-
#if defined(GGML_BLAS_USE_MKL)
|
|
300
|
-
#include <mkl.h>
|
|
301
|
-
#else
|
|
302
|
-
#include <cblas.h>
|
|
303
|
-
#endif
|
|
304
|
-
#elif defined(GGML_USE_CLBLAST)
|
|
305
|
-
#include "ggml-opencl.h"
|
|
306
300
|
#endif
|
|
307
301
|
|
|
308
302
|
// floating point type used to accumulate sums
|
|
@@ -1756,7 +1750,7 @@ struct ggml_compute_state_shared {
|
|
|
1756
1750
|
int64_t perf_node_start_cycles;
|
|
1757
1751
|
int64_t perf_node_start_time_us;
|
|
1758
1752
|
|
|
1759
|
-
|
|
1753
|
+
int n_threads;
|
|
1760
1754
|
|
|
1761
1755
|
// synchronization primitives
|
|
1762
1756
|
atomic_int n_active; // num active threads
|
|
@@ -2267,6 +2261,11 @@ inline static float ggml_silu_f32(float x) {
|
|
|
2267
2261
|
return x/(1.0f + expf(-x));
|
|
2268
2262
|
}
|
|
2269
2263
|
|
|
2264
|
+
#if __FINITE_MATH_ONLY__
|
|
2265
|
+
#error "some routines in ggml.c require non-finite math arithmetics -- pass -fno-finite-math-only to the compiler to fix"
|
|
2266
|
+
#error "ref: https://github.com/ggerganov/llama.cpp/pull/7154#issuecomment-2143844461"
|
|
2267
|
+
#endif
|
|
2268
|
+
|
|
2270
2269
|
#if defined(__ARM_NEON) && defined(__aarch64__)
|
|
2271
2270
|
|
|
2272
2271
|
// adapted from arm limited optimized routine
|
|
@@ -3207,35 +3206,42 @@ GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor) {
|
|
|
3207
3206
|
return tensor->nb[0] > tensor->nb[1];
|
|
3208
3207
|
}
|
|
3209
3208
|
|
|
3210
|
-
|
|
3211
|
-
|
|
3209
|
+
static bool ggml_is_contiguous_n(const struct ggml_tensor * tensor, int n) {
|
|
3210
|
+
size_t next_nb = ggml_type_size(tensor->type);
|
|
3211
|
+
if (tensor->ne[0] != ggml_blck_size(tensor->type) && tensor->nb[0] != next_nb) {
|
|
3212
|
+
return false;
|
|
3213
|
+
}
|
|
3214
|
+
next_nb *= tensor->ne[0]/ggml_blck_size(tensor->type);
|
|
3215
|
+
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
|
3216
|
+
if (tensor->ne[i] != 1) {
|
|
3217
|
+
if (i > n) {
|
|
3218
|
+
if (tensor->nb[i] != next_nb) {
|
|
3219
|
+
return false;
|
|
3220
|
+
}
|
|
3221
|
+
next_nb *= tensor->ne[i];
|
|
3222
|
+
} else {
|
|
3223
|
+
// this dimension does not need to be contiguous
|
|
3224
|
+
next_nb = tensor->ne[i]*tensor->nb[i];
|
|
3225
|
+
}
|
|
3226
|
+
}
|
|
3227
|
+
}
|
|
3228
|
+
return true;
|
|
3229
|
+
}
|
|
3212
3230
|
|
|
3213
|
-
|
|
3214
|
-
|
|
3215
|
-
tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) &&
|
|
3216
|
-
tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
|
|
3217
|
-
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
|
3231
|
+
GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
|
|
3232
|
+
return ggml_is_contiguous_0(tensor);
|
|
3218
3233
|
}
|
|
3219
3234
|
|
|
3220
3235
|
GGML_CALL bool ggml_is_contiguous_0(const struct ggml_tensor * tensor) {
|
|
3221
|
-
return
|
|
3236
|
+
return ggml_is_contiguous_n(tensor, 0);
|
|
3222
3237
|
}
|
|
3223
3238
|
|
|
3224
3239
|
GGML_CALL bool ggml_is_contiguous_1(const struct ggml_tensor * tensor) {
|
|
3225
|
-
|
|
3226
|
-
|
|
3227
|
-
return
|
|
3228
|
-
tensor->nb[0] == ggml_type_size(tensor->type) &&
|
|
3229
|
-
tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
|
|
3230
|
-
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
|
3240
|
+
return ggml_is_contiguous_n(tensor, 1);
|
|
3231
3241
|
}
|
|
3232
3242
|
|
|
3233
3243
|
GGML_CALL bool ggml_is_contiguous_2(const struct ggml_tensor * tensor) {
|
|
3234
|
-
|
|
3235
|
-
|
|
3236
|
-
return
|
|
3237
|
-
tensor->nb[0] == ggml_type_size(tensor->type) &&
|
|
3238
|
-
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
|
3244
|
+
return ggml_is_contiguous_n(tensor, 2);
|
|
3239
3245
|
}
|
|
3240
3246
|
|
|
3241
3247
|
GGML_CALL bool ggml_is_permuted(const struct ggml_tensor * tensor) {
|
|
@@ -3267,20 +3273,20 @@ bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor
|
|
|
3267
3273
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
|
3268
3274
|
|
|
3269
3275
|
return
|
|
3270
|
-
(t0->ne[0] == t1->ne[0]
|
|
3271
|
-
(t0->ne[1] == t1->ne[1]
|
|
3272
|
-
(t0->ne[2] == t1->ne[2]
|
|
3273
|
-
(t0->ne[3] == t1->ne[3]
|
|
3276
|
+
(t0->ne[0] == t1->ne[0]) &&
|
|
3277
|
+
(t0->ne[1] == t1->ne[1]) &&
|
|
3278
|
+
(t0->ne[2] == t1->ne[2]) &&
|
|
3279
|
+
(t0->ne[3] == t1->ne[3]);
|
|
3274
3280
|
}
|
|
3275
3281
|
|
|
3276
3282
|
bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
|
3277
3283
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
|
3278
3284
|
|
|
3279
3285
|
return
|
|
3280
|
-
(t0->nb[0] == t1->nb[0]
|
|
3281
|
-
(t0->nb[1] == t1->nb[1]
|
|
3282
|
-
(t0->nb[2] == t1->nb[2]
|
|
3283
|
-
(t0->nb[3] == t1->nb[3]
|
|
3286
|
+
(t0->nb[0] == t1->nb[0]) &&
|
|
3287
|
+
(t0->nb[1] == t1->nb[1]) &&
|
|
3288
|
+
(t0->nb[2] == t1->nb[2]) &&
|
|
3289
|
+
(t0->nb[3] == t1->nb[3]);
|
|
3284
3290
|
}
|
|
3285
3291
|
|
|
3286
3292
|
// check if t1 can be represented as a repeatition of t0
|
|
@@ -3370,10 +3376,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
|
3370
3376
|
GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
|
|
3371
3377
|
}
|
|
3372
3378
|
|
|
3373
|
-
#if defined(GGML_USE_CLBLAST)
|
|
3374
|
-
ggml_cl_init();
|
|
3375
|
-
#endif
|
|
3376
|
-
|
|
3377
3379
|
ggml_setup_op_has_task_pass();
|
|
3378
3380
|
|
|
3379
3381
|
is_first_call = false;
|
|
@@ -4077,32 +4079,26 @@ float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) {
|
|
|
4077
4079
|
switch (tensor->type) {
|
|
4078
4080
|
case GGML_TYPE_I8:
|
|
4079
4081
|
{
|
|
4080
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
|
|
4081
4082
|
return ((int8_t *)(tensor->data))[i];
|
|
4082
4083
|
}
|
|
4083
4084
|
case GGML_TYPE_I16:
|
|
4084
4085
|
{
|
|
4085
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
|
|
4086
4086
|
return ((int16_t *)(tensor->data))[i];
|
|
4087
4087
|
}
|
|
4088
4088
|
case GGML_TYPE_I32:
|
|
4089
4089
|
{
|
|
4090
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
|
|
4091
4090
|
return ((int32_t *)(tensor->data))[i];
|
|
4092
4091
|
}
|
|
4093
4092
|
case GGML_TYPE_F16:
|
|
4094
4093
|
{
|
|
4095
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
|
|
4096
4094
|
return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
|
|
4097
4095
|
}
|
|
4098
4096
|
case GGML_TYPE_BF16:
|
|
4099
4097
|
{
|
|
4100
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_bf16_t));
|
|
4101
4098
|
return GGML_BF16_TO_FP32(((ggml_bf16_t *)(tensor->data))[i]);
|
|
4102
4099
|
}
|
|
4103
4100
|
case GGML_TYPE_F32:
|
|
4104
4101
|
{
|
|
4105
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(float));
|
|
4106
4102
|
return ((float *)(tensor->data))[i];
|
|
4107
4103
|
}
|
|
4108
4104
|
default:
|
|
@@ -4124,32 +4120,26 @@ void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) {
|
|
|
4124
4120
|
switch (tensor->type) {
|
|
4125
4121
|
case GGML_TYPE_I8:
|
|
4126
4122
|
{
|
|
4127
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
|
|
4128
4123
|
((int8_t *)(tensor->data))[i] = value;
|
|
4129
4124
|
} break;
|
|
4130
4125
|
case GGML_TYPE_I16:
|
|
4131
4126
|
{
|
|
4132
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
|
|
4133
4127
|
((int16_t *)(tensor->data))[i] = value;
|
|
4134
4128
|
} break;
|
|
4135
4129
|
case GGML_TYPE_I32:
|
|
4136
4130
|
{
|
|
4137
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
|
|
4138
4131
|
((int32_t *)(tensor->data))[i] = value;
|
|
4139
4132
|
} break;
|
|
4140
4133
|
case GGML_TYPE_F16:
|
|
4141
4134
|
{
|
|
4142
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
|
|
4143
4135
|
((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
|
|
4144
4136
|
} break;
|
|
4145
4137
|
case GGML_TYPE_BF16:
|
|
4146
4138
|
{
|
|
4147
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_bf16_t));
|
|
4148
4139
|
((ggml_bf16_t *)(tensor->data))[i] = GGML_FP32_TO_BF16(value);
|
|
4149
4140
|
} break;
|
|
4150
4141
|
case GGML_TYPE_F32:
|
|
4151
4142
|
{
|
|
4152
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(float));
|
|
4153
4143
|
((float *)(tensor->data))[i] = value;
|
|
4154
4144
|
} break;
|
|
4155
4145
|
default:
|
|
@@ -6249,16 +6239,13 @@ static struct ggml_tensor * ggml_rope_impl(
|
|
|
6249
6239
|
struct ggml_tensor * c,
|
|
6250
6240
|
int n_dims,
|
|
6251
6241
|
int mode,
|
|
6252
|
-
int
|
|
6253
|
-
int n_orig_ctx,
|
|
6242
|
+
int n_ctx_orig,
|
|
6254
6243
|
float freq_base,
|
|
6255
6244
|
float freq_scale,
|
|
6256
6245
|
float ext_factor,
|
|
6257
6246
|
float attn_factor,
|
|
6258
6247
|
float beta_fast,
|
|
6259
6248
|
float beta_slow,
|
|
6260
|
-
float xpos_base,
|
|
6261
|
-
bool xpos_down,
|
|
6262
6249
|
bool inplace) {
|
|
6263
6250
|
GGML_ASSERT((mode & 1) == 0 && "mode & 1 == 1 is no longer supported");
|
|
6264
6251
|
|
|
@@ -6279,15 +6266,13 @@ static struct ggml_tensor * ggml_rope_impl(
|
|
|
6279
6266
|
|
|
6280
6267
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
|
6281
6268
|
|
|
6282
|
-
int32_t params[
|
|
6269
|
+
int32_t params[11] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
|
|
6283
6270
|
memcpy(params + 5, &freq_base, sizeof(float));
|
|
6284
6271
|
memcpy(params + 6, &freq_scale, sizeof(float));
|
|
6285
6272
|
memcpy(params + 7, &ext_factor, sizeof(float));
|
|
6286
6273
|
memcpy(params + 8, &attn_factor, sizeof(float));
|
|
6287
6274
|
memcpy(params + 9, &beta_fast, sizeof(float));
|
|
6288
6275
|
memcpy(params + 10, &beta_slow, sizeof(float));
|
|
6289
|
-
memcpy(params + 11, &xpos_base, sizeof(float));
|
|
6290
|
-
memcpy(params + 12, &xpos_down, sizeof(bool));
|
|
6291
6276
|
ggml_set_op_params(result, params, sizeof(params));
|
|
6292
6277
|
|
|
6293
6278
|
result->op = GGML_OP_ROPE;
|
|
@@ -6304,10 +6289,9 @@ struct ggml_tensor * ggml_rope(
|
|
|
6304
6289
|
struct ggml_tensor * a,
|
|
6305
6290
|
struct ggml_tensor * b,
|
|
6306
6291
|
int n_dims,
|
|
6307
|
-
int mode
|
|
6308
|
-
int n_ctx) {
|
|
6292
|
+
int mode) {
|
|
6309
6293
|
return ggml_rope_impl(
|
|
6310
|
-
ctx, a, b, NULL, n_dims, mode,
|
|
6294
|
+
ctx, a, b, NULL, n_dims, mode, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, false
|
|
6311
6295
|
);
|
|
6312
6296
|
}
|
|
6313
6297
|
|
|
@@ -6316,10 +6300,9 @@ struct ggml_tensor * ggml_rope_inplace(
|
|
|
6316
6300
|
struct ggml_tensor * a,
|
|
6317
6301
|
struct ggml_tensor * b,
|
|
6318
6302
|
int n_dims,
|
|
6319
|
-
int mode
|
|
6320
|
-
int n_ctx) {
|
|
6303
|
+
int mode) {
|
|
6321
6304
|
return ggml_rope_impl(
|
|
6322
|
-
ctx, a, b, NULL, n_dims, mode,
|
|
6305
|
+
ctx, a, b, NULL, n_dims, mode, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, true
|
|
6323
6306
|
);
|
|
6324
6307
|
}
|
|
6325
6308
|
|
|
@@ -6330,8 +6313,7 @@ struct ggml_tensor * ggml_rope_ext(
|
|
|
6330
6313
|
struct ggml_tensor * c,
|
|
6331
6314
|
int n_dims,
|
|
6332
6315
|
int mode,
|
|
6333
|
-
int
|
|
6334
|
-
int n_orig_ctx,
|
|
6316
|
+
int n_ctx_orig,
|
|
6335
6317
|
float freq_base,
|
|
6336
6318
|
float freq_scale,
|
|
6337
6319
|
float ext_factor,
|
|
@@ -6339,8 +6321,8 @@ struct ggml_tensor * ggml_rope_ext(
|
|
|
6339
6321
|
float beta_fast,
|
|
6340
6322
|
float beta_slow) {
|
|
6341
6323
|
return ggml_rope_impl(
|
|
6342
|
-
ctx, a, b, c, n_dims, mode,
|
|
6343
|
-
ext_factor, attn_factor, beta_fast, beta_slow,
|
|
6324
|
+
ctx, a, b, c, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
|
|
6325
|
+
ext_factor, attn_factor, beta_fast, beta_slow, false
|
|
6344
6326
|
);
|
|
6345
6327
|
}
|
|
6346
6328
|
|
|
@@ -6351,8 +6333,7 @@ struct ggml_tensor * ggml_rope_ext_inplace(
|
|
|
6351
6333
|
struct ggml_tensor * c,
|
|
6352
6334
|
int n_dims,
|
|
6353
6335
|
int mode,
|
|
6354
|
-
int
|
|
6355
|
-
int n_orig_ctx,
|
|
6336
|
+
int n_ctx_orig,
|
|
6356
6337
|
float freq_base,
|
|
6357
6338
|
float freq_scale,
|
|
6358
6339
|
float ext_factor,
|
|
@@ -6360,8 +6341,8 @@ struct ggml_tensor * ggml_rope_ext_inplace(
|
|
|
6360
6341
|
float beta_fast,
|
|
6361
6342
|
float beta_slow) {
|
|
6362
6343
|
return ggml_rope_impl(
|
|
6363
|
-
ctx, a, b, c, n_dims, mode,
|
|
6364
|
-
ext_factor, attn_factor, beta_fast, beta_slow,
|
|
6344
|
+
ctx, a, b, c, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
|
|
6345
|
+
ext_factor, attn_factor, beta_fast, beta_slow, true
|
|
6365
6346
|
);
|
|
6366
6347
|
}
|
|
6367
6348
|
|
|
@@ -6371,8 +6352,7 @@ struct ggml_tensor * ggml_rope_custom(
|
|
|
6371
6352
|
struct ggml_tensor * b,
|
|
6372
6353
|
int n_dims,
|
|
6373
6354
|
int mode,
|
|
6374
|
-
int
|
|
6375
|
-
int n_orig_ctx,
|
|
6355
|
+
int n_ctx_orig,
|
|
6376
6356
|
float freq_base,
|
|
6377
6357
|
float freq_scale,
|
|
6378
6358
|
float ext_factor,
|
|
@@ -6380,8 +6360,8 @@ struct ggml_tensor * ggml_rope_custom(
|
|
|
6380
6360
|
float beta_fast,
|
|
6381
6361
|
float beta_slow) {
|
|
6382
6362
|
return ggml_rope_impl(
|
|
6383
|
-
ctx, a, b, NULL, n_dims, mode,
|
|
6384
|
-
ext_factor, attn_factor, beta_fast, beta_slow,
|
|
6363
|
+
ctx, a, b, NULL, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
|
|
6364
|
+
ext_factor, attn_factor, beta_fast, beta_slow, false
|
|
6385
6365
|
);
|
|
6386
6366
|
}
|
|
6387
6367
|
|
|
@@ -6391,8 +6371,7 @@ struct ggml_tensor * ggml_rope_custom_inplace(
|
|
|
6391
6371
|
struct ggml_tensor * b,
|
|
6392
6372
|
int n_dims,
|
|
6393
6373
|
int mode,
|
|
6394
|
-
int
|
|
6395
|
-
int n_orig_ctx,
|
|
6374
|
+
int n_ctx_orig,
|
|
6396
6375
|
float freq_base,
|
|
6397
6376
|
float freq_scale,
|
|
6398
6377
|
float ext_factor,
|
|
@@ -6400,21 +6379,11 @@ struct ggml_tensor * ggml_rope_custom_inplace(
|
|
|
6400
6379
|
float beta_fast,
|
|
6401
6380
|
float beta_slow) {
|
|
6402
6381
|
return ggml_rope_impl(
|
|
6403
|
-
ctx, a, b, NULL, n_dims, mode,
|
|
6404
|
-
ext_factor, attn_factor, beta_fast, beta_slow,
|
|
6382
|
+
ctx, a, b, NULL, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
|
|
6383
|
+
ext_factor, attn_factor, beta_fast, beta_slow, true
|
|
6405
6384
|
);
|
|
6406
6385
|
}
|
|
6407
6386
|
|
|
6408
|
-
struct ggml_tensor * ggml_rope_xpos_inplace(
|
|
6409
|
-
struct ggml_context * ctx,
|
|
6410
|
-
struct ggml_tensor * a,
|
|
6411
|
-
struct ggml_tensor * b,
|
|
6412
|
-
int n_dims,
|
|
6413
|
-
float base,
|
|
6414
|
-
bool down) {
|
|
6415
|
-
return ggml_rope_impl(ctx, a, b, NULL, n_dims, 0, 0, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, base, down, true);
|
|
6416
|
-
}
|
|
6417
|
-
|
|
6418
6387
|
// ggml_rope_back
|
|
6419
6388
|
|
|
6420
6389
|
struct ggml_tensor * ggml_rope_back(
|
|
@@ -6424,16 +6393,13 @@ struct ggml_tensor * ggml_rope_back(
|
|
|
6424
6393
|
struct ggml_tensor * c,
|
|
6425
6394
|
int n_dims,
|
|
6426
6395
|
int mode,
|
|
6427
|
-
int
|
|
6428
|
-
int n_orig_ctx,
|
|
6396
|
+
int n_ctx_orig,
|
|
6429
6397
|
float freq_base,
|
|
6430
6398
|
float freq_scale,
|
|
6431
6399
|
float ext_factor,
|
|
6432
6400
|
float attn_factor,
|
|
6433
6401
|
float beta_fast,
|
|
6434
|
-
float beta_slow
|
|
6435
|
-
float xpos_base,
|
|
6436
|
-
bool xpos_down) {
|
|
6402
|
+
float beta_slow) {
|
|
6437
6403
|
GGML_ASSERT(ggml_is_vector(b));
|
|
6438
6404
|
GGML_ASSERT(b->type == GGML_TYPE_I32);
|
|
6439
6405
|
GGML_ASSERT(a->ne[2] == b->ne[0]);
|
|
@@ -6449,15 +6415,13 @@ struct ggml_tensor * ggml_rope_back(
|
|
|
6449
6415
|
|
|
6450
6416
|
struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
|
|
6451
6417
|
|
|
6452
|
-
int32_t params[
|
|
6418
|
+
int32_t params[11] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
|
|
6453
6419
|
memcpy(params + 5, &freq_base, sizeof(float));
|
|
6454
6420
|
memcpy(params + 6, &freq_scale, sizeof(float));
|
|
6455
6421
|
memcpy(params + 7, &ext_factor, sizeof(float));
|
|
6456
6422
|
memcpy(params + 8, &attn_factor, sizeof(float));
|
|
6457
6423
|
memcpy(params + 9, &beta_fast, sizeof(float));
|
|
6458
6424
|
memcpy(params + 10, &beta_slow, sizeof(float));
|
|
6459
|
-
memcpy(params + 11, &xpos_base, sizeof(float));
|
|
6460
|
-
memcpy(params + 12, &xpos_down, sizeof(bool));
|
|
6461
6425
|
ggml_set_op_params(result, params, sizeof(params));
|
|
6462
6426
|
|
|
6463
6427
|
result->op = GGML_OP_ROPE_BACK;
|
|
@@ -7368,13 +7332,15 @@ struct ggml_tensor * ggml_add_rel_pos_inplace(
|
|
|
7368
7332
|
return ggml_add_rel_pos_impl(ctx, a, pw, ph, true);
|
|
7369
7333
|
}
|
|
7370
7334
|
|
|
7371
|
-
//
|
|
7335
|
+
// ggml_unary
|
|
7372
7336
|
|
|
7373
7337
|
static struct ggml_tensor * ggml_unary_impl(
|
|
7374
7338
|
struct ggml_context * ctx,
|
|
7375
7339
|
struct ggml_tensor * a,
|
|
7376
7340
|
enum ggml_unary_op op,
|
|
7377
7341
|
bool inplace) {
|
|
7342
|
+
GGML_ASSERT(ggml_is_contiguous_1(a));
|
|
7343
|
+
|
|
7378
7344
|
bool is_node = false;
|
|
7379
7345
|
|
|
7380
7346
|
if (!inplace && (a->grad)) {
|
|
@@ -9043,17 +9009,6 @@ static void ggml_compute_forward_add_f32(
|
|
|
9043
9009
|
const int ith = params->ith;
|
|
9044
9010
|
const int nth = params->nth;
|
|
9045
9011
|
|
|
9046
|
-
#ifdef GGML_USE_CLBLAST
|
|
9047
|
-
if (src1->backend == GGML_BACKEND_TYPE_GPU) {
|
|
9048
|
-
// TODO: OpenCL kernel support full broadcast
|
|
9049
|
-
GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
|
|
9050
|
-
if (ith == 0) {
|
|
9051
|
-
ggml_cl_add(src0, src1, dst);
|
|
9052
|
-
}
|
|
9053
|
-
return;
|
|
9054
|
-
}
|
|
9055
|
-
#endif
|
|
9056
|
-
|
|
9057
9012
|
const int nr = ggml_nrows(src0);
|
|
9058
9013
|
|
|
9059
9014
|
GGML_TENSOR_BINARY_OP_LOCALS
|
|
@@ -10161,17 +10116,6 @@ static void ggml_compute_forward_mul_f32(
|
|
|
10161
10116
|
const int ith = params->ith;
|
|
10162
10117
|
const int nth = params->nth;
|
|
10163
10118
|
|
|
10164
|
-
#if defined(GGML_USE_CLBLAST)
|
|
10165
|
-
if (src1->backend == GGML_BACKEND_TYPE_GPU) {
|
|
10166
|
-
// TODO: OpenCL kernel support full broadcast
|
|
10167
|
-
GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
|
|
10168
|
-
if (ith == 0) {
|
|
10169
|
-
ggml_cl_mul(src0, src1, dst);
|
|
10170
|
-
}
|
|
10171
|
-
return;
|
|
10172
|
-
}
|
|
10173
|
-
#endif
|
|
10174
|
-
|
|
10175
10119
|
const int64_t nr = ggml_nrows(src0);
|
|
10176
10120
|
|
|
10177
10121
|
GGML_TENSOR_BINARY_OP_LOCALS
|
|
@@ -11061,6 +11005,8 @@ static void ggml_compute_forward_abs_f32(
|
|
|
11061
11005
|
const struct ggml_tensor * src0 = dst->src[0];
|
|
11062
11006
|
|
|
11063
11007
|
assert(params->ith == 0);
|
|
11008
|
+
assert(ggml_is_contiguous_1(src0));
|
|
11009
|
+
assert(ggml_is_contiguous_1(dst));
|
|
11064
11010
|
assert(ggml_are_same_shape(src0, dst));
|
|
11065
11011
|
|
|
11066
11012
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
@@ -11070,9 +11016,6 @@ static void ggml_compute_forward_abs_f32(
|
|
|
11070
11016
|
const int n = ggml_nrows(src0);
|
|
11071
11017
|
const int nc = src0->ne[0];
|
|
11072
11018
|
|
|
11073
|
-
assert(dst->nb[0] == sizeof(float));
|
|
11074
|
-
assert(src0->nb[0] == sizeof(float));
|
|
11075
|
-
|
|
11076
11019
|
for (int i = 0; i < n; i++) {
|
|
11077
11020
|
ggml_vec_abs_f32(nc,
|
|
11078
11021
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
|
@@ -11107,6 +11050,8 @@ static void ggml_compute_forward_sgn_f32(
|
|
|
11107
11050
|
const struct ggml_tensor * src0 = dst->src[0];
|
|
11108
11051
|
|
|
11109
11052
|
assert(params->ith == 0);
|
|
11053
|
+
assert(ggml_is_contiguous_1(src0));
|
|
11054
|
+
assert(ggml_is_contiguous_1(dst));
|
|
11110
11055
|
assert(ggml_are_same_shape(src0, dst));
|
|
11111
11056
|
|
|
11112
11057
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
@@ -11116,9 +11061,6 @@ static void ggml_compute_forward_sgn_f32(
|
|
|
11116
11061
|
const int n = ggml_nrows(src0);
|
|
11117
11062
|
const int nc = src0->ne[0];
|
|
11118
11063
|
|
|
11119
|
-
assert(dst->nb[0] == sizeof(float));
|
|
11120
|
-
assert(src0->nb[0] == sizeof(float));
|
|
11121
|
-
|
|
11122
11064
|
for (int i = 0; i < n; i++) {
|
|
11123
11065
|
ggml_vec_sgn_f32(nc,
|
|
11124
11066
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
|
@@ -11153,6 +11095,8 @@ static void ggml_compute_forward_neg_f32(
|
|
|
11153
11095
|
const struct ggml_tensor * src0 = dst->src[0];
|
|
11154
11096
|
|
|
11155
11097
|
assert(params->ith == 0);
|
|
11098
|
+
assert(ggml_is_contiguous_1(src0));
|
|
11099
|
+
assert(ggml_is_contiguous_1(dst));
|
|
11156
11100
|
assert(ggml_are_same_shape(src0, dst));
|
|
11157
11101
|
|
|
11158
11102
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
@@ -11162,9 +11106,6 @@ static void ggml_compute_forward_neg_f32(
|
|
|
11162
11106
|
const int n = ggml_nrows(src0);
|
|
11163
11107
|
const int nc = src0->ne[0];
|
|
11164
11108
|
|
|
11165
|
-
assert(dst->nb[0] == sizeof(float));
|
|
11166
|
-
assert(src0->nb[0] == sizeof(float));
|
|
11167
|
-
|
|
11168
11109
|
for (int i = 0; i < n; i++) {
|
|
11169
11110
|
ggml_vec_neg_f32(nc,
|
|
11170
11111
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
|
@@ -11199,6 +11140,8 @@ static void ggml_compute_forward_step_f32(
|
|
|
11199
11140
|
const struct ggml_tensor * src0 = dst->src[0];
|
|
11200
11141
|
|
|
11201
11142
|
assert(params->ith == 0);
|
|
11143
|
+
assert(ggml_is_contiguous_1(src0));
|
|
11144
|
+
assert(ggml_is_contiguous_1(dst));
|
|
11202
11145
|
assert(ggml_are_same_shape(src0, dst));
|
|
11203
11146
|
|
|
11204
11147
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
@@ -11208,9 +11151,6 @@ static void ggml_compute_forward_step_f32(
|
|
|
11208
11151
|
const int n = ggml_nrows(src0);
|
|
11209
11152
|
const int nc = src0->ne[0];
|
|
11210
11153
|
|
|
11211
|
-
assert(dst->nb[0] == sizeof(float));
|
|
11212
|
-
assert(src0->nb[0] == sizeof(float));
|
|
11213
|
-
|
|
11214
11154
|
for (int i = 0; i < n; i++) {
|
|
11215
11155
|
ggml_vec_step_f32(nc,
|
|
11216
11156
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
|
@@ -11245,6 +11185,8 @@ static void ggml_compute_forward_tanh_f32(
|
|
|
11245
11185
|
const struct ggml_tensor * src0 = dst->src[0];
|
|
11246
11186
|
|
|
11247
11187
|
assert(params->ith == 0);
|
|
11188
|
+
assert(ggml_is_contiguous_1(src0));
|
|
11189
|
+
assert(ggml_is_contiguous_1(dst));
|
|
11248
11190
|
assert(ggml_are_same_shape(src0, dst));
|
|
11249
11191
|
|
|
11250
11192
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
@@ -11254,9 +11196,6 @@ static void ggml_compute_forward_tanh_f32(
|
|
|
11254
11196
|
const int n = ggml_nrows(src0);
|
|
11255
11197
|
const int nc = src0->ne[0];
|
|
11256
11198
|
|
|
11257
|
-
assert(dst->nb[0] == sizeof(float));
|
|
11258
|
-
assert(src0->nb[0] == sizeof(float));
|
|
11259
|
-
|
|
11260
11199
|
for (int i = 0; i < n; i++) {
|
|
11261
11200
|
ggml_vec_tanh_f32(nc,
|
|
11262
11201
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
|
@@ -11291,6 +11230,8 @@ static void ggml_compute_forward_elu_f32(
|
|
|
11291
11230
|
const struct ggml_tensor * src0 = dst->src[0];
|
|
11292
11231
|
|
|
11293
11232
|
assert(params->ith == 0);
|
|
11233
|
+
assert(ggml_is_contiguous_1(src0));
|
|
11234
|
+
assert(ggml_is_contiguous_1(dst));
|
|
11294
11235
|
assert(ggml_are_same_shape(src0, dst));
|
|
11295
11236
|
|
|
11296
11237
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
@@ -11300,9 +11241,6 @@ static void ggml_compute_forward_elu_f32(
|
|
|
11300
11241
|
const int n = ggml_nrows(src0);
|
|
11301
11242
|
const int nc = src0->ne[0];
|
|
11302
11243
|
|
|
11303
|
-
assert(dst->nb[0] == sizeof(float));
|
|
11304
|
-
assert(src0->nb[0] == sizeof(float));
|
|
11305
|
-
|
|
11306
11244
|
for (int i = 0; i < n; i++) {
|
|
11307
11245
|
ggml_vec_elu_f32(nc,
|
|
11308
11246
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
|
@@ -11337,6 +11275,8 @@ static void ggml_compute_forward_relu_f32(
|
|
|
11337
11275
|
const struct ggml_tensor * src0 = dst->src[0];
|
|
11338
11276
|
|
|
11339
11277
|
assert(params->ith == 0);
|
|
11278
|
+
assert(ggml_is_contiguous_1(src0));
|
|
11279
|
+
assert(ggml_is_contiguous_1(dst));
|
|
11340
11280
|
assert(ggml_are_same_shape(src0, dst));
|
|
11341
11281
|
|
|
11342
11282
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
@@ -11346,9 +11286,6 @@ static void ggml_compute_forward_relu_f32(
|
|
|
11346
11286
|
const int n = ggml_nrows(src0);
|
|
11347
11287
|
const int nc = src0->ne[0];
|
|
11348
11288
|
|
|
11349
|
-
assert(dst->nb[0] == sizeof(float));
|
|
11350
|
-
assert(src0->nb[0] == sizeof(float));
|
|
11351
|
-
|
|
11352
11289
|
for (int i = 0; i < n; i++) {
|
|
11353
11290
|
ggml_vec_relu_f32(nc,
|
|
11354
11291
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
|
@@ -11383,6 +11320,8 @@ static void ggml_compute_forward_sigmoid_f32(
|
|
|
11383
11320
|
const struct ggml_tensor * src0 = dst->src[0];
|
|
11384
11321
|
|
|
11385
11322
|
assert(params->ith == 0);
|
|
11323
|
+
assert(ggml_is_contiguous_1(src0));
|
|
11324
|
+
assert(ggml_is_contiguous_1(dst));
|
|
11386
11325
|
assert(ggml_are_same_shape(src0, dst));
|
|
11387
11326
|
|
|
11388
11327
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
@@ -11392,9 +11331,6 @@ static void ggml_compute_forward_sigmoid_f32(
|
|
|
11392
11331
|
const int n = ggml_nrows(src0);
|
|
11393
11332
|
const int nc = src0->ne[0];
|
|
11394
11333
|
|
|
11395
|
-
assert(dst->nb[0] == sizeof(float));
|
|
11396
|
-
assert(src0->nb[0] == sizeof(float));
|
|
11397
|
-
|
|
11398
11334
|
for (int i = 0; i < n; i++) {
|
|
11399
11335
|
ggml_vec_sigmoid_f32(nc,
|
|
11400
11336
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
|
@@ -11428,9 +11364,9 @@ static void ggml_compute_forward_gelu_f32(
|
|
|
11428
11364
|
|
|
11429
11365
|
const struct ggml_tensor * src0 = dst->src[0];
|
|
11430
11366
|
|
|
11431
|
-
|
|
11432
|
-
|
|
11433
|
-
|
|
11367
|
+
assert(ggml_is_contiguous_1(src0));
|
|
11368
|
+
assert(ggml_is_contiguous_1(dst));
|
|
11369
|
+
assert(ggml_are_same_shape(src0, dst));
|
|
11434
11370
|
|
|
11435
11371
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
11436
11372
|
return;
|
|
@@ -11491,9 +11427,9 @@ static void ggml_compute_forward_gelu_quick_f32(
|
|
|
11491
11427
|
|
|
11492
11428
|
const struct ggml_tensor * src0 = dst->src[0];
|
|
11493
11429
|
|
|
11494
|
-
|
|
11495
|
-
|
|
11496
|
-
|
|
11430
|
+
assert(ggml_is_contiguous_1(src0));
|
|
11431
|
+
assert(ggml_is_contiguous_1(dst));
|
|
11432
|
+
assert(ggml_are_same_shape(src0, dst));
|
|
11497
11433
|
|
|
11498
11434
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
11499
11435
|
return;
|
|
@@ -11554,9 +11490,9 @@ static void ggml_compute_forward_silu_f32(
|
|
|
11554
11490
|
|
|
11555
11491
|
const struct ggml_tensor * src0 = dst->src[0];
|
|
11556
11492
|
|
|
11557
|
-
|
|
11558
|
-
|
|
11559
|
-
|
|
11493
|
+
assert(ggml_is_contiguous_1(src0));
|
|
11494
|
+
assert(ggml_is_contiguous_1(dst));
|
|
11495
|
+
assert(ggml_are_same_shape(src0, dst));
|
|
11560
11496
|
|
|
11561
11497
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
11562
11498
|
return;
|
|
@@ -11617,6 +11553,8 @@ static void ggml_compute_forward_leaky_relu_f32(
|
|
|
11617
11553
|
const struct ggml_tensor * src0 = dst->src[0];
|
|
11618
11554
|
|
|
11619
11555
|
assert(params->ith == 0);
|
|
11556
|
+
assert(ggml_is_contiguous_1(src0));
|
|
11557
|
+
assert(ggml_is_contiguous_1(dst));
|
|
11620
11558
|
assert(ggml_are_same_shape(src0, dst));
|
|
11621
11559
|
|
|
11622
11560
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
@@ -11666,11 +11604,11 @@ static void ggml_compute_forward_silu_back_f32(
|
|
|
11666
11604
|
const struct ggml_tensor * src0 = dst->src[0];
|
|
11667
11605
|
const struct ggml_tensor * grad = dst->src[1];
|
|
11668
11606
|
|
|
11669
|
-
|
|
11670
|
-
|
|
11671
|
-
|
|
11672
|
-
|
|
11673
|
-
|
|
11607
|
+
assert(ggml_is_contiguous_1(grad));
|
|
11608
|
+
assert(ggml_is_contiguous_1(src0));
|
|
11609
|
+
assert(ggml_is_contiguous_1(dst));
|
|
11610
|
+
assert(ggml_are_same_shape(src0, dst));
|
|
11611
|
+
assert(ggml_are_same_shape(src0, grad));
|
|
11674
11612
|
|
|
11675
11613
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
11676
11614
|
return;
|
|
@@ -11732,6 +11670,8 @@ static void ggml_compute_forward_hardswish_f32(
|
|
|
11732
11670
|
const struct ggml_tensor * src0 = dst->src[0];
|
|
11733
11671
|
|
|
11734
11672
|
assert(params->ith == 0);
|
|
11673
|
+
assert(ggml_is_contiguous_1(src0));
|
|
11674
|
+
assert(ggml_is_contiguous_1(dst));
|
|
11735
11675
|
assert(ggml_are_same_shape(src0, dst));
|
|
11736
11676
|
|
|
11737
11677
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
@@ -11741,9 +11681,6 @@ static void ggml_compute_forward_hardswish_f32(
|
|
|
11741
11681
|
const int n = ggml_nrows(src0);
|
|
11742
11682
|
const int nc = src0->ne[0];
|
|
11743
11683
|
|
|
11744
|
-
assert(dst->nb[0] == sizeof(float));
|
|
11745
|
-
assert(src0->nb[0] == sizeof(float));
|
|
11746
|
-
|
|
11747
11684
|
for (int i = 0; i < n; i++) {
|
|
11748
11685
|
ggml_vec_hardswish_f32(nc,
|
|
11749
11686
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
|
@@ -11775,6 +11712,8 @@ static void ggml_compute_forward_hardsigmoid_f32(
|
|
|
11775
11712
|
const struct ggml_tensor * src0 = dst->src[0];
|
|
11776
11713
|
|
|
11777
11714
|
assert(params->ith == 0);
|
|
11715
|
+
assert(ggml_is_contiguous_1(src0));
|
|
11716
|
+
assert(ggml_is_contiguous_1(dst));
|
|
11778
11717
|
assert(ggml_are_same_shape(src0, dst));
|
|
11779
11718
|
|
|
11780
11719
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
@@ -11784,9 +11723,6 @@ static void ggml_compute_forward_hardsigmoid_f32(
|
|
|
11784
11723
|
const int n = ggml_nrows(src0);
|
|
11785
11724
|
const int nc = src0->ne[0];
|
|
11786
11725
|
|
|
11787
|
-
assert(dst->nb[0] == sizeof(float));
|
|
11788
|
-
assert(src0->nb[0] == sizeof(float));
|
|
11789
|
-
|
|
11790
11726
|
for (int i = 0; i < n; i++) {
|
|
11791
11727
|
ggml_vec_hardsigmoid_f32(nc,
|
|
11792
11728
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
|
@@ -12237,39 +12173,6 @@ static void ggml_compute_forward_group_norm(
|
|
|
12237
12173
|
|
|
12238
12174
|
// ggml_compute_forward_mul_mat
|
|
12239
12175
|
|
|
12240
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
|
12241
|
-
// helper function to determine if it is better to use BLAS or not
|
|
12242
|
-
// for large matrices, BLAS is faster
|
|
12243
|
-
static bool ggml_compute_forward_mul_mat_use_blas(struct ggml_tensor * dst) {
|
|
12244
|
-
const struct ggml_tensor * src0 = dst->src[0];
|
|
12245
|
-
const struct ggml_tensor * src1 = dst->src[1];
|
|
12246
|
-
|
|
12247
|
-
//const int64_t ne00 = src0->ne[0];
|
|
12248
|
-
//const int64_t ne01 = src0->ne[1];
|
|
12249
|
-
|
|
12250
|
-
const int64_t ne10 = src1->ne[0];
|
|
12251
|
-
|
|
12252
|
-
const int64_t ne0 = dst->ne[0];
|
|
12253
|
-
const int64_t ne1 = dst->ne[1];
|
|
12254
|
-
|
|
12255
|
-
// NOTE: with GGML_OP_MUL_MAT_ID we don't want to go through the BLAS branch because it will dequantize (to_float)
|
|
12256
|
-
// all the experts for each batch element and the processing would become incredibly slow
|
|
12257
|
-
// TODO: find the optimal values for these
|
|
12258
|
-
if (dst->op != GGML_OP_MUL_MAT_ID &&
|
|
12259
|
-
ggml_is_contiguous(src0) &&
|
|
12260
|
-
ggml_is_contiguous(src1) &&
|
|
12261
|
-
//src0->type == GGML_TYPE_F32 &&
|
|
12262
|
-
src1->type == GGML_TYPE_F32 &&
|
|
12263
|
-
(ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
|
|
12264
|
-
|
|
12265
|
-
/*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
|
|
12266
|
-
return true;
|
|
12267
|
-
}
|
|
12268
|
-
|
|
12269
|
-
return false;
|
|
12270
|
-
}
|
|
12271
|
-
#endif
|
|
12272
|
-
|
|
12273
12176
|
static void ggml_compute_forward_mul_mat_one_chunk(
|
|
12274
12177
|
const struct ggml_compute_params * params,
|
|
12275
12178
|
struct ggml_tensor * dst,
|
|
@@ -12407,82 +12310,6 @@ static void ggml_compute_forward_mul_mat(
|
|
|
12407
12310
|
// nb01 >= nb00 - src0 is not transposed
|
|
12408
12311
|
// compute by src0 rows
|
|
12409
12312
|
|
|
12410
|
-
#if defined(GGML_USE_CLBLAST)
|
|
12411
|
-
if (ggml_cl_can_mul_mat(src0, src1, dst)) {
|
|
12412
|
-
if (params->ith == 0 && params->type == GGML_TASK_TYPE_COMPUTE) {
|
|
12413
|
-
ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
|
12414
|
-
}
|
|
12415
|
-
return;
|
|
12416
|
-
}
|
|
12417
|
-
#endif
|
|
12418
|
-
|
|
12419
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
|
12420
|
-
if (ggml_compute_forward_mul_mat_use_blas(dst)) {
|
|
12421
|
-
const int64_t ne_plane = ne01*ne00;
|
|
12422
|
-
const size_t desired_wsize = ne13*ne12*ne_plane*sizeof(float);
|
|
12423
|
-
UNUSED(desired_wsize);
|
|
12424
|
-
|
|
12425
|
-
if (params->type == GGML_TASK_TYPE_INIT) {
|
|
12426
|
-
if (type != GGML_TYPE_F32) {
|
|
12427
|
-
assert(params->wsize >= desired_wsize);
|
|
12428
|
-
// parallelize by src0 rows
|
|
12429
|
-
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
|
12430
|
-
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
|
12431
|
-
// broadcast src0 into src1 across 2nd,3rd dimension
|
|
12432
|
-
const int64_t i03 = i13/r3;
|
|
12433
|
-
const int64_t i02 = i12/r2;
|
|
12434
|
-
|
|
12435
|
-
const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
|
|
12436
|
-
float * const wdata = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
|
|
12437
|
-
ggml_to_float_t const to_float = type_traits[type].to_float;
|
|
12438
|
-
|
|
12439
|
-
for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
|
|
12440
|
-
to_float((const char *) x + i01*nb01, wdata + i01*ne00, ne00);
|
|
12441
|
-
}
|
|
12442
|
-
}
|
|
12443
|
-
}
|
|
12444
|
-
}
|
|
12445
|
-
return;
|
|
12446
|
-
}
|
|
12447
|
-
|
|
12448
|
-
if (params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
12449
|
-
return;
|
|
12450
|
-
}
|
|
12451
|
-
|
|
12452
|
-
// perform sgemm, parallelization controlled by blas lib
|
|
12453
|
-
if (ith != 0) {
|
|
12454
|
-
return;
|
|
12455
|
-
}
|
|
12456
|
-
|
|
12457
|
-
//const int64_t tgemm0 = ggml_perf_time_us();
|
|
12458
|
-
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
|
12459
|
-
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
|
12460
|
-
const int64_t i03 = i13/r3;
|
|
12461
|
-
const int64_t i02 = i12/r2;
|
|
12462
|
-
|
|
12463
|
-
const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
|
|
12464
|
-
const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
|
|
12465
|
-
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
|
12466
|
-
|
|
12467
|
-
if (type != GGML_TYPE_F32) {
|
|
12468
|
-
x = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
|
|
12469
|
-
}
|
|
12470
|
-
|
|
12471
|
-
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
|
12472
|
-
ne1, ne01, ne10,
|
|
12473
|
-
1.0f, y, ne10,
|
|
12474
|
-
x, ne00,
|
|
12475
|
-
0.0f, d, ne01);
|
|
12476
|
-
}
|
|
12477
|
-
}
|
|
12478
|
-
//printf("cblas_sgemm = %.3f ms, %lld flops\n", (ggml_perf_time_us() - tgemm0)/1000.0, ne13*ne12*ne1*ne01*ne10*2);
|
|
12479
|
-
|
|
12480
|
-
//printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
|
|
12481
|
-
|
|
12482
|
-
return;
|
|
12483
|
-
}
|
|
12484
|
-
#endif
|
|
12485
|
-
|
|
12486
12313
|
#if GGML_USE_LLAMAFILE
|
|
12487
12314
|
const bool src1_cont = ggml_is_contiguous(src1);
|
|
12488
12315
|
|
|
@@ -12863,21 +12690,7 @@ static void ggml_compute_forward_out_prod_f32(
|
|
|
12863
12690
|
// nb01 >= nb00 - src0 is not transposed
|
|
12864
12691
|
// compute by src0 rows
|
|
12865
12692
|
|
|
12866
|
-
// TODO: #if defined(GGML_USE_CLBLAST)
|
|
12867
|
-
|
|
12868
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
|
12869
|
-
bool use_blas = ggml_is_matrix(src0) &&
|
|
12870
|
-
ggml_is_matrix(src1) &&
|
|
12871
|
-
ggml_is_contiguous(src0) &&
|
|
12872
|
-
(ggml_is_contiguous(src1) || ggml_is_transposed(src1));
|
|
12873
|
-
#endif
|
|
12874
|
-
|
|
12875
12693
|
if (params->type == GGML_TASK_TYPE_INIT) {
|
|
12876
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) // gemm beta will zero dst
|
|
12877
|
-
if (use_blas) {
|
|
12878
|
-
return;
|
|
12879
|
-
}
|
|
12880
|
-
#endif
|
|
12881
12694
|
if (ith != 0) {
|
|
12882
12695
|
return;
|
|
12883
12696
|
}
|
|
@@ -12889,50 +12702,6 @@ static void ggml_compute_forward_out_prod_f32(
|
|
|
12889
12702
|
return;
|
|
12890
12703
|
}
|
|
12891
12704
|
|
|
12892
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
|
12893
|
-
if (use_blas) {
|
|
12894
|
-
if (params->ith != 0) { // All threads other than the first do no work.
|
|
12895
|
-
return;
|
|
12896
|
-
}
|
|
12897
|
-
// Arguments to ggml_compute_forward_out_prod (expressed as major,minor)
|
|
12898
|
-
// src0: (k,n)
|
|
12899
|
-
// src1: (k,m)
|
|
12900
|
-
// dst: (m,n)
|
|
12901
|
-
//
|
|
12902
|
-
// Arguments to sgemm (see https://github.com/Reference-LAPACK/lapack/blob/master/BLAS/SRC/sgemm.f)
|
|
12903
|
-
// Also expressed as (major,minor)
|
|
12904
|
-
// a: (m,k): so src1 transposed
|
|
12905
|
-
// b: (k,n): so src0
|
|
12906
|
-
// c: (m,n)
|
|
12907
|
-
//
|
|
12908
|
-
// However, if ggml_is_transposed(src1) is true, then
|
|
12909
|
-
// src1->data already contains a transposed version, so sgemm mustn't
|
|
12910
|
-
// transpose it further.
|
|
12911
|
-
|
|
12912
|
-
int n = src0->ne[0];
|
|
12913
|
-
int k = src0->ne[1];
|
|
12914
|
-
int m = src1->ne[0];
|
|
12915
|
-
|
|
12916
|
-
int transposeA, lda;
|
|
12917
|
-
|
|
12918
|
-
if (!ggml_is_transposed(src1)) {
|
|
12919
|
-
transposeA = CblasTrans;
|
|
12920
|
-
lda = m;
|
|
12921
|
-
} else {
|
|
12922
|
-
transposeA = CblasNoTrans;
|
|
12923
|
-
lda = k;
|
|
12924
|
-
}
|
|
12925
|
-
|
|
12926
|
-
float * a = (float *) ((char *) src1->data);
|
|
12927
|
-
float * b = (float *) ((char *) src0->data);
|
|
12928
|
-
float * c = (float *) ((char *) dst->data);
|
|
12929
|
-
|
|
12930
|
-
cblas_sgemm(CblasRowMajor, transposeA, CblasNoTrans, m, n, k, 1.0, a, lda, b, n, 0.0, c, n);
|
|
12931
|
-
|
|
12932
|
-
return;
|
|
12933
|
-
}
|
|
12934
|
-
#endif
|
|
12935
|
-
|
|
12936
12705
|
// dst[:,:,:,:] = 0
|
|
12937
12706
|
// for i2,i3:
|
|
12938
12707
|
// for i1:
|
|
@@ -13062,8 +12831,6 @@ static void ggml_compute_forward_out_prod_q_f32(
|
|
|
13062
12831
|
// nb01 >= nb00 - src0 is not transposed
|
|
13063
12832
|
// compute by src0 rows
|
|
13064
12833
|
|
|
13065
|
-
// TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
|
13066
|
-
|
|
13067
12834
|
if (params->type == GGML_TASK_TYPE_INIT) {
|
|
13068
12835
|
if (ith != 0) {
|
|
13069
12836
|
return;
|
|
@@ -13460,6 +13227,8 @@ static void ggml_compute_forward_get_rows_q(
|
|
|
13460
13227
|
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
|
|
13461
13228
|
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
|
13462
13229
|
|
|
13230
|
+
assert(i01 >= 0 && i01 < ne01);
|
|
13231
|
+
|
|
13463
13232
|
dequantize_row_q(
|
|
13464
13233
|
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
|
13465
13234
|
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
|
@@ -13503,6 +13272,8 @@ static void ggml_compute_forward_get_rows_f16(
|
|
|
13503
13272
|
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
|
|
13504
13273
|
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
|
13505
13274
|
|
|
13275
|
+
assert(i01 >= 0 && i01 < ne01);
|
|
13276
|
+
|
|
13506
13277
|
ggml_fp16_to_fp32_row(
|
|
13507
13278
|
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
|
13508
13279
|
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
|
@@ -13546,7 +13317,9 @@ static void ggml_compute_forward_get_rows_bf16(
|
|
|
13546
13317
|
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
|
|
13547
13318
|
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
|
13548
13319
|
|
|
13549
|
-
|
|
13320
|
+
assert(i01 >= 0 && i01 < ne01);
|
|
13321
|
+
|
|
13322
|
+
ggml_bf16_to_fp32_row(
|
|
13550
13323
|
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
|
13551
13324
|
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
|
13552
13325
|
}
|
|
@@ -13589,6 +13362,8 @@ static void ggml_compute_forward_get_rows_f32(
|
|
|
13589
13362
|
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
|
|
13590
13363
|
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
|
13591
13364
|
|
|
13365
|
+
assert(i01 >= 0 && i01 < ne01);
|
|
13366
|
+
|
|
13592
13367
|
ggml_vec_cpy_f32(nc,
|
|
13593
13368
|
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3),
|
|
13594
13369
|
(float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03));
|
|
@@ -14259,8 +14034,7 @@ static float rope_yarn_ramp(const float low, const float high, const int i0) {
|
|
|
14259
14034
|
// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
|
|
14260
14035
|
static void rope_yarn(
|
|
14261
14036
|
float theta_extrap, float freq_scale, float corr_dims[2], int64_t i0, float ext_factor, float mscale,
|
|
14262
|
-
float * cos_theta, float * sin_theta
|
|
14263
|
-
) {
|
|
14037
|
+
float * cos_theta, float * sin_theta) {
|
|
14264
14038
|
// Get n-d rotational scaling corrected for extrapolation
|
|
14265
14039
|
float theta_interp = freq_scale * theta_extrap;
|
|
14266
14040
|
float theta = theta_interp;
|
|
@@ -14277,18 +14051,19 @@ static void rope_yarn(
|
|
|
14277
14051
|
|
|
14278
14052
|
// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
|
|
14279
14053
|
// `corr_dim(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
|
|
14280
|
-
static float ggml_rope_yarn_corr_dim(int n_dims, int
|
|
14281
|
-
return n_dims * logf(
|
|
14054
|
+
static float ggml_rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) {
|
|
14055
|
+
return n_dims * logf(n_ctx_orig / (n_rot * 2 * (float)M_PI)) / (2 * logf(base));
|
|
14282
14056
|
}
|
|
14283
14057
|
|
|
14284
14058
|
static void ggml_rope_cache_init(
|
|
14285
|
-
float theta_base, float freq_scale, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
|
|
14286
|
-
float * cache, float sin_sign, float theta_scale
|
|
14287
|
-
|
|
14059
|
+
float theta_base, float freq_scale, const float * freq_factors, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
|
|
14060
|
+
float * cache, float sin_sign, float theta_scale) {
|
|
14061
|
+
// ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
|
|
14288
14062
|
float theta = theta_base;
|
|
14289
14063
|
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
|
14064
|
+
const float ff = freq_factors ? freq_factors[i0/2] : 1.0f;
|
|
14290
14065
|
rope_yarn(
|
|
14291
|
-
theta, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1]
|
|
14066
|
+
theta/ff, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1]
|
|
14292
14067
|
);
|
|
14293
14068
|
cache[i0 + 1] *= sin_sign;
|
|
14294
14069
|
|
|
@@ -14297,11 +14072,11 @@ static void ggml_rope_cache_init(
|
|
|
14297
14072
|
}
|
|
14298
14073
|
|
|
14299
14074
|
GGML_CALL void ggml_rope_yarn_corr_dims(
|
|
14300
|
-
int n_dims, int
|
|
14075
|
+
int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]
|
|
14301
14076
|
) {
|
|
14302
14077
|
// start and end correction dims
|
|
14303
|
-
float start = floorf(ggml_rope_yarn_corr_dim(n_dims,
|
|
14304
|
-
float end = ceilf(ggml_rope_yarn_corr_dim(n_dims,
|
|
14078
|
+
float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_fast, freq_base));
|
|
14079
|
+
float end = ceilf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_slow, freq_base));
|
|
14305
14080
|
dims[0] = MAX(0, start);
|
|
14306
14081
|
dims[1] = MIN(n_dims - 1, end);
|
|
14307
14082
|
}
|
|
@@ -14321,15 +14096,11 @@ static void ggml_compute_forward_rope_f32(
|
|
|
14321
14096
|
|
|
14322
14097
|
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
|
14323
14098
|
|
|
14324
|
-
// these two only relevant for xPos RoPE:
|
|
14325
|
-
float xpos_base;
|
|
14326
|
-
bool xpos_down;
|
|
14327
|
-
|
|
14328
14099
|
//const int n_past = ((int32_t *) dst->op_params)[0];
|
|
14329
14100
|
const int n_dims = ((int32_t *) dst->op_params)[1];
|
|
14330
14101
|
const int mode = ((int32_t *) dst->op_params)[2];
|
|
14331
|
-
const int n_ctx = ((int32_t *) dst->op_params)[3];
|
|
14332
|
-
const int
|
|
14102
|
+
//const int n_ctx = ((int32_t *) dst->op_params)[3];
|
|
14103
|
+
const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
|
|
14333
14104
|
|
|
14334
14105
|
memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
|
|
14335
14106
|
memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
|
|
@@ -14337,8 +14108,6 @@ static void ggml_compute_forward_rope_f32(
|
|
|
14337
14108
|
memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
|
|
14338
14109
|
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
|
14339
14110
|
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
|
14340
|
-
memcpy(&xpos_base, (int32_t *) dst->op_params + 11, sizeof(float));
|
|
14341
|
-
memcpy(&xpos_down, (int32_t *) dst->op_params + 12, sizeof(bool));
|
|
14342
14111
|
|
|
14343
14112
|
GGML_TENSOR_UNARY_OP_LOCALS
|
|
14344
14113
|
|
|
@@ -14368,20 +14137,15 @@ static void ggml_compute_forward_rope_f32(
|
|
|
14368
14137
|
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
|
14369
14138
|
|
|
14370
14139
|
float corr_dims[2];
|
|
14371
|
-
ggml_rope_yarn_corr_dims(n_dims,
|
|
14140
|
+
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
|
|
14372
14141
|
|
|
14373
14142
|
const bool is_neox = mode & 2;
|
|
14374
|
-
const bool is_glm = mode & 4;
|
|
14375
14143
|
|
|
14376
14144
|
const float * freq_factors = NULL;
|
|
14377
|
-
if (
|
|
14378
|
-
|
|
14379
|
-
|
|
14380
|
-
|
|
14381
|
-
freq_factors = (const float *) src2->data;
|
|
14382
|
-
}
|
|
14383
|
-
} else {
|
|
14384
|
-
GGML_ASSERT(src2 == NULL && "TODO: freq_factors not implemented for !is_neox");
|
|
14145
|
+
if (src2 != NULL) {
|
|
14146
|
+
GGML_ASSERT(src2->type == GGML_TYPE_F32);
|
|
14147
|
+
GGML_ASSERT(src2->ne[0] >= n_dims / 2);
|
|
14148
|
+
freq_factors = (const float *) src2->data;
|
|
14385
14149
|
}
|
|
14386
14150
|
|
|
14387
14151
|
// backward process uses inverse rotation by cos and sin.
|
|
@@ -14396,94 +14160,50 @@ static void ggml_compute_forward_rope_f32(
|
|
|
14396
14160
|
const int64_t p = pos[i2];
|
|
14397
14161
|
|
|
14398
14162
|
float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
|
|
14399
|
-
|
|
14400
|
-
ggml_rope_cache_init(p, freq_scale, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
|
|
14401
|
-
}
|
|
14163
|
+
ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
|
|
14402
14164
|
|
|
14403
14165
|
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
|
14404
14166
|
if (ir++ < ir0) continue;
|
|
14405
14167
|
if (ir > ir1) break;
|
|
14406
14168
|
|
|
14407
|
-
|
|
14408
|
-
|
|
14409
|
-
if (is_glm) {
|
|
14410
|
-
theta_base = MIN(p, n_ctx - 2);
|
|
14411
|
-
float block_theta = MAX(p - (n_ctx - 2), 0);
|
|
14412
|
-
for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
|
|
14413
|
-
const float cos_theta = cosf(theta_base);
|
|
14414
|
-
const float sin_theta = sinf(theta_base) * sin_sign;
|
|
14415
|
-
const float cos_block_theta = cosf(block_theta);
|
|
14416
|
-
const float sin_block_theta = sinf(block_theta) * sin_sign;
|
|
14417
|
-
|
|
14418
|
-
theta_base *= theta_scale;
|
|
14419
|
-
block_theta *= theta_scale;
|
|
14420
|
-
|
|
14421
|
-
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
|
14422
|
-
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
|
14423
|
-
|
|
14424
|
-
const float x0 = src[0];
|
|
14425
|
-
const float x1 = src[n_dims/2];
|
|
14426
|
-
const float x2 = src[n_dims];
|
|
14427
|
-
const float x3 = src[n_dims/2*3];
|
|
14428
|
-
|
|
14429
|
-
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
|
14430
|
-
dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
|
|
14431
|
-
dst_data[n_dims] = x2*cos_block_theta - x3*sin_block_theta;
|
|
14432
|
-
dst_data[n_dims/2*3] = x2*sin_block_theta + x3*cos_block_theta;
|
|
14433
|
-
}
|
|
14434
|
-
} else if (!is_neox) {
|
|
14435
|
-
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
|
14169
|
+
if (!is_neox) {
|
|
14170
|
+
for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
|
|
14436
14171
|
const float cos_theta = cache[i0 + 0];
|
|
14437
14172
|
const float sin_theta = cache[i0 + 1];
|
|
14438
14173
|
|
|
14439
|
-
// zeta scaling for xPos only:
|
|
14440
|
-
float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
|
|
14441
|
-
if (xpos_down) zeta = 1.0f / zeta;
|
|
14442
|
-
|
|
14443
14174
|
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
|
14444
14175
|
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
|
14445
14176
|
|
|
14446
14177
|
const float x0 = src[0];
|
|
14447
14178
|
const float x1 = src[1];
|
|
14448
14179
|
|
|
14449
|
-
dst_data[0] = x0*cos_theta
|
|
14450
|
-
dst_data[1] = x0*sin_theta
|
|
14180
|
+
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
|
14181
|
+
dst_data[1] = x0*sin_theta + x1*cos_theta;
|
|
14451
14182
|
}
|
|
14452
14183
|
} else {
|
|
14453
|
-
|
|
14454
|
-
|
|
14455
|
-
if (ic < n_dims) {
|
|
14456
|
-
const int64_t i0 = ic/2;
|
|
14457
|
-
|
|
14458
|
-
const float freq_factor = freq_factors ? freq_factors[i0] : 1.0f;
|
|
14459
|
-
|
|
14460
|
-
float cos_theta, sin_theta;
|
|
14461
|
-
rope_yarn(
|
|
14462
|
-
theta_base/freq_factor, freq_scale, corr_dims, ic, ext_factor, attn_factor,
|
|
14463
|
-
&cos_theta, &sin_theta
|
|
14464
|
-
);
|
|
14184
|
+
for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
|
|
14185
|
+
const int64_t ic = i0/2;
|
|
14465
14186
|
|
|
14466
|
-
|
|
14467
|
-
|
|
14187
|
+
const float cos_theta = cache[i0 + 0];
|
|
14188
|
+
const float sin_theta = cache[i0 + 1];
|
|
14468
14189
|
|
|
14469
|
-
|
|
14470
|
-
|
|
14190
|
+
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
|
|
14191
|
+
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
|
|
14471
14192
|
|
|
14472
|
-
|
|
14473
|
-
|
|
14193
|
+
const float x0 = src[0];
|
|
14194
|
+
const float x1 = src[n_dims/2];
|
|
14474
14195
|
|
|
14475
|
-
|
|
14476
|
-
|
|
14477
|
-
|
|
14478
|
-
|
|
14196
|
+
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
|
14197
|
+
dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
|
|
14198
|
+
}
|
|
14199
|
+
}
|
|
14479
14200
|
|
|
14480
|
-
|
|
14481
|
-
|
|
14201
|
+
for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
|
|
14202
|
+
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
|
14203
|
+
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
|
14482
14204
|
|
|
14483
|
-
|
|
14484
|
-
|
|
14485
|
-
}
|
|
14486
|
-
}
|
|
14205
|
+
dst_data[0] = src[0];
|
|
14206
|
+
dst_data[1] = src[1];
|
|
14487
14207
|
}
|
|
14488
14208
|
}
|
|
14489
14209
|
}
|
|
@@ -14509,8 +14229,8 @@ static void ggml_compute_forward_rope_f16(
|
|
|
14509
14229
|
//const int n_past = ((int32_t *) dst->op_params)[0];
|
|
14510
14230
|
const int n_dims = ((int32_t *) dst->op_params)[1];
|
|
14511
14231
|
const int mode = ((int32_t *) dst->op_params)[2];
|
|
14512
|
-
const int n_ctx = ((int32_t *) dst->op_params)[3];
|
|
14513
|
-
const int
|
|
14232
|
+
//const int n_ctx = ((int32_t *) dst->op_params)[3];
|
|
14233
|
+
const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
|
|
14514
14234
|
memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
|
|
14515
14235
|
memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
|
|
14516
14236
|
memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
|
|
@@ -14546,20 +14266,15 @@ static void ggml_compute_forward_rope_f16(
|
|
|
14546
14266
|
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
|
14547
14267
|
|
|
14548
14268
|
float corr_dims[2];
|
|
14549
|
-
ggml_rope_yarn_corr_dims(n_dims,
|
|
14269
|
+
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
|
|
14550
14270
|
|
|
14551
14271
|
const bool is_neox = mode & 2;
|
|
14552
|
-
const bool is_glm = mode & 4;
|
|
14553
14272
|
|
|
14554
14273
|
const float * freq_factors = NULL;
|
|
14555
|
-
if (
|
|
14556
|
-
|
|
14557
|
-
|
|
14558
|
-
|
|
14559
|
-
freq_factors = (const float *) src2->data;
|
|
14560
|
-
}
|
|
14561
|
-
} else {
|
|
14562
|
-
GGML_ASSERT(src2 == NULL && "TODO: freq_factors not implemented for !is_neox");
|
|
14274
|
+
if (src2 != NULL) {
|
|
14275
|
+
GGML_ASSERT(src2->type == GGML_TYPE_F32);
|
|
14276
|
+
GGML_ASSERT(src2->ne[0] >= n_dims / 2);
|
|
14277
|
+
freq_factors = (const float *) src2->data;
|
|
14563
14278
|
}
|
|
14564
14279
|
|
|
14565
14280
|
// backward process uses inverse rotation by cos and sin.
|
|
@@ -14574,43 +14289,14 @@ static void ggml_compute_forward_rope_f16(
|
|
|
14574
14289
|
const int64_t p = pos[i2];
|
|
14575
14290
|
|
|
14576
14291
|
float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
|
|
14577
|
-
|
|
14578
|
-
ggml_rope_cache_init(p, freq_scale, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
|
|
14579
|
-
}
|
|
14292
|
+
ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
|
|
14580
14293
|
|
|
14581
14294
|
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
|
14582
14295
|
if (ir++ < ir0) continue;
|
|
14583
14296
|
if (ir > ir1) break;
|
|
14584
14297
|
|
|
14585
|
-
|
|
14586
|
-
|
|
14587
|
-
if (is_glm) {
|
|
14588
|
-
theta_base = MIN(p, n_ctx - 2);
|
|
14589
|
-
float block_theta = MAX(p - (n_ctx - 2), 0);
|
|
14590
|
-
for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
|
|
14591
|
-
const float cos_theta = cosf(theta_base);
|
|
14592
|
-
const float sin_theta = sinf(theta_base) * sin_sign;
|
|
14593
|
-
const float cos_block_theta = cosf(block_theta);
|
|
14594
|
-
const float sin_block_theta = sinf(block_theta) * sin_sign;
|
|
14595
|
-
|
|
14596
|
-
theta_base *= theta_scale;
|
|
14597
|
-
block_theta *= theta_scale;
|
|
14598
|
-
|
|
14599
|
-
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
|
14600
|
-
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
|
14601
|
-
|
|
14602
|
-
const float x0 = GGML_FP16_TO_FP32(src[0]);
|
|
14603
|
-
const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
|
|
14604
|
-
const float x2 = GGML_FP16_TO_FP32(src[n_dims]);
|
|
14605
|
-
const float x3 = GGML_FP16_TO_FP32(src[n_dims/2*3]);
|
|
14606
|
-
|
|
14607
|
-
dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
|
|
14608
|
-
dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
|
14609
|
-
dst_data[n_dims] = GGML_FP32_TO_FP16(x2*cos_block_theta - x3*sin_block_theta);
|
|
14610
|
-
dst_data[n_dims/2*3] = GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta);
|
|
14611
|
-
}
|
|
14612
|
-
} else if (!is_neox) {
|
|
14613
|
-
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
|
14298
|
+
if (!is_neox) {
|
|
14299
|
+
for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
|
|
14614
14300
|
const float cos_theta = cache[i0 + 0];
|
|
14615
14301
|
const float sin_theta = cache[i0 + 1];
|
|
14616
14302
|
|
|
@@ -14624,40 +14310,29 @@ static void ggml_compute_forward_rope_f16(
|
|
|
14624
14310
|
dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
|
14625
14311
|
}
|
|
14626
14312
|
} else {
|
|
14627
|
-
|
|
14628
|
-
|
|
14629
|
-
if (ic < n_dims) {
|
|
14630
|
-
const int64_t i0 = ic/2;
|
|
14313
|
+
for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
|
|
14314
|
+
const int64_t ic = i0/2;
|
|
14631
14315
|
|
|
14632
|
-
|
|
14633
|
-
|
|
14634
|
-
float cos_theta, sin_theta;
|
|
14635
|
-
rope_yarn(
|
|
14636
|
-
theta_base/freq_factor, freq_scale, corr_dims, ic, ext_factor, attn_factor,
|
|
14637
|
-
&cos_theta, &sin_theta
|
|
14638
|
-
);
|
|
14639
|
-
|
|
14640
|
-
sin_theta *= sin_sign;
|
|
14641
|
-
theta_base *= theta_scale;
|
|
14316
|
+
const float cos_theta = cache[i0 + 0];
|
|
14317
|
+
const float sin_theta = cache[i0 + 1];
|
|
14642
14318
|
|
|
14643
|
-
|
|
14644
|
-
|
|
14319
|
+
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
|
|
14320
|
+
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
|
|
14645
14321
|
|
|
14646
|
-
|
|
14647
|
-
|
|
14322
|
+
const float x0 = GGML_FP16_TO_FP32(src[0]);
|
|
14323
|
+
const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
|
|
14648
14324
|
|
|
14649
|
-
|
|
14650
|
-
|
|
14651
|
-
|
|
14652
|
-
|
|
14325
|
+
dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
|
|
14326
|
+
dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
|
14327
|
+
}
|
|
14328
|
+
}
|
|
14653
14329
|
|
|
14654
|
-
|
|
14655
|
-
|
|
14330
|
+
for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
|
|
14331
|
+
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
|
14332
|
+
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
|
14656
14333
|
|
|
14657
|
-
|
|
14658
|
-
|
|
14659
|
-
}
|
|
14660
|
-
}
|
|
14334
|
+
dst_data[0] = src[0];
|
|
14335
|
+
dst_data[1] = src[1];
|
|
14661
14336
|
}
|
|
14662
14337
|
}
|
|
14663
14338
|
}
|
|
@@ -16844,7 +16519,10 @@ static void ggml_compute_forward_map_unary_f32(
|
|
|
16844
16519
|
|
|
16845
16520
|
const struct ggml_tensor * src0 = dst->src[0];
|
|
16846
16521
|
|
|
16847
|
-
|
|
16522
|
+
assert(params->ith == 0);
|
|
16523
|
+
assert(ggml_is_contiguous_1(src0));
|
|
16524
|
+
assert(ggml_is_contiguous_1(dst));
|
|
16525
|
+
assert(ggml_are_same_shape(src0, dst));
|
|
16848
16526
|
|
|
16849
16527
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
16850
16528
|
return;
|
|
@@ -16853,9 +16531,6 @@ static void ggml_compute_forward_map_unary_f32(
|
|
|
16853
16531
|
const int n = ggml_nrows(src0);
|
|
16854
16532
|
const int nc = src0->ne[0];
|
|
16855
16533
|
|
|
16856
|
-
assert( dst->nb[0] == sizeof(float));
|
|
16857
|
-
assert(src0->nb[0] == sizeof(float));
|
|
16858
|
-
|
|
16859
16534
|
for (int i = 0; i < n; i++) {
|
|
16860
16535
|
fun(nc,
|
|
16861
16536
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
|
@@ -16893,6 +16568,9 @@ static void ggml_compute_forward_map_binary_f32(
|
|
|
16893
16568
|
const struct ggml_tensor * src1 = dst->src[1];
|
|
16894
16569
|
|
|
16895
16570
|
assert(params->ith == 0);
|
|
16571
|
+
assert(ggml_is_contiguous_1(src0));
|
|
16572
|
+
assert(ggml_is_contiguous_1(src1));
|
|
16573
|
+
assert(ggml_is_contiguous_1(dst));
|
|
16896
16574
|
assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
|
|
16897
16575
|
|
|
16898
16576
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
@@ -16902,10 +16580,6 @@ static void ggml_compute_forward_map_binary_f32(
|
|
|
16902
16580
|
const int n = ggml_nrows(src0);
|
|
16903
16581
|
const int nc = src0->ne[0];
|
|
16904
16582
|
|
|
16905
|
-
assert( dst->nb[0] == sizeof(float));
|
|
16906
|
-
assert(src0->nb[0] == sizeof(float));
|
|
16907
|
-
assert(src1->nb[0] == sizeof(float));
|
|
16908
|
-
|
|
16909
16583
|
for (int i = 0; i < n; i++) {
|
|
16910
16584
|
fun(nc,
|
|
16911
16585
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
|
@@ -18359,9 +18033,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
|
18359
18033
|
//const int n_past = ((int32_t *) tensor->op_params)[0];
|
|
18360
18034
|
const int n_dims = ((int32_t *) tensor->op_params)[1];
|
|
18361
18035
|
const int mode = ((int32_t *) tensor->op_params)[2];
|
|
18362
|
-
const int n_ctx = ((int32_t *) tensor->op_params)[3];
|
|
18363
|
-
const int
|
|
18364
|
-
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
18036
|
+
//const int n_ctx = ((int32_t *) tensor->op_params)[3];
|
|
18037
|
+
const int n_ctx_orig = ((int32_t *) tensor->op_params)[4];
|
|
18038
|
+
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
|
18365
18039
|
|
|
18366
18040
|
memcpy(&freq_base, (int32_t *) tensor->op_params + 5, sizeof(float));
|
|
18367
18041
|
memcpy(&freq_scale, (int32_t *) tensor->op_params + 6, sizeof(float));
|
|
@@ -18369,8 +18043,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
|
18369
18043
|
memcpy(&attn_factor, (int32_t *) tensor->op_params + 8, sizeof(float));
|
|
18370
18044
|
memcpy(&beta_fast, (int32_t *) tensor->op_params + 9, sizeof(float));
|
|
18371
18045
|
memcpy(&beta_slow, (int32_t *) tensor->op_params + 10, sizeof(float));
|
|
18372
|
-
memcpy(&xpos_base, (int32_t *) tensor->op_params + 11, sizeof(float));
|
|
18373
|
-
memcpy(&xpos_down, (int32_t *) tensor->op_params + 12, sizeof(bool));
|
|
18374
18046
|
|
|
18375
18047
|
src0->grad = ggml_add_or_set(ctx,
|
|
18376
18048
|
src0->grad,
|
|
@@ -18380,16 +18052,13 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
|
18380
18052
|
src2,
|
|
18381
18053
|
n_dims,
|
|
18382
18054
|
mode,
|
|
18383
|
-
|
|
18384
|
-
n_orig_ctx,
|
|
18055
|
+
n_ctx_orig,
|
|
18385
18056
|
freq_base,
|
|
18386
18057
|
freq_scale,
|
|
18387
18058
|
ext_factor,
|
|
18388
18059
|
attn_factor,
|
|
18389
18060
|
beta_fast,
|
|
18390
|
-
beta_slow,
|
|
18391
|
-
xpos_base,
|
|
18392
|
-
xpos_down),
|
|
18061
|
+
beta_slow),
|
|
18393
18062
|
zero_table);
|
|
18394
18063
|
}
|
|
18395
18064
|
} break;
|
|
@@ -18399,9 +18068,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
|
18399
18068
|
//const int n_past = ((int32_t *) tensor->op_params)[0];
|
|
18400
18069
|
const int n_dims = ((int32_t *) tensor->op_params)[1];
|
|
18401
18070
|
const int mode = ((int32_t *) tensor->op_params)[2];
|
|
18402
|
-
const int n_ctx = ((int32_t *) tensor->op_params)[3];
|
|
18403
|
-
const int
|
|
18404
|
-
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
18071
|
+
//const int n_ctx = ((int32_t *) tensor->op_params)[3];
|
|
18072
|
+
const int n_ctx_orig = ((int32_t *) tensor->op_params)[4];
|
|
18073
|
+
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
|
18405
18074
|
|
|
18406
18075
|
memcpy(&freq_base, (int32_t *) tensor->op_params + 5, sizeof(float));
|
|
18407
18076
|
memcpy(&freq_scale, (int32_t *) tensor->op_params + 6, sizeof(float));
|
|
@@ -18409,8 +18078,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
|
18409
18078
|
memcpy(&attn_factor, (int32_t *) tensor->op_params + 8, sizeof(float));
|
|
18410
18079
|
memcpy(&beta_fast, (int32_t *) tensor->op_params + 9, sizeof(float));
|
|
18411
18080
|
memcpy(&beta_slow, (int32_t *) tensor->op_params + 10, sizeof(float));
|
|
18412
|
-
memcpy(&xpos_base, (int32_t *) tensor->op_params + 11, sizeof(float));
|
|
18413
|
-
memcpy(&xpos_down, (int32_t *) tensor->op_params + 12, sizeof(bool));
|
|
18414
18081
|
|
|
18415
18082
|
src0->grad = ggml_add_or_set(ctx,
|
|
18416
18083
|
src0->grad,
|
|
@@ -18420,16 +18087,13 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
|
18420
18087
|
src2,
|
|
18421
18088
|
n_dims,
|
|
18422
18089
|
mode,
|
|
18423
|
-
|
|
18424
|
-
n_orig_ctx,
|
|
18090
|
+
n_ctx_orig,
|
|
18425
18091
|
freq_base,
|
|
18426
18092
|
freq_scale,
|
|
18427
18093
|
ext_factor,
|
|
18428
18094
|
attn_factor,
|
|
18429
18095
|
beta_fast,
|
|
18430
18096
|
beta_slow,
|
|
18431
|
-
xpos_base,
|
|
18432
|
-
xpos_down,
|
|
18433
18097
|
false),
|
|
18434
18098
|
zero_table);
|
|
18435
18099
|
}
|
|
@@ -19073,6 +18737,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
|
|
|
19073
18737
|
switch (node->op) {
|
|
19074
18738
|
case GGML_OP_CPY:
|
|
19075
18739
|
case GGML_OP_DUP:
|
|
18740
|
+
case GGML_OP_CONT:
|
|
19076
18741
|
case GGML_OP_ADD:
|
|
19077
18742
|
case GGML_OP_ADD1:
|
|
19078
18743
|
case GGML_OP_ACC:
|
|
@@ -19157,7 +18822,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
|
|
|
19157
18822
|
} break;
|
|
19158
18823
|
case GGML_OP_SCALE:
|
|
19159
18824
|
case GGML_OP_SET:
|
|
19160
|
-
case GGML_OP_CONT:
|
|
19161
18825
|
case GGML_OP_RESHAPE:
|
|
19162
18826
|
case GGML_OP_VIEW:
|
|
19163
18827
|
case GGML_OP_PERMUTE:
|
|
@@ -19317,8 +18981,11 @@ static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_comput
|
|
|
19317
18981
|
sched_yield();
|
|
19318
18982
|
}
|
|
19319
18983
|
|
|
19320
|
-
*
|
|
19321
|
-
if (*
|
|
18984
|
+
*node_n = atomic_load(&state->shared->node_n);
|
|
18985
|
+
if (*node_n != last_node_n) {
|
|
18986
|
+
break;
|
|
18987
|
+
}
|
|
18988
|
+
|
|
19322
18989
|
#if defined(__SSE3__)
|
|
19323
18990
|
// Tell the processor we're spinning. It's a processor hint for spinlocks.
|
|
19324
18991
|
_mm_pause();
|
|
@@ -19328,15 +18995,18 @@ static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_comput
|
|
|
19328
18995
|
|
|
19329
18996
|
static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_compute_state * state, const bool do_yield) {
|
|
19330
18997
|
// wait for other threads to finish
|
|
19331
|
-
const int last_task_phase = *
|
|
18998
|
+
const int last_task_phase = *task_phase;
|
|
19332
18999
|
|
|
19333
19000
|
while (true) {
|
|
19334
19001
|
if (do_yield) {
|
|
19335
19002
|
sched_yield();
|
|
19336
19003
|
}
|
|
19337
19004
|
|
|
19338
|
-
*
|
|
19339
|
-
if (*
|
|
19005
|
+
*task_phase = atomic_load(&state->shared->node_task);
|
|
19006
|
+
if (*task_phase != last_task_phase) {
|
|
19007
|
+
break;
|
|
19008
|
+
}
|
|
19009
|
+
|
|
19340
19010
|
#if defined(__SSE3__)
|
|
19341
19011
|
// Tell the processor we're spinning. It's a processor hint for spinlocks.
|
|
19342
19012
|
_mm_pause();
|
|
@@ -19536,22 +19206,6 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
|
|
19536
19206
|
{
|
|
19537
19207
|
const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type;
|
|
19538
19208
|
|
|
19539
|
-
#if defined(GGML_USE_CLBLAST)
|
|
19540
|
-
if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
|
|
19541
|
-
cur = ggml_cl_mul_mat_get_wsize(node->src[0], node->src[1], node);
|
|
19542
|
-
} else
|
|
19543
|
-
#endif
|
|
19544
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
|
19545
|
-
if (ggml_compute_forward_mul_mat_use_blas(node)) {
|
|
19546
|
-
if (node->src[0]->type != GGML_TYPE_F32) {
|
|
19547
|
-
// here we need memory for fully dequantized matrix from src0
|
|
19548
|
-
// take into account that src0 can be broadcasted into src1[2,3]
|
|
19549
|
-
cur = ggml_type_size(GGML_TYPE_F32)
|
|
19550
|
-
* node->src[0]->ne[0]*node->src[0]->ne[1]
|
|
19551
|
-
* node->src[1]->ne[2]*node->src[1]->ne[3];
|
|
19552
|
-
}
|
|
19553
|
-
} else
|
|
19554
|
-
#endif
|
|
19555
19209
|
if (node->src[1]->type != vec_dot_type) {
|
|
19556
19210
|
cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
|
|
19557
19211
|
}
|
|
@@ -19670,6 +19324,59 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
|
|
19670
19324
|
return cplan;
|
|
19671
19325
|
}
|
|
19672
19326
|
|
|
19327
|
+
static enum ggml_status ggml_graph_compute_parallel(struct ggml_compute_state * workers, int n_threads) {
|
|
19328
|
+
enum ggml_status compute_status = GGML_STATUS_SUCCESS;
|
|
19329
|
+
|
|
19330
|
+
#ifdef GGML_USE_OPENMP
|
|
19331
|
+
if (n_threads > 1) {
|
|
19332
|
+
#pragma omp parallel num_threads(n_threads)
|
|
19333
|
+
{
|
|
19334
|
+
#pragma omp single
|
|
19335
|
+
{
|
|
19336
|
+
// update the number of threads from the actual number of threads that we got from OpenMP
|
|
19337
|
+
n_threads = omp_get_num_threads();
|
|
19338
|
+
workers[0].shared->n_threads = n_threads;
|
|
19339
|
+
workers[0].shared->n_active = n_threads;
|
|
19340
|
+
}
|
|
19341
|
+
ggml_graph_compute_thread(&workers[omp_get_thread_num()]);
|
|
19342
|
+
}
|
|
19343
|
+
} else {
|
|
19344
|
+
ggml_graph_compute_thread(&workers[0]);
|
|
19345
|
+
}
|
|
19346
|
+
#else
|
|
19347
|
+
// create thread pool
|
|
19348
|
+
if (n_threads > 1) {
|
|
19349
|
+
for (int j = 1; j < n_threads; ++j) {
|
|
19350
|
+
const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
|
|
19351
|
+
GGML_ASSERT(rc == 0);
|
|
19352
|
+
UNUSED(rc);
|
|
19353
|
+
}
|
|
19354
|
+
}
|
|
19355
|
+
|
|
19356
|
+
// this is a work thread too
|
|
19357
|
+
ggml_graph_compute_thread(&workers[0]);
|
|
19358
|
+
|
|
19359
|
+
// join or kill thread pool
|
|
19360
|
+
if (n_threads > 1) {
|
|
19361
|
+
for (int j = 1; j < n_threads; j++) {
|
|
19362
|
+
const int rc = ggml_thread_join(workers[j].thrd, NULL);
|
|
19363
|
+
GGML_ASSERT(rc == 0);
|
|
19364
|
+
UNUSED(rc);
|
|
19365
|
+
}
|
|
19366
|
+
}
|
|
19367
|
+
#endif
|
|
19368
|
+
// don't leave affinity set on the main thread
|
|
19369
|
+
clear_numa_thread_affinity();
|
|
19370
|
+
|
|
19371
|
+
for (int j = 0; j < n_threads; j++) {
|
|
19372
|
+
if (workers[j].ec != GGML_STATUS_SUCCESS) {
|
|
19373
|
+
compute_status = workers[j].ec;
|
|
19374
|
+
break;
|
|
19375
|
+
}
|
|
19376
|
+
}
|
|
19377
|
+
return compute_status;
|
|
19378
|
+
}
|
|
19379
|
+
|
|
19673
19380
|
enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
|
19674
19381
|
{
|
|
19675
19382
|
GGML_ASSERT(cplan);
|
|
@@ -19680,7 +19387,11 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
|
|
19680
19387
|
}
|
|
19681
19388
|
}
|
|
19682
19389
|
|
|
19683
|
-
|
|
19390
|
+
int n_threads = cplan->n_threads;
|
|
19391
|
+
|
|
19392
|
+
#if defined(GGML_USE_OPENMP)
|
|
19393
|
+
n_threads = MIN(n_threads, omp_get_max_threads());
|
|
19394
|
+
#endif
|
|
19684
19395
|
|
|
19685
19396
|
struct ggml_compute_state_shared state_shared = {
|
|
19686
19397
|
/*.cgraph =*/ cgraph,
|
|
@@ -19696,47 +19407,20 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
|
|
19696
19407
|
/*.current_chunk; =*/ 0,
|
|
19697
19408
|
};
|
|
19698
19409
|
struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
|
|
19699
|
-
|
|
19700
|
-
// create thread pool
|
|
19701
|
-
if (n_threads > 1) {
|
|
19702
|
-
for (int j = 1; j < n_threads; ++j) {
|
|
19703
|
-
workers[j] = (struct ggml_compute_state) {
|
|
19704
|
-
.thrd = 0,
|
|
19705
|
-
.ith = j,
|
|
19706
|
-
.shared = &state_shared,
|
|
19707
|
-
.ec = GGML_STATUS_SUCCESS,
|
|
19708
|
-
};
|
|
19709
|
-
|
|
19710
|
-
const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
|
|
19711
|
-
GGML_ASSERT(rc == 0);
|
|
19712
|
-
UNUSED(rc);
|
|
19713
|
-
}
|
|
19714
|
-
}
|
|
19715
|
-
|
|
19716
|
-
workers[0].ith = 0;
|
|
19717
|
-
workers[0].shared = &state_shared;
|
|
19718
|
-
workers[0].ec = GGML_STATUS_SUCCESS;
|
|
19719
|
-
|
|
19720
19410
|
const int64_t perf_start_cycles = ggml_perf_cycles();
|
|
19721
19411
|
const int64_t perf_start_time_us = ggml_perf_time_us();
|
|
19722
19412
|
|
|
19723
|
-
|
|
19724
|
-
|
|
19725
|
-
|
|
19726
|
-
|
|
19727
|
-
|
|
19728
|
-
|
|
19729
|
-
|
|
19730
|
-
// join or kill thread pool
|
|
19731
|
-
if (n_threads > 1) {
|
|
19732
|
-
for (int j = 1; j < n_threads; j++) {
|
|
19733
|
-
const int rc = ggml_thread_join(workers[j].thrd, NULL);
|
|
19734
|
-
GGML_ASSERT(rc == 0);
|
|
19735
|
-
if (workers[j].ec != GGML_STATUS_SUCCESS)
|
|
19736
|
-
compute_status = workers[j].ec;
|
|
19737
|
-
}
|
|
19413
|
+
for (int j = 0; j < n_threads; ++j) {
|
|
19414
|
+
workers[j] = (struct ggml_compute_state) {
|
|
19415
|
+
.thrd = 0,
|
|
19416
|
+
.ith = j,
|
|
19417
|
+
.shared = &state_shared,
|
|
19418
|
+
.ec = GGML_STATUS_SUCCESS,
|
|
19419
|
+
};
|
|
19738
19420
|
}
|
|
19739
19421
|
|
|
19422
|
+
enum ggml_status compute_status = ggml_graph_compute_parallel(workers, n_threads);
|
|
19423
|
+
|
|
19740
19424
|
// performance stats (graph)
|
|
19741
19425
|
{
|
|
19742
19426
|
int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles;
|
|
@@ -22819,7 +22503,7 @@ int ggml_cpu_has_wasm_simd(void) {
|
|
|
22819
22503
|
}
|
|
22820
22504
|
|
|
22821
22505
|
int ggml_cpu_has_blas(void) {
|
|
22822
|
-
#if defined(
|
|
22506
|
+
#if defined(GGML_USE_BLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL)
|
|
22823
22507
|
return 1;
|
|
22824
22508
|
#else
|
|
22825
22509
|
return 0;
|
|
@@ -22834,14 +22518,6 @@ int ggml_cpu_has_cuda(void) {
|
|
|
22834
22518
|
#endif
|
|
22835
22519
|
}
|
|
22836
22520
|
|
|
22837
|
-
int ggml_cpu_has_clblast(void) {
|
|
22838
|
-
#if defined(GGML_USE_CLBLAST)
|
|
22839
|
-
return 1;
|
|
22840
|
-
#else
|
|
22841
|
-
return 0;
|
|
22842
|
-
#endif
|
|
22843
|
-
}
|
|
22844
|
-
|
|
22845
22521
|
int ggml_cpu_has_vulkan(void) {
|
|
22846
22522
|
#if defined(GGML_USE_VULKAN)
|
|
22847
22523
|
return 1;
|
|
@@ -22875,8 +22551,7 @@ int ggml_cpu_has_rpc(void) {
|
|
|
22875
22551
|
}
|
|
22876
22552
|
|
|
22877
22553
|
int ggml_cpu_has_gpublas(void) {
|
|
22878
|
-
return ggml_cpu_has_cuda() ||
|
|
22879
|
-
ggml_cpu_has_sycl();
|
|
22554
|
+
return ggml_cpu_has_cuda() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() || ggml_cpu_has_sycl();
|
|
22880
22555
|
}
|
|
22881
22556
|
|
|
22882
22557
|
int ggml_cpu_has_sse3(void) {
|