llama_cpp 0.15.4 → 0.16.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/ext/llama_cpp/extconf.rb +3 -2
- data/ext/llama_cpp/llama_cpp.cpp +17 -3
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +15 -1
- data/vendor/tmp/llama.cpp/Makefile +166 -82
- data/vendor/tmp/llama.cpp/ggml-alloc.c +82 -26
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +20 -8
- data/vendor/tmp/llama.cpp/ggml-backend.c +183 -69
- data/vendor/tmp/llama.cpp/ggml-backend.h +4 -4
- data/vendor/tmp/llama.cpp/ggml-blas.cpp +363 -0
- data/vendor/tmp/llama.cpp/ggml-blas.h +23 -0
- data/vendor/tmp/llama.cpp/ggml-common.h +6 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +47 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +34 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +104 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +280 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +34 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +196 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +686 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +490 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +40 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +674 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +319 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +312 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +345 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +178 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +104 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +88 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +419 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +221 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +49 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +94 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +112 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +271 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +31 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +206 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +40 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +9 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +8 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +47 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +286 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +51 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +103 -135
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +29 -13
- data/vendor/tmp/llama.cpp/ggml-metal.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +45 -33
- data/vendor/tmp/llama.cpp/ggml-metal.metal +83 -59
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +15 -14
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +26 -90
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +74522 -14913
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +631 -471
- data/vendor/tmp/llama.cpp/ggml.c +278 -603
- data/vendor/tmp/llama.cpp/ggml.h +9 -28
- data/vendor/tmp/llama.cpp/llama.cpp +345 -473
- data/vendor/tmp/llama.cpp/llama.h +21 -43
- metadata +134 -7
- data/vendor/tmp/llama.cpp/ggml-mpi.c +0 -216
- data/vendor/tmp/llama.cpp/ggml-mpi.h +0 -39
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +0 -2305
- data/vendor/tmp/llama.cpp/ggml-opencl.h +0 -36
data/vendor/tmp/llama.cpp/ggml.c
CHANGED
@@ -5,6 +5,7 @@
|
|
5
5
|
#include "ggml-quants.h"
|
6
6
|
#include "ggml.h"
|
7
7
|
|
8
|
+
|
8
9
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
9
10
|
#include <malloc.h> // using malloc.h with MSC/MINGW
|
10
11
|
#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
|
@@ -28,6 +29,10 @@
|
|
28
29
|
#include <syscall.h>
|
29
30
|
#endif
|
30
31
|
|
32
|
+
#ifdef GGML_USE_OPENMP
|
33
|
+
#include <omp.h>
|
34
|
+
#endif
|
35
|
+
|
31
36
|
#ifdef GGML_USE_METAL
|
32
37
|
#include <unistd.h>
|
33
38
|
#endif
|
@@ -292,17 +297,6 @@ inline static void * ggml_calloc(size_t num, size_t size) {
|
|
292
297
|
|
293
298
|
#if defined(GGML_USE_ACCELERATE)
|
294
299
|
#include <Accelerate/Accelerate.h>
|
295
|
-
#if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
|
296
|
-
#include "ggml-opencl.h"
|
297
|
-
#endif
|
298
|
-
#elif defined(GGML_USE_OPENBLAS)
|
299
|
-
#if defined(GGML_BLAS_USE_MKL)
|
300
|
-
#include <mkl.h>
|
301
|
-
#else
|
302
|
-
#include <cblas.h>
|
303
|
-
#endif
|
304
|
-
#elif defined(GGML_USE_CLBLAST)
|
305
|
-
#include "ggml-opencl.h"
|
306
300
|
#endif
|
307
301
|
|
308
302
|
// floating point type used to accumulate sums
|
@@ -1756,7 +1750,7 @@ struct ggml_compute_state_shared {
|
|
1756
1750
|
int64_t perf_node_start_cycles;
|
1757
1751
|
int64_t perf_node_start_time_us;
|
1758
1752
|
|
1759
|
-
|
1753
|
+
int n_threads;
|
1760
1754
|
|
1761
1755
|
// synchronization primitives
|
1762
1756
|
atomic_int n_active; // num active threads
|
@@ -2267,6 +2261,11 @@ inline static float ggml_silu_f32(float x) {
|
|
2267
2261
|
return x/(1.0f + expf(-x));
|
2268
2262
|
}
|
2269
2263
|
|
2264
|
+
#if __FINITE_MATH_ONLY__
|
2265
|
+
#error "some routines in ggml.c require non-finite math arithmetics -- pass -fno-finite-math-only to the compiler to fix"
|
2266
|
+
#error "ref: https://github.com/ggerganov/llama.cpp/pull/7154#issuecomment-2143844461"
|
2267
|
+
#endif
|
2268
|
+
|
2270
2269
|
#if defined(__ARM_NEON) && defined(__aarch64__)
|
2271
2270
|
|
2272
2271
|
// adapted from arm limited optimized routine
|
@@ -3207,35 +3206,42 @@ GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor) {
|
|
3207
3206
|
return tensor->nb[0] > tensor->nb[1];
|
3208
3207
|
}
|
3209
3208
|
|
3210
|
-
|
3211
|
-
|
3209
|
+
static bool ggml_is_contiguous_n(const struct ggml_tensor * tensor, int n) {
|
3210
|
+
size_t next_nb = ggml_type_size(tensor->type);
|
3211
|
+
if (tensor->ne[0] != ggml_blck_size(tensor->type) && tensor->nb[0] != next_nb) {
|
3212
|
+
return false;
|
3213
|
+
}
|
3214
|
+
next_nb *= tensor->ne[0]/ggml_blck_size(tensor->type);
|
3215
|
+
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
3216
|
+
if (tensor->ne[i] != 1) {
|
3217
|
+
if (i > n) {
|
3218
|
+
if (tensor->nb[i] != next_nb) {
|
3219
|
+
return false;
|
3220
|
+
}
|
3221
|
+
next_nb *= tensor->ne[i];
|
3222
|
+
} else {
|
3223
|
+
// this dimension does not need to be contiguous
|
3224
|
+
next_nb = tensor->ne[i]*tensor->nb[i];
|
3225
|
+
}
|
3226
|
+
}
|
3227
|
+
}
|
3228
|
+
return true;
|
3229
|
+
}
|
3212
3230
|
|
3213
|
-
|
3214
|
-
|
3215
|
-
tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) &&
|
3216
|
-
tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
|
3217
|
-
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
3231
|
+
GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
|
3232
|
+
return ggml_is_contiguous_0(tensor);
|
3218
3233
|
}
|
3219
3234
|
|
3220
3235
|
GGML_CALL bool ggml_is_contiguous_0(const struct ggml_tensor * tensor) {
|
3221
|
-
return
|
3236
|
+
return ggml_is_contiguous_n(tensor, 0);
|
3222
3237
|
}
|
3223
3238
|
|
3224
3239
|
GGML_CALL bool ggml_is_contiguous_1(const struct ggml_tensor * tensor) {
|
3225
|
-
|
3226
|
-
|
3227
|
-
return
|
3228
|
-
tensor->nb[0] == ggml_type_size(tensor->type) &&
|
3229
|
-
tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
|
3230
|
-
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
3240
|
+
return ggml_is_contiguous_n(tensor, 1);
|
3231
3241
|
}
|
3232
3242
|
|
3233
3243
|
GGML_CALL bool ggml_is_contiguous_2(const struct ggml_tensor * tensor) {
|
3234
|
-
|
3235
|
-
|
3236
|
-
return
|
3237
|
-
tensor->nb[0] == ggml_type_size(tensor->type) &&
|
3238
|
-
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
3244
|
+
return ggml_is_contiguous_n(tensor, 2);
|
3239
3245
|
}
|
3240
3246
|
|
3241
3247
|
GGML_CALL bool ggml_is_permuted(const struct ggml_tensor * tensor) {
|
@@ -3267,20 +3273,20 @@ bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor
|
|
3267
3273
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
3268
3274
|
|
3269
3275
|
return
|
3270
|
-
(t0->ne[0] == t1->ne[0]
|
3271
|
-
(t0->ne[1] == t1->ne[1]
|
3272
|
-
(t0->ne[2] == t1->ne[2]
|
3273
|
-
(t0->ne[3] == t1->ne[3]
|
3276
|
+
(t0->ne[0] == t1->ne[0]) &&
|
3277
|
+
(t0->ne[1] == t1->ne[1]) &&
|
3278
|
+
(t0->ne[2] == t1->ne[2]) &&
|
3279
|
+
(t0->ne[3] == t1->ne[3]);
|
3274
3280
|
}
|
3275
3281
|
|
3276
3282
|
bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
3277
3283
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
3278
3284
|
|
3279
3285
|
return
|
3280
|
-
(t0->nb[0] == t1->nb[0]
|
3281
|
-
(t0->nb[1] == t1->nb[1]
|
3282
|
-
(t0->nb[2] == t1->nb[2]
|
3283
|
-
(t0->nb[3] == t1->nb[3]
|
3286
|
+
(t0->nb[0] == t1->nb[0]) &&
|
3287
|
+
(t0->nb[1] == t1->nb[1]) &&
|
3288
|
+
(t0->nb[2] == t1->nb[2]) &&
|
3289
|
+
(t0->nb[3] == t1->nb[3]);
|
3284
3290
|
}
|
3285
3291
|
|
3286
3292
|
// check if t1 can be represented as a repeatition of t0
|
@@ -3370,10 +3376,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
3370
3376
|
GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
|
3371
3377
|
}
|
3372
3378
|
|
3373
|
-
#if defined(GGML_USE_CLBLAST)
|
3374
|
-
ggml_cl_init();
|
3375
|
-
#endif
|
3376
|
-
|
3377
3379
|
ggml_setup_op_has_task_pass();
|
3378
3380
|
|
3379
3381
|
is_first_call = false;
|
@@ -4077,32 +4079,26 @@ float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) {
|
|
4077
4079
|
switch (tensor->type) {
|
4078
4080
|
case GGML_TYPE_I8:
|
4079
4081
|
{
|
4080
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
|
4081
4082
|
return ((int8_t *)(tensor->data))[i];
|
4082
4083
|
}
|
4083
4084
|
case GGML_TYPE_I16:
|
4084
4085
|
{
|
4085
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
|
4086
4086
|
return ((int16_t *)(tensor->data))[i];
|
4087
4087
|
}
|
4088
4088
|
case GGML_TYPE_I32:
|
4089
4089
|
{
|
4090
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
|
4091
4090
|
return ((int32_t *)(tensor->data))[i];
|
4092
4091
|
}
|
4093
4092
|
case GGML_TYPE_F16:
|
4094
4093
|
{
|
4095
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
|
4096
4094
|
return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
|
4097
4095
|
}
|
4098
4096
|
case GGML_TYPE_BF16:
|
4099
4097
|
{
|
4100
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_bf16_t));
|
4101
4098
|
return GGML_BF16_TO_FP32(((ggml_bf16_t *)(tensor->data))[i]);
|
4102
4099
|
}
|
4103
4100
|
case GGML_TYPE_F32:
|
4104
4101
|
{
|
4105
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(float));
|
4106
4102
|
return ((float *)(tensor->data))[i];
|
4107
4103
|
}
|
4108
4104
|
default:
|
@@ -4124,32 +4120,26 @@ void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) {
|
|
4124
4120
|
switch (tensor->type) {
|
4125
4121
|
case GGML_TYPE_I8:
|
4126
4122
|
{
|
4127
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
|
4128
4123
|
((int8_t *)(tensor->data))[i] = value;
|
4129
4124
|
} break;
|
4130
4125
|
case GGML_TYPE_I16:
|
4131
4126
|
{
|
4132
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
|
4133
4127
|
((int16_t *)(tensor->data))[i] = value;
|
4134
4128
|
} break;
|
4135
4129
|
case GGML_TYPE_I32:
|
4136
4130
|
{
|
4137
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
|
4138
4131
|
((int32_t *)(tensor->data))[i] = value;
|
4139
4132
|
} break;
|
4140
4133
|
case GGML_TYPE_F16:
|
4141
4134
|
{
|
4142
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
|
4143
4135
|
((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
|
4144
4136
|
} break;
|
4145
4137
|
case GGML_TYPE_BF16:
|
4146
4138
|
{
|
4147
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_bf16_t));
|
4148
4139
|
((ggml_bf16_t *)(tensor->data))[i] = GGML_FP32_TO_BF16(value);
|
4149
4140
|
} break;
|
4150
4141
|
case GGML_TYPE_F32:
|
4151
4142
|
{
|
4152
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(float));
|
4153
4143
|
((float *)(tensor->data))[i] = value;
|
4154
4144
|
} break;
|
4155
4145
|
default:
|
@@ -6249,16 +6239,13 @@ static struct ggml_tensor * ggml_rope_impl(
|
|
6249
6239
|
struct ggml_tensor * c,
|
6250
6240
|
int n_dims,
|
6251
6241
|
int mode,
|
6252
|
-
int
|
6253
|
-
int n_orig_ctx,
|
6242
|
+
int n_ctx_orig,
|
6254
6243
|
float freq_base,
|
6255
6244
|
float freq_scale,
|
6256
6245
|
float ext_factor,
|
6257
6246
|
float attn_factor,
|
6258
6247
|
float beta_fast,
|
6259
6248
|
float beta_slow,
|
6260
|
-
float xpos_base,
|
6261
|
-
bool xpos_down,
|
6262
6249
|
bool inplace) {
|
6263
6250
|
GGML_ASSERT((mode & 1) == 0 && "mode & 1 == 1 is no longer supported");
|
6264
6251
|
|
@@ -6279,15 +6266,13 @@ static struct ggml_tensor * ggml_rope_impl(
|
|
6279
6266
|
|
6280
6267
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
6281
6268
|
|
6282
|
-
int32_t params[
|
6269
|
+
int32_t params[11] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
|
6283
6270
|
memcpy(params + 5, &freq_base, sizeof(float));
|
6284
6271
|
memcpy(params + 6, &freq_scale, sizeof(float));
|
6285
6272
|
memcpy(params + 7, &ext_factor, sizeof(float));
|
6286
6273
|
memcpy(params + 8, &attn_factor, sizeof(float));
|
6287
6274
|
memcpy(params + 9, &beta_fast, sizeof(float));
|
6288
6275
|
memcpy(params + 10, &beta_slow, sizeof(float));
|
6289
|
-
memcpy(params + 11, &xpos_base, sizeof(float));
|
6290
|
-
memcpy(params + 12, &xpos_down, sizeof(bool));
|
6291
6276
|
ggml_set_op_params(result, params, sizeof(params));
|
6292
6277
|
|
6293
6278
|
result->op = GGML_OP_ROPE;
|
@@ -6304,10 +6289,9 @@ struct ggml_tensor * ggml_rope(
|
|
6304
6289
|
struct ggml_tensor * a,
|
6305
6290
|
struct ggml_tensor * b,
|
6306
6291
|
int n_dims,
|
6307
|
-
int mode
|
6308
|
-
int n_ctx) {
|
6292
|
+
int mode) {
|
6309
6293
|
return ggml_rope_impl(
|
6310
|
-
ctx, a, b, NULL, n_dims, mode,
|
6294
|
+
ctx, a, b, NULL, n_dims, mode, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, false
|
6311
6295
|
);
|
6312
6296
|
}
|
6313
6297
|
|
@@ -6316,10 +6300,9 @@ struct ggml_tensor * ggml_rope_inplace(
|
|
6316
6300
|
struct ggml_tensor * a,
|
6317
6301
|
struct ggml_tensor * b,
|
6318
6302
|
int n_dims,
|
6319
|
-
int mode
|
6320
|
-
int n_ctx) {
|
6303
|
+
int mode) {
|
6321
6304
|
return ggml_rope_impl(
|
6322
|
-
ctx, a, b, NULL, n_dims, mode,
|
6305
|
+
ctx, a, b, NULL, n_dims, mode, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, true
|
6323
6306
|
);
|
6324
6307
|
}
|
6325
6308
|
|
@@ -6330,8 +6313,7 @@ struct ggml_tensor * ggml_rope_ext(
|
|
6330
6313
|
struct ggml_tensor * c,
|
6331
6314
|
int n_dims,
|
6332
6315
|
int mode,
|
6333
|
-
int
|
6334
|
-
int n_orig_ctx,
|
6316
|
+
int n_ctx_orig,
|
6335
6317
|
float freq_base,
|
6336
6318
|
float freq_scale,
|
6337
6319
|
float ext_factor,
|
@@ -6339,8 +6321,8 @@ struct ggml_tensor * ggml_rope_ext(
|
|
6339
6321
|
float beta_fast,
|
6340
6322
|
float beta_slow) {
|
6341
6323
|
return ggml_rope_impl(
|
6342
|
-
ctx, a, b, c, n_dims, mode,
|
6343
|
-
ext_factor, attn_factor, beta_fast, beta_slow,
|
6324
|
+
ctx, a, b, c, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
|
6325
|
+
ext_factor, attn_factor, beta_fast, beta_slow, false
|
6344
6326
|
);
|
6345
6327
|
}
|
6346
6328
|
|
@@ -6351,8 +6333,7 @@ struct ggml_tensor * ggml_rope_ext_inplace(
|
|
6351
6333
|
struct ggml_tensor * c,
|
6352
6334
|
int n_dims,
|
6353
6335
|
int mode,
|
6354
|
-
int
|
6355
|
-
int n_orig_ctx,
|
6336
|
+
int n_ctx_orig,
|
6356
6337
|
float freq_base,
|
6357
6338
|
float freq_scale,
|
6358
6339
|
float ext_factor,
|
@@ -6360,8 +6341,8 @@ struct ggml_tensor * ggml_rope_ext_inplace(
|
|
6360
6341
|
float beta_fast,
|
6361
6342
|
float beta_slow) {
|
6362
6343
|
return ggml_rope_impl(
|
6363
|
-
ctx, a, b, c, n_dims, mode,
|
6364
|
-
ext_factor, attn_factor, beta_fast, beta_slow,
|
6344
|
+
ctx, a, b, c, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
|
6345
|
+
ext_factor, attn_factor, beta_fast, beta_slow, true
|
6365
6346
|
);
|
6366
6347
|
}
|
6367
6348
|
|
@@ -6371,8 +6352,7 @@ struct ggml_tensor * ggml_rope_custom(
|
|
6371
6352
|
struct ggml_tensor * b,
|
6372
6353
|
int n_dims,
|
6373
6354
|
int mode,
|
6374
|
-
int
|
6375
|
-
int n_orig_ctx,
|
6355
|
+
int n_ctx_orig,
|
6376
6356
|
float freq_base,
|
6377
6357
|
float freq_scale,
|
6378
6358
|
float ext_factor,
|
@@ -6380,8 +6360,8 @@ struct ggml_tensor * ggml_rope_custom(
|
|
6380
6360
|
float beta_fast,
|
6381
6361
|
float beta_slow) {
|
6382
6362
|
return ggml_rope_impl(
|
6383
|
-
ctx, a, b, NULL, n_dims, mode,
|
6384
|
-
ext_factor, attn_factor, beta_fast, beta_slow,
|
6363
|
+
ctx, a, b, NULL, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
|
6364
|
+
ext_factor, attn_factor, beta_fast, beta_slow, false
|
6385
6365
|
);
|
6386
6366
|
}
|
6387
6367
|
|
@@ -6391,8 +6371,7 @@ struct ggml_tensor * ggml_rope_custom_inplace(
|
|
6391
6371
|
struct ggml_tensor * b,
|
6392
6372
|
int n_dims,
|
6393
6373
|
int mode,
|
6394
|
-
int
|
6395
|
-
int n_orig_ctx,
|
6374
|
+
int n_ctx_orig,
|
6396
6375
|
float freq_base,
|
6397
6376
|
float freq_scale,
|
6398
6377
|
float ext_factor,
|
@@ -6400,21 +6379,11 @@ struct ggml_tensor * ggml_rope_custom_inplace(
|
|
6400
6379
|
float beta_fast,
|
6401
6380
|
float beta_slow) {
|
6402
6381
|
return ggml_rope_impl(
|
6403
|
-
ctx, a, b, NULL, n_dims, mode,
|
6404
|
-
ext_factor, attn_factor, beta_fast, beta_slow,
|
6382
|
+
ctx, a, b, NULL, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
|
6383
|
+
ext_factor, attn_factor, beta_fast, beta_slow, true
|
6405
6384
|
);
|
6406
6385
|
}
|
6407
6386
|
|
6408
|
-
struct ggml_tensor * ggml_rope_xpos_inplace(
|
6409
|
-
struct ggml_context * ctx,
|
6410
|
-
struct ggml_tensor * a,
|
6411
|
-
struct ggml_tensor * b,
|
6412
|
-
int n_dims,
|
6413
|
-
float base,
|
6414
|
-
bool down) {
|
6415
|
-
return ggml_rope_impl(ctx, a, b, NULL, n_dims, 0, 0, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, base, down, true);
|
6416
|
-
}
|
6417
|
-
|
6418
6387
|
// ggml_rope_back
|
6419
6388
|
|
6420
6389
|
struct ggml_tensor * ggml_rope_back(
|
@@ -6424,16 +6393,13 @@ struct ggml_tensor * ggml_rope_back(
|
|
6424
6393
|
struct ggml_tensor * c,
|
6425
6394
|
int n_dims,
|
6426
6395
|
int mode,
|
6427
|
-
int
|
6428
|
-
int n_orig_ctx,
|
6396
|
+
int n_ctx_orig,
|
6429
6397
|
float freq_base,
|
6430
6398
|
float freq_scale,
|
6431
6399
|
float ext_factor,
|
6432
6400
|
float attn_factor,
|
6433
6401
|
float beta_fast,
|
6434
|
-
float beta_slow
|
6435
|
-
float xpos_base,
|
6436
|
-
bool xpos_down) {
|
6402
|
+
float beta_slow) {
|
6437
6403
|
GGML_ASSERT(ggml_is_vector(b));
|
6438
6404
|
GGML_ASSERT(b->type == GGML_TYPE_I32);
|
6439
6405
|
GGML_ASSERT(a->ne[2] == b->ne[0]);
|
@@ -6449,15 +6415,13 @@ struct ggml_tensor * ggml_rope_back(
|
|
6449
6415
|
|
6450
6416
|
struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
|
6451
6417
|
|
6452
|
-
int32_t params[
|
6418
|
+
int32_t params[11] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
|
6453
6419
|
memcpy(params + 5, &freq_base, sizeof(float));
|
6454
6420
|
memcpy(params + 6, &freq_scale, sizeof(float));
|
6455
6421
|
memcpy(params + 7, &ext_factor, sizeof(float));
|
6456
6422
|
memcpy(params + 8, &attn_factor, sizeof(float));
|
6457
6423
|
memcpy(params + 9, &beta_fast, sizeof(float));
|
6458
6424
|
memcpy(params + 10, &beta_slow, sizeof(float));
|
6459
|
-
memcpy(params + 11, &xpos_base, sizeof(float));
|
6460
|
-
memcpy(params + 12, &xpos_down, sizeof(bool));
|
6461
6425
|
ggml_set_op_params(result, params, sizeof(params));
|
6462
6426
|
|
6463
6427
|
result->op = GGML_OP_ROPE_BACK;
|
@@ -7368,13 +7332,15 @@ struct ggml_tensor * ggml_add_rel_pos_inplace(
|
|
7368
7332
|
return ggml_add_rel_pos_impl(ctx, a, pw, ph, true);
|
7369
7333
|
}
|
7370
7334
|
|
7371
|
-
//
|
7335
|
+
// ggml_unary
|
7372
7336
|
|
7373
7337
|
static struct ggml_tensor * ggml_unary_impl(
|
7374
7338
|
struct ggml_context * ctx,
|
7375
7339
|
struct ggml_tensor * a,
|
7376
7340
|
enum ggml_unary_op op,
|
7377
7341
|
bool inplace) {
|
7342
|
+
GGML_ASSERT(ggml_is_contiguous_1(a));
|
7343
|
+
|
7378
7344
|
bool is_node = false;
|
7379
7345
|
|
7380
7346
|
if (!inplace && (a->grad)) {
|
@@ -9043,17 +9009,6 @@ static void ggml_compute_forward_add_f32(
|
|
9043
9009
|
const int ith = params->ith;
|
9044
9010
|
const int nth = params->nth;
|
9045
9011
|
|
9046
|
-
#ifdef GGML_USE_CLBLAST
|
9047
|
-
if (src1->backend == GGML_BACKEND_TYPE_GPU) {
|
9048
|
-
// TODO: OpenCL kernel support full broadcast
|
9049
|
-
GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
|
9050
|
-
if (ith == 0) {
|
9051
|
-
ggml_cl_add(src0, src1, dst);
|
9052
|
-
}
|
9053
|
-
return;
|
9054
|
-
}
|
9055
|
-
#endif
|
9056
|
-
|
9057
9012
|
const int nr = ggml_nrows(src0);
|
9058
9013
|
|
9059
9014
|
GGML_TENSOR_BINARY_OP_LOCALS
|
@@ -10161,17 +10116,6 @@ static void ggml_compute_forward_mul_f32(
|
|
10161
10116
|
const int ith = params->ith;
|
10162
10117
|
const int nth = params->nth;
|
10163
10118
|
|
10164
|
-
#if defined(GGML_USE_CLBLAST)
|
10165
|
-
if (src1->backend == GGML_BACKEND_TYPE_GPU) {
|
10166
|
-
// TODO: OpenCL kernel support full broadcast
|
10167
|
-
GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
|
10168
|
-
if (ith == 0) {
|
10169
|
-
ggml_cl_mul(src0, src1, dst);
|
10170
|
-
}
|
10171
|
-
return;
|
10172
|
-
}
|
10173
|
-
#endif
|
10174
|
-
|
10175
10119
|
const int64_t nr = ggml_nrows(src0);
|
10176
10120
|
|
10177
10121
|
GGML_TENSOR_BINARY_OP_LOCALS
|
@@ -11061,6 +11005,8 @@ static void ggml_compute_forward_abs_f32(
|
|
11061
11005
|
const struct ggml_tensor * src0 = dst->src[0];
|
11062
11006
|
|
11063
11007
|
assert(params->ith == 0);
|
11008
|
+
assert(ggml_is_contiguous_1(src0));
|
11009
|
+
assert(ggml_is_contiguous_1(dst));
|
11064
11010
|
assert(ggml_are_same_shape(src0, dst));
|
11065
11011
|
|
11066
11012
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
@@ -11070,9 +11016,6 @@ static void ggml_compute_forward_abs_f32(
|
|
11070
11016
|
const int n = ggml_nrows(src0);
|
11071
11017
|
const int nc = src0->ne[0];
|
11072
11018
|
|
11073
|
-
assert(dst->nb[0] == sizeof(float));
|
11074
|
-
assert(src0->nb[0] == sizeof(float));
|
11075
|
-
|
11076
11019
|
for (int i = 0; i < n; i++) {
|
11077
11020
|
ggml_vec_abs_f32(nc,
|
11078
11021
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
@@ -11107,6 +11050,8 @@ static void ggml_compute_forward_sgn_f32(
|
|
11107
11050
|
const struct ggml_tensor * src0 = dst->src[0];
|
11108
11051
|
|
11109
11052
|
assert(params->ith == 0);
|
11053
|
+
assert(ggml_is_contiguous_1(src0));
|
11054
|
+
assert(ggml_is_contiguous_1(dst));
|
11110
11055
|
assert(ggml_are_same_shape(src0, dst));
|
11111
11056
|
|
11112
11057
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
@@ -11116,9 +11061,6 @@ static void ggml_compute_forward_sgn_f32(
|
|
11116
11061
|
const int n = ggml_nrows(src0);
|
11117
11062
|
const int nc = src0->ne[0];
|
11118
11063
|
|
11119
|
-
assert(dst->nb[0] == sizeof(float));
|
11120
|
-
assert(src0->nb[0] == sizeof(float));
|
11121
|
-
|
11122
11064
|
for (int i = 0; i < n; i++) {
|
11123
11065
|
ggml_vec_sgn_f32(nc,
|
11124
11066
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
@@ -11153,6 +11095,8 @@ static void ggml_compute_forward_neg_f32(
|
|
11153
11095
|
const struct ggml_tensor * src0 = dst->src[0];
|
11154
11096
|
|
11155
11097
|
assert(params->ith == 0);
|
11098
|
+
assert(ggml_is_contiguous_1(src0));
|
11099
|
+
assert(ggml_is_contiguous_1(dst));
|
11156
11100
|
assert(ggml_are_same_shape(src0, dst));
|
11157
11101
|
|
11158
11102
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
@@ -11162,9 +11106,6 @@ static void ggml_compute_forward_neg_f32(
|
|
11162
11106
|
const int n = ggml_nrows(src0);
|
11163
11107
|
const int nc = src0->ne[0];
|
11164
11108
|
|
11165
|
-
assert(dst->nb[0] == sizeof(float));
|
11166
|
-
assert(src0->nb[0] == sizeof(float));
|
11167
|
-
|
11168
11109
|
for (int i = 0; i < n; i++) {
|
11169
11110
|
ggml_vec_neg_f32(nc,
|
11170
11111
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
@@ -11199,6 +11140,8 @@ static void ggml_compute_forward_step_f32(
|
|
11199
11140
|
const struct ggml_tensor * src0 = dst->src[0];
|
11200
11141
|
|
11201
11142
|
assert(params->ith == 0);
|
11143
|
+
assert(ggml_is_contiguous_1(src0));
|
11144
|
+
assert(ggml_is_contiguous_1(dst));
|
11202
11145
|
assert(ggml_are_same_shape(src0, dst));
|
11203
11146
|
|
11204
11147
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
@@ -11208,9 +11151,6 @@ static void ggml_compute_forward_step_f32(
|
|
11208
11151
|
const int n = ggml_nrows(src0);
|
11209
11152
|
const int nc = src0->ne[0];
|
11210
11153
|
|
11211
|
-
assert(dst->nb[0] == sizeof(float));
|
11212
|
-
assert(src0->nb[0] == sizeof(float));
|
11213
|
-
|
11214
11154
|
for (int i = 0; i < n; i++) {
|
11215
11155
|
ggml_vec_step_f32(nc,
|
11216
11156
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
@@ -11245,6 +11185,8 @@ static void ggml_compute_forward_tanh_f32(
|
|
11245
11185
|
const struct ggml_tensor * src0 = dst->src[0];
|
11246
11186
|
|
11247
11187
|
assert(params->ith == 0);
|
11188
|
+
assert(ggml_is_contiguous_1(src0));
|
11189
|
+
assert(ggml_is_contiguous_1(dst));
|
11248
11190
|
assert(ggml_are_same_shape(src0, dst));
|
11249
11191
|
|
11250
11192
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
@@ -11254,9 +11196,6 @@ static void ggml_compute_forward_tanh_f32(
|
|
11254
11196
|
const int n = ggml_nrows(src0);
|
11255
11197
|
const int nc = src0->ne[0];
|
11256
11198
|
|
11257
|
-
assert(dst->nb[0] == sizeof(float));
|
11258
|
-
assert(src0->nb[0] == sizeof(float));
|
11259
|
-
|
11260
11199
|
for (int i = 0; i < n; i++) {
|
11261
11200
|
ggml_vec_tanh_f32(nc,
|
11262
11201
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
@@ -11291,6 +11230,8 @@ static void ggml_compute_forward_elu_f32(
|
|
11291
11230
|
const struct ggml_tensor * src0 = dst->src[0];
|
11292
11231
|
|
11293
11232
|
assert(params->ith == 0);
|
11233
|
+
assert(ggml_is_contiguous_1(src0));
|
11234
|
+
assert(ggml_is_contiguous_1(dst));
|
11294
11235
|
assert(ggml_are_same_shape(src0, dst));
|
11295
11236
|
|
11296
11237
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
@@ -11300,9 +11241,6 @@ static void ggml_compute_forward_elu_f32(
|
|
11300
11241
|
const int n = ggml_nrows(src0);
|
11301
11242
|
const int nc = src0->ne[0];
|
11302
11243
|
|
11303
|
-
assert(dst->nb[0] == sizeof(float));
|
11304
|
-
assert(src0->nb[0] == sizeof(float));
|
11305
|
-
|
11306
11244
|
for (int i = 0; i < n; i++) {
|
11307
11245
|
ggml_vec_elu_f32(nc,
|
11308
11246
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
@@ -11337,6 +11275,8 @@ static void ggml_compute_forward_relu_f32(
|
|
11337
11275
|
const struct ggml_tensor * src0 = dst->src[0];
|
11338
11276
|
|
11339
11277
|
assert(params->ith == 0);
|
11278
|
+
assert(ggml_is_contiguous_1(src0));
|
11279
|
+
assert(ggml_is_contiguous_1(dst));
|
11340
11280
|
assert(ggml_are_same_shape(src0, dst));
|
11341
11281
|
|
11342
11282
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
@@ -11346,9 +11286,6 @@ static void ggml_compute_forward_relu_f32(
|
|
11346
11286
|
const int n = ggml_nrows(src0);
|
11347
11287
|
const int nc = src0->ne[0];
|
11348
11288
|
|
11349
|
-
assert(dst->nb[0] == sizeof(float));
|
11350
|
-
assert(src0->nb[0] == sizeof(float));
|
11351
|
-
|
11352
11289
|
for (int i = 0; i < n; i++) {
|
11353
11290
|
ggml_vec_relu_f32(nc,
|
11354
11291
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
@@ -11383,6 +11320,8 @@ static void ggml_compute_forward_sigmoid_f32(
|
|
11383
11320
|
const struct ggml_tensor * src0 = dst->src[0];
|
11384
11321
|
|
11385
11322
|
assert(params->ith == 0);
|
11323
|
+
assert(ggml_is_contiguous_1(src0));
|
11324
|
+
assert(ggml_is_contiguous_1(dst));
|
11386
11325
|
assert(ggml_are_same_shape(src0, dst));
|
11387
11326
|
|
11388
11327
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
@@ -11392,9 +11331,6 @@ static void ggml_compute_forward_sigmoid_f32(
|
|
11392
11331
|
const int n = ggml_nrows(src0);
|
11393
11332
|
const int nc = src0->ne[0];
|
11394
11333
|
|
11395
|
-
assert(dst->nb[0] == sizeof(float));
|
11396
|
-
assert(src0->nb[0] == sizeof(float));
|
11397
|
-
|
11398
11334
|
for (int i = 0; i < n; i++) {
|
11399
11335
|
ggml_vec_sigmoid_f32(nc,
|
11400
11336
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
@@ -11428,9 +11364,9 @@ static void ggml_compute_forward_gelu_f32(
|
|
11428
11364
|
|
11429
11365
|
const struct ggml_tensor * src0 = dst->src[0];
|
11430
11366
|
|
11431
|
-
|
11432
|
-
|
11433
|
-
|
11367
|
+
assert(ggml_is_contiguous_1(src0));
|
11368
|
+
assert(ggml_is_contiguous_1(dst));
|
11369
|
+
assert(ggml_are_same_shape(src0, dst));
|
11434
11370
|
|
11435
11371
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
11436
11372
|
return;
|
@@ -11491,9 +11427,9 @@ static void ggml_compute_forward_gelu_quick_f32(
|
|
11491
11427
|
|
11492
11428
|
const struct ggml_tensor * src0 = dst->src[0];
|
11493
11429
|
|
11494
|
-
|
11495
|
-
|
11496
|
-
|
11430
|
+
assert(ggml_is_contiguous_1(src0));
|
11431
|
+
assert(ggml_is_contiguous_1(dst));
|
11432
|
+
assert(ggml_are_same_shape(src0, dst));
|
11497
11433
|
|
11498
11434
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
11499
11435
|
return;
|
@@ -11554,9 +11490,9 @@ static void ggml_compute_forward_silu_f32(
|
|
11554
11490
|
|
11555
11491
|
const struct ggml_tensor * src0 = dst->src[0];
|
11556
11492
|
|
11557
|
-
|
11558
|
-
|
11559
|
-
|
11493
|
+
assert(ggml_is_contiguous_1(src0));
|
11494
|
+
assert(ggml_is_contiguous_1(dst));
|
11495
|
+
assert(ggml_are_same_shape(src0, dst));
|
11560
11496
|
|
11561
11497
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
11562
11498
|
return;
|
@@ -11617,6 +11553,8 @@ static void ggml_compute_forward_leaky_relu_f32(
|
|
11617
11553
|
const struct ggml_tensor * src0 = dst->src[0];
|
11618
11554
|
|
11619
11555
|
assert(params->ith == 0);
|
11556
|
+
assert(ggml_is_contiguous_1(src0));
|
11557
|
+
assert(ggml_is_contiguous_1(dst));
|
11620
11558
|
assert(ggml_are_same_shape(src0, dst));
|
11621
11559
|
|
11622
11560
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
@@ -11666,11 +11604,11 @@ static void ggml_compute_forward_silu_back_f32(
|
|
11666
11604
|
const struct ggml_tensor * src0 = dst->src[0];
|
11667
11605
|
const struct ggml_tensor * grad = dst->src[1];
|
11668
11606
|
|
11669
|
-
|
11670
|
-
|
11671
|
-
|
11672
|
-
|
11673
|
-
|
11607
|
+
assert(ggml_is_contiguous_1(grad));
|
11608
|
+
assert(ggml_is_contiguous_1(src0));
|
11609
|
+
assert(ggml_is_contiguous_1(dst));
|
11610
|
+
assert(ggml_are_same_shape(src0, dst));
|
11611
|
+
assert(ggml_are_same_shape(src0, grad));
|
11674
11612
|
|
11675
11613
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
11676
11614
|
return;
|
@@ -11732,6 +11670,8 @@ static void ggml_compute_forward_hardswish_f32(
|
|
11732
11670
|
const struct ggml_tensor * src0 = dst->src[0];
|
11733
11671
|
|
11734
11672
|
assert(params->ith == 0);
|
11673
|
+
assert(ggml_is_contiguous_1(src0));
|
11674
|
+
assert(ggml_is_contiguous_1(dst));
|
11735
11675
|
assert(ggml_are_same_shape(src0, dst));
|
11736
11676
|
|
11737
11677
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
@@ -11741,9 +11681,6 @@ static void ggml_compute_forward_hardswish_f32(
|
|
11741
11681
|
const int n = ggml_nrows(src0);
|
11742
11682
|
const int nc = src0->ne[0];
|
11743
11683
|
|
11744
|
-
assert(dst->nb[0] == sizeof(float));
|
11745
|
-
assert(src0->nb[0] == sizeof(float));
|
11746
|
-
|
11747
11684
|
for (int i = 0; i < n; i++) {
|
11748
11685
|
ggml_vec_hardswish_f32(nc,
|
11749
11686
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
@@ -11775,6 +11712,8 @@ static void ggml_compute_forward_hardsigmoid_f32(
|
|
11775
11712
|
const struct ggml_tensor * src0 = dst->src[0];
|
11776
11713
|
|
11777
11714
|
assert(params->ith == 0);
|
11715
|
+
assert(ggml_is_contiguous_1(src0));
|
11716
|
+
assert(ggml_is_contiguous_1(dst));
|
11778
11717
|
assert(ggml_are_same_shape(src0, dst));
|
11779
11718
|
|
11780
11719
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
@@ -11784,9 +11723,6 @@ static void ggml_compute_forward_hardsigmoid_f32(
|
|
11784
11723
|
const int n = ggml_nrows(src0);
|
11785
11724
|
const int nc = src0->ne[0];
|
11786
11725
|
|
11787
|
-
assert(dst->nb[0] == sizeof(float));
|
11788
|
-
assert(src0->nb[0] == sizeof(float));
|
11789
|
-
|
11790
11726
|
for (int i = 0; i < n; i++) {
|
11791
11727
|
ggml_vec_hardsigmoid_f32(nc,
|
11792
11728
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
@@ -12237,39 +12173,6 @@ static void ggml_compute_forward_group_norm(
|
|
12237
12173
|
|
12238
12174
|
// ggml_compute_forward_mul_mat
|
12239
12175
|
|
12240
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
12241
|
-
// helper function to determine if it is better to use BLAS or not
|
12242
|
-
// for large matrices, BLAS is faster
|
12243
|
-
static bool ggml_compute_forward_mul_mat_use_blas(struct ggml_tensor * dst) {
|
12244
|
-
const struct ggml_tensor * src0 = dst->src[0];
|
12245
|
-
const struct ggml_tensor * src1 = dst->src[1];
|
12246
|
-
|
12247
|
-
//const int64_t ne00 = src0->ne[0];
|
12248
|
-
//const int64_t ne01 = src0->ne[1];
|
12249
|
-
|
12250
|
-
const int64_t ne10 = src1->ne[0];
|
12251
|
-
|
12252
|
-
const int64_t ne0 = dst->ne[0];
|
12253
|
-
const int64_t ne1 = dst->ne[1];
|
12254
|
-
|
12255
|
-
// NOTE: with GGML_OP_MUL_MAT_ID we don't want to go through the BLAS branch because it will dequantize (to_float)
|
12256
|
-
// all the experts for each batch element and the processing would become incredibly slow
|
12257
|
-
// TODO: find the optimal values for these
|
12258
|
-
if (dst->op != GGML_OP_MUL_MAT_ID &&
|
12259
|
-
ggml_is_contiguous(src0) &&
|
12260
|
-
ggml_is_contiguous(src1) &&
|
12261
|
-
//src0->type == GGML_TYPE_F32 &&
|
12262
|
-
src1->type == GGML_TYPE_F32 &&
|
12263
|
-
(ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
|
12264
|
-
|
12265
|
-
/*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
|
12266
|
-
return true;
|
12267
|
-
}
|
12268
|
-
|
12269
|
-
return false;
|
12270
|
-
}
|
12271
|
-
#endif
|
12272
|
-
|
12273
12176
|
static void ggml_compute_forward_mul_mat_one_chunk(
|
12274
12177
|
const struct ggml_compute_params * params,
|
12275
12178
|
struct ggml_tensor * dst,
|
@@ -12407,82 +12310,6 @@ static void ggml_compute_forward_mul_mat(
|
|
12407
12310
|
// nb01 >= nb00 - src0 is not transposed
|
12408
12311
|
// compute by src0 rows
|
12409
12312
|
|
12410
|
-
#if defined(GGML_USE_CLBLAST)
|
12411
|
-
if (ggml_cl_can_mul_mat(src0, src1, dst)) {
|
12412
|
-
if (params->ith == 0 && params->type == GGML_TASK_TYPE_COMPUTE) {
|
12413
|
-
ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
12414
|
-
}
|
12415
|
-
return;
|
12416
|
-
}
|
12417
|
-
#endif
|
12418
|
-
|
12419
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
12420
|
-
if (ggml_compute_forward_mul_mat_use_blas(dst)) {
|
12421
|
-
const int64_t ne_plane = ne01*ne00;
|
12422
|
-
const size_t desired_wsize = ne13*ne12*ne_plane*sizeof(float);
|
12423
|
-
UNUSED(desired_wsize);
|
12424
|
-
|
12425
|
-
if (params->type == GGML_TASK_TYPE_INIT) {
|
12426
|
-
if (type != GGML_TYPE_F32) {
|
12427
|
-
assert(params->wsize >= desired_wsize);
|
12428
|
-
// parallelize by src0 rows
|
12429
|
-
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
12430
|
-
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
12431
|
-
// broadcast src0 into src1 across 2nd,3rd dimension
|
12432
|
-
const int64_t i03 = i13/r3;
|
12433
|
-
const int64_t i02 = i12/r2;
|
12434
|
-
|
12435
|
-
const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
|
12436
|
-
float * const wdata = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
|
12437
|
-
ggml_to_float_t const to_float = type_traits[type].to_float;
|
12438
|
-
|
12439
|
-
for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
|
12440
|
-
to_float((const char *) x + i01*nb01, wdata + i01*ne00, ne00);
|
12441
|
-
}
|
12442
|
-
}
|
12443
|
-
}
|
12444
|
-
}
|
12445
|
-
return;
|
12446
|
-
}
|
12447
|
-
|
12448
|
-
if (params->type == GGML_TASK_TYPE_FINALIZE) {
|
12449
|
-
return;
|
12450
|
-
}
|
12451
|
-
|
12452
|
-
// perform sgemm, parallelization controlled by blas lib
|
12453
|
-
if (ith != 0) {
|
12454
|
-
return;
|
12455
|
-
}
|
12456
|
-
|
12457
|
-
//const int64_t tgemm0 = ggml_perf_time_us();
|
12458
|
-
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
12459
|
-
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
12460
|
-
const int64_t i03 = i13/r3;
|
12461
|
-
const int64_t i02 = i12/r2;
|
12462
|
-
|
12463
|
-
const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
|
12464
|
-
const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
|
12465
|
-
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
12466
|
-
|
12467
|
-
if (type != GGML_TYPE_F32) {
|
12468
|
-
x = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
|
12469
|
-
}
|
12470
|
-
|
12471
|
-
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
12472
|
-
ne1, ne01, ne10,
|
12473
|
-
1.0f, y, ne10,
|
12474
|
-
x, ne00,
|
12475
|
-
0.0f, d, ne01);
|
12476
|
-
}
|
12477
|
-
}
|
12478
|
-
//printf("cblas_sgemm = %.3f ms, %lld flops\n", (ggml_perf_time_us() - tgemm0)/1000.0, ne13*ne12*ne1*ne01*ne10*2);
|
12479
|
-
|
12480
|
-
//printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
|
12481
|
-
|
12482
|
-
return;
|
12483
|
-
}
|
12484
|
-
#endif
|
12485
|
-
|
12486
12313
|
#if GGML_USE_LLAMAFILE
|
12487
12314
|
const bool src1_cont = ggml_is_contiguous(src1);
|
12488
12315
|
|
@@ -12863,21 +12690,7 @@ static void ggml_compute_forward_out_prod_f32(
|
|
12863
12690
|
// nb01 >= nb00 - src0 is not transposed
|
12864
12691
|
// compute by src0 rows
|
12865
12692
|
|
12866
|
-
// TODO: #if defined(GGML_USE_CLBLAST)
|
12867
|
-
|
12868
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
12869
|
-
bool use_blas = ggml_is_matrix(src0) &&
|
12870
|
-
ggml_is_matrix(src1) &&
|
12871
|
-
ggml_is_contiguous(src0) &&
|
12872
|
-
(ggml_is_contiguous(src1) || ggml_is_transposed(src1));
|
12873
|
-
#endif
|
12874
|
-
|
12875
12693
|
if (params->type == GGML_TASK_TYPE_INIT) {
|
12876
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) // gemm beta will zero dst
|
12877
|
-
if (use_blas) {
|
12878
|
-
return;
|
12879
|
-
}
|
12880
|
-
#endif
|
12881
12694
|
if (ith != 0) {
|
12882
12695
|
return;
|
12883
12696
|
}
|
@@ -12889,50 +12702,6 @@ static void ggml_compute_forward_out_prod_f32(
|
|
12889
12702
|
return;
|
12890
12703
|
}
|
12891
12704
|
|
12892
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
12893
|
-
if (use_blas) {
|
12894
|
-
if (params->ith != 0) { // All threads other than the first do no work.
|
12895
|
-
return;
|
12896
|
-
}
|
12897
|
-
// Arguments to ggml_compute_forward_out_prod (expressed as major,minor)
|
12898
|
-
// src0: (k,n)
|
12899
|
-
// src1: (k,m)
|
12900
|
-
// dst: (m,n)
|
12901
|
-
//
|
12902
|
-
// Arguments to sgemm (see https://github.com/Reference-LAPACK/lapack/blob/master/BLAS/SRC/sgemm.f)
|
12903
|
-
// Also expressed as (major,minor)
|
12904
|
-
// a: (m,k): so src1 transposed
|
12905
|
-
// b: (k,n): so src0
|
12906
|
-
// c: (m,n)
|
12907
|
-
//
|
12908
|
-
// However, if ggml_is_transposed(src1) is true, then
|
12909
|
-
// src1->data already contains a transposed version, so sgemm mustn't
|
12910
|
-
// transpose it further.
|
12911
|
-
|
12912
|
-
int n = src0->ne[0];
|
12913
|
-
int k = src0->ne[1];
|
12914
|
-
int m = src1->ne[0];
|
12915
|
-
|
12916
|
-
int transposeA, lda;
|
12917
|
-
|
12918
|
-
if (!ggml_is_transposed(src1)) {
|
12919
|
-
transposeA = CblasTrans;
|
12920
|
-
lda = m;
|
12921
|
-
} else {
|
12922
|
-
transposeA = CblasNoTrans;
|
12923
|
-
lda = k;
|
12924
|
-
}
|
12925
|
-
|
12926
|
-
float * a = (float *) ((char *) src1->data);
|
12927
|
-
float * b = (float *) ((char *) src0->data);
|
12928
|
-
float * c = (float *) ((char *) dst->data);
|
12929
|
-
|
12930
|
-
cblas_sgemm(CblasRowMajor, transposeA, CblasNoTrans, m, n, k, 1.0, a, lda, b, n, 0.0, c, n);
|
12931
|
-
|
12932
|
-
return;
|
12933
|
-
}
|
12934
|
-
#endif
|
12935
|
-
|
12936
12705
|
// dst[:,:,:,:] = 0
|
12937
12706
|
// for i2,i3:
|
12938
12707
|
// for i1:
|
@@ -13062,8 +12831,6 @@ static void ggml_compute_forward_out_prod_q_f32(
|
|
13062
12831
|
// nb01 >= nb00 - src0 is not transposed
|
13063
12832
|
// compute by src0 rows
|
13064
12833
|
|
13065
|
-
// TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
13066
|
-
|
13067
12834
|
if (params->type == GGML_TASK_TYPE_INIT) {
|
13068
12835
|
if (ith != 0) {
|
13069
12836
|
return;
|
@@ -13460,6 +13227,8 @@ static void ggml_compute_forward_get_rows_q(
|
|
13460
13227
|
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
|
13461
13228
|
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
13462
13229
|
|
13230
|
+
assert(i01 >= 0 && i01 < ne01);
|
13231
|
+
|
13463
13232
|
dequantize_row_q(
|
13464
13233
|
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
13465
13234
|
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
@@ -13503,6 +13272,8 @@ static void ggml_compute_forward_get_rows_f16(
|
|
13503
13272
|
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
|
13504
13273
|
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
13505
13274
|
|
13275
|
+
assert(i01 >= 0 && i01 < ne01);
|
13276
|
+
|
13506
13277
|
ggml_fp16_to_fp32_row(
|
13507
13278
|
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
13508
13279
|
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
@@ -13546,7 +13317,9 @@ static void ggml_compute_forward_get_rows_bf16(
|
|
13546
13317
|
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
|
13547
13318
|
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
13548
13319
|
|
13549
|
-
|
13320
|
+
assert(i01 >= 0 && i01 < ne01);
|
13321
|
+
|
13322
|
+
ggml_bf16_to_fp32_row(
|
13550
13323
|
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
13551
13324
|
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
13552
13325
|
}
|
@@ -13589,6 +13362,8 @@ static void ggml_compute_forward_get_rows_f32(
|
|
13589
13362
|
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
|
13590
13363
|
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
13591
13364
|
|
13365
|
+
assert(i01 >= 0 && i01 < ne01);
|
13366
|
+
|
13592
13367
|
ggml_vec_cpy_f32(nc,
|
13593
13368
|
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3),
|
13594
13369
|
(float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03));
|
@@ -14259,8 +14034,7 @@ static float rope_yarn_ramp(const float low, const float high, const int i0) {
|
|
14259
14034
|
// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
|
14260
14035
|
static void rope_yarn(
|
14261
14036
|
float theta_extrap, float freq_scale, float corr_dims[2], int64_t i0, float ext_factor, float mscale,
|
14262
|
-
float * cos_theta, float * sin_theta
|
14263
|
-
) {
|
14037
|
+
float * cos_theta, float * sin_theta) {
|
14264
14038
|
// Get n-d rotational scaling corrected for extrapolation
|
14265
14039
|
float theta_interp = freq_scale * theta_extrap;
|
14266
14040
|
float theta = theta_interp;
|
@@ -14277,18 +14051,19 @@ static void rope_yarn(
|
|
14277
14051
|
|
14278
14052
|
// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
|
14279
14053
|
// `corr_dim(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
|
14280
|
-
static float ggml_rope_yarn_corr_dim(int n_dims, int
|
14281
|
-
return n_dims * logf(
|
14054
|
+
static float ggml_rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) {
|
14055
|
+
return n_dims * logf(n_ctx_orig / (n_rot * 2 * (float)M_PI)) / (2 * logf(base));
|
14282
14056
|
}
|
14283
14057
|
|
14284
14058
|
static void ggml_rope_cache_init(
|
14285
|
-
float theta_base, float freq_scale, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
|
14286
|
-
float * cache, float sin_sign, float theta_scale
|
14287
|
-
|
14059
|
+
float theta_base, float freq_scale, const float * freq_factors, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
|
14060
|
+
float * cache, float sin_sign, float theta_scale) {
|
14061
|
+
// ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
|
14288
14062
|
float theta = theta_base;
|
14289
14063
|
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
14064
|
+
const float ff = freq_factors ? freq_factors[i0/2] : 1.0f;
|
14290
14065
|
rope_yarn(
|
14291
|
-
theta, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1]
|
14066
|
+
theta/ff, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1]
|
14292
14067
|
);
|
14293
14068
|
cache[i0 + 1] *= sin_sign;
|
14294
14069
|
|
@@ -14297,11 +14072,11 @@ static void ggml_rope_cache_init(
|
|
14297
14072
|
}
|
14298
14073
|
|
14299
14074
|
GGML_CALL void ggml_rope_yarn_corr_dims(
|
14300
|
-
int n_dims, int
|
14075
|
+
int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]
|
14301
14076
|
) {
|
14302
14077
|
// start and end correction dims
|
14303
|
-
float start = floorf(ggml_rope_yarn_corr_dim(n_dims,
|
14304
|
-
float end = ceilf(ggml_rope_yarn_corr_dim(n_dims,
|
14078
|
+
float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_fast, freq_base));
|
14079
|
+
float end = ceilf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_slow, freq_base));
|
14305
14080
|
dims[0] = MAX(0, start);
|
14306
14081
|
dims[1] = MIN(n_dims - 1, end);
|
14307
14082
|
}
|
@@ -14321,15 +14096,11 @@ static void ggml_compute_forward_rope_f32(
|
|
14321
14096
|
|
14322
14097
|
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
14323
14098
|
|
14324
|
-
// these two only relevant for xPos RoPE:
|
14325
|
-
float xpos_base;
|
14326
|
-
bool xpos_down;
|
14327
|
-
|
14328
14099
|
//const int n_past = ((int32_t *) dst->op_params)[0];
|
14329
14100
|
const int n_dims = ((int32_t *) dst->op_params)[1];
|
14330
14101
|
const int mode = ((int32_t *) dst->op_params)[2];
|
14331
|
-
const int n_ctx = ((int32_t *) dst->op_params)[3];
|
14332
|
-
const int
|
14102
|
+
//const int n_ctx = ((int32_t *) dst->op_params)[3];
|
14103
|
+
const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
|
14333
14104
|
|
14334
14105
|
memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
|
14335
14106
|
memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
|
@@ -14337,8 +14108,6 @@ static void ggml_compute_forward_rope_f32(
|
|
14337
14108
|
memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
|
14338
14109
|
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
14339
14110
|
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
14340
|
-
memcpy(&xpos_base, (int32_t *) dst->op_params + 11, sizeof(float));
|
14341
|
-
memcpy(&xpos_down, (int32_t *) dst->op_params + 12, sizeof(bool));
|
14342
14111
|
|
14343
14112
|
GGML_TENSOR_UNARY_OP_LOCALS
|
14344
14113
|
|
@@ -14368,20 +14137,15 @@ static void ggml_compute_forward_rope_f32(
|
|
14368
14137
|
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
14369
14138
|
|
14370
14139
|
float corr_dims[2];
|
14371
|
-
ggml_rope_yarn_corr_dims(n_dims,
|
14140
|
+
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
|
14372
14141
|
|
14373
14142
|
const bool is_neox = mode & 2;
|
14374
|
-
const bool is_glm = mode & 4;
|
14375
14143
|
|
14376
14144
|
const float * freq_factors = NULL;
|
14377
|
-
if (
|
14378
|
-
|
14379
|
-
|
14380
|
-
|
14381
|
-
freq_factors = (const float *) src2->data;
|
14382
|
-
}
|
14383
|
-
} else {
|
14384
|
-
GGML_ASSERT(src2 == NULL && "TODO: freq_factors not implemented for !is_neox");
|
14145
|
+
if (src2 != NULL) {
|
14146
|
+
GGML_ASSERT(src2->type == GGML_TYPE_F32);
|
14147
|
+
GGML_ASSERT(src2->ne[0] >= n_dims / 2);
|
14148
|
+
freq_factors = (const float *) src2->data;
|
14385
14149
|
}
|
14386
14150
|
|
14387
14151
|
// backward process uses inverse rotation by cos and sin.
|
@@ -14396,94 +14160,50 @@ static void ggml_compute_forward_rope_f32(
|
|
14396
14160
|
const int64_t p = pos[i2];
|
14397
14161
|
|
14398
14162
|
float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
|
14399
|
-
|
14400
|
-
ggml_rope_cache_init(p, freq_scale, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
|
14401
|
-
}
|
14163
|
+
ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
|
14402
14164
|
|
14403
14165
|
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
14404
14166
|
if (ir++ < ir0) continue;
|
14405
14167
|
if (ir > ir1) break;
|
14406
14168
|
|
14407
|
-
|
14408
|
-
|
14409
|
-
if (is_glm) {
|
14410
|
-
theta_base = MIN(p, n_ctx - 2);
|
14411
|
-
float block_theta = MAX(p - (n_ctx - 2), 0);
|
14412
|
-
for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
|
14413
|
-
const float cos_theta = cosf(theta_base);
|
14414
|
-
const float sin_theta = sinf(theta_base) * sin_sign;
|
14415
|
-
const float cos_block_theta = cosf(block_theta);
|
14416
|
-
const float sin_block_theta = sinf(block_theta) * sin_sign;
|
14417
|
-
|
14418
|
-
theta_base *= theta_scale;
|
14419
|
-
block_theta *= theta_scale;
|
14420
|
-
|
14421
|
-
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
14422
|
-
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
14423
|
-
|
14424
|
-
const float x0 = src[0];
|
14425
|
-
const float x1 = src[n_dims/2];
|
14426
|
-
const float x2 = src[n_dims];
|
14427
|
-
const float x3 = src[n_dims/2*3];
|
14428
|
-
|
14429
|
-
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
14430
|
-
dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
|
14431
|
-
dst_data[n_dims] = x2*cos_block_theta - x3*sin_block_theta;
|
14432
|
-
dst_data[n_dims/2*3] = x2*sin_block_theta + x3*cos_block_theta;
|
14433
|
-
}
|
14434
|
-
} else if (!is_neox) {
|
14435
|
-
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
14169
|
+
if (!is_neox) {
|
14170
|
+
for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
|
14436
14171
|
const float cos_theta = cache[i0 + 0];
|
14437
14172
|
const float sin_theta = cache[i0 + 1];
|
14438
14173
|
|
14439
|
-
// zeta scaling for xPos only:
|
14440
|
-
float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
|
14441
|
-
if (xpos_down) zeta = 1.0f / zeta;
|
14442
|
-
|
14443
14174
|
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
14444
14175
|
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
14445
14176
|
|
14446
14177
|
const float x0 = src[0];
|
14447
14178
|
const float x1 = src[1];
|
14448
14179
|
|
14449
|
-
dst_data[0] = x0*cos_theta
|
14450
|
-
dst_data[1] = x0*sin_theta
|
14180
|
+
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
14181
|
+
dst_data[1] = x0*sin_theta + x1*cos_theta;
|
14451
14182
|
}
|
14452
14183
|
} else {
|
14453
|
-
|
14454
|
-
|
14455
|
-
if (ic < n_dims) {
|
14456
|
-
const int64_t i0 = ic/2;
|
14457
|
-
|
14458
|
-
const float freq_factor = freq_factors ? freq_factors[i0] : 1.0f;
|
14459
|
-
|
14460
|
-
float cos_theta, sin_theta;
|
14461
|
-
rope_yarn(
|
14462
|
-
theta_base/freq_factor, freq_scale, corr_dims, ic, ext_factor, attn_factor,
|
14463
|
-
&cos_theta, &sin_theta
|
14464
|
-
);
|
14184
|
+
for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
|
14185
|
+
const int64_t ic = i0/2;
|
14465
14186
|
|
14466
|
-
|
14467
|
-
|
14187
|
+
const float cos_theta = cache[i0 + 0];
|
14188
|
+
const float sin_theta = cache[i0 + 1];
|
14468
14189
|
|
14469
|
-
|
14470
|
-
|
14190
|
+
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
|
14191
|
+
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
|
14471
14192
|
|
14472
|
-
|
14473
|
-
|
14193
|
+
const float x0 = src[0];
|
14194
|
+
const float x1 = src[n_dims/2];
|
14474
14195
|
|
14475
|
-
|
14476
|
-
|
14477
|
-
|
14478
|
-
|
14196
|
+
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
14197
|
+
dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
|
14198
|
+
}
|
14199
|
+
}
|
14479
14200
|
|
14480
|
-
|
14481
|
-
|
14201
|
+
for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
|
14202
|
+
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
14203
|
+
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
14482
14204
|
|
14483
|
-
|
14484
|
-
|
14485
|
-
}
|
14486
|
-
}
|
14205
|
+
dst_data[0] = src[0];
|
14206
|
+
dst_data[1] = src[1];
|
14487
14207
|
}
|
14488
14208
|
}
|
14489
14209
|
}
|
@@ -14509,8 +14229,8 @@ static void ggml_compute_forward_rope_f16(
|
|
14509
14229
|
//const int n_past = ((int32_t *) dst->op_params)[0];
|
14510
14230
|
const int n_dims = ((int32_t *) dst->op_params)[1];
|
14511
14231
|
const int mode = ((int32_t *) dst->op_params)[2];
|
14512
|
-
const int n_ctx = ((int32_t *) dst->op_params)[3];
|
14513
|
-
const int
|
14232
|
+
//const int n_ctx = ((int32_t *) dst->op_params)[3];
|
14233
|
+
const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
|
14514
14234
|
memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
|
14515
14235
|
memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
|
14516
14236
|
memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
|
@@ -14546,20 +14266,15 @@ static void ggml_compute_forward_rope_f16(
|
|
14546
14266
|
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
14547
14267
|
|
14548
14268
|
float corr_dims[2];
|
14549
|
-
ggml_rope_yarn_corr_dims(n_dims,
|
14269
|
+
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
|
14550
14270
|
|
14551
14271
|
const bool is_neox = mode & 2;
|
14552
|
-
const bool is_glm = mode & 4;
|
14553
14272
|
|
14554
14273
|
const float * freq_factors = NULL;
|
14555
|
-
if (
|
14556
|
-
|
14557
|
-
|
14558
|
-
|
14559
|
-
freq_factors = (const float *) src2->data;
|
14560
|
-
}
|
14561
|
-
} else {
|
14562
|
-
GGML_ASSERT(src2 == NULL && "TODO: freq_factors not implemented for !is_neox");
|
14274
|
+
if (src2 != NULL) {
|
14275
|
+
GGML_ASSERT(src2->type == GGML_TYPE_F32);
|
14276
|
+
GGML_ASSERT(src2->ne[0] >= n_dims / 2);
|
14277
|
+
freq_factors = (const float *) src2->data;
|
14563
14278
|
}
|
14564
14279
|
|
14565
14280
|
// backward process uses inverse rotation by cos and sin.
|
@@ -14574,43 +14289,14 @@ static void ggml_compute_forward_rope_f16(
|
|
14574
14289
|
const int64_t p = pos[i2];
|
14575
14290
|
|
14576
14291
|
float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
|
14577
|
-
|
14578
|
-
ggml_rope_cache_init(p, freq_scale, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
|
14579
|
-
}
|
14292
|
+
ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
|
14580
14293
|
|
14581
14294
|
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
14582
14295
|
if (ir++ < ir0) continue;
|
14583
14296
|
if (ir > ir1) break;
|
14584
14297
|
|
14585
|
-
|
14586
|
-
|
14587
|
-
if (is_glm) {
|
14588
|
-
theta_base = MIN(p, n_ctx - 2);
|
14589
|
-
float block_theta = MAX(p - (n_ctx - 2), 0);
|
14590
|
-
for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
|
14591
|
-
const float cos_theta = cosf(theta_base);
|
14592
|
-
const float sin_theta = sinf(theta_base) * sin_sign;
|
14593
|
-
const float cos_block_theta = cosf(block_theta);
|
14594
|
-
const float sin_block_theta = sinf(block_theta) * sin_sign;
|
14595
|
-
|
14596
|
-
theta_base *= theta_scale;
|
14597
|
-
block_theta *= theta_scale;
|
14598
|
-
|
14599
|
-
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
14600
|
-
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
14601
|
-
|
14602
|
-
const float x0 = GGML_FP16_TO_FP32(src[0]);
|
14603
|
-
const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
|
14604
|
-
const float x2 = GGML_FP16_TO_FP32(src[n_dims]);
|
14605
|
-
const float x3 = GGML_FP16_TO_FP32(src[n_dims/2*3]);
|
14606
|
-
|
14607
|
-
dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
|
14608
|
-
dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
14609
|
-
dst_data[n_dims] = GGML_FP32_TO_FP16(x2*cos_block_theta - x3*sin_block_theta);
|
14610
|
-
dst_data[n_dims/2*3] = GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta);
|
14611
|
-
}
|
14612
|
-
} else if (!is_neox) {
|
14613
|
-
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
14298
|
+
if (!is_neox) {
|
14299
|
+
for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
|
14614
14300
|
const float cos_theta = cache[i0 + 0];
|
14615
14301
|
const float sin_theta = cache[i0 + 1];
|
14616
14302
|
|
@@ -14624,40 +14310,29 @@ static void ggml_compute_forward_rope_f16(
|
|
14624
14310
|
dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
14625
14311
|
}
|
14626
14312
|
} else {
|
14627
|
-
|
14628
|
-
|
14629
|
-
if (ic < n_dims) {
|
14630
|
-
const int64_t i0 = ic/2;
|
14313
|
+
for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
|
14314
|
+
const int64_t ic = i0/2;
|
14631
14315
|
|
14632
|
-
|
14633
|
-
|
14634
|
-
float cos_theta, sin_theta;
|
14635
|
-
rope_yarn(
|
14636
|
-
theta_base/freq_factor, freq_scale, corr_dims, ic, ext_factor, attn_factor,
|
14637
|
-
&cos_theta, &sin_theta
|
14638
|
-
);
|
14639
|
-
|
14640
|
-
sin_theta *= sin_sign;
|
14641
|
-
theta_base *= theta_scale;
|
14316
|
+
const float cos_theta = cache[i0 + 0];
|
14317
|
+
const float sin_theta = cache[i0 + 1];
|
14642
14318
|
|
14643
|
-
|
14644
|
-
|
14319
|
+
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
|
14320
|
+
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
|
14645
14321
|
|
14646
|
-
|
14647
|
-
|
14322
|
+
const float x0 = GGML_FP16_TO_FP32(src[0]);
|
14323
|
+
const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
|
14648
14324
|
|
14649
|
-
|
14650
|
-
|
14651
|
-
|
14652
|
-
|
14325
|
+
dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
|
14326
|
+
dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
14327
|
+
}
|
14328
|
+
}
|
14653
14329
|
|
14654
|
-
|
14655
|
-
|
14330
|
+
for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
|
14331
|
+
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
14332
|
+
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
14656
14333
|
|
14657
|
-
|
14658
|
-
|
14659
|
-
}
|
14660
|
-
}
|
14334
|
+
dst_data[0] = src[0];
|
14335
|
+
dst_data[1] = src[1];
|
14661
14336
|
}
|
14662
14337
|
}
|
14663
14338
|
}
|
@@ -16844,7 +16519,10 @@ static void ggml_compute_forward_map_unary_f32(
|
|
16844
16519
|
|
16845
16520
|
const struct ggml_tensor * src0 = dst->src[0];
|
16846
16521
|
|
16847
|
-
|
16522
|
+
assert(params->ith == 0);
|
16523
|
+
assert(ggml_is_contiguous_1(src0));
|
16524
|
+
assert(ggml_is_contiguous_1(dst));
|
16525
|
+
assert(ggml_are_same_shape(src0, dst));
|
16848
16526
|
|
16849
16527
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
16850
16528
|
return;
|
@@ -16853,9 +16531,6 @@ static void ggml_compute_forward_map_unary_f32(
|
|
16853
16531
|
const int n = ggml_nrows(src0);
|
16854
16532
|
const int nc = src0->ne[0];
|
16855
16533
|
|
16856
|
-
assert( dst->nb[0] == sizeof(float));
|
16857
|
-
assert(src0->nb[0] == sizeof(float));
|
16858
|
-
|
16859
16534
|
for (int i = 0; i < n; i++) {
|
16860
16535
|
fun(nc,
|
16861
16536
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
@@ -16893,6 +16568,9 @@ static void ggml_compute_forward_map_binary_f32(
|
|
16893
16568
|
const struct ggml_tensor * src1 = dst->src[1];
|
16894
16569
|
|
16895
16570
|
assert(params->ith == 0);
|
16571
|
+
assert(ggml_is_contiguous_1(src0));
|
16572
|
+
assert(ggml_is_contiguous_1(src1));
|
16573
|
+
assert(ggml_is_contiguous_1(dst));
|
16896
16574
|
assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
|
16897
16575
|
|
16898
16576
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
@@ -16902,10 +16580,6 @@ static void ggml_compute_forward_map_binary_f32(
|
|
16902
16580
|
const int n = ggml_nrows(src0);
|
16903
16581
|
const int nc = src0->ne[0];
|
16904
16582
|
|
16905
|
-
assert( dst->nb[0] == sizeof(float));
|
16906
|
-
assert(src0->nb[0] == sizeof(float));
|
16907
|
-
assert(src1->nb[0] == sizeof(float));
|
16908
|
-
|
16909
16583
|
for (int i = 0; i < n; i++) {
|
16910
16584
|
fun(nc,
|
16911
16585
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
@@ -18359,9 +18033,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
18359
18033
|
//const int n_past = ((int32_t *) tensor->op_params)[0];
|
18360
18034
|
const int n_dims = ((int32_t *) tensor->op_params)[1];
|
18361
18035
|
const int mode = ((int32_t *) tensor->op_params)[2];
|
18362
|
-
const int n_ctx = ((int32_t *) tensor->op_params)[3];
|
18363
|
-
const int
|
18364
|
-
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
18036
|
+
//const int n_ctx = ((int32_t *) tensor->op_params)[3];
|
18037
|
+
const int n_ctx_orig = ((int32_t *) tensor->op_params)[4];
|
18038
|
+
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
18365
18039
|
|
18366
18040
|
memcpy(&freq_base, (int32_t *) tensor->op_params + 5, sizeof(float));
|
18367
18041
|
memcpy(&freq_scale, (int32_t *) tensor->op_params + 6, sizeof(float));
|
@@ -18369,8 +18043,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
18369
18043
|
memcpy(&attn_factor, (int32_t *) tensor->op_params + 8, sizeof(float));
|
18370
18044
|
memcpy(&beta_fast, (int32_t *) tensor->op_params + 9, sizeof(float));
|
18371
18045
|
memcpy(&beta_slow, (int32_t *) tensor->op_params + 10, sizeof(float));
|
18372
|
-
memcpy(&xpos_base, (int32_t *) tensor->op_params + 11, sizeof(float));
|
18373
|
-
memcpy(&xpos_down, (int32_t *) tensor->op_params + 12, sizeof(bool));
|
18374
18046
|
|
18375
18047
|
src0->grad = ggml_add_or_set(ctx,
|
18376
18048
|
src0->grad,
|
@@ -18380,16 +18052,13 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
18380
18052
|
src2,
|
18381
18053
|
n_dims,
|
18382
18054
|
mode,
|
18383
|
-
|
18384
|
-
n_orig_ctx,
|
18055
|
+
n_ctx_orig,
|
18385
18056
|
freq_base,
|
18386
18057
|
freq_scale,
|
18387
18058
|
ext_factor,
|
18388
18059
|
attn_factor,
|
18389
18060
|
beta_fast,
|
18390
|
-
beta_slow,
|
18391
|
-
xpos_base,
|
18392
|
-
xpos_down),
|
18061
|
+
beta_slow),
|
18393
18062
|
zero_table);
|
18394
18063
|
}
|
18395
18064
|
} break;
|
@@ -18399,9 +18068,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
18399
18068
|
//const int n_past = ((int32_t *) tensor->op_params)[0];
|
18400
18069
|
const int n_dims = ((int32_t *) tensor->op_params)[1];
|
18401
18070
|
const int mode = ((int32_t *) tensor->op_params)[2];
|
18402
|
-
const int n_ctx = ((int32_t *) tensor->op_params)[3];
|
18403
|
-
const int
|
18404
|
-
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
18071
|
+
//const int n_ctx = ((int32_t *) tensor->op_params)[3];
|
18072
|
+
const int n_ctx_orig = ((int32_t *) tensor->op_params)[4];
|
18073
|
+
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
18405
18074
|
|
18406
18075
|
memcpy(&freq_base, (int32_t *) tensor->op_params + 5, sizeof(float));
|
18407
18076
|
memcpy(&freq_scale, (int32_t *) tensor->op_params + 6, sizeof(float));
|
@@ -18409,8 +18078,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
18409
18078
|
memcpy(&attn_factor, (int32_t *) tensor->op_params + 8, sizeof(float));
|
18410
18079
|
memcpy(&beta_fast, (int32_t *) tensor->op_params + 9, sizeof(float));
|
18411
18080
|
memcpy(&beta_slow, (int32_t *) tensor->op_params + 10, sizeof(float));
|
18412
|
-
memcpy(&xpos_base, (int32_t *) tensor->op_params + 11, sizeof(float));
|
18413
|
-
memcpy(&xpos_down, (int32_t *) tensor->op_params + 12, sizeof(bool));
|
18414
18081
|
|
18415
18082
|
src0->grad = ggml_add_or_set(ctx,
|
18416
18083
|
src0->grad,
|
@@ -18420,16 +18087,13 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
18420
18087
|
src2,
|
18421
18088
|
n_dims,
|
18422
18089
|
mode,
|
18423
|
-
|
18424
|
-
n_orig_ctx,
|
18090
|
+
n_ctx_orig,
|
18425
18091
|
freq_base,
|
18426
18092
|
freq_scale,
|
18427
18093
|
ext_factor,
|
18428
18094
|
attn_factor,
|
18429
18095
|
beta_fast,
|
18430
18096
|
beta_slow,
|
18431
|
-
xpos_base,
|
18432
|
-
xpos_down,
|
18433
18097
|
false),
|
18434
18098
|
zero_table);
|
18435
18099
|
}
|
@@ -19073,6 +18737,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
|
|
19073
18737
|
switch (node->op) {
|
19074
18738
|
case GGML_OP_CPY:
|
19075
18739
|
case GGML_OP_DUP:
|
18740
|
+
case GGML_OP_CONT:
|
19076
18741
|
case GGML_OP_ADD:
|
19077
18742
|
case GGML_OP_ADD1:
|
19078
18743
|
case GGML_OP_ACC:
|
@@ -19157,7 +18822,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
|
|
19157
18822
|
} break;
|
19158
18823
|
case GGML_OP_SCALE:
|
19159
18824
|
case GGML_OP_SET:
|
19160
|
-
case GGML_OP_CONT:
|
19161
18825
|
case GGML_OP_RESHAPE:
|
19162
18826
|
case GGML_OP_VIEW:
|
19163
18827
|
case GGML_OP_PERMUTE:
|
@@ -19317,8 +18981,11 @@ static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_comput
|
|
19317
18981
|
sched_yield();
|
19318
18982
|
}
|
19319
18983
|
|
19320
|
-
*
|
19321
|
-
if (*
|
18984
|
+
*node_n = atomic_load(&state->shared->node_n);
|
18985
|
+
if (*node_n != last_node_n) {
|
18986
|
+
break;
|
18987
|
+
}
|
18988
|
+
|
19322
18989
|
#if defined(__SSE3__)
|
19323
18990
|
// Tell the processor we're spinning. It's a processor hint for spinlocks.
|
19324
18991
|
_mm_pause();
|
@@ -19328,15 +18995,18 @@ static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_comput
|
|
19328
18995
|
|
19329
18996
|
static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_compute_state * state, const bool do_yield) {
|
19330
18997
|
// wait for other threads to finish
|
19331
|
-
const int last_task_phase = *
|
18998
|
+
const int last_task_phase = *task_phase;
|
19332
18999
|
|
19333
19000
|
while (true) {
|
19334
19001
|
if (do_yield) {
|
19335
19002
|
sched_yield();
|
19336
19003
|
}
|
19337
19004
|
|
19338
|
-
*
|
19339
|
-
if (*
|
19005
|
+
*task_phase = atomic_load(&state->shared->node_task);
|
19006
|
+
if (*task_phase != last_task_phase) {
|
19007
|
+
break;
|
19008
|
+
}
|
19009
|
+
|
19340
19010
|
#if defined(__SSE3__)
|
19341
19011
|
// Tell the processor we're spinning. It's a processor hint for spinlocks.
|
19342
19012
|
_mm_pause();
|
@@ -19536,22 +19206,6 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
|
19536
19206
|
{
|
19537
19207
|
const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type;
|
19538
19208
|
|
19539
|
-
#if defined(GGML_USE_CLBLAST)
|
19540
|
-
if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
|
19541
|
-
cur = ggml_cl_mul_mat_get_wsize(node->src[0], node->src[1], node);
|
19542
|
-
} else
|
19543
|
-
#endif
|
19544
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
19545
|
-
if (ggml_compute_forward_mul_mat_use_blas(node)) {
|
19546
|
-
if (node->src[0]->type != GGML_TYPE_F32) {
|
19547
|
-
// here we need memory for fully dequantized matrix from src0
|
19548
|
-
// take into account that src0 can be broadcasted into src1[2,3]
|
19549
|
-
cur = ggml_type_size(GGML_TYPE_F32)
|
19550
|
-
* node->src[0]->ne[0]*node->src[0]->ne[1]
|
19551
|
-
* node->src[1]->ne[2]*node->src[1]->ne[3];
|
19552
|
-
}
|
19553
|
-
} else
|
19554
|
-
#endif
|
19555
19209
|
if (node->src[1]->type != vec_dot_type) {
|
19556
19210
|
cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
|
19557
19211
|
}
|
@@ -19670,6 +19324,59 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
|
19670
19324
|
return cplan;
|
19671
19325
|
}
|
19672
19326
|
|
19327
|
+
static enum ggml_status ggml_graph_compute_parallel(struct ggml_compute_state * workers, int n_threads) {
|
19328
|
+
enum ggml_status compute_status = GGML_STATUS_SUCCESS;
|
19329
|
+
|
19330
|
+
#ifdef GGML_USE_OPENMP
|
19331
|
+
if (n_threads > 1) {
|
19332
|
+
#pragma omp parallel num_threads(n_threads)
|
19333
|
+
{
|
19334
|
+
#pragma omp single
|
19335
|
+
{
|
19336
|
+
// update the number of threads from the actual number of threads that we got from OpenMP
|
19337
|
+
n_threads = omp_get_num_threads();
|
19338
|
+
workers[0].shared->n_threads = n_threads;
|
19339
|
+
workers[0].shared->n_active = n_threads;
|
19340
|
+
}
|
19341
|
+
ggml_graph_compute_thread(&workers[omp_get_thread_num()]);
|
19342
|
+
}
|
19343
|
+
} else {
|
19344
|
+
ggml_graph_compute_thread(&workers[0]);
|
19345
|
+
}
|
19346
|
+
#else
|
19347
|
+
// create thread pool
|
19348
|
+
if (n_threads > 1) {
|
19349
|
+
for (int j = 1; j < n_threads; ++j) {
|
19350
|
+
const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
|
19351
|
+
GGML_ASSERT(rc == 0);
|
19352
|
+
UNUSED(rc);
|
19353
|
+
}
|
19354
|
+
}
|
19355
|
+
|
19356
|
+
// this is a work thread too
|
19357
|
+
ggml_graph_compute_thread(&workers[0]);
|
19358
|
+
|
19359
|
+
// join or kill thread pool
|
19360
|
+
if (n_threads > 1) {
|
19361
|
+
for (int j = 1; j < n_threads; j++) {
|
19362
|
+
const int rc = ggml_thread_join(workers[j].thrd, NULL);
|
19363
|
+
GGML_ASSERT(rc == 0);
|
19364
|
+
UNUSED(rc);
|
19365
|
+
}
|
19366
|
+
}
|
19367
|
+
#endif
|
19368
|
+
// don't leave affinity set on the main thread
|
19369
|
+
clear_numa_thread_affinity();
|
19370
|
+
|
19371
|
+
for (int j = 0; j < n_threads; j++) {
|
19372
|
+
if (workers[j].ec != GGML_STATUS_SUCCESS) {
|
19373
|
+
compute_status = workers[j].ec;
|
19374
|
+
break;
|
19375
|
+
}
|
19376
|
+
}
|
19377
|
+
return compute_status;
|
19378
|
+
}
|
19379
|
+
|
19673
19380
|
enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
19674
19381
|
{
|
19675
19382
|
GGML_ASSERT(cplan);
|
@@ -19680,7 +19387,11 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
|
19680
19387
|
}
|
19681
19388
|
}
|
19682
19389
|
|
19683
|
-
|
19390
|
+
int n_threads = cplan->n_threads;
|
19391
|
+
|
19392
|
+
#if defined(GGML_USE_OPENMP)
|
19393
|
+
n_threads = MIN(n_threads, omp_get_max_threads());
|
19394
|
+
#endif
|
19684
19395
|
|
19685
19396
|
struct ggml_compute_state_shared state_shared = {
|
19686
19397
|
/*.cgraph =*/ cgraph,
|
@@ -19696,47 +19407,20 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
|
19696
19407
|
/*.current_chunk; =*/ 0,
|
19697
19408
|
};
|
19698
19409
|
struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
|
19699
|
-
|
19700
|
-
// create thread pool
|
19701
|
-
if (n_threads > 1) {
|
19702
|
-
for (int j = 1; j < n_threads; ++j) {
|
19703
|
-
workers[j] = (struct ggml_compute_state) {
|
19704
|
-
.thrd = 0,
|
19705
|
-
.ith = j,
|
19706
|
-
.shared = &state_shared,
|
19707
|
-
.ec = GGML_STATUS_SUCCESS,
|
19708
|
-
};
|
19709
|
-
|
19710
|
-
const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
|
19711
|
-
GGML_ASSERT(rc == 0);
|
19712
|
-
UNUSED(rc);
|
19713
|
-
}
|
19714
|
-
}
|
19715
|
-
|
19716
|
-
workers[0].ith = 0;
|
19717
|
-
workers[0].shared = &state_shared;
|
19718
|
-
workers[0].ec = GGML_STATUS_SUCCESS;
|
19719
|
-
|
19720
19410
|
const int64_t perf_start_cycles = ggml_perf_cycles();
|
19721
19411
|
const int64_t perf_start_time_us = ggml_perf_time_us();
|
19722
19412
|
|
19723
|
-
|
19724
|
-
|
19725
|
-
|
19726
|
-
|
19727
|
-
|
19728
|
-
|
19729
|
-
|
19730
|
-
// join or kill thread pool
|
19731
|
-
if (n_threads > 1) {
|
19732
|
-
for (int j = 1; j < n_threads; j++) {
|
19733
|
-
const int rc = ggml_thread_join(workers[j].thrd, NULL);
|
19734
|
-
GGML_ASSERT(rc == 0);
|
19735
|
-
if (workers[j].ec != GGML_STATUS_SUCCESS)
|
19736
|
-
compute_status = workers[j].ec;
|
19737
|
-
}
|
19413
|
+
for (int j = 0; j < n_threads; ++j) {
|
19414
|
+
workers[j] = (struct ggml_compute_state) {
|
19415
|
+
.thrd = 0,
|
19416
|
+
.ith = j,
|
19417
|
+
.shared = &state_shared,
|
19418
|
+
.ec = GGML_STATUS_SUCCESS,
|
19419
|
+
};
|
19738
19420
|
}
|
19739
19421
|
|
19422
|
+
enum ggml_status compute_status = ggml_graph_compute_parallel(workers, n_threads);
|
19423
|
+
|
19740
19424
|
// performance stats (graph)
|
19741
19425
|
{
|
19742
19426
|
int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles;
|
@@ -22819,7 +22503,7 @@ int ggml_cpu_has_wasm_simd(void) {
|
|
22819
22503
|
}
|
22820
22504
|
|
22821
22505
|
int ggml_cpu_has_blas(void) {
|
22822
|
-
#if defined(
|
22506
|
+
#if defined(GGML_USE_BLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL)
|
22823
22507
|
return 1;
|
22824
22508
|
#else
|
22825
22509
|
return 0;
|
@@ -22834,14 +22518,6 @@ int ggml_cpu_has_cuda(void) {
|
|
22834
22518
|
#endif
|
22835
22519
|
}
|
22836
22520
|
|
22837
|
-
int ggml_cpu_has_clblast(void) {
|
22838
|
-
#if defined(GGML_USE_CLBLAST)
|
22839
|
-
return 1;
|
22840
|
-
#else
|
22841
|
-
return 0;
|
22842
|
-
#endif
|
22843
|
-
}
|
22844
|
-
|
22845
22521
|
int ggml_cpu_has_vulkan(void) {
|
22846
22522
|
#if defined(GGML_USE_VULKAN)
|
22847
22523
|
return 1;
|
@@ -22875,8 +22551,7 @@ int ggml_cpu_has_rpc(void) {
|
|
22875
22551
|
}
|
22876
22552
|
|
22877
22553
|
int ggml_cpu_has_gpublas(void) {
|
22878
|
-
return ggml_cpu_has_cuda() ||
|
22879
|
-
ggml_cpu_has_sycl();
|
22554
|
+
return ggml_cpu_has_cuda() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() || ggml_cpu_has_sycl();
|
22880
22555
|
}
|
22881
22556
|
|
22882
22557
|
int ggml_cpu_has_sse3(void) {
|