llama_cpp 0.15.3 → 0.16.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/ext/llama_cpp/extconf.rb +1 -2
- data/ext/llama_cpp/llama_cpp.cpp +27 -3
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +15 -1
- data/vendor/tmp/llama.cpp/Makefile +66 -36
- data/vendor/tmp/llama.cpp/ggml-alloc.c +4 -4
- data/vendor/tmp/llama.cpp/ggml-backend.c +5 -5
- data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +47 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +34 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +103 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +280 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +34 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +196 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +686 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +490 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +40 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +662 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +319 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +312 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +345 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +178 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +104 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +1564 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +404 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +221 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +49 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +94 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +45 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +271 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +31 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +205 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +40 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +9 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +8 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +47 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +266 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +51 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +35 -16
- data/vendor/tmp/llama.cpp/ggml-impl.h +4 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +21 -7
- data/vendor/tmp/llama.cpp/ggml-metal.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +99 -35
- data/vendor/tmp/llama.cpp/ggml-metal.metal +146 -80
- data/vendor/tmp/llama.cpp/ggml-quants.c +101 -11
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +75 -58
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +345 -227
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +99301 -39793
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +458 -329
- data/vendor/tmp/llama.cpp/ggml.c +301 -409
- data/vendor/tmp/llama.cpp/ggml.h +19 -23
- data/vendor/tmp/llama.cpp/llama.cpp +855 -651
- data/vendor/tmp/llama.cpp/llama.h +28 -48
- metadata +121 -6
- data/vendor/tmp/llama.cpp/ggml-mpi.c +0 -216
- data/vendor/tmp/llama.cpp/ggml-mpi.h +0 -39
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +0 -2305
- data/vendor/tmp/llama.cpp/ggml-opencl.h +0 -36
data/vendor/tmp/llama.cpp/ggml.c
CHANGED
@@ -5,6 +5,7 @@
|
|
5
5
|
#include "ggml-quants.h"
|
6
6
|
#include "ggml.h"
|
7
7
|
|
8
|
+
|
8
9
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
9
10
|
#include <malloc.h> // using malloc.h with MSC/MINGW
|
10
11
|
#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
|
@@ -28,6 +29,10 @@
|
|
28
29
|
#include <syscall.h>
|
29
30
|
#endif
|
30
31
|
|
32
|
+
#ifdef GGML_USE_OPENMP
|
33
|
+
#include <omp.h>
|
34
|
+
#endif
|
35
|
+
|
31
36
|
#ifdef GGML_USE_METAL
|
32
37
|
#include <unistd.h>
|
33
38
|
#endif
|
@@ -60,6 +65,9 @@
|
|
60
65
|
|
61
66
|
typedef volatile LONG atomic_int;
|
62
67
|
typedef atomic_int atomic_bool;
|
68
|
+
typedef atomic_int atomic_flag;
|
69
|
+
|
70
|
+
#define ATOMIC_FLAG_INIT 0
|
63
71
|
|
64
72
|
static void atomic_store(atomic_int * ptr, LONG val) {
|
65
73
|
InterlockedExchange(ptr, val);
|
@@ -73,6 +81,12 @@ static LONG atomic_fetch_add(atomic_int * ptr, LONG inc) {
|
|
73
81
|
static LONG atomic_fetch_sub(atomic_int * ptr, LONG dec) {
|
74
82
|
return atomic_fetch_add(ptr, -(dec));
|
75
83
|
}
|
84
|
+
static atomic_bool atomic_flag_test_and_set(atomic_flag * ptr) {
|
85
|
+
return InterlockedExchange(ptr, 1);
|
86
|
+
}
|
87
|
+
static void atomic_flag_clear(atomic_flag * ptr) {
|
88
|
+
InterlockedExchange(ptr, 0);
|
89
|
+
}
|
76
90
|
|
77
91
|
typedef HANDLE pthread_t;
|
78
92
|
|
@@ -283,17 +297,12 @@ inline static void * ggml_calloc(size_t num, size_t size) {
|
|
283
297
|
|
284
298
|
#if defined(GGML_USE_ACCELERATE)
|
285
299
|
#include <Accelerate/Accelerate.h>
|
286
|
-
#if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
|
287
|
-
#include "ggml-opencl.h"
|
288
|
-
#endif
|
289
300
|
#elif defined(GGML_USE_OPENBLAS)
|
290
301
|
#if defined(GGML_BLAS_USE_MKL)
|
291
302
|
#include <mkl.h>
|
292
303
|
#else
|
293
304
|
#include <cblas.h>
|
294
305
|
#endif
|
295
|
-
#elif defined(GGML_USE_CLBLAST)
|
296
|
-
#include "ggml-opencl.h"
|
297
306
|
#endif
|
298
307
|
|
299
308
|
// floating point type used to accumulate sums
|
@@ -1567,11 +1576,11 @@ do { \
|
|
1567
1576
|
|
1568
1577
|
// F16 arithmetic is not supported by AVX, so we use F32 instead
|
1569
1578
|
|
1570
|
-
#define GGML_F32Cx8
|
1579
|
+
#define GGML_F32Cx8 __m256
|
1571
1580
|
#define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0)
|
1572
1581
|
#define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))
|
1573
1582
|
|
1574
|
-
static inline __m256 __lasx_f32cx8_load(ggml_fp16_t *x) {
|
1583
|
+
static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) {
|
1575
1584
|
float tmp[8];
|
1576
1585
|
|
1577
1586
|
for (int i = 0; i < 8; i++) {
|
@@ -1580,13 +1589,14 @@ static inline __m256 __lasx_f32cx8_load(ggml_fp16_t *x) {
|
|
1580
1589
|
|
1581
1590
|
return (__m256)__lasx_xvld(tmp, 0);
|
1582
1591
|
}
|
1583
|
-
static inline void __lasx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
|
1592
|
+
static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
|
1584
1593
|
float arr[8];
|
1585
1594
|
|
1586
1595
|
__lasx_xvst(y, arr, 0);
|
1587
1596
|
|
1588
|
-
for (int i = 0; i < 8; i++)
|
1597
|
+
for (int i = 0; i < 8; i++) {
|
1589
1598
|
x[i] = GGML_FP32_TO_FP16(arr[i]);
|
1599
|
+
}
|
1590
1600
|
}
|
1591
1601
|
#define GGML_F32Cx8_LOAD(x) __lasx_f32cx8_load(x)
|
1592
1602
|
#define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y)
|
@@ -1662,7 +1672,7 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
|
|
1662
1672
|
#define GGML_F16_STEP 32
|
1663
1673
|
#define GGML_F16_EPR 4
|
1664
1674
|
|
1665
|
-
static inline __m128 __lsx_f16x4_load(ggml_fp16_t *x) {
|
1675
|
+
static inline __m128 __lsx_f16x4_load(const ggml_fp16_t * x) {
|
1666
1676
|
float tmp[4];
|
1667
1677
|
|
1668
1678
|
tmp[0] = GGML_FP16_TO_FP32(x[0]);
|
@@ -1673,7 +1683,7 @@ static inline __m128 __lsx_f16x4_load(ggml_fp16_t *x) {
|
|
1673
1683
|
return __lsx_vld(tmp, 0);
|
1674
1684
|
}
|
1675
1685
|
|
1676
|
-
static inline void __lsx_f16x4_store(ggml_fp16_t *x, __m128 y) {
|
1686
|
+
static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
|
1677
1687
|
float arr[4];
|
1678
1688
|
|
1679
1689
|
__lsx_vst(y, arr, 0);
|
@@ -1746,7 +1756,7 @@ struct ggml_compute_state_shared {
|
|
1746
1756
|
int64_t perf_node_start_cycles;
|
1747
1757
|
int64_t perf_node_start_time_us;
|
1748
1758
|
|
1749
|
-
|
1759
|
+
int n_threads;
|
1750
1760
|
|
1751
1761
|
// synchronization primitives
|
1752
1762
|
atomic_int n_active; // num active threads
|
@@ -2257,6 +2267,11 @@ inline static float ggml_silu_f32(float x) {
|
|
2257
2267
|
return x/(1.0f + expf(-x));
|
2258
2268
|
}
|
2259
2269
|
|
2270
|
+
#if __FINITE_MATH_ONLY__
|
2271
|
+
#error "some routines in ggml.c require non-finite math arithmetics -- pass -fno-finite-math-only to the compiler to fix"
|
2272
|
+
#error "ref: https://github.com/ggerganov/llama.cpp/pull/7154#issuecomment-2143844461"
|
2273
|
+
#endif
|
2274
|
+
|
2260
2275
|
#if defined(__ARM_NEON) && defined(__aarch64__)
|
2261
2276
|
|
2262
2277
|
// adapted from arm limited optimized routine
|
@@ -2306,32 +2321,27 @@ inline static __m512 ggml_v_expf(__m512 x) {
|
|
2306
2321
|
const __m512 r = _mm512_set1_ps(0x1.8p23f);
|
2307
2322
|
const __m512 z = _mm512_fmadd_ps(x, _mm512_set1_ps(0x1.715476p+0f), r);
|
2308
2323
|
const __m512 n = _mm512_sub_ps(z, r);
|
2309
|
-
const __m512 b =
|
2310
|
-
|
2311
|
-
|
2312
|
-
const __m512 k = _mm512_castsi512_ps(_mm512_add_epi32(e, _mm512_castps_si512(_mm512_set1_ps(1))));
|
2313
|
-
const __mmask16 c = _mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(126), _CMP_GT_OQ);
|
2314
|
-
const __m512 u = _mm512_mul_ps(b, b);
|
2315
|
-
const __m512 j = _mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_set1_ps(0x1.0e4020p-7f), b,
|
2316
|
-
_mm512_set1_ps(0x1.573e2ep-5f)), u,
|
2317
|
-
_mm512_fmadd_ps(_mm512_set1_ps(0x1.555e66p-3f), b,
|
2318
|
-
_mm512_set1_ps(0x1.fffdb6p-2f))),
|
2319
|
-
u, _mm512_mul_ps(_mm512_set1_ps(0x1.ffffecp-1f), b));
|
2320
|
-
if (_mm512_kortestz(c, c))
|
2321
|
-
return _mm512_fmadd_ps(j, k, k);
|
2322
|
-
const __m512i g = _mm512_and_si512(
|
2323
|
-
_mm512_movm_epi32(_mm512_cmp_ps_mask(n, _mm512_setzero_ps(), _CMP_LE_OQ)),
|
2324
|
-
_mm512_set1_epi32(0x82000000u));
|
2325
|
-
const __m512 s1 =
|
2326
|
-
_mm512_castsi512_ps(_mm512_add_epi32(g, _mm512_set1_epi32(0x7f000000u)));
|
2327
|
-
const __m512 s2 = _mm512_castsi512_ps(_mm512_sub_epi32(e, g));
|
2324
|
+
const __m512 b =
|
2325
|
+
_mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.7f7d1cp-20f),
|
2326
|
+
_mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.62e4p-1f), x));
|
2328
2327
|
const __mmask16 d =
|
2329
2328
|
_mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(192), _CMP_GT_OQ);
|
2330
|
-
|
2331
|
-
|
2332
|
-
|
2333
|
-
|
2334
|
-
|
2329
|
+
const __m512 u = _mm512_mul_ps(b, b);
|
2330
|
+
const __m512 j = _mm512_fmadd_ps(
|
2331
|
+
_mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_set1_ps(0x1.0e4020p-7f), b,
|
2332
|
+
_mm512_set1_ps(0x1.573e2ep-5f)),
|
2333
|
+
u,
|
2334
|
+
_mm512_fmadd_ps(_mm512_set1_ps(0x1.555e66p-3f), b,
|
2335
|
+
_mm512_set1_ps(0x1.fffdb6p-2f))),
|
2336
|
+
u,
|
2337
|
+
_mm512_fmadd_ps(_mm512_set1_ps(0x1.ffffecp-1f), b, _mm512_set1_ps(1.0F)));
|
2338
|
+
const __m512 res = _mm512_scalef_ps(j, n);
|
2339
|
+
if (_mm512_kortestz(d, d))
|
2340
|
+
return res;
|
2341
|
+
const __m512 zero = _mm512_setzero_ps();
|
2342
|
+
const __m512 alt = _mm512_mask_blend_ps(
|
2343
|
+
_mm512_cmp_ps_mask(n, zero, _CMP_LE_OQ), _mm512_set1_ps(INFINITY), zero);
|
2344
|
+
return _mm512_mask_blend_ps(d, res, alt);
|
2335
2345
|
}
|
2336
2346
|
|
2337
2347
|
// computes silu x/(1+exp(-x)) in single precision vector
|
@@ -2883,24 +2893,20 @@ struct ggml_state {
|
|
2883
2893
|
|
2884
2894
|
// global state
|
2885
2895
|
static struct ggml_state g_state;
|
2886
|
-
static
|
2896
|
+
static atomic_flag g_state_critical = ATOMIC_FLAG_INIT;
|
2887
2897
|
|
2888
2898
|
// barrier via spin lock
|
2889
2899
|
inline static void ggml_critical_section_start(void) {
|
2890
|
-
|
2891
|
-
|
2892
|
-
|
2893
|
-
// wait for other threads to finish
|
2894
|
-
atomic_fetch_sub(&g_state_barrier, 1);
|
2895
|
-
sched_yield(); // TODO: reconsider this
|
2896
|
-
processing = atomic_fetch_add(&g_state_barrier, 1);
|
2900
|
+
while (atomic_flag_test_and_set(&g_state_critical)) {
|
2901
|
+
// spin
|
2902
|
+
sched_yield();
|
2897
2903
|
}
|
2898
2904
|
}
|
2899
2905
|
|
2900
2906
|
// TODO: make this somehow automatically executed
|
2901
2907
|
// some sort of "sentry" mechanism
|
2902
2908
|
inline static void ggml_critical_section_end(void) {
|
2903
|
-
|
2909
|
+
atomic_flag_clear(&g_state_critical);
|
2904
2910
|
}
|
2905
2911
|
|
2906
2912
|
#if defined(__gnu_linux__)
|
@@ -3216,7 +3222,11 @@ GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
|
|
3216
3222
|
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
3217
3223
|
}
|
3218
3224
|
|
3219
|
-
|
3225
|
+
GGML_CALL bool ggml_is_contiguous_0(const struct ggml_tensor * tensor) {
|
3226
|
+
return ggml_is_contiguous(tensor);
|
3227
|
+
}
|
3228
|
+
|
3229
|
+
GGML_CALL bool ggml_is_contiguous_1(const struct ggml_tensor * tensor) {
|
3220
3230
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
3221
3231
|
|
3222
3232
|
return
|
@@ -3225,6 +3235,14 @@ static inline bool ggml_is_contiguous_except_dim_1(const struct ggml_tensor * te
|
|
3225
3235
|
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
3226
3236
|
}
|
3227
3237
|
|
3238
|
+
GGML_CALL bool ggml_is_contiguous_2(const struct ggml_tensor * tensor) {
|
3239
|
+
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
3240
|
+
|
3241
|
+
return
|
3242
|
+
tensor->nb[0] == ggml_type_size(tensor->type) &&
|
3243
|
+
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
3244
|
+
}
|
3245
|
+
|
3228
3246
|
GGML_CALL bool ggml_is_permuted(const struct ggml_tensor * tensor) {
|
3229
3247
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
3230
3248
|
|
@@ -3357,10 +3375,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
3357
3375
|
GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
|
3358
3376
|
}
|
3359
3377
|
|
3360
|
-
#if defined(GGML_USE_CLBLAST)
|
3361
|
-
ggml_cl_init();
|
3362
|
-
#endif
|
3363
|
-
|
3364
3378
|
ggml_setup_op_has_task_pass();
|
3365
3379
|
|
3366
3380
|
is_first_call = false;
|
@@ -4882,10 +4896,21 @@ struct ggml_tensor * ggml_repeat_back(
|
|
4882
4896
|
// ggml_concat
|
4883
4897
|
|
4884
4898
|
struct ggml_tensor * ggml_concat(
|
4885
|
-
struct ggml_context* ctx,
|
4886
|
-
struct ggml_tensor* a,
|
4887
|
-
struct ggml_tensor* b
|
4888
|
-
|
4899
|
+
struct ggml_context * ctx,
|
4900
|
+
struct ggml_tensor * a,
|
4901
|
+
struct ggml_tensor * b,
|
4902
|
+
int dim) {
|
4903
|
+
GGML_ASSERT(dim >= 0 && dim < GGML_MAX_DIMS);
|
4904
|
+
|
4905
|
+
int64_t ne[GGML_MAX_DIMS];
|
4906
|
+
for (int d = 0; d < GGML_MAX_DIMS; ++d) {
|
4907
|
+
if (d == dim) {
|
4908
|
+
ne[d] = a->ne[d] + b->ne[d];
|
4909
|
+
continue;
|
4910
|
+
}
|
4911
|
+
GGML_ASSERT(a->ne[d] == b->ne[d]);
|
4912
|
+
ne[d] = a->ne[d];
|
4913
|
+
}
|
4889
4914
|
|
4890
4915
|
bool is_node = false;
|
4891
4916
|
|
@@ -4893,7 +4918,9 @@ struct ggml_tensor * ggml_concat(
|
|
4893
4918
|
is_node = true;
|
4894
4919
|
}
|
4895
4920
|
|
4896
|
-
struct ggml_tensor * result =
|
4921
|
+
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, ne);
|
4922
|
+
|
4923
|
+
ggml_set_op_params_i32(result, 0, dim);
|
4897
4924
|
|
4898
4925
|
result->op = GGML_OP_CONCAT;
|
4899
4926
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -5013,6 +5040,7 @@ struct ggml_tensor * ggml_leaky_relu(
|
|
5013
5040
|
}
|
5014
5041
|
|
5015
5042
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
5043
|
+
|
5016
5044
|
ggml_set_op_params(result, &negative_slope, sizeof(negative_slope));
|
5017
5045
|
|
5018
5046
|
result->op = GGML_OP_LEAKY_RELU;
|
@@ -6222,16 +6250,13 @@ static struct ggml_tensor * ggml_rope_impl(
|
|
6222
6250
|
struct ggml_tensor * c,
|
6223
6251
|
int n_dims,
|
6224
6252
|
int mode,
|
6225
|
-
int
|
6226
|
-
int n_orig_ctx,
|
6253
|
+
int n_ctx_orig,
|
6227
6254
|
float freq_base,
|
6228
6255
|
float freq_scale,
|
6229
6256
|
float ext_factor,
|
6230
6257
|
float attn_factor,
|
6231
6258
|
float beta_fast,
|
6232
6259
|
float beta_slow,
|
6233
|
-
float xpos_base,
|
6234
|
-
bool xpos_down,
|
6235
6260
|
bool inplace) {
|
6236
6261
|
GGML_ASSERT((mode & 1) == 0 && "mode & 1 == 1 is no longer supported");
|
6237
6262
|
|
@@ -6252,15 +6277,13 @@ static struct ggml_tensor * ggml_rope_impl(
|
|
6252
6277
|
|
6253
6278
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
6254
6279
|
|
6255
|
-
int32_t params[
|
6280
|
+
int32_t params[11] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
|
6256
6281
|
memcpy(params + 5, &freq_base, sizeof(float));
|
6257
6282
|
memcpy(params + 6, &freq_scale, sizeof(float));
|
6258
6283
|
memcpy(params + 7, &ext_factor, sizeof(float));
|
6259
6284
|
memcpy(params + 8, &attn_factor, sizeof(float));
|
6260
6285
|
memcpy(params + 9, &beta_fast, sizeof(float));
|
6261
6286
|
memcpy(params + 10, &beta_slow, sizeof(float));
|
6262
|
-
memcpy(params + 11, &xpos_base, sizeof(float));
|
6263
|
-
memcpy(params + 12, &xpos_down, sizeof(bool));
|
6264
6287
|
ggml_set_op_params(result, params, sizeof(params));
|
6265
6288
|
|
6266
6289
|
result->op = GGML_OP_ROPE;
|
@@ -6277,10 +6300,9 @@ struct ggml_tensor * ggml_rope(
|
|
6277
6300
|
struct ggml_tensor * a,
|
6278
6301
|
struct ggml_tensor * b,
|
6279
6302
|
int n_dims,
|
6280
|
-
int mode
|
6281
|
-
int n_ctx) {
|
6303
|
+
int mode) {
|
6282
6304
|
return ggml_rope_impl(
|
6283
|
-
ctx, a, b, NULL, n_dims, mode,
|
6305
|
+
ctx, a, b, NULL, n_dims, mode, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, false
|
6284
6306
|
);
|
6285
6307
|
}
|
6286
6308
|
|
@@ -6289,10 +6311,9 @@ struct ggml_tensor * ggml_rope_inplace(
|
|
6289
6311
|
struct ggml_tensor * a,
|
6290
6312
|
struct ggml_tensor * b,
|
6291
6313
|
int n_dims,
|
6292
|
-
int mode
|
6293
|
-
int n_ctx) {
|
6314
|
+
int mode) {
|
6294
6315
|
return ggml_rope_impl(
|
6295
|
-
ctx, a, b, NULL, n_dims, mode,
|
6316
|
+
ctx, a, b, NULL, n_dims, mode, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, true
|
6296
6317
|
);
|
6297
6318
|
}
|
6298
6319
|
|
@@ -6303,8 +6324,7 @@ struct ggml_tensor * ggml_rope_ext(
|
|
6303
6324
|
struct ggml_tensor * c,
|
6304
6325
|
int n_dims,
|
6305
6326
|
int mode,
|
6306
|
-
int
|
6307
|
-
int n_orig_ctx,
|
6327
|
+
int n_ctx_orig,
|
6308
6328
|
float freq_base,
|
6309
6329
|
float freq_scale,
|
6310
6330
|
float ext_factor,
|
@@ -6312,8 +6332,8 @@ struct ggml_tensor * ggml_rope_ext(
|
|
6312
6332
|
float beta_fast,
|
6313
6333
|
float beta_slow) {
|
6314
6334
|
return ggml_rope_impl(
|
6315
|
-
ctx, a, b, c, n_dims, mode,
|
6316
|
-
ext_factor, attn_factor, beta_fast, beta_slow,
|
6335
|
+
ctx, a, b, c, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
|
6336
|
+
ext_factor, attn_factor, beta_fast, beta_slow, false
|
6317
6337
|
);
|
6318
6338
|
}
|
6319
6339
|
|
@@ -6324,8 +6344,7 @@ struct ggml_tensor * ggml_rope_ext_inplace(
|
|
6324
6344
|
struct ggml_tensor * c,
|
6325
6345
|
int n_dims,
|
6326
6346
|
int mode,
|
6327
|
-
int
|
6328
|
-
int n_orig_ctx,
|
6347
|
+
int n_ctx_orig,
|
6329
6348
|
float freq_base,
|
6330
6349
|
float freq_scale,
|
6331
6350
|
float ext_factor,
|
@@ -6333,8 +6352,8 @@ struct ggml_tensor * ggml_rope_ext_inplace(
|
|
6333
6352
|
float beta_fast,
|
6334
6353
|
float beta_slow) {
|
6335
6354
|
return ggml_rope_impl(
|
6336
|
-
ctx, a, b, c, n_dims, mode,
|
6337
|
-
ext_factor, attn_factor, beta_fast, beta_slow,
|
6355
|
+
ctx, a, b, c, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
|
6356
|
+
ext_factor, attn_factor, beta_fast, beta_slow, true
|
6338
6357
|
);
|
6339
6358
|
}
|
6340
6359
|
|
@@ -6344,8 +6363,7 @@ struct ggml_tensor * ggml_rope_custom(
|
|
6344
6363
|
struct ggml_tensor * b,
|
6345
6364
|
int n_dims,
|
6346
6365
|
int mode,
|
6347
|
-
int
|
6348
|
-
int n_orig_ctx,
|
6366
|
+
int n_ctx_orig,
|
6349
6367
|
float freq_base,
|
6350
6368
|
float freq_scale,
|
6351
6369
|
float ext_factor,
|
@@ -6353,8 +6371,8 @@ struct ggml_tensor * ggml_rope_custom(
|
|
6353
6371
|
float beta_fast,
|
6354
6372
|
float beta_slow) {
|
6355
6373
|
return ggml_rope_impl(
|
6356
|
-
ctx, a, b, NULL, n_dims, mode,
|
6357
|
-
ext_factor, attn_factor, beta_fast, beta_slow,
|
6374
|
+
ctx, a, b, NULL, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
|
6375
|
+
ext_factor, attn_factor, beta_fast, beta_slow, false
|
6358
6376
|
);
|
6359
6377
|
}
|
6360
6378
|
|
@@ -6364,8 +6382,7 @@ struct ggml_tensor * ggml_rope_custom_inplace(
|
|
6364
6382
|
struct ggml_tensor * b,
|
6365
6383
|
int n_dims,
|
6366
6384
|
int mode,
|
6367
|
-
int
|
6368
|
-
int n_orig_ctx,
|
6385
|
+
int n_ctx_orig,
|
6369
6386
|
float freq_base,
|
6370
6387
|
float freq_scale,
|
6371
6388
|
float ext_factor,
|
@@ -6373,8 +6390,8 @@ struct ggml_tensor * ggml_rope_custom_inplace(
|
|
6373
6390
|
float beta_fast,
|
6374
6391
|
float beta_slow) {
|
6375
6392
|
return ggml_rope_impl(
|
6376
|
-
ctx, a, b, NULL, n_dims, mode,
|
6377
|
-
ext_factor, attn_factor, beta_fast, beta_slow,
|
6393
|
+
ctx, a, b, NULL, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
|
6394
|
+
ext_factor, attn_factor, beta_fast, beta_slow, true
|
6378
6395
|
);
|
6379
6396
|
}
|
6380
6397
|
|
@@ -6387,16 +6404,13 @@ struct ggml_tensor * ggml_rope_back(
|
|
6387
6404
|
struct ggml_tensor * c,
|
6388
6405
|
int n_dims,
|
6389
6406
|
int mode,
|
6390
|
-
int
|
6391
|
-
int n_orig_ctx,
|
6407
|
+
int n_ctx_orig,
|
6392
6408
|
float freq_base,
|
6393
6409
|
float freq_scale,
|
6394
6410
|
float ext_factor,
|
6395
6411
|
float attn_factor,
|
6396
6412
|
float beta_fast,
|
6397
|
-
float beta_slow
|
6398
|
-
float xpos_base,
|
6399
|
-
bool xpos_down) {
|
6413
|
+
float beta_slow) {
|
6400
6414
|
GGML_ASSERT(ggml_is_vector(b));
|
6401
6415
|
GGML_ASSERT(b->type == GGML_TYPE_I32);
|
6402
6416
|
GGML_ASSERT(a->ne[2] == b->ne[0]);
|
@@ -6412,15 +6426,13 @@ struct ggml_tensor * ggml_rope_back(
|
|
6412
6426
|
|
6413
6427
|
struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
|
6414
6428
|
|
6415
|
-
int32_t params[
|
6429
|
+
int32_t params[11] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
|
6416
6430
|
memcpy(params + 5, &freq_base, sizeof(float));
|
6417
6431
|
memcpy(params + 6, &freq_scale, sizeof(float));
|
6418
6432
|
memcpy(params + 7, &ext_factor, sizeof(float));
|
6419
6433
|
memcpy(params + 8, &attn_factor, sizeof(float));
|
6420
6434
|
memcpy(params + 9, &beta_fast, sizeof(float));
|
6421
6435
|
memcpy(params + 10, &beta_slow, sizeof(float));
|
6422
|
-
memcpy(params + 11, &xpos_base, sizeof(float));
|
6423
|
-
memcpy(params + 12, &xpos_down, sizeof(bool));
|
6424
6436
|
ggml_set_op_params(result, params, sizeof(params));
|
6425
6437
|
|
6426
6438
|
result->op = GGML_OP_ROPE_BACK;
|
@@ -9006,17 +9018,6 @@ static void ggml_compute_forward_add_f32(
|
|
9006
9018
|
const int ith = params->ith;
|
9007
9019
|
const int nth = params->nth;
|
9008
9020
|
|
9009
|
-
#ifdef GGML_USE_CLBLAST
|
9010
|
-
if (src1->backend == GGML_BACKEND_TYPE_GPU) {
|
9011
|
-
// TODO: OpenCL kernel support full broadcast
|
9012
|
-
GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
|
9013
|
-
if (ith == 0) {
|
9014
|
-
ggml_cl_add(src0, src1, dst);
|
9015
|
-
}
|
9016
|
-
return;
|
9017
|
-
}
|
9018
|
-
#endif
|
9019
|
-
|
9020
9021
|
const int nr = ggml_nrows(src0);
|
9021
9022
|
|
9022
9023
|
GGML_TENSOR_BINARY_OP_LOCALS
|
@@ -10124,17 +10125,6 @@ static void ggml_compute_forward_mul_f32(
|
|
10124
10125
|
const int ith = params->ith;
|
10125
10126
|
const int nth = params->nth;
|
10126
10127
|
|
10127
|
-
#if defined(GGML_USE_CLBLAST)
|
10128
|
-
if (src1->backend == GGML_BACKEND_TYPE_GPU) {
|
10129
|
-
// TODO: OpenCL kernel support full broadcast
|
10130
|
-
GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
|
10131
|
-
if (ith == 0) {
|
10132
|
-
ggml_cl_mul(src0, src1, dst);
|
10133
|
-
}
|
10134
|
-
return;
|
10135
|
-
}
|
10136
|
-
#endif
|
10137
|
-
|
10138
10128
|
const int64_t nr = ggml_nrows(src0);
|
10139
10129
|
|
10140
10130
|
GGML_TENSOR_BINARY_OP_LOCALS
|
@@ -10967,26 +10957,29 @@ static void ggml_compute_forward_concat_f32(
|
|
10967
10957
|
GGML_ASSERT(nb00 == sizeof(float));
|
10968
10958
|
GGML_ASSERT(nb10 == sizeof(float));
|
10969
10959
|
|
10960
|
+
const int32_t dim = ggml_get_op_params_i32(dst, 0);
|
10961
|
+
|
10962
|
+
GGML_ASSERT(dim >= 0 && dim < 4);
|
10963
|
+
|
10964
|
+
int64_t o[4] = {0, 0, 0, 0};
|
10965
|
+
o[dim] = src0->ne[dim];
|
10966
|
+
|
10967
|
+
const float * x;
|
10968
|
+
|
10969
|
+
// TODO: smarter multi-theading
|
10970
10970
|
for (int i3 = 0; i3 < ne3; i3++) {
|
10971
10971
|
for (int i2 = ith; i2 < ne2; i2 += nth) {
|
10972
|
-
|
10973
|
-
for (int
|
10974
|
-
|
10975
|
-
|
10976
|
-
|
10977
|
-
|
10978
|
-
*y = *x;
|
10979
|
-
}
|
10980
|
-
}
|
10981
|
-
} // src1
|
10982
|
-
else {
|
10983
|
-
for (int i1 = 0; i1 < ne1; i1++) {
|
10984
|
-
for (int i0 = 0; i0 < ne0; i0++) {
|
10985
|
-
const float * x = (float *)((char *) src1->data + i0 * nb10 + i1 * nb11 + (i2 - ne02) * nb12 + i3 * nb13);
|
10986
|
-
|
10987
|
-
float * y = (float *)((char *)dst->data + i0 * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3);
|
10988
|
-
*y = *x;
|
10972
|
+
for (int i1 = 0; i1 < ne1; i1++) {
|
10973
|
+
for (int i0 = 0; i0 < ne0; i0++) {
|
10974
|
+
if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
|
10975
|
+
x = (const float *) ((const char *)src0->data + (i0 )*nb00 + (i1 )*nb01 + (i2 )*nb02 + (i3 )*nb03);
|
10976
|
+
} else {
|
10977
|
+
x = (const float *) ((const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13);
|
10989
10978
|
}
|
10979
|
+
|
10980
|
+
float * y = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
|
10981
|
+
|
10982
|
+
*y = *x;
|
10990
10983
|
}
|
10991
10984
|
}
|
10992
10985
|
}
|
@@ -10994,8 +10987,8 @@ static void ggml_compute_forward_concat_f32(
|
|
10994
10987
|
}
|
10995
10988
|
|
10996
10989
|
static void ggml_compute_forward_concat(
|
10997
|
-
const struct ggml_compute_params* params,
|
10998
|
-
struct ggml_tensor* dst) {
|
10990
|
+
const struct ggml_compute_params * params,
|
10991
|
+
struct ggml_tensor * dst) {
|
10999
10992
|
|
11000
10993
|
const struct ggml_tensor * src0 = dst->src[0];
|
11001
10994
|
|
@@ -11388,8 +11381,8 @@ static void ggml_compute_forward_gelu_f32(
|
|
11388
11381
|
|
11389
11382
|
const struct ggml_tensor * src0 = dst->src[0];
|
11390
11383
|
|
11391
|
-
GGML_ASSERT(
|
11392
|
-
GGML_ASSERT(
|
11384
|
+
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
11385
|
+
GGML_ASSERT(ggml_is_contiguous_1(dst));
|
11393
11386
|
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
11394
11387
|
|
11395
11388
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
@@ -11451,8 +11444,8 @@ static void ggml_compute_forward_gelu_quick_f32(
|
|
11451
11444
|
|
11452
11445
|
const struct ggml_tensor * src0 = dst->src[0];
|
11453
11446
|
|
11454
|
-
GGML_ASSERT(
|
11455
|
-
GGML_ASSERT(
|
11447
|
+
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
11448
|
+
GGML_ASSERT(ggml_is_contiguous_1(dst));
|
11456
11449
|
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
11457
11450
|
|
11458
11451
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
@@ -11514,8 +11507,8 @@ static void ggml_compute_forward_silu_f32(
|
|
11514
11507
|
|
11515
11508
|
const struct ggml_tensor * src0 = dst->src[0];
|
11516
11509
|
|
11517
|
-
GGML_ASSERT(
|
11518
|
-
GGML_ASSERT(
|
11510
|
+
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
11511
|
+
GGML_ASSERT(ggml_is_contiguous_1(dst));
|
11519
11512
|
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
11520
11513
|
|
11521
11514
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
@@ -11626,9 +11619,9 @@ static void ggml_compute_forward_silu_back_f32(
|
|
11626
11619
|
const struct ggml_tensor * src0 = dst->src[0];
|
11627
11620
|
const struct ggml_tensor * grad = dst->src[1];
|
11628
11621
|
|
11629
|
-
GGML_ASSERT(
|
11630
|
-
GGML_ASSERT(
|
11631
|
-
GGML_ASSERT(
|
11622
|
+
GGML_ASSERT(ggml_is_contiguous_1(grad));
|
11623
|
+
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
11624
|
+
GGML_ASSERT(ggml_is_contiguous_1(dst));
|
11632
11625
|
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
11633
11626
|
GGML_ASSERT(ggml_are_same_shape(src0, grad));
|
11634
11627
|
|
@@ -12367,15 +12360,6 @@ static void ggml_compute_forward_mul_mat(
|
|
12367
12360
|
// nb01 >= nb00 - src0 is not transposed
|
12368
12361
|
// compute by src0 rows
|
12369
12362
|
|
12370
|
-
#if defined(GGML_USE_CLBLAST)
|
12371
|
-
if (ggml_cl_can_mul_mat(src0, src1, dst)) {
|
12372
|
-
if (params->ith == 0 && params->type == GGML_TASK_TYPE_COMPUTE) {
|
12373
|
-
ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
12374
|
-
}
|
12375
|
-
return;
|
12376
|
-
}
|
12377
|
-
#endif
|
12378
|
-
|
12379
12363
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
12380
12364
|
if (ggml_compute_forward_mul_mat_use_blas(dst)) {
|
12381
12365
|
const int64_t ne_plane = ne01*ne00;
|
@@ -12823,8 +12807,6 @@ static void ggml_compute_forward_out_prod_f32(
|
|
12823
12807
|
// nb01 >= nb00 - src0 is not transposed
|
12824
12808
|
// compute by src0 rows
|
12825
12809
|
|
12826
|
-
// TODO: #if defined(GGML_USE_CLBLAST)
|
12827
|
-
|
12828
12810
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
12829
12811
|
bool use_blas = ggml_is_matrix(src0) &&
|
12830
12812
|
ggml_is_matrix(src1) &&
|
@@ -13022,7 +13004,7 @@ static void ggml_compute_forward_out_prod_q_f32(
|
|
13022
13004
|
// nb01 >= nb00 - src0 is not transposed
|
13023
13005
|
// compute by src0 rows
|
13024
13006
|
|
13025
|
-
// TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
13007
|
+
// TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
13026
13008
|
|
13027
13009
|
if (params->type == GGML_TASK_TYPE_INIT) {
|
13028
13010
|
if (ith != 0) {
|
@@ -14219,8 +14201,7 @@ static float rope_yarn_ramp(const float low, const float high, const int i0) {
|
|
14219
14201
|
// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
|
14220
14202
|
static void rope_yarn(
|
14221
14203
|
float theta_extrap, float freq_scale, float corr_dims[2], int64_t i0, float ext_factor, float mscale,
|
14222
|
-
float * cos_theta, float * sin_theta
|
14223
|
-
) {
|
14204
|
+
float * cos_theta, float * sin_theta) {
|
14224
14205
|
// Get n-d rotational scaling corrected for extrapolation
|
14225
14206
|
float theta_interp = freq_scale * theta_extrap;
|
14226
14207
|
float theta = theta_interp;
|
@@ -14237,18 +14218,19 @@ static void rope_yarn(
|
|
14237
14218
|
|
14238
14219
|
// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
|
14239
14220
|
// `corr_dim(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
|
14240
|
-
static float ggml_rope_yarn_corr_dim(int n_dims, int
|
14241
|
-
return n_dims * logf(
|
14221
|
+
static float ggml_rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) {
|
14222
|
+
return n_dims * logf(n_ctx_orig / (n_rot * 2 * (float)M_PI)) / (2 * logf(base));
|
14242
14223
|
}
|
14243
14224
|
|
14244
14225
|
static void ggml_rope_cache_init(
|
14245
|
-
float theta_base, float freq_scale, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
|
14246
|
-
float * cache, float sin_sign, float theta_scale
|
14247
|
-
|
14226
|
+
float theta_base, float freq_scale, const float * freq_factors, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
|
14227
|
+
float * cache, float sin_sign, float theta_scale) {
|
14228
|
+
// ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
|
14248
14229
|
float theta = theta_base;
|
14249
14230
|
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
14231
|
+
const float ff = freq_factors ? freq_factors[i0/2] : 1.0f;
|
14250
14232
|
rope_yarn(
|
14251
|
-
theta, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1]
|
14233
|
+
theta/ff, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1]
|
14252
14234
|
);
|
14253
14235
|
cache[i0 + 1] *= sin_sign;
|
14254
14236
|
|
@@ -14257,11 +14239,11 @@ static void ggml_rope_cache_init(
|
|
14257
14239
|
}
|
14258
14240
|
|
14259
14241
|
GGML_CALL void ggml_rope_yarn_corr_dims(
|
14260
|
-
int n_dims, int
|
14242
|
+
int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]
|
14261
14243
|
) {
|
14262
14244
|
// start and end correction dims
|
14263
|
-
float start = floorf(ggml_rope_yarn_corr_dim(n_dims,
|
14264
|
-
float end = ceilf(ggml_rope_yarn_corr_dim(n_dims,
|
14245
|
+
float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_fast, freq_base));
|
14246
|
+
float end = ceilf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_slow, freq_base));
|
14265
14247
|
dims[0] = MAX(0, start);
|
14266
14248
|
dims[1] = MIN(n_dims - 1, end);
|
14267
14249
|
}
|
@@ -14281,15 +14263,11 @@ static void ggml_compute_forward_rope_f32(
|
|
14281
14263
|
|
14282
14264
|
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
14283
14265
|
|
14284
|
-
// these two only relevant for xPos RoPE:
|
14285
|
-
float xpos_base;
|
14286
|
-
bool xpos_down;
|
14287
|
-
|
14288
14266
|
//const int n_past = ((int32_t *) dst->op_params)[0];
|
14289
14267
|
const int n_dims = ((int32_t *) dst->op_params)[1];
|
14290
14268
|
const int mode = ((int32_t *) dst->op_params)[2];
|
14291
|
-
const int n_ctx = ((int32_t *) dst->op_params)[3];
|
14292
|
-
const int
|
14269
|
+
//const int n_ctx = ((int32_t *) dst->op_params)[3];
|
14270
|
+
const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
|
14293
14271
|
|
14294
14272
|
memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
|
14295
14273
|
memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
|
@@ -14297,8 +14275,6 @@ static void ggml_compute_forward_rope_f32(
|
|
14297
14275
|
memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
|
14298
14276
|
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
14299
14277
|
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
14300
|
-
memcpy(&xpos_base, (int32_t *) dst->op_params + 11, sizeof(float));
|
14301
|
-
memcpy(&xpos_down, (int32_t *) dst->op_params + 12, sizeof(bool));
|
14302
14278
|
|
14303
14279
|
GGML_TENSOR_UNARY_OP_LOCALS
|
14304
14280
|
|
@@ -14326,22 +14302,17 @@ static void ggml_compute_forward_rope_f32(
|
|
14326
14302
|
int ir = 0;
|
14327
14303
|
|
14328
14304
|
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
14329
|
-
|
14305
|
+
|
14330
14306
|
float corr_dims[2];
|
14331
|
-
ggml_rope_yarn_corr_dims(n_dims,
|
14307
|
+
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
|
14332
14308
|
|
14333
14309
|
const bool is_neox = mode & 2;
|
14334
|
-
const bool is_glm = mode & 4;
|
14335
14310
|
|
14336
14311
|
const float * freq_factors = NULL;
|
14337
|
-
if (
|
14338
|
-
|
14339
|
-
|
14340
|
-
|
14341
|
-
freq_factors = (const float *) src2->data;
|
14342
|
-
}
|
14343
|
-
} else {
|
14344
|
-
GGML_ASSERT(src2 == NULL && "TODO: freq_factors not implemented for !is_neox");
|
14312
|
+
if (src2 != NULL) {
|
14313
|
+
GGML_ASSERT(src2->type == GGML_TYPE_F32);
|
14314
|
+
GGML_ASSERT(src2->ne[0] >= n_dims / 2);
|
14315
|
+
freq_factors = (const float *) src2->data;
|
14345
14316
|
}
|
14346
14317
|
|
14347
14318
|
// backward process uses inverse rotation by cos and sin.
|
@@ -14356,101 +14327,50 @@ static void ggml_compute_forward_rope_f32(
|
|
14356
14327
|
const int64_t p = pos[i2];
|
14357
14328
|
|
14358
14329
|
float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
|
14359
|
-
|
14360
|
-
ggml_rope_cache_init(p, freq_scale, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
|
14361
|
-
}
|
14330
|
+
ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
|
14362
14331
|
|
14363
14332
|
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
14364
14333
|
if (ir++ < ir0) continue;
|
14365
14334
|
if (ir > ir1) break;
|
14366
14335
|
|
14367
|
-
|
14368
|
-
|
14369
|
-
if (is_glm) {
|
14370
|
-
theta_base = MIN(p, n_ctx - 2);
|
14371
|
-
float block_theta = MAX(p - (n_ctx - 2), 0);
|
14372
|
-
for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
|
14373
|
-
const float cos_theta = cosf(theta_base);
|
14374
|
-
const float sin_theta = sinf(theta_base) * sin_sign;
|
14375
|
-
const float cos_block_theta = cosf(block_theta);
|
14376
|
-
const float sin_block_theta = sinf(block_theta) * sin_sign;
|
14377
|
-
|
14378
|
-
theta_base *= theta_scale;
|
14379
|
-
block_theta *= theta_scale;
|
14380
|
-
|
14381
|
-
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
14382
|
-
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
14383
|
-
|
14384
|
-
const float x0 = src[0];
|
14385
|
-
const float x1 = src[n_dims/2];
|
14386
|
-
const float x2 = src[n_dims];
|
14387
|
-
const float x3 = src[n_dims/2*3];
|
14388
|
-
|
14389
|
-
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
14390
|
-
dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
|
14391
|
-
dst_data[n_dims] = x2*cos_block_theta - x3*sin_block_theta;
|
14392
|
-
dst_data[n_dims/2*3] = x2*sin_block_theta + x3*cos_block_theta;
|
14393
|
-
}
|
14394
|
-
} else if (!is_neox) {
|
14395
|
-
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
14336
|
+
if (!is_neox) {
|
14337
|
+
for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
|
14396
14338
|
const float cos_theta = cache[i0 + 0];
|
14397
14339
|
const float sin_theta = cache[i0 + 1];
|
14398
14340
|
|
14399
|
-
// zeta scaling for xPos only:
|
14400
|
-
float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
|
14401
|
-
if (xpos_down) zeta = 1.0f / zeta;
|
14402
|
-
|
14403
14341
|
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
14404
14342
|
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
14405
14343
|
|
14406
14344
|
const float x0 = src[0];
|
14407
14345
|
const float x1 = src[1];
|
14408
14346
|
|
14409
|
-
dst_data[0] = x0*cos_theta
|
14410
|
-
dst_data[1] = x0*sin_theta
|
14347
|
+
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
14348
|
+
dst_data[1] = x0*sin_theta + x1*cos_theta;
|
14411
14349
|
}
|
14412
14350
|
} else {
|
14413
|
-
|
14414
|
-
|
14415
|
-
// ref: https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26
|
14416
|
-
theta_base *= freq_scale;
|
14417
|
-
for (int64_t ic = 0; ic < ne0; ic += 2) {
|
14418
|
-
if (ic < n_dims) {
|
14419
|
-
const int64_t ib = 0;
|
14420
|
-
|
14421
|
-
// simplified from `(ib * n_dims + ic) * inv_ndims`
|
14422
|
-
float cur_rot = inv_ndims * ic - ib;
|
14423
|
-
float freq_factor = freq_factors ? freq_factors[ic/2] : 1.0f;
|
14351
|
+
for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
|
14352
|
+
const int64_t ic = i0/2;
|
14424
14353
|
|
14425
|
-
|
14426
|
-
|
14427
|
-
theta_base/freq_factor, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
|
14428
|
-
&cos_theta, &sin_theta
|
14429
|
-
);
|
14430
|
-
sin_theta *= sin_sign;
|
14431
|
-
|
14432
|
-
theta_base *= theta_scale;
|
14433
|
-
|
14434
|
-
const int64_t i0 = ib*n_dims + ic/2;
|
14354
|
+
const float cos_theta = cache[i0 + 0];
|
14355
|
+
const float sin_theta = cache[i0 + 1];
|
14435
14356
|
|
14436
|
-
|
14437
|
-
|
14357
|
+
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
|
14358
|
+
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
|
14438
14359
|
|
14439
|
-
|
14440
|
-
|
14360
|
+
const float x0 = src[0];
|
14361
|
+
const float x1 = src[n_dims/2];
|
14441
14362
|
|
14442
|
-
|
14443
|
-
|
14444
|
-
|
14445
|
-
|
14363
|
+
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
14364
|
+
dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
|
14365
|
+
}
|
14366
|
+
}
|
14446
14367
|
|
14447
|
-
|
14448
|
-
|
14368
|
+
for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
|
14369
|
+
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
14370
|
+
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
14449
14371
|
|
14450
|
-
|
14451
|
-
|
14452
|
-
}
|
14453
|
-
}
|
14372
|
+
dst_data[0] = src[0];
|
14373
|
+
dst_data[1] = src[1];
|
14454
14374
|
}
|
14455
14375
|
}
|
14456
14376
|
}
|
@@ -14476,8 +14396,8 @@ static void ggml_compute_forward_rope_f16(
|
|
14476
14396
|
//const int n_past = ((int32_t *) dst->op_params)[0];
|
14477
14397
|
const int n_dims = ((int32_t *) dst->op_params)[1];
|
14478
14398
|
const int mode = ((int32_t *) dst->op_params)[2];
|
14479
|
-
const int n_ctx = ((int32_t *) dst->op_params)[3];
|
14480
|
-
const int
|
14399
|
+
//const int n_ctx = ((int32_t *) dst->op_params)[3];
|
14400
|
+
const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
|
14481
14401
|
memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
|
14482
14402
|
memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
|
14483
14403
|
memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
|
@@ -14511,22 +14431,17 @@ static void ggml_compute_forward_rope_f16(
|
|
14511
14431
|
int ir = 0;
|
14512
14432
|
|
14513
14433
|
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
14514
|
-
|
14434
|
+
|
14515
14435
|
float corr_dims[2];
|
14516
|
-
ggml_rope_yarn_corr_dims(n_dims,
|
14436
|
+
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
|
14517
14437
|
|
14518
14438
|
const bool is_neox = mode & 2;
|
14519
|
-
const bool is_glm = mode & 4;
|
14520
14439
|
|
14521
14440
|
const float * freq_factors = NULL;
|
14522
|
-
if (
|
14523
|
-
|
14524
|
-
|
14525
|
-
|
14526
|
-
freq_factors = (const float *) src2->data;
|
14527
|
-
}
|
14528
|
-
} else {
|
14529
|
-
GGML_ASSERT(src2 == NULL && "TODO: freq_factors not implemented for !is_neox");
|
14441
|
+
if (src2 != NULL) {
|
14442
|
+
GGML_ASSERT(src2->type == GGML_TYPE_F32);
|
14443
|
+
GGML_ASSERT(src2->ne[0] >= n_dims / 2);
|
14444
|
+
freq_factors = (const float *) src2->data;
|
14530
14445
|
}
|
14531
14446
|
|
14532
14447
|
// backward process uses inverse rotation by cos and sin.
|
@@ -14541,43 +14456,14 @@ static void ggml_compute_forward_rope_f16(
|
|
14541
14456
|
const int64_t p = pos[i2];
|
14542
14457
|
|
14543
14458
|
float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
|
14544
|
-
|
14545
|
-
ggml_rope_cache_init(p, freq_scale, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
|
14546
|
-
}
|
14459
|
+
ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
|
14547
14460
|
|
14548
14461
|
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
14549
14462
|
if (ir++ < ir0) continue;
|
14550
14463
|
if (ir > ir1) break;
|
14551
14464
|
|
14552
|
-
|
14553
|
-
|
14554
|
-
if (is_glm) {
|
14555
|
-
theta_base = MIN(p, n_ctx - 2);
|
14556
|
-
float block_theta = MAX(p - (n_ctx - 2), 0);
|
14557
|
-
for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
|
14558
|
-
const float cos_theta = cosf(theta_base);
|
14559
|
-
const float sin_theta = sinf(theta_base) * sin_sign;
|
14560
|
-
const float cos_block_theta = cosf(block_theta);
|
14561
|
-
const float sin_block_theta = sinf(block_theta) * sin_sign;
|
14562
|
-
|
14563
|
-
theta_base *= theta_scale;
|
14564
|
-
block_theta *= theta_scale;
|
14565
|
-
|
14566
|
-
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
14567
|
-
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
14568
|
-
|
14569
|
-
const float x0 = GGML_FP16_TO_FP32(src[0]);
|
14570
|
-
const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
|
14571
|
-
const float x2 = GGML_FP16_TO_FP32(src[n_dims]);
|
14572
|
-
const float x3 = GGML_FP16_TO_FP32(src[n_dims/2*3]);
|
14573
|
-
|
14574
|
-
dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
|
14575
|
-
dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
14576
|
-
dst_data[n_dims] = GGML_FP32_TO_FP16(x2*cos_block_theta - x3*sin_block_theta);
|
14577
|
-
dst_data[n_dims/2*3] = GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta);
|
14578
|
-
}
|
14579
|
-
} else if (!is_neox) {
|
14580
|
-
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
14465
|
+
if (!is_neox) {
|
14466
|
+
for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
|
14581
14467
|
const float cos_theta = cache[i0 + 0];
|
14582
14468
|
const float sin_theta = cache[i0 + 1];
|
14583
14469
|
|
@@ -14591,47 +14477,29 @@ static void ggml_compute_forward_rope_f16(
|
|
14591
14477
|
dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
14592
14478
|
}
|
14593
14479
|
} else {
|
14594
|
-
|
14595
|
-
|
14596
|
-
// ref: https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26
|
14597
|
-
theta_base *= freq_scale;
|
14598
|
-
for (int64_t ic = 0; ic < ne0; ic += 2) {
|
14599
|
-
if (ic < n_dims) {
|
14600
|
-
const int64_t ib = 0;
|
14601
|
-
|
14602
|
-
// simplified from `(ib * n_dims + ic) * inv_ndims`
|
14603
|
-
float cur_rot = inv_ndims * ic - ib;
|
14604
|
-
float freq_factor = freq_factors ? freq_factors[ic/2] : 1.0f;
|
14605
|
-
|
14606
|
-
float cos_theta, sin_theta;
|
14607
|
-
rope_yarn(
|
14608
|
-
theta_base/freq_factor, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
|
14609
|
-
&cos_theta, &sin_theta
|
14610
|
-
);
|
14611
|
-
sin_theta *= sin_sign;
|
14612
|
-
|
14613
|
-
theta_base *= theta_scale;
|
14480
|
+
for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
|
14481
|
+
const int64_t ic = i0/2;
|
14614
14482
|
|
14615
|
-
|
14483
|
+
const float cos_theta = cache[i0 + 0];
|
14484
|
+
const float sin_theta = cache[i0 + 1];
|
14616
14485
|
|
14617
|
-
|
14618
|
-
|
14486
|
+
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
|
14487
|
+
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
|
14619
14488
|
|
14620
|
-
|
14621
|
-
|
14489
|
+
const float x0 = GGML_FP16_TO_FP32(src[0]);
|
14490
|
+
const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
|
14622
14491
|
|
14623
|
-
|
14624
|
-
|
14625
|
-
|
14626
|
-
|
14492
|
+
dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
|
14493
|
+
dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
14494
|
+
}
|
14495
|
+
}
|
14627
14496
|
|
14628
|
-
|
14629
|
-
|
14497
|
+
for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
|
14498
|
+
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
14499
|
+
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
14630
14500
|
|
14631
|
-
|
14632
|
-
|
14633
|
-
}
|
14634
|
-
}
|
14501
|
+
dst_data[0] = src[0];
|
14502
|
+
dst_data[1] = src[1];
|
14635
14503
|
}
|
14636
14504
|
}
|
14637
14505
|
}
|
@@ -18333,9 +18201,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
18333
18201
|
//const int n_past = ((int32_t *) tensor->op_params)[0];
|
18334
18202
|
const int n_dims = ((int32_t *) tensor->op_params)[1];
|
18335
18203
|
const int mode = ((int32_t *) tensor->op_params)[2];
|
18336
|
-
const int n_ctx = ((int32_t *) tensor->op_params)[3];
|
18337
|
-
const int
|
18338
|
-
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
18204
|
+
//const int n_ctx = ((int32_t *) tensor->op_params)[3];
|
18205
|
+
const int n_ctx_orig = ((int32_t *) tensor->op_params)[4];
|
18206
|
+
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
18339
18207
|
|
18340
18208
|
memcpy(&freq_base, (int32_t *) tensor->op_params + 5, sizeof(float));
|
18341
18209
|
memcpy(&freq_scale, (int32_t *) tensor->op_params + 6, sizeof(float));
|
@@ -18343,8 +18211,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
18343
18211
|
memcpy(&attn_factor, (int32_t *) tensor->op_params + 8, sizeof(float));
|
18344
18212
|
memcpy(&beta_fast, (int32_t *) tensor->op_params + 9, sizeof(float));
|
18345
18213
|
memcpy(&beta_slow, (int32_t *) tensor->op_params + 10, sizeof(float));
|
18346
|
-
memcpy(&xpos_base, (int32_t *) tensor->op_params + 11, sizeof(float));
|
18347
|
-
memcpy(&xpos_down, (int32_t *) tensor->op_params + 12, sizeof(bool));
|
18348
18214
|
|
18349
18215
|
src0->grad = ggml_add_or_set(ctx,
|
18350
18216
|
src0->grad,
|
@@ -18354,16 +18220,13 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
18354
18220
|
src2,
|
18355
18221
|
n_dims,
|
18356
18222
|
mode,
|
18357
|
-
|
18358
|
-
n_orig_ctx,
|
18223
|
+
n_ctx_orig,
|
18359
18224
|
freq_base,
|
18360
18225
|
freq_scale,
|
18361
18226
|
ext_factor,
|
18362
18227
|
attn_factor,
|
18363
18228
|
beta_fast,
|
18364
|
-
beta_slow,
|
18365
|
-
xpos_base,
|
18366
|
-
xpos_down),
|
18229
|
+
beta_slow),
|
18367
18230
|
zero_table);
|
18368
18231
|
}
|
18369
18232
|
} break;
|
@@ -18373,9 +18236,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
18373
18236
|
//const int n_past = ((int32_t *) tensor->op_params)[0];
|
18374
18237
|
const int n_dims = ((int32_t *) tensor->op_params)[1];
|
18375
18238
|
const int mode = ((int32_t *) tensor->op_params)[2];
|
18376
|
-
const int n_ctx = ((int32_t *) tensor->op_params)[3];
|
18377
|
-
const int
|
18378
|
-
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
18239
|
+
//const int n_ctx = ((int32_t *) tensor->op_params)[3];
|
18240
|
+
const int n_ctx_orig = ((int32_t *) tensor->op_params)[4];
|
18241
|
+
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
18379
18242
|
|
18380
18243
|
memcpy(&freq_base, (int32_t *) tensor->op_params + 5, sizeof(float));
|
18381
18244
|
memcpy(&freq_scale, (int32_t *) tensor->op_params + 6, sizeof(float));
|
@@ -18383,8 +18246,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
18383
18246
|
memcpy(&attn_factor, (int32_t *) tensor->op_params + 8, sizeof(float));
|
18384
18247
|
memcpy(&beta_fast, (int32_t *) tensor->op_params + 9, sizeof(float));
|
18385
18248
|
memcpy(&beta_slow, (int32_t *) tensor->op_params + 10, sizeof(float));
|
18386
|
-
memcpy(&xpos_base, (int32_t *) tensor->op_params + 11, sizeof(float));
|
18387
|
-
memcpy(&xpos_down, (int32_t *) tensor->op_params + 12, sizeof(bool));
|
18388
18249
|
|
18389
18250
|
src0->grad = ggml_add_or_set(ctx,
|
18390
18251
|
src0->grad,
|
@@ -18394,16 +18255,13 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
18394
18255
|
src2,
|
18395
18256
|
n_dims,
|
18396
18257
|
mode,
|
18397
|
-
|
18398
|
-
n_orig_ctx,
|
18258
|
+
n_ctx_orig,
|
18399
18259
|
freq_base,
|
18400
18260
|
freq_scale,
|
18401
18261
|
ext_factor,
|
18402
18262
|
attn_factor,
|
18403
18263
|
beta_fast,
|
18404
18264
|
beta_slow,
|
18405
|
-
xpos_base,
|
18406
|
-
xpos_down,
|
18407
18265
|
false),
|
18408
18266
|
zero_table);
|
18409
18267
|
}
|
@@ -19510,11 +19368,6 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
|
19510
19368
|
{
|
19511
19369
|
const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type;
|
19512
19370
|
|
19513
|
-
#if defined(GGML_USE_CLBLAST)
|
19514
|
-
if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
|
19515
|
-
cur = ggml_cl_mul_mat_get_wsize(node->src[0], node->src[1], node);
|
19516
|
-
} else
|
19517
|
-
#endif
|
19518
19371
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
19519
19372
|
if (ggml_compute_forward_mul_mat_use_blas(node)) {
|
19520
19373
|
if (node->src[0]->type != GGML_TYPE_F32) {
|
@@ -19644,6 +19497,59 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
|
19644
19497
|
return cplan;
|
19645
19498
|
}
|
19646
19499
|
|
19500
|
+
static enum ggml_status ggml_graph_compute_parallel(struct ggml_compute_state * workers, int n_threads) {
|
19501
|
+
enum ggml_status compute_status = GGML_STATUS_SUCCESS;
|
19502
|
+
|
19503
|
+
#ifdef GGML_USE_OPENMP
|
19504
|
+
if (n_threads > 1) {
|
19505
|
+
#pragma omp parallel num_threads(n_threads)
|
19506
|
+
{
|
19507
|
+
#pragma omp single
|
19508
|
+
{
|
19509
|
+
// update the number of threads from the actual number of threads that we got from OpenMP
|
19510
|
+
n_threads = omp_get_num_threads();
|
19511
|
+
workers[0].shared->n_threads = n_threads;
|
19512
|
+
workers[0].shared->n_active = n_threads;
|
19513
|
+
}
|
19514
|
+
ggml_graph_compute_thread(&workers[omp_get_thread_num()]);
|
19515
|
+
}
|
19516
|
+
} else {
|
19517
|
+
ggml_graph_compute_thread(&workers[0]);
|
19518
|
+
}
|
19519
|
+
#else
|
19520
|
+
// create thread pool
|
19521
|
+
if (n_threads > 1) {
|
19522
|
+
for (int j = 1; j < n_threads; ++j) {
|
19523
|
+
const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
|
19524
|
+
GGML_ASSERT(rc == 0);
|
19525
|
+
UNUSED(rc);
|
19526
|
+
}
|
19527
|
+
}
|
19528
|
+
|
19529
|
+
// this is a work thread too
|
19530
|
+
ggml_graph_compute_thread(&workers[0]);
|
19531
|
+
|
19532
|
+
// join or kill thread pool
|
19533
|
+
if (n_threads > 1) {
|
19534
|
+
for (int j = 1; j < n_threads; j++) {
|
19535
|
+
const int rc = ggml_thread_join(workers[j].thrd, NULL);
|
19536
|
+
GGML_ASSERT(rc == 0);
|
19537
|
+
UNUSED(rc);
|
19538
|
+
}
|
19539
|
+
}
|
19540
|
+
#endif
|
19541
|
+
// don't leave affinity set on the main thread
|
19542
|
+
clear_numa_thread_affinity();
|
19543
|
+
|
19544
|
+
for (int j = 0; j < n_threads; j++) {
|
19545
|
+
if (workers[j].ec != GGML_STATUS_SUCCESS) {
|
19546
|
+
compute_status = workers[j].ec;
|
19547
|
+
break;
|
19548
|
+
}
|
19549
|
+
}
|
19550
|
+
return compute_status;
|
19551
|
+
}
|
19552
|
+
|
19647
19553
|
enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
19648
19554
|
{
|
19649
19555
|
GGML_ASSERT(cplan);
|
@@ -19654,7 +19560,11 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
|
19654
19560
|
}
|
19655
19561
|
}
|
19656
19562
|
|
19657
|
-
|
19563
|
+
int n_threads = cplan->n_threads;
|
19564
|
+
|
19565
|
+
#if defined(GGML_USE_OPENMP)
|
19566
|
+
n_threads = MIN(n_threads, omp_get_max_threads());
|
19567
|
+
#endif
|
19658
19568
|
|
19659
19569
|
struct ggml_compute_state_shared state_shared = {
|
19660
19570
|
/*.cgraph =*/ cgraph,
|
@@ -19670,47 +19580,20 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
|
19670
19580
|
/*.current_chunk; =*/ 0,
|
19671
19581
|
};
|
19672
19582
|
struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
|
19673
|
-
|
19674
|
-
// create thread pool
|
19675
|
-
if (n_threads > 1) {
|
19676
|
-
for (int j = 1; j < n_threads; ++j) {
|
19677
|
-
workers[j] = (struct ggml_compute_state) {
|
19678
|
-
.thrd = 0,
|
19679
|
-
.ith = j,
|
19680
|
-
.shared = &state_shared,
|
19681
|
-
.ec = GGML_STATUS_SUCCESS,
|
19682
|
-
};
|
19683
|
-
|
19684
|
-
const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
|
19685
|
-
GGML_ASSERT(rc == 0);
|
19686
|
-
UNUSED(rc);
|
19687
|
-
}
|
19688
|
-
}
|
19689
|
-
|
19690
|
-
workers[0].ith = 0;
|
19691
|
-
workers[0].shared = &state_shared;
|
19692
|
-
workers[0].ec = GGML_STATUS_SUCCESS;
|
19693
|
-
|
19694
19583
|
const int64_t perf_start_cycles = ggml_perf_cycles();
|
19695
19584
|
const int64_t perf_start_time_us = ggml_perf_time_us();
|
19696
19585
|
|
19697
|
-
|
19698
|
-
|
19699
|
-
|
19700
|
-
|
19701
|
-
|
19702
|
-
|
19703
|
-
|
19704
|
-
// join or kill thread pool
|
19705
|
-
if (n_threads > 1) {
|
19706
|
-
for (int j = 1; j < n_threads; j++) {
|
19707
|
-
const int rc = ggml_thread_join(workers[j].thrd, NULL);
|
19708
|
-
GGML_ASSERT(rc == 0);
|
19709
|
-
if (workers[j].ec != GGML_STATUS_SUCCESS)
|
19710
|
-
compute_status = workers[j].ec;
|
19711
|
-
}
|
19586
|
+
for (int j = 0; j < n_threads; ++j) {
|
19587
|
+
workers[j] = (struct ggml_compute_state) {
|
19588
|
+
.thrd = 0,
|
19589
|
+
.ith = j,
|
19590
|
+
.shared = &state_shared,
|
19591
|
+
.ec = GGML_STATUS_SUCCESS,
|
19592
|
+
};
|
19712
19593
|
}
|
19713
19594
|
|
19595
|
+
enum ggml_status compute_status = ggml_graph_compute_parallel(workers, n_threads);
|
19596
|
+
|
19714
19597
|
// performance stats (graph)
|
19715
19598
|
{
|
19716
19599
|
int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles;
|
@@ -22742,6 +22625,16 @@ int ggml_cpu_has_neon(void) {
|
|
22742
22625
|
#endif
|
22743
22626
|
}
|
22744
22627
|
|
22628
|
+
int ggml_cpu_has_sve(void) {
|
22629
|
+
#if defined(__ARM_FEATURE_SVE)
|
22630
|
+
// TODO: Currently, SVE 256 bit is only supported.
|
22631
|
+
GGML_ASSERT(svcntb() == QK8_0);
|
22632
|
+
return 1;
|
22633
|
+
#else
|
22634
|
+
return 0;
|
22635
|
+
#endif
|
22636
|
+
}
|
22637
|
+
|
22745
22638
|
int ggml_cpu_has_arm_fma(void) {
|
22746
22639
|
#if defined(__ARM_FEATURE_FMA)
|
22747
22640
|
return 1;
|
@@ -22783,7 +22676,7 @@ int ggml_cpu_has_wasm_simd(void) {
|
|
22783
22676
|
}
|
22784
22677
|
|
22785
22678
|
int ggml_cpu_has_blas(void) {
|
22786
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(
|
22679
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL)
|
22787
22680
|
return 1;
|
22788
22681
|
#else
|
22789
22682
|
return 0;
|
@@ -22798,14 +22691,6 @@ int ggml_cpu_has_cuda(void) {
|
|
22798
22691
|
#endif
|
22799
22692
|
}
|
22800
22693
|
|
22801
|
-
int ggml_cpu_has_clblast(void) {
|
22802
|
-
#if defined(GGML_USE_CLBLAST)
|
22803
|
-
return 1;
|
22804
|
-
#else
|
22805
|
-
return 0;
|
22806
|
-
#endif
|
22807
|
-
}
|
22808
|
-
|
22809
22694
|
int ggml_cpu_has_vulkan(void) {
|
22810
22695
|
#if defined(GGML_USE_VULKAN)
|
22811
22696
|
return 1;
|
@@ -22830,9 +22715,16 @@ int ggml_cpu_has_sycl(void) {
|
|
22830
22715
|
#endif
|
22831
22716
|
}
|
22832
22717
|
|
22718
|
+
int ggml_cpu_has_rpc(void) {
|
22719
|
+
#if defined(GGML_USE_RPC)
|
22720
|
+
return 1;
|
22721
|
+
#else
|
22722
|
+
return 0;
|
22723
|
+
#endif
|
22724
|
+
}
|
22725
|
+
|
22833
22726
|
int ggml_cpu_has_gpublas(void) {
|
22834
|
-
return ggml_cpu_has_cuda() ||
|
22835
|
-
ggml_cpu_has_sycl();
|
22727
|
+
return ggml_cpu_has_cuda() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() || ggml_cpu_has_sycl();
|
22836
22728
|
}
|
22837
22729
|
|
22838
22730
|
int ggml_cpu_has_sse3(void) {
|