llama_cpp 0.15.4 → 0.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/ext/llama_cpp/extconf.rb +1 -2
- data/ext/llama_cpp/llama_cpp.cpp +15 -3
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +13 -1
- data/vendor/tmp/llama.cpp/Makefile +62 -35
- data/vendor/tmp/llama.cpp/ggml-alloc.c +4 -4
- data/vendor/tmp/llama.cpp/ggml-backend.c +5 -5
- data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +47 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +34 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +103 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +280 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +34 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +196 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +686 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +490 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +40 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +662 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +319 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +312 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +345 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +178 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +104 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +1564 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +404 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +221 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +49 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +94 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +45 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +271 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +31 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +205 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +40 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +9 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +10 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +8 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +47 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +266 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +51 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +8 -6
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +21 -6
- data/vendor/tmp/llama.cpp/ggml-metal.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +34 -24
- data/vendor/tmp/llama.cpp/ggml-metal.metal +83 -59
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +2 -2
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +7 -67
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +99301 -39793
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +456 -329
- data/vendor/tmp/llama.cpp/ggml.c +178 -330
- data/vendor/tmp/llama.cpp/ggml.h +9 -28
- data/vendor/tmp/llama.cpp/llama.cpp +242 -426
- data/vendor/tmp/llama.cpp/llama.h +17 -43
- metadata +121 -6
- data/vendor/tmp/llama.cpp/ggml-mpi.c +0 -216
- data/vendor/tmp/llama.cpp/ggml-mpi.h +0 -39
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +0 -2305
- data/vendor/tmp/llama.cpp/ggml-opencl.h +0 -36
data/vendor/tmp/llama.cpp/ggml.c
CHANGED
|
@@ -5,6 +5,7 @@
|
|
|
5
5
|
#include "ggml-quants.h"
|
|
6
6
|
#include "ggml.h"
|
|
7
7
|
|
|
8
|
+
|
|
8
9
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
|
9
10
|
#include <malloc.h> // using malloc.h with MSC/MINGW
|
|
10
11
|
#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
|
|
@@ -28,6 +29,10 @@
|
|
|
28
29
|
#include <syscall.h>
|
|
29
30
|
#endif
|
|
30
31
|
|
|
32
|
+
#ifdef GGML_USE_OPENMP
|
|
33
|
+
#include <omp.h>
|
|
34
|
+
#endif
|
|
35
|
+
|
|
31
36
|
#ifdef GGML_USE_METAL
|
|
32
37
|
#include <unistd.h>
|
|
33
38
|
#endif
|
|
@@ -292,17 +297,12 @@ inline static void * ggml_calloc(size_t num, size_t size) {
|
|
|
292
297
|
|
|
293
298
|
#if defined(GGML_USE_ACCELERATE)
|
|
294
299
|
#include <Accelerate/Accelerate.h>
|
|
295
|
-
#if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
|
|
296
|
-
#include "ggml-opencl.h"
|
|
297
|
-
#endif
|
|
298
300
|
#elif defined(GGML_USE_OPENBLAS)
|
|
299
301
|
#if defined(GGML_BLAS_USE_MKL)
|
|
300
302
|
#include <mkl.h>
|
|
301
303
|
#else
|
|
302
304
|
#include <cblas.h>
|
|
303
305
|
#endif
|
|
304
|
-
#elif defined(GGML_USE_CLBLAST)
|
|
305
|
-
#include "ggml-opencl.h"
|
|
306
306
|
#endif
|
|
307
307
|
|
|
308
308
|
// floating point type used to accumulate sums
|
|
@@ -1756,7 +1756,7 @@ struct ggml_compute_state_shared {
|
|
|
1756
1756
|
int64_t perf_node_start_cycles;
|
|
1757
1757
|
int64_t perf_node_start_time_us;
|
|
1758
1758
|
|
|
1759
|
-
|
|
1759
|
+
int n_threads;
|
|
1760
1760
|
|
|
1761
1761
|
// synchronization primitives
|
|
1762
1762
|
atomic_int n_active; // num active threads
|
|
@@ -2267,6 +2267,11 @@ inline static float ggml_silu_f32(float x) {
|
|
|
2267
2267
|
return x/(1.0f + expf(-x));
|
|
2268
2268
|
}
|
|
2269
2269
|
|
|
2270
|
+
#if __FINITE_MATH_ONLY__
|
|
2271
|
+
#error "some routines in ggml.c require non-finite math arithmetics -- pass -fno-finite-math-only to the compiler to fix"
|
|
2272
|
+
#error "ref: https://github.com/ggerganov/llama.cpp/pull/7154#issuecomment-2143844461"
|
|
2273
|
+
#endif
|
|
2274
|
+
|
|
2270
2275
|
#if defined(__ARM_NEON) && defined(__aarch64__)
|
|
2271
2276
|
|
|
2272
2277
|
// adapted from arm limited optimized routine
|
|
@@ -3370,10 +3375,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
|
3370
3375
|
GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
|
|
3371
3376
|
}
|
|
3372
3377
|
|
|
3373
|
-
#if defined(GGML_USE_CLBLAST)
|
|
3374
|
-
ggml_cl_init();
|
|
3375
|
-
#endif
|
|
3376
|
-
|
|
3377
3378
|
ggml_setup_op_has_task_pass();
|
|
3378
3379
|
|
|
3379
3380
|
is_first_call = false;
|
|
@@ -6249,16 +6250,13 @@ static struct ggml_tensor * ggml_rope_impl(
|
|
|
6249
6250
|
struct ggml_tensor * c,
|
|
6250
6251
|
int n_dims,
|
|
6251
6252
|
int mode,
|
|
6252
|
-
int
|
|
6253
|
-
int n_orig_ctx,
|
|
6253
|
+
int n_ctx_orig,
|
|
6254
6254
|
float freq_base,
|
|
6255
6255
|
float freq_scale,
|
|
6256
6256
|
float ext_factor,
|
|
6257
6257
|
float attn_factor,
|
|
6258
6258
|
float beta_fast,
|
|
6259
6259
|
float beta_slow,
|
|
6260
|
-
float xpos_base,
|
|
6261
|
-
bool xpos_down,
|
|
6262
6260
|
bool inplace) {
|
|
6263
6261
|
GGML_ASSERT((mode & 1) == 0 && "mode & 1 == 1 is no longer supported");
|
|
6264
6262
|
|
|
@@ -6279,15 +6277,13 @@ static struct ggml_tensor * ggml_rope_impl(
|
|
|
6279
6277
|
|
|
6280
6278
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
|
6281
6279
|
|
|
6282
|
-
int32_t params[
|
|
6280
|
+
int32_t params[11] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
|
|
6283
6281
|
memcpy(params + 5, &freq_base, sizeof(float));
|
|
6284
6282
|
memcpy(params + 6, &freq_scale, sizeof(float));
|
|
6285
6283
|
memcpy(params + 7, &ext_factor, sizeof(float));
|
|
6286
6284
|
memcpy(params + 8, &attn_factor, sizeof(float));
|
|
6287
6285
|
memcpy(params + 9, &beta_fast, sizeof(float));
|
|
6288
6286
|
memcpy(params + 10, &beta_slow, sizeof(float));
|
|
6289
|
-
memcpy(params + 11, &xpos_base, sizeof(float));
|
|
6290
|
-
memcpy(params + 12, &xpos_down, sizeof(bool));
|
|
6291
6287
|
ggml_set_op_params(result, params, sizeof(params));
|
|
6292
6288
|
|
|
6293
6289
|
result->op = GGML_OP_ROPE;
|
|
@@ -6304,10 +6300,9 @@ struct ggml_tensor * ggml_rope(
|
|
|
6304
6300
|
struct ggml_tensor * a,
|
|
6305
6301
|
struct ggml_tensor * b,
|
|
6306
6302
|
int n_dims,
|
|
6307
|
-
int mode
|
|
6308
|
-
int n_ctx) {
|
|
6303
|
+
int mode) {
|
|
6309
6304
|
return ggml_rope_impl(
|
|
6310
|
-
ctx, a, b, NULL, n_dims, mode,
|
|
6305
|
+
ctx, a, b, NULL, n_dims, mode, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, false
|
|
6311
6306
|
);
|
|
6312
6307
|
}
|
|
6313
6308
|
|
|
@@ -6316,10 +6311,9 @@ struct ggml_tensor * ggml_rope_inplace(
|
|
|
6316
6311
|
struct ggml_tensor * a,
|
|
6317
6312
|
struct ggml_tensor * b,
|
|
6318
6313
|
int n_dims,
|
|
6319
|
-
int mode
|
|
6320
|
-
int n_ctx) {
|
|
6314
|
+
int mode) {
|
|
6321
6315
|
return ggml_rope_impl(
|
|
6322
|
-
ctx, a, b, NULL, n_dims, mode,
|
|
6316
|
+
ctx, a, b, NULL, n_dims, mode, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, true
|
|
6323
6317
|
);
|
|
6324
6318
|
}
|
|
6325
6319
|
|
|
@@ -6330,8 +6324,7 @@ struct ggml_tensor * ggml_rope_ext(
|
|
|
6330
6324
|
struct ggml_tensor * c,
|
|
6331
6325
|
int n_dims,
|
|
6332
6326
|
int mode,
|
|
6333
|
-
int
|
|
6334
|
-
int n_orig_ctx,
|
|
6327
|
+
int n_ctx_orig,
|
|
6335
6328
|
float freq_base,
|
|
6336
6329
|
float freq_scale,
|
|
6337
6330
|
float ext_factor,
|
|
@@ -6339,8 +6332,8 @@ struct ggml_tensor * ggml_rope_ext(
|
|
|
6339
6332
|
float beta_fast,
|
|
6340
6333
|
float beta_slow) {
|
|
6341
6334
|
return ggml_rope_impl(
|
|
6342
|
-
ctx, a, b, c, n_dims, mode,
|
|
6343
|
-
ext_factor, attn_factor, beta_fast, beta_slow,
|
|
6335
|
+
ctx, a, b, c, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
|
|
6336
|
+
ext_factor, attn_factor, beta_fast, beta_slow, false
|
|
6344
6337
|
);
|
|
6345
6338
|
}
|
|
6346
6339
|
|
|
@@ -6351,8 +6344,7 @@ struct ggml_tensor * ggml_rope_ext_inplace(
|
|
|
6351
6344
|
struct ggml_tensor * c,
|
|
6352
6345
|
int n_dims,
|
|
6353
6346
|
int mode,
|
|
6354
|
-
int
|
|
6355
|
-
int n_orig_ctx,
|
|
6347
|
+
int n_ctx_orig,
|
|
6356
6348
|
float freq_base,
|
|
6357
6349
|
float freq_scale,
|
|
6358
6350
|
float ext_factor,
|
|
@@ -6360,8 +6352,8 @@ struct ggml_tensor * ggml_rope_ext_inplace(
|
|
|
6360
6352
|
float beta_fast,
|
|
6361
6353
|
float beta_slow) {
|
|
6362
6354
|
return ggml_rope_impl(
|
|
6363
|
-
ctx, a, b, c, n_dims, mode,
|
|
6364
|
-
ext_factor, attn_factor, beta_fast, beta_slow,
|
|
6355
|
+
ctx, a, b, c, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
|
|
6356
|
+
ext_factor, attn_factor, beta_fast, beta_slow, true
|
|
6365
6357
|
);
|
|
6366
6358
|
}
|
|
6367
6359
|
|
|
@@ -6371,8 +6363,7 @@ struct ggml_tensor * ggml_rope_custom(
|
|
|
6371
6363
|
struct ggml_tensor * b,
|
|
6372
6364
|
int n_dims,
|
|
6373
6365
|
int mode,
|
|
6374
|
-
int
|
|
6375
|
-
int n_orig_ctx,
|
|
6366
|
+
int n_ctx_orig,
|
|
6376
6367
|
float freq_base,
|
|
6377
6368
|
float freq_scale,
|
|
6378
6369
|
float ext_factor,
|
|
@@ -6380,8 +6371,8 @@ struct ggml_tensor * ggml_rope_custom(
|
|
|
6380
6371
|
float beta_fast,
|
|
6381
6372
|
float beta_slow) {
|
|
6382
6373
|
return ggml_rope_impl(
|
|
6383
|
-
ctx, a, b, NULL, n_dims, mode,
|
|
6384
|
-
ext_factor, attn_factor, beta_fast, beta_slow,
|
|
6374
|
+
ctx, a, b, NULL, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
|
|
6375
|
+
ext_factor, attn_factor, beta_fast, beta_slow, false
|
|
6385
6376
|
);
|
|
6386
6377
|
}
|
|
6387
6378
|
|
|
@@ -6391,8 +6382,7 @@ struct ggml_tensor * ggml_rope_custom_inplace(
|
|
|
6391
6382
|
struct ggml_tensor * b,
|
|
6392
6383
|
int n_dims,
|
|
6393
6384
|
int mode,
|
|
6394
|
-
int
|
|
6395
|
-
int n_orig_ctx,
|
|
6385
|
+
int n_ctx_orig,
|
|
6396
6386
|
float freq_base,
|
|
6397
6387
|
float freq_scale,
|
|
6398
6388
|
float ext_factor,
|
|
@@ -6400,21 +6390,11 @@ struct ggml_tensor * ggml_rope_custom_inplace(
|
|
|
6400
6390
|
float beta_fast,
|
|
6401
6391
|
float beta_slow) {
|
|
6402
6392
|
return ggml_rope_impl(
|
|
6403
|
-
ctx, a, b, NULL, n_dims, mode,
|
|
6404
|
-
ext_factor, attn_factor, beta_fast, beta_slow,
|
|
6393
|
+
ctx, a, b, NULL, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
|
|
6394
|
+
ext_factor, attn_factor, beta_fast, beta_slow, true
|
|
6405
6395
|
);
|
|
6406
6396
|
}
|
|
6407
6397
|
|
|
6408
|
-
struct ggml_tensor * ggml_rope_xpos_inplace(
|
|
6409
|
-
struct ggml_context * ctx,
|
|
6410
|
-
struct ggml_tensor * a,
|
|
6411
|
-
struct ggml_tensor * b,
|
|
6412
|
-
int n_dims,
|
|
6413
|
-
float base,
|
|
6414
|
-
bool down) {
|
|
6415
|
-
return ggml_rope_impl(ctx, a, b, NULL, n_dims, 0, 0, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, base, down, true);
|
|
6416
|
-
}
|
|
6417
|
-
|
|
6418
6398
|
// ggml_rope_back
|
|
6419
6399
|
|
|
6420
6400
|
struct ggml_tensor * ggml_rope_back(
|
|
@@ -6424,16 +6404,13 @@ struct ggml_tensor * ggml_rope_back(
|
|
|
6424
6404
|
struct ggml_tensor * c,
|
|
6425
6405
|
int n_dims,
|
|
6426
6406
|
int mode,
|
|
6427
|
-
int
|
|
6428
|
-
int n_orig_ctx,
|
|
6407
|
+
int n_ctx_orig,
|
|
6429
6408
|
float freq_base,
|
|
6430
6409
|
float freq_scale,
|
|
6431
6410
|
float ext_factor,
|
|
6432
6411
|
float attn_factor,
|
|
6433
6412
|
float beta_fast,
|
|
6434
|
-
float beta_slow
|
|
6435
|
-
float xpos_base,
|
|
6436
|
-
bool xpos_down) {
|
|
6413
|
+
float beta_slow) {
|
|
6437
6414
|
GGML_ASSERT(ggml_is_vector(b));
|
|
6438
6415
|
GGML_ASSERT(b->type == GGML_TYPE_I32);
|
|
6439
6416
|
GGML_ASSERT(a->ne[2] == b->ne[0]);
|
|
@@ -6449,15 +6426,13 @@ struct ggml_tensor * ggml_rope_back(
|
|
|
6449
6426
|
|
|
6450
6427
|
struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
|
|
6451
6428
|
|
|
6452
|
-
int32_t params[
|
|
6429
|
+
int32_t params[11] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
|
|
6453
6430
|
memcpy(params + 5, &freq_base, sizeof(float));
|
|
6454
6431
|
memcpy(params + 6, &freq_scale, sizeof(float));
|
|
6455
6432
|
memcpy(params + 7, &ext_factor, sizeof(float));
|
|
6456
6433
|
memcpy(params + 8, &attn_factor, sizeof(float));
|
|
6457
6434
|
memcpy(params + 9, &beta_fast, sizeof(float));
|
|
6458
6435
|
memcpy(params + 10, &beta_slow, sizeof(float));
|
|
6459
|
-
memcpy(params + 11, &xpos_base, sizeof(float));
|
|
6460
|
-
memcpy(params + 12, &xpos_down, sizeof(bool));
|
|
6461
6436
|
ggml_set_op_params(result, params, sizeof(params));
|
|
6462
6437
|
|
|
6463
6438
|
result->op = GGML_OP_ROPE_BACK;
|
|
@@ -9043,17 +9018,6 @@ static void ggml_compute_forward_add_f32(
|
|
|
9043
9018
|
const int ith = params->ith;
|
|
9044
9019
|
const int nth = params->nth;
|
|
9045
9020
|
|
|
9046
|
-
#ifdef GGML_USE_CLBLAST
|
|
9047
|
-
if (src1->backend == GGML_BACKEND_TYPE_GPU) {
|
|
9048
|
-
// TODO: OpenCL kernel support full broadcast
|
|
9049
|
-
GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
|
|
9050
|
-
if (ith == 0) {
|
|
9051
|
-
ggml_cl_add(src0, src1, dst);
|
|
9052
|
-
}
|
|
9053
|
-
return;
|
|
9054
|
-
}
|
|
9055
|
-
#endif
|
|
9056
|
-
|
|
9057
9021
|
const int nr = ggml_nrows(src0);
|
|
9058
9022
|
|
|
9059
9023
|
GGML_TENSOR_BINARY_OP_LOCALS
|
|
@@ -10161,17 +10125,6 @@ static void ggml_compute_forward_mul_f32(
|
|
|
10161
10125
|
const int ith = params->ith;
|
|
10162
10126
|
const int nth = params->nth;
|
|
10163
10127
|
|
|
10164
|
-
#if defined(GGML_USE_CLBLAST)
|
|
10165
|
-
if (src1->backend == GGML_BACKEND_TYPE_GPU) {
|
|
10166
|
-
// TODO: OpenCL kernel support full broadcast
|
|
10167
|
-
GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
|
|
10168
|
-
if (ith == 0) {
|
|
10169
|
-
ggml_cl_mul(src0, src1, dst);
|
|
10170
|
-
}
|
|
10171
|
-
return;
|
|
10172
|
-
}
|
|
10173
|
-
#endif
|
|
10174
|
-
|
|
10175
10128
|
const int64_t nr = ggml_nrows(src0);
|
|
10176
10129
|
|
|
10177
10130
|
GGML_TENSOR_BINARY_OP_LOCALS
|
|
@@ -12407,15 +12360,6 @@ static void ggml_compute_forward_mul_mat(
|
|
|
12407
12360
|
// nb01 >= nb00 - src0 is not transposed
|
|
12408
12361
|
// compute by src0 rows
|
|
12409
12362
|
|
|
12410
|
-
#if defined(GGML_USE_CLBLAST)
|
|
12411
|
-
if (ggml_cl_can_mul_mat(src0, src1, dst)) {
|
|
12412
|
-
if (params->ith == 0 && params->type == GGML_TASK_TYPE_COMPUTE) {
|
|
12413
|
-
ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
|
12414
|
-
}
|
|
12415
|
-
return;
|
|
12416
|
-
}
|
|
12417
|
-
#endif
|
|
12418
|
-
|
|
12419
12363
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
|
12420
12364
|
if (ggml_compute_forward_mul_mat_use_blas(dst)) {
|
|
12421
12365
|
const int64_t ne_plane = ne01*ne00;
|
|
@@ -12863,8 +12807,6 @@ static void ggml_compute_forward_out_prod_f32(
|
|
|
12863
12807
|
// nb01 >= nb00 - src0 is not transposed
|
|
12864
12808
|
// compute by src0 rows
|
|
12865
12809
|
|
|
12866
|
-
// TODO: #if defined(GGML_USE_CLBLAST)
|
|
12867
|
-
|
|
12868
12810
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
|
12869
12811
|
bool use_blas = ggml_is_matrix(src0) &&
|
|
12870
12812
|
ggml_is_matrix(src1) &&
|
|
@@ -13062,7 +13004,7 @@ static void ggml_compute_forward_out_prod_q_f32(
|
|
|
13062
13004
|
// nb01 >= nb00 - src0 is not transposed
|
|
13063
13005
|
// compute by src0 rows
|
|
13064
13006
|
|
|
13065
|
-
// TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
|
13007
|
+
// TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
|
13066
13008
|
|
|
13067
13009
|
if (params->type == GGML_TASK_TYPE_INIT) {
|
|
13068
13010
|
if (ith != 0) {
|
|
@@ -14259,8 +14201,7 @@ static float rope_yarn_ramp(const float low, const float high, const int i0) {
|
|
|
14259
14201
|
// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
|
|
14260
14202
|
static void rope_yarn(
|
|
14261
14203
|
float theta_extrap, float freq_scale, float corr_dims[2], int64_t i0, float ext_factor, float mscale,
|
|
14262
|
-
float * cos_theta, float * sin_theta
|
|
14263
|
-
) {
|
|
14204
|
+
float * cos_theta, float * sin_theta) {
|
|
14264
14205
|
// Get n-d rotational scaling corrected for extrapolation
|
|
14265
14206
|
float theta_interp = freq_scale * theta_extrap;
|
|
14266
14207
|
float theta = theta_interp;
|
|
@@ -14277,18 +14218,19 @@ static void rope_yarn(
|
|
|
14277
14218
|
|
|
14278
14219
|
// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
|
|
14279
14220
|
// `corr_dim(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
|
|
14280
|
-
static float ggml_rope_yarn_corr_dim(int n_dims, int
|
|
14281
|
-
return n_dims * logf(
|
|
14221
|
+
static float ggml_rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) {
|
|
14222
|
+
return n_dims * logf(n_ctx_orig / (n_rot * 2 * (float)M_PI)) / (2 * logf(base));
|
|
14282
14223
|
}
|
|
14283
14224
|
|
|
14284
14225
|
static void ggml_rope_cache_init(
|
|
14285
|
-
float theta_base, float freq_scale, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
|
|
14286
|
-
float * cache, float sin_sign, float theta_scale
|
|
14287
|
-
|
|
14226
|
+
float theta_base, float freq_scale, const float * freq_factors, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
|
|
14227
|
+
float * cache, float sin_sign, float theta_scale) {
|
|
14228
|
+
// ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
|
|
14288
14229
|
float theta = theta_base;
|
|
14289
14230
|
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
|
14231
|
+
const float ff = freq_factors ? freq_factors[i0/2] : 1.0f;
|
|
14290
14232
|
rope_yarn(
|
|
14291
|
-
theta, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1]
|
|
14233
|
+
theta/ff, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1]
|
|
14292
14234
|
);
|
|
14293
14235
|
cache[i0 + 1] *= sin_sign;
|
|
14294
14236
|
|
|
@@ -14297,11 +14239,11 @@ static void ggml_rope_cache_init(
|
|
|
14297
14239
|
}
|
|
14298
14240
|
|
|
14299
14241
|
GGML_CALL void ggml_rope_yarn_corr_dims(
|
|
14300
|
-
int n_dims, int
|
|
14242
|
+
int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]
|
|
14301
14243
|
) {
|
|
14302
14244
|
// start and end correction dims
|
|
14303
|
-
float start = floorf(ggml_rope_yarn_corr_dim(n_dims,
|
|
14304
|
-
float end = ceilf(ggml_rope_yarn_corr_dim(n_dims,
|
|
14245
|
+
float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_fast, freq_base));
|
|
14246
|
+
float end = ceilf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_slow, freq_base));
|
|
14305
14247
|
dims[0] = MAX(0, start);
|
|
14306
14248
|
dims[1] = MIN(n_dims - 1, end);
|
|
14307
14249
|
}
|
|
@@ -14321,15 +14263,11 @@ static void ggml_compute_forward_rope_f32(
|
|
|
14321
14263
|
|
|
14322
14264
|
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
|
14323
14265
|
|
|
14324
|
-
// these two only relevant for xPos RoPE:
|
|
14325
|
-
float xpos_base;
|
|
14326
|
-
bool xpos_down;
|
|
14327
|
-
|
|
14328
14266
|
//const int n_past = ((int32_t *) dst->op_params)[0];
|
|
14329
14267
|
const int n_dims = ((int32_t *) dst->op_params)[1];
|
|
14330
14268
|
const int mode = ((int32_t *) dst->op_params)[2];
|
|
14331
|
-
const int n_ctx = ((int32_t *) dst->op_params)[3];
|
|
14332
|
-
const int
|
|
14269
|
+
//const int n_ctx = ((int32_t *) dst->op_params)[3];
|
|
14270
|
+
const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
|
|
14333
14271
|
|
|
14334
14272
|
memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
|
|
14335
14273
|
memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
|
|
@@ -14337,8 +14275,6 @@ static void ggml_compute_forward_rope_f32(
|
|
|
14337
14275
|
memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
|
|
14338
14276
|
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
|
14339
14277
|
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
|
14340
|
-
memcpy(&xpos_base, (int32_t *) dst->op_params + 11, sizeof(float));
|
|
14341
|
-
memcpy(&xpos_down, (int32_t *) dst->op_params + 12, sizeof(bool));
|
|
14342
14278
|
|
|
14343
14279
|
GGML_TENSOR_UNARY_OP_LOCALS
|
|
14344
14280
|
|
|
@@ -14368,20 +14304,15 @@ static void ggml_compute_forward_rope_f32(
|
|
|
14368
14304
|
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
|
14369
14305
|
|
|
14370
14306
|
float corr_dims[2];
|
|
14371
|
-
ggml_rope_yarn_corr_dims(n_dims,
|
|
14307
|
+
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
|
|
14372
14308
|
|
|
14373
14309
|
const bool is_neox = mode & 2;
|
|
14374
|
-
const bool is_glm = mode & 4;
|
|
14375
14310
|
|
|
14376
14311
|
const float * freq_factors = NULL;
|
|
14377
|
-
if (
|
|
14378
|
-
|
|
14379
|
-
|
|
14380
|
-
|
|
14381
|
-
freq_factors = (const float *) src2->data;
|
|
14382
|
-
}
|
|
14383
|
-
} else {
|
|
14384
|
-
GGML_ASSERT(src2 == NULL && "TODO: freq_factors not implemented for !is_neox");
|
|
14312
|
+
if (src2 != NULL) {
|
|
14313
|
+
GGML_ASSERT(src2->type == GGML_TYPE_F32);
|
|
14314
|
+
GGML_ASSERT(src2->ne[0] >= n_dims / 2);
|
|
14315
|
+
freq_factors = (const float *) src2->data;
|
|
14385
14316
|
}
|
|
14386
14317
|
|
|
14387
14318
|
// backward process uses inverse rotation by cos and sin.
|
|
@@ -14396,94 +14327,50 @@ static void ggml_compute_forward_rope_f32(
|
|
|
14396
14327
|
const int64_t p = pos[i2];
|
|
14397
14328
|
|
|
14398
14329
|
float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
|
|
14399
|
-
|
|
14400
|
-
ggml_rope_cache_init(p, freq_scale, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
|
|
14401
|
-
}
|
|
14330
|
+
ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
|
|
14402
14331
|
|
|
14403
14332
|
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
|
14404
14333
|
if (ir++ < ir0) continue;
|
|
14405
14334
|
if (ir > ir1) break;
|
|
14406
14335
|
|
|
14407
|
-
|
|
14408
|
-
|
|
14409
|
-
if (is_glm) {
|
|
14410
|
-
theta_base = MIN(p, n_ctx - 2);
|
|
14411
|
-
float block_theta = MAX(p - (n_ctx - 2), 0);
|
|
14412
|
-
for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
|
|
14413
|
-
const float cos_theta = cosf(theta_base);
|
|
14414
|
-
const float sin_theta = sinf(theta_base) * sin_sign;
|
|
14415
|
-
const float cos_block_theta = cosf(block_theta);
|
|
14416
|
-
const float sin_block_theta = sinf(block_theta) * sin_sign;
|
|
14417
|
-
|
|
14418
|
-
theta_base *= theta_scale;
|
|
14419
|
-
block_theta *= theta_scale;
|
|
14420
|
-
|
|
14421
|
-
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
|
14422
|
-
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
|
14423
|
-
|
|
14424
|
-
const float x0 = src[0];
|
|
14425
|
-
const float x1 = src[n_dims/2];
|
|
14426
|
-
const float x2 = src[n_dims];
|
|
14427
|
-
const float x3 = src[n_dims/2*3];
|
|
14428
|
-
|
|
14429
|
-
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
|
14430
|
-
dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
|
|
14431
|
-
dst_data[n_dims] = x2*cos_block_theta - x3*sin_block_theta;
|
|
14432
|
-
dst_data[n_dims/2*3] = x2*sin_block_theta + x3*cos_block_theta;
|
|
14433
|
-
}
|
|
14434
|
-
} else if (!is_neox) {
|
|
14435
|
-
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
|
14336
|
+
if (!is_neox) {
|
|
14337
|
+
for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
|
|
14436
14338
|
const float cos_theta = cache[i0 + 0];
|
|
14437
14339
|
const float sin_theta = cache[i0 + 1];
|
|
14438
14340
|
|
|
14439
|
-
// zeta scaling for xPos only:
|
|
14440
|
-
float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
|
|
14441
|
-
if (xpos_down) zeta = 1.0f / zeta;
|
|
14442
|
-
|
|
14443
14341
|
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
|
14444
14342
|
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
|
14445
14343
|
|
|
14446
14344
|
const float x0 = src[0];
|
|
14447
14345
|
const float x1 = src[1];
|
|
14448
14346
|
|
|
14449
|
-
dst_data[0] = x0*cos_theta
|
|
14450
|
-
dst_data[1] = x0*sin_theta
|
|
14347
|
+
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
|
14348
|
+
dst_data[1] = x0*sin_theta + x1*cos_theta;
|
|
14451
14349
|
}
|
|
14452
14350
|
} else {
|
|
14453
|
-
|
|
14454
|
-
|
|
14455
|
-
if (ic < n_dims) {
|
|
14456
|
-
const int64_t i0 = ic/2;
|
|
14351
|
+
for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
|
|
14352
|
+
const int64_t ic = i0/2;
|
|
14457
14353
|
|
|
14458
|
-
|
|
14459
|
-
|
|
14460
|
-
float cos_theta, sin_theta;
|
|
14461
|
-
rope_yarn(
|
|
14462
|
-
theta_base/freq_factor, freq_scale, corr_dims, ic, ext_factor, attn_factor,
|
|
14463
|
-
&cos_theta, &sin_theta
|
|
14464
|
-
);
|
|
14465
|
-
|
|
14466
|
-
sin_theta *= sin_sign;
|
|
14467
|
-
theta_base *= theta_scale;
|
|
14354
|
+
const float cos_theta = cache[i0 + 0];
|
|
14355
|
+
const float sin_theta = cache[i0 + 1];
|
|
14468
14356
|
|
|
14469
|
-
|
|
14470
|
-
|
|
14357
|
+
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
|
|
14358
|
+
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
|
|
14471
14359
|
|
|
14472
|
-
|
|
14473
|
-
|
|
14360
|
+
const float x0 = src[0];
|
|
14361
|
+
const float x1 = src[n_dims/2];
|
|
14474
14362
|
|
|
14475
|
-
|
|
14476
|
-
|
|
14477
|
-
|
|
14478
|
-
|
|
14363
|
+
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
|
14364
|
+
dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
|
|
14365
|
+
}
|
|
14366
|
+
}
|
|
14479
14367
|
|
|
14480
|
-
|
|
14481
|
-
|
|
14368
|
+
for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
|
|
14369
|
+
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
|
14370
|
+
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
|
14482
14371
|
|
|
14483
|
-
|
|
14484
|
-
|
|
14485
|
-
}
|
|
14486
|
-
}
|
|
14372
|
+
dst_data[0] = src[0];
|
|
14373
|
+
dst_data[1] = src[1];
|
|
14487
14374
|
}
|
|
14488
14375
|
}
|
|
14489
14376
|
}
|
|
@@ -14509,8 +14396,8 @@ static void ggml_compute_forward_rope_f16(
|
|
|
14509
14396
|
//const int n_past = ((int32_t *) dst->op_params)[0];
|
|
14510
14397
|
const int n_dims = ((int32_t *) dst->op_params)[1];
|
|
14511
14398
|
const int mode = ((int32_t *) dst->op_params)[2];
|
|
14512
|
-
const int n_ctx = ((int32_t *) dst->op_params)[3];
|
|
14513
|
-
const int
|
|
14399
|
+
//const int n_ctx = ((int32_t *) dst->op_params)[3];
|
|
14400
|
+
const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
|
|
14514
14401
|
memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
|
|
14515
14402
|
memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
|
|
14516
14403
|
memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
|
|
@@ -14546,20 +14433,15 @@ static void ggml_compute_forward_rope_f16(
|
|
|
14546
14433
|
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
|
14547
14434
|
|
|
14548
14435
|
float corr_dims[2];
|
|
14549
|
-
ggml_rope_yarn_corr_dims(n_dims,
|
|
14436
|
+
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
|
|
14550
14437
|
|
|
14551
14438
|
const bool is_neox = mode & 2;
|
|
14552
|
-
const bool is_glm = mode & 4;
|
|
14553
14439
|
|
|
14554
14440
|
const float * freq_factors = NULL;
|
|
14555
|
-
if (
|
|
14556
|
-
|
|
14557
|
-
|
|
14558
|
-
|
|
14559
|
-
freq_factors = (const float *) src2->data;
|
|
14560
|
-
}
|
|
14561
|
-
} else {
|
|
14562
|
-
GGML_ASSERT(src2 == NULL && "TODO: freq_factors not implemented for !is_neox");
|
|
14441
|
+
if (src2 != NULL) {
|
|
14442
|
+
GGML_ASSERT(src2->type == GGML_TYPE_F32);
|
|
14443
|
+
GGML_ASSERT(src2->ne[0] >= n_dims / 2);
|
|
14444
|
+
freq_factors = (const float *) src2->data;
|
|
14563
14445
|
}
|
|
14564
14446
|
|
|
14565
14447
|
// backward process uses inverse rotation by cos and sin.
|
|
@@ -14574,43 +14456,14 @@ static void ggml_compute_forward_rope_f16(
|
|
|
14574
14456
|
const int64_t p = pos[i2];
|
|
14575
14457
|
|
|
14576
14458
|
float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
|
|
14577
|
-
|
|
14578
|
-
ggml_rope_cache_init(p, freq_scale, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
|
|
14579
|
-
}
|
|
14459
|
+
ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
|
|
14580
14460
|
|
|
14581
14461
|
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
|
14582
14462
|
if (ir++ < ir0) continue;
|
|
14583
14463
|
if (ir > ir1) break;
|
|
14584
14464
|
|
|
14585
|
-
|
|
14586
|
-
|
|
14587
|
-
if (is_glm) {
|
|
14588
|
-
theta_base = MIN(p, n_ctx - 2);
|
|
14589
|
-
float block_theta = MAX(p - (n_ctx - 2), 0);
|
|
14590
|
-
for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
|
|
14591
|
-
const float cos_theta = cosf(theta_base);
|
|
14592
|
-
const float sin_theta = sinf(theta_base) * sin_sign;
|
|
14593
|
-
const float cos_block_theta = cosf(block_theta);
|
|
14594
|
-
const float sin_block_theta = sinf(block_theta) * sin_sign;
|
|
14595
|
-
|
|
14596
|
-
theta_base *= theta_scale;
|
|
14597
|
-
block_theta *= theta_scale;
|
|
14598
|
-
|
|
14599
|
-
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
|
14600
|
-
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
|
14601
|
-
|
|
14602
|
-
const float x0 = GGML_FP16_TO_FP32(src[0]);
|
|
14603
|
-
const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
|
|
14604
|
-
const float x2 = GGML_FP16_TO_FP32(src[n_dims]);
|
|
14605
|
-
const float x3 = GGML_FP16_TO_FP32(src[n_dims/2*3]);
|
|
14606
|
-
|
|
14607
|
-
dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
|
|
14608
|
-
dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
|
14609
|
-
dst_data[n_dims] = GGML_FP32_TO_FP16(x2*cos_block_theta - x3*sin_block_theta);
|
|
14610
|
-
dst_data[n_dims/2*3] = GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta);
|
|
14611
|
-
}
|
|
14612
|
-
} else if (!is_neox) {
|
|
14613
|
-
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
|
14465
|
+
if (!is_neox) {
|
|
14466
|
+
for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
|
|
14614
14467
|
const float cos_theta = cache[i0 + 0];
|
|
14615
14468
|
const float sin_theta = cache[i0 + 1];
|
|
14616
14469
|
|
|
@@ -14624,40 +14477,29 @@ static void ggml_compute_forward_rope_f16(
|
|
|
14624
14477
|
dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
|
14625
14478
|
}
|
|
14626
14479
|
} else {
|
|
14627
|
-
|
|
14628
|
-
|
|
14629
|
-
if (ic < n_dims) {
|
|
14630
|
-
const int64_t i0 = ic/2;
|
|
14631
|
-
|
|
14632
|
-
const float freq_factor = freq_factors ? freq_factors[i0] : 1.0f;
|
|
14633
|
-
|
|
14634
|
-
float cos_theta, sin_theta;
|
|
14635
|
-
rope_yarn(
|
|
14636
|
-
theta_base/freq_factor, freq_scale, corr_dims, ic, ext_factor, attn_factor,
|
|
14637
|
-
&cos_theta, &sin_theta
|
|
14638
|
-
);
|
|
14480
|
+
for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
|
|
14481
|
+
const int64_t ic = i0/2;
|
|
14639
14482
|
|
|
14640
|
-
|
|
14641
|
-
|
|
14483
|
+
const float cos_theta = cache[i0 + 0];
|
|
14484
|
+
const float sin_theta = cache[i0 + 1];
|
|
14642
14485
|
|
|
14643
|
-
|
|
14644
|
-
|
|
14486
|
+
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
|
|
14487
|
+
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
|
|
14645
14488
|
|
|
14646
|
-
|
|
14647
|
-
|
|
14489
|
+
const float x0 = GGML_FP16_TO_FP32(src[0]);
|
|
14490
|
+
const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
|
|
14648
14491
|
|
|
14649
|
-
|
|
14650
|
-
|
|
14651
|
-
|
|
14652
|
-
|
|
14492
|
+
dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
|
|
14493
|
+
dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
|
14494
|
+
}
|
|
14495
|
+
}
|
|
14653
14496
|
|
|
14654
|
-
|
|
14655
|
-
|
|
14497
|
+
for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
|
|
14498
|
+
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
|
14499
|
+
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
|
14656
14500
|
|
|
14657
|
-
|
|
14658
|
-
|
|
14659
|
-
}
|
|
14660
|
-
}
|
|
14501
|
+
dst_data[0] = src[0];
|
|
14502
|
+
dst_data[1] = src[1];
|
|
14661
14503
|
}
|
|
14662
14504
|
}
|
|
14663
14505
|
}
|
|
@@ -18359,9 +18201,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
|
18359
18201
|
//const int n_past = ((int32_t *) tensor->op_params)[0];
|
|
18360
18202
|
const int n_dims = ((int32_t *) tensor->op_params)[1];
|
|
18361
18203
|
const int mode = ((int32_t *) tensor->op_params)[2];
|
|
18362
|
-
const int n_ctx = ((int32_t *) tensor->op_params)[3];
|
|
18363
|
-
const int
|
|
18364
|
-
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
18204
|
+
//const int n_ctx = ((int32_t *) tensor->op_params)[3];
|
|
18205
|
+
const int n_ctx_orig = ((int32_t *) tensor->op_params)[4];
|
|
18206
|
+
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
|
18365
18207
|
|
|
18366
18208
|
memcpy(&freq_base, (int32_t *) tensor->op_params + 5, sizeof(float));
|
|
18367
18209
|
memcpy(&freq_scale, (int32_t *) tensor->op_params + 6, sizeof(float));
|
|
@@ -18369,8 +18211,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
|
18369
18211
|
memcpy(&attn_factor, (int32_t *) tensor->op_params + 8, sizeof(float));
|
|
18370
18212
|
memcpy(&beta_fast, (int32_t *) tensor->op_params + 9, sizeof(float));
|
|
18371
18213
|
memcpy(&beta_slow, (int32_t *) tensor->op_params + 10, sizeof(float));
|
|
18372
|
-
memcpy(&xpos_base, (int32_t *) tensor->op_params + 11, sizeof(float));
|
|
18373
|
-
memcpy(&xpos_down, (int32_t *) tensor->op_params + 12, sizeof(bool));
|
|
18374
18214
|
|
|
18375
18215
|
src0->grad = ggml_add_or_set(ctx,
|
|
18376
18216
|
src0->grad,
|
|
@@ -18380,16 +18220,13 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
|
18380
18220
|
src2,
|
|
18381
18221
|
n_dims,
|
|
18382
18222
|
mode,
|
|
18383
|
-
|
|
18384
|
-
n_orig_ctx,
|
|
18223
|
+
n_ctx_orig,
|
|
18385
18224
|
freq_base,
|
|
18386
18225
|
freq_scale,
|
|
18387
18226
|
ext_factor,
|
|
18388
18227
|
attn_factor,
|
|
18389
18228
|
beta_fast,
|
|
18390
|
-
beta_slow,
|
|
18391
|
-
xpos_base,
|
|
18392
|
-
xpos_down),
|
|
18229
|
+
beta_slow),
|
|
18393
18230
|
zero_table);
|
|
18394
18231
|
}
|
|
18395
18232
|
} break;
|
|
@@ -18399,9 +18236,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
|
18399
18236
|
//const int n_past = ((int32_t *) tensor->op_params)[0];
|
|
18400
18237
|
const int n_dims = ((int32_t *) tensor->op_params)[1];
|
|
18401
18238
|
const int mode = ((int32_t *) tensor->op_params)[2];
|
|
18402
|
-
const int n_ctx = ((int32_t *) tensor->op_params)[3];
|
|
18403
|
-
const int
|
|
18404
|
-
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
18239
|
+
//const int n_ctx = ((int32_t *) tensor->op_params)[3];
|
|
18240
|
+
const int n_ctx_orig = ((int32_t *) tensor->op_params)[4];
|
|
18241
|
+
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
|
18405
18242
|
|
|
18406
18243
|
memcpy(&freq_base, (int32_t *) tensor->op_params + 5, sizeof(float));
|
|
18407
18244
|
memcpy(&freq_scale, (int32_t *) tensor->op_params + 6, sizeof(float));
|
|
@@ -18409,8 +18246,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
|
18409
18246
|
memcpy(&attn_factor, (int32_t *) tensor->op_params + 8, sizeof(float));
|
|
18410
18247
|
memcpy(&beta_fast, (int32_t *) tensor->op_params + 9, sizeof(float));
|
|
18411
18248
|
memcpy(&beta_slow, (int32_t *) tensor->op_params + 10, sizeof(float));
|
|
18412
|
-
memcpy(&xpos_base, (int32_t *) tensor->op_params + 11, sizeof(float));
|
|
18413
|
-
memcpy(&xpos_down, (int32_t *) tensor->op_params + 12, sizeof(bool));
|
|
18414
18249
|
|
|
18415
18250
|
src0->grad = ggml_add_or_set(ctx,
|
|
18416
18251
|
src0->grad,
|
|
@@ -18420,16 +18255,13 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
|
18420
18255
|
src2,
|
|
18421
18256
|
n_dims,
|
|
18422
18257
|
mode,
|
|
18423
|
-
|
|
18424
|
-
n_orig_ctx,
|
|
18258
|
+
n_ctx_orig,
|
|
18425
18259
|
freq_base,
|
|
18426
18260
|
freq_scale,
|
|
18427
18261
|
ext_factor,
|
|
18428
18262
|
attn_factor,
|
|
18429
18263
|
beta_fast,
|
|
18430
18264
|
beta_slow,
|
|
18431
|
-
xpos_base,
|
|
18432
|
-
xpos_down,
|
|
18433
18265
|
false),
|
|
18434
18266
|
zero_table);
|
|
18435
18267
|
}
|
|
@@ -19536,11 +19368,6 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
|
|
19536
19368
|
{
|
|
19537
19369
|
const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type;
|
|
19538
19370
|
|
|
19539
|
-
#if defined(GGML_USE_CLBLAST)
|
|
19540
|
-
if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
|
|
19541
|
-
cur = ggml_cl_mul_mat_get_wsize(node->src[0], node->src[1], node);
|
|
19542
|
-
} else
|
|
19543
|
-
#endif
|
|
19544
19371
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
|
19545
19372
|
if (ggml_compute_forward_mul_mat_use_blas(node)) {
|
|
19546
19373
|
if (node->src[0]->type != GGML_TYPE_F32) {
|
|
@@ -19670,6 +19497,59 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
|
|
19670
19497
|
return cplan;
|
|
19671
19498
|
}
|
|
19672
19499
|
|
|
19500
|
+
static enum ggml_status ggml_graph_compute_parallel(struct ggml_compute_state * workers, int n_threads) {
|
|
19501
|
+
enum ggml_status compute_status = GGML_STATUS_SUCCESS;
|
|
19502
|
+
|
|
19503
|
+
#ifdef GGML_USE_OPENMP
|
|
19504
|
+
if (n_threads > 1) {
|
|
19505
|
+
#pragma omp parallel num_threads(n_threads)
|
|
19506
|
+
{
|
|
19507
|
+
#pragma omp single
|
|
19508
|
+
{
|
|
19509
|
+
// update the number of threads from the actual number of threads that we got from OpenMP
|
|
19510
|
+
n_threads = omp_get_num_threads();
|
|
19511
|
+
workers[0].shared->n_threads = n_threads;
|
|
19512
|
+
workers[0].shared->n_active = n_threads;
|
|
19513
|
+
}
|
|
19514
|
+
ggml_graph_compute_thread(&workers[omp_get_thread_num()]);
|
|
19515
|
+
}
|
|
19516
|
+
} else {
|
|
19517
|
+
ggml_graph_compute_thread(&workers[0]);
|
|
19518
|
+
}
|
|
19519
|
+
#else
|
|
19520
|
+
// create thread pool
|
|
19521
|
+
if (n_threads > 1) {
|
|
19522
|
+
for (int j = 1; j < n_threads; ++j) {
|
|
19523
|
+
const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
|
|
19524
|
+
GGML_ASSERT(rc == 0);
|
|
19525
|
+
UNUSED(rc);
|
|
19526
|
+
}
|
|
19527
|
+
}
|
|
19528
|
+
|
|
19529
|
+
// this is a work thread too
|
|
19530
|
+
ggml_graph_compute_thread(&workers[0]);
|
|
19531
|
+
|
|
19532
|
+
// join or kill thread pool
|
|
19533
|
+
if (n_threads > 1) {
|
|
19534
|
+
for (int j = 1; j < n_threads; j++) {
|
|
19535
|
+
const int rc = ggml_thread_join(workers[j].thrd, NULL);
|
|
19536
|
+
GGML_ASSERT(rc == 0);
|
|
19537
|
+
UNUSED(rc);
|
|
19538
|
+
}
|
|
19539
|
+
}
|
|
19540
|
+
#endif
|
|
19541
|
+
// don't leave affinity set on the main thread
|
|
19542
|
+
clear_numa_thread_affinity();
|
|
19543
|
+
|
|
19544
|
+
for (int j = 0; j < n_threads; j++) {
|
|
19545
|
+
if (workers[j].ec != GGML_STATUS_SUCCESS) {
|
|
19546
|
+
compute_status = workers[j].ec;
|
|
19547
|
+
break;
|
|
19548
|
+
}
|
|
19549
|
+
}
|
|
19550
|
+
return compute_status;
|
|
19551
|
+
}
|
|
19552
|
+
|
|
19673
19553
|
enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
|
19674
19554
|
{
|
|
19675
19555
|
GGML_ASSERT(cplan);
|
|
@@ -19680,7 +19560,11 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
|
|
19680
19560
|
}
|
|
19681
19561
|
}
|
|
19682
19562
|
|
|
19683
|
-
|
|
19563
|
+
int n_threads = cplan->n_threads;
|
|
19564
|
+
|
|
19565
|
+
#if defined(GGML_USE_OPENMP)
|
|
19566
|
+
n_threads = MIN(n_threads, omp_get_max_threads());
|
|
19567
|
+
#endif
|
|
19684
19568
|
|
|
19685
19569
|
struct ggml_compute_state_shared state_shared = {
|
|
19686
19570
|
/*.cgraph =*/ cgraph,
|
|
@@ -19696,47 +19580,20 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
|
|
19696
19580
|
/*.current_chunk; =*/ 0,
|
|
19697
19581
|
};
|
|
19698
19582
|
struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
|
|
19699
|
-
|
|
19700
|
-
// create thread pool
|
|
19701
|
-
if (n_threads > 1) {
|
|
19702
|
-
for (int j = 1; j < n_threads; ++j) {
|
|
19703
|
-
workers[j] = (struct ggml_compute_state) {
|
|
19704
|
-
.thrd = 0,
|
|
19705
|
-
.ith = j,
|
|
19706
|
-
.shared = &state_shared,
|
|
19707
|
-
.ec = GGML_STATUS_SUCCESS,
|
|
19708
|
-
};
|
|
19709
|
-
|
|
19710
|
-
const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
|
|
19711
|
-
GGML_ASSERT(rc == 0);
|
|
19712
|
-
UNUSED(rc);
|
|
19713
|
-
}
|
|
19714
|
-
}
|
|
19715
|
-
|
|
19716
|
-
workers[0].ith = 0;
|
|
19717
|
-
workers[0].shared = &state_shared;
|
|
19718
|
-
workers[0].ec = GGML_STATUS_SUCCESS;
|
|
19719
|
-
|
|
19720
19583
|
const int64_t perf_start_cycles = ggml_perf_cycles();
|
|
19721
19584
|
const int64_t perf_start_time_us = ggml_perf_time_us();
|
|
19722
19585
|
|
|
19723
|
-
|
|
19724
|
-
|
|
19725
|
-
|
|
19726
|
-
|
|
19727
|
-
|
|
19728
|
-
|
|
19729
|
-
|
|
19730
|
-
// join or kill thread pool
|
|
19731
|
-
if (n_threads > 1) {
|
|
19732
|
-
for (int j = 1; j < n_threads; j++) {
|
|
19733
|
-
const int rc = ggml_thread_join(workers[j].thrd, NULL);
|
|
19734
|
-
GGML_ASSERT(rc == 0);
|
|
19735
|
-
if (workers[j].ec != GGML_STATUS_SUCCESS)
|
|
19736
|
-
compute_status = workers[j].ec;
|
|
19737
|
-
}
|
|
19586
|
+
for (int j = 0; j < n_threads; ++j) {
|
|
19587
|
+
workers[j] = (struct ggml_compute_state) {
|
|
19588
|
+
.thrd = 0,
|
|
19589
|
+
.ith = j,
|
|
19590
|
+
.shared = &state_shared,
|
|
19591
|
+
.ec = GGML_STATUS_SUCCESS,
|
|
19592
|
+
};
|
|
19738
19593
|
}
|
|
19739
19594
|
|
|
19595
|
+
enum ggml_status compute_status = ggml_graph_compute_parallel(workers, n_threads);
|
|
19596
|
+
|
|
19740
19597
|
// performance stats (graph)
|
|
19741
19598
|
{
|
|
19742
19599
|
int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles;
|
|
@@ -22819,7 +22676,7 @@ int ggml_cpu_has_wasm_simd(void) {
|
|
|
22819
22676
|
}
|
|
22820
22677
|
|
|
22821
22678
|
int ggml_cpu_has_blas(void) {
|
|
22822
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(
|
|
22679
|
+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL)
|
|
22823
22680
|
return 1;
|
|
22824
22681
|
#else
|
|
22825
22682
|
return 0;
|
|
@@ -22834,14 +22691,6 @@ int ggml_cpu_has_cuda(void) {
|
|
|
22834
22691
|
#endif
|
|
22835
22692
|
}
|
|
22836
22693
|
|
|
22837
|
-
int ggml_cpu_has_clblast(void) {
|
|
22838
|
-
#if defined(GGML_USE_CLBLAST)
|
|
22839
|
-
return 1;
|
|
22840
|
-
#else
|
|
22841
|
-
return 0;
|
|
22842
|
-
#endif
|
|
22843
|
-
}
|
|
22844
|
-
|
|
22845
22694
|
int ggml_cpu_has_vulkan(void) {
|
|
22846
22695
|
#if defined(GGML_USE_VULKAN)
|
|
22847
22696
|
return 1;
|
|
@@ -22875,8 +22724,7 @@ int ggml_cpu_has_rpc(void) {
|
|
|
22875
22724
|
}
|
|
22876
22725
|
|
|
22877
22726
|
int ggml_cpu_has_gpublas(void) {
|
|
22878
|
-
return ggml_cpu_has_cuda() ||
|
|
22879
|
-
ggml_cpu_has_sycl();
|
|
22727
|
+
return ggml_cpu_has_cuda() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() || ggml_cpu_has_sycl();
|
|
22880
22728
|
}
|
|
22881
22729
|
|
|
22882
22730
|
int ggml_cpu_has_sse3(void) {
|