llama_cpp 0.16.0 → 0.16.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/ext/llama_cpp/extconf.rb +3 -0
- data/ext/llama_cpp/llama_cpp.cpp +14 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +119 -54
- data/vendor/tmp/llama.cpp/ggml-alloc.c +78 -22
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +20 -8
- data/vendor/tmp/llama.cpp/ggml-backend.c +190 -65
- data/vendor/tmp/llama.cpp/ggml-backend.h +6 -3
- data/vendor/tmp/llama.cpp/ggml-blas.cpp +363 -0
- data/vendor/tmp/llama.cpp/ggml-blas.h +23 -0
- data/vendor/tmp/llama.cpp/ggml-common.h +6 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +1 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +21 -9
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +15 -1491
- data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +77 -62
- data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +77 -10
- data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +1 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +48 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +95 -129
- data/vendor/tmp/llama.cpp/ggml-impl.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +8 -7
- data/vendor/tmp/llama.cpp/ggml-metal.m +17 -9
- data/vendor/tmp/llama.cpp/ggml-quants.c +982 -368
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +21 -15
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +2133 -13215
- data/vendor/tmp/llama.cpp/ggml-sycl.h +1 -10
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +28826 -25037
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +438 -493
- data/vendor/tmp/llama.cpp/ggml.c +158 -414
- data/vendor/tmp/llama.cpp/ggml.h +6 -0
- data/vendor/tmp/llama.cpp/llama.cpp +628 -279
- data/vendor/tmp/llama.cpp/llama.h +9 -1
- data/vendor/tmp/llama.cpp/sgemm.cpp +2 -0
- data/vendor/tmp/llama.cpp/unicode-data.cpp +851 -801
- data/vendor/tmp/llama.cpp/unicode.cpp +33 -19
- data/vendor/tmp/llama.cpp/unicode.h +1 -1
- metadata +15 -3
data/vendor/tmp/llama.cpp/ggml.c
CHANGED
@@ -297,12 +297,6 @@ inline static void * ggml_calloc(size_t num, size_t size) {
|
|
297
297
|
|
298
298
|
#if defined(GGML_USE_ACCELERATE)
|
299
299
|
#include <Accelerate/Accelerate.h>
|
300
|
-
#elif defined(GGML_USE_OPENBLAS)
|
301
|
-
#if defined(GGML_BLAS_USE_MKL)
|
302
|
-
#include <mkl.h>
|
303
|
-
#else
|
304
|
-
#include <cblas.h>
|
305
|
-
#endif
|
306
300
|
#endif
|
307
301
|
|
308
302
|
// floating point type used to accumulate sums
|
@@ -1759,9 +1753,8 @@ struct ggml_compute_state_shared {
|
|
1759
1753
|
int n_threads;
|
1760
1754
|
|
1761
1755
|
// synchronization primitives
|
1762
|
-
atomic_int
|
1763
|
-
atomic_int
|
1764
|
-
atomic_int node_task; // active graph node task phase
|
1756
|
+
atomic_int n_barrier;
|
1757
|
+
atomic_int n_barrier_passed;
|
1765
1758
|
|
1766
1759
|
ggml_abort_callback abort_callback; // abort ggml_graph_compute when true
|
1767
1760
|
void* abort_callback_data;
|
@@ -3212,35 +3205,42 @@ GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor) {
|
|
3212
3205
|
return tensor->nb[0] > tensor->nb[1];
|
3213
3206
|
}
|
3214
3207
|
|
3215
|
-
|
3216
|
-
|
3208
|
+
static bool ggml_is_contiguous_n(const struct ggml_tensor * tensor, int n) {
|
3209
|
+
size_t next_nb = ggml_type_size(tensor->type);
|
3210
|
+
if (tensor->ne[0] != ggml_blck_size(tensor->type) && tensor->nb[0] != next_nb) {
|
3211
|
+
return false;
|
3212
|
+
}
|
3213
|
+
next_nb *= tensor->ne[0]/ggml_blck_size(tensor->type);
|
3214
|
+
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
3215
|
+
if (tensor->ne[i] != 1) {
|
3216
|
+
if (i > n) {
|
3217
|
+
if (tensor->nb[i] != next_nb) {
|
3218
|
+
return false;
|
3219
|
+
}
|
3220
|
+
next_nb *= tensor->ne[i];
|
3221
|
+
} else {
|
3222
|
+
// this dimension does not need to be contiguous
|
3223
|
+
next_nb = tensor->ne[i]*tensor->nb[i];
|
3224
|
+
}
|
3225
|
+
}
|
3226
|
+
}
|
3227
|
+
return true;
|
3228
|
+
}
|
3217
3229
|
|
3218
|
-
|
3219
|
-
|
3220
|
-
tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) &&
|
3221
|
-
tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
|
3222
|
-
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
3230
|
+
GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
|
3231
|
+
return ggml_is_contiguous_0(tensor);
|
3223
3232
|
}
|
3224
3233
|
|
3225
3234
|
GGML_CALL bool ggml_is_contiguous_0(const struct ggml_tensor * tensor) {
|
3226
|
-
return
|
3235
|
+
return ggml_is_contiguous_n(tensor, 0);
|
3227
3236
|
}
|
3228
3237
|
|
3229
3238
|
GGML_CALL bool ggml_is_contiguous_1(const struct ggml_tensor * tensor) {
|
3230
|
-
|
3231
|
-
|
3232
|
-
return
|
3233
|
-
tensor->nb[0] == ggml_type_size(tensor->type) &&
|
3234
|
-
tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
|
3235
|
-
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
3239
|
+
return ggml_is_contiguous_n(tensor, 1);
|
3236
3240
|
}
|
3237
3241
|
|
3238
3242
|
GGML_CALL bool ggml_is_contiguous_2(const struct ggml_tensor * tensor) {
|
3239
|
-
|
3240
|
-
|
3241
|
-
return
|
3242
|
-
tensor->nb[0] == ggml_type_size(tensor->type) &&
|
3243
|
-
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
3243
|
+
return ggml_is_contiguous_n(tensor, 2);
|
3244
3244
|
}
|
3245
3245
|
|
3246
3246
|
GGML_CALL bool ggml_is_permuted(const struct ggml_tensor * tensor) {
|
@@ -3272,20 +3272,20 @@ bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor
|
|
3272
3272
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
3273
3273
|
|
3274
3274
|
return
|
3275
|
-
(t0->ne[0] == t1->ne[0]
|
3276
|
-
(t0->ne[1] == t1->ne[1]
|
3277
|
-
(t0->ne[2] == t1->ne[2]
|
3278
|
-
(t0->ne[3] == t1->ne[3]
|
3275
|
+
(t0->ne[0] == t1->ne[0]) &&
|
3276
|
+
(t0->ne[1] == t1->ne[1]) &&
|
3277
|
+
(t0->ne[2] == t1->ne[2]) &&
|
3278
|
+
(t0->ne[3] == t1->ne[3]);
|
3279
3279
|
}
|
3280
3280
|
|
3281
3281
|
bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
3282
3282
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
3283
3283
|
|
3284
3284
|
return
|
3285
|
-
(t0->nb[0] == t1->nb[0]
|
3286
|
-
(t0->nb[1] == t1->nb[1]
|
3287
|
-
(t0->nb[2] == t1->nb[2]
|
3288
|
-
(t0->nb[3] == t1->nb[3]
|
3285
|
+
(t0->nb[0] == t1->nb[0]) &&
|
3286
|
+
(t0->nb[1] == t1->nb[1]) &&
|
3287
|
+
(t0->nb[2] == t1->nb[2]) &&
|
3288
|
+
(t0->nb[3] == t1->nb[3]);
|
3289
3289
|
}
|
3290
3290
|
|
3291
3291
|
// check if t1 can be represented as a repeatition of t0
|
@@ -4078,32 +4078,26 @@ float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) {
|
|
4078
4078
|
switch (tensor->type) {
|
4079
4079
|
case GGML_TYPE_I8:
|
4080
4080
|
{
|
4081
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
|
4082
4081
|
return ((int8_t *)(tensor->data))[i];
|
4083
4082
|
}
|
4084
4083
|
case GGML_TYPE_I16:
|
4085
4084
|
{
|
4086
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
|
4087
4085
|
return ((int16_t *)(tensor->data))[i];
|
4088
4086
|
}
|
4089
4087
|
case GGML_TYPE_I32:
|
4090
4088
|
{
|
4091
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
|
4092
4089
|
return ((int32_t *)(tensor->data))[i];
|
4093
4090
|
}
|
4094
4091
|
case GGML_TYPE_F16:
|
4095
4092
|
{
|
4096
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
|
4097
4093
|
return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
|
4098
4094
|
}
|
4099
4095
|
case GGML_TYPE_BF16:
|
4100
4096
|
{
|
4101
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_bf16_t));
|
4102
4097
|
return GGML_BF16_TO_FP32(((ggml_bf16_t *)(tensor->data))[i]);
|
4103
4098
|
}
|
4104
4099
|
case GGML_TYPE_F32:
|
4105
4100
|
{
|
4106
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(float));
|
4107
4101
|
return ((float *)(tensor->data))[i];
|
4108
4102
|
}
|
4109
4103
|
default:
|
@@ -4125,32 +4119,26 @@ void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) {
|
|
4125
4119
|
switch (tensor->type) {
|
4126
4120
|
case GGML_TYPE_I8:
|
4127
4121
|
{
|
4128
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
|
4129
4122
|
((int8_t *)(tensor->data))[i] = value;
|
4130
4123
|
} break;
|
4131
4124
|
case GGML_TYPE_I16:
|
4132
4125
|
{
|
4133
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
|
4134
4126
|
((int16_t *)(tensor->data))[i] = value;
|
4135
4127
|
} break;
|
4136
4128
|
case GGML_TYPE_I32:
|
4137
4129
|
{
|
4138
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
|
4139
4130
|
((int32_t *)(tensor->data))[i] = value;
|
4140
4131
|
} break;
|
4141
4132
|
case GGML_TYPE_F16:
|
4142
4133
|
{
|
4143
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
|
4144
4134
|
((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
|
4145
4135
|
} break;
|
4146
4136
|
case GGML_TYPE_BF16:
|
4147
4137
|
{
|
4148
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_bf16_t));
|
4149
4138
|
((ggml_bf16_t *)(tensor->data))[i] = GGML_FP32_TO_BF16(value);
|
4150
4139
|
} break;
|
4151
4140
|
case GGML_TYPE_F32:
|
4152
4141
|
{
|
4153
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(float));
|
4154
4142
|
((float *)(tensor->data))[i] = value;
|
4155
4143
|
} break;
|
4156
4144
|
default:
|
@@ -7343,13 +7331,15 @@ struct ggml_tensor * ggml_add_rel_pos_inplace(
|
|
7343
7331
|
return ggml_add_rel_pos_impl(ctx, a, pw, ph, true);
|
7344
7332
|
}
|
7345
7333
|
|
7346
|
-
//
|
7334
|
+
// ggml_unary
|
7347
7335
|
|
7348
7336
|
static struct ggml_tensor * ggml_unary_impl(
|
7349
7337
|
struct ggml_context * ctx,
|
7350
7338
|
struct ggml_tensor * a,
|
7351
7339
|
enum ggml_unary_op op,
|
7352
7340
|
bool inplace) {
|
7341
|
+
GGML_ASSERT(ggml_is_contiguous_1(a));
|
7342
|
+
|
7353
7343
|
bool is_node = false;
|
7354
7344
|
|
7355
7345
|
if (!inplace && (a->grad)) {
|
@@ -11014,6 +11004,8 @@ static void ggml_compute_forward_abs_f32(
|
|
11014
11004
|
const struct ggml_tensor * src0 = dst->src[0];
|
11015
11005
|
|
11016
11006
|
assert(params->ith == 0);
|
11007
|
+
assert(ggml_is_contiguous_1(src0));
|
11008
|
+
assert(ggml_is_contiguous_1(dst));
|
11017
11009
|
assert(ggml_are_same_shape(src0, dst));
|
11018
11010
|
|
11019
11011
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
@@ -11023,9 +11015,6 @@ static void ggml_compute_forward_abs_f32(
|
|
11023
11015
|
const int n = ggml_nrows(src0);
|
11024
11016
|
const int nc = src0->ne[0];
|
11025
11017
|
|
11026
|
-
assert(dst->nb[0] == sizeof(float));
|
11027
|
-
assert(src0->nb[0] == sizeof(float));
|
11028
|
-
|
11029
11018
|
for (int i = 0; i < n; i++) {
|
11030
11019
|
ggml_vec_abs_f32(nc,
|
11031
11020
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
@@ -11060,6 +11049,8 @@ static void ggml_compute_forward_sgn_f32(
|
|
11060
11049
|
const struct ggml_tensor * src0 = dst->src[0];
|
11061
11050
|
|
11062
11051
|
assert(params->ith == 0);
|
11052
|
+
assert(ggml_is_contiguous_1(src0));
|
11053
|
+
assert(ggml_is_contiguous_1(dst));
|
11063
11054
|
assert(ggml_are_same_shape(src0, dst));
|
11064
11055
|
|
11065
11056
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
@@ -11069,9 +11060,6 @@ static void ggml_compute_forward_sgn_f32(
|
|
11069
11060
|
const int n = ggml_nrows(src0);
|
11070
11061
|
const int nc = src0->ne[0];
|
11071
11062
|
|
11072
|
-
assert(dst->nb[0] == sizeof(float));
|
11073
|
-
assert(src0->nb[0] == sizeof(float));
|
11074
|
-
|
11075
11063
|
for (int i = 0; i < n; i++) {
|
11076
11064
|
ggml_vec_sgn_f32(nc,
|
11077
11065
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
@@ -11106,6 +11094,8 @@ static void ggml_compute_forward_neg_f32(
|
|
11106
11094
|
const struct ggml_tensor * src0 = dst->src[0];
|
11107
11095
|
|
11108
11096
|
assert(params->ith == 0);
|
11097
|
+
assert(ggml_is_contiguous_1(src0));
|
11098
|
+
assert(ggml_is_contiguous_1(dst));
|
11109
11099
|
assert(ggml_are_same_shape(src0, dst));
|
11110
11100
|
|
11111
11101
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
@@ -11115,9 +11105,6 @@ static void ggml_compute_forward_neg_f32(
|
|
11115
11105
|
const int n = ggml_nrows(src0);
|
11116
11106
|
const int nc = src0->ne[0];
|
11117
11107
|
|
11118
|
-
assert(dst->nb[0] == sizeof(float));
|
11119
|
-
assert(src0->nb[0] == sizeof(float));
|
11120
|
-
|
11121
11108
|
for (int i = 0; i < n; i++) {
|
11122
11109
|
ggml_vec_neg_f32(nc,
|
11123
11110
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
@@ -11152,6 +11139,8 @@ static void ggml_compute_forward_step_f32(
|
|
11152
11139
|
const struct ggml_tensor * src0 = dst->src[0];
|
11153
11140
|
|
11154
11141
|
assert(params->ith == 0);
|
11142
|
+
assert(ggml_is_contiguous_1(src0));
|
11143
|
+
assert(ggml_is_contiguous_1(dst));
|
11155
11144
|
assert(ggml_are_same_shape(src0, dst));
|
11156
11145
|
|
11157
11146
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
@@ -11161,9 +11150,6 @@ static void ggml_compute_forward_step_f32(
|
|
11161
11150
|
const int n = ggml_nrows(src0);
|
11162
11151
|
const int nc = src0->ne[0];
|
11163
11152
|
|
11164
|
-
assert(dst->nb[0] == sizeof(float));
|
11165
|
-
assert(src0->nb[0] == sizeof(float));
|
11166
|
-
|
11167
11153
|
for (int i = 0; i < n; i++) {
|
11168
11154
|
ggml_vec_step_f32(nc,
|
11169
11155
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
@@ -11198,6 +11184,8 @@ static void ggml_compute_forward_tanh_f32(
|
|
11198
11184
|
const struct ggml_tensor * src0 = dst->src[0];
|
11199
11185
|
|
11200
11186
|
assert(params->ith == 0);
|
11187
|
+
assert(ggml_is_contiguous_1(src0));
|
11188
|
+
assert(ggml_is_contiguous_1(dst));
|
11201
11189
|
assert(ggml_are_same_shape(src0, dst));
|
11202
11190
|
|
11203
11191
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
@@ -11207,9 +11195,6 @@ static void ggml_compute_forward_tanh_f32(
|
|
11207
11195
|
const int n = ggml_nrows(src0);
|
11208
11196
|
const int nc = src0->ne[0];
|
11209
11197
|
|
11210
|
-
assert(dst->nb[0] == sizeof(float));
|
11211
|
-
assert(src0->nb[0] == sizeof(float));
|
11212
|
-
|
11213
11198
|
for (int i = 0; i < n; i++) {
|
11214
11199
|
ggml_vec_tanh_f32(nc,
|
11215
11200
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
@@ -11244,6 +11229,8 @@ static void ggml_compute_forward_elu_f32(
|
|
11244
11229
|
const struct ggml_tensor * src0 = dst->src[0];
|
11245
11230
|
|
11246
11231
|
assert(params->ith == 0);
|
11232
|
+
assert(ggml_is_contiguous_1(src0));
|
11233
|
+
assert(ggml_is_contiguous_1(dst));
|
11247
11234
|
assert(ggml_are_same_shape(src0, dst));
|
11248
11235
|
|
11249
11236
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
@@ -11253,9 +11240,6 @@ static void ggml_compute_forward_elu_f32(
|
|
11253
11240
|
const int n = ggml_nrows(src0);
|
11254
11241
|
const int nc = src0->ne[0];
|
11255
11242
|
|
11256
|
-
assert(dst->nb[0] == sizeof(float));
|
11257
|
-
assert(src0->nb[0] == sizeof(float));
|
11258
|
-
|
11259
11243
|
for (int i = 0; i < n; i++) {
|
11260
11244
|
ggml_vec_elu_f32(nc,
|
11261
11245
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
@@ -11290,6 +11274,8 @@ static void ggml_compute_forward_relu_f32(
|
|
11290
11274
|
const struct ggml_tensor * src0 = dst->src[0];
|
11291
11275
|
|
11292
11276
|
assert(params->ith == 0);
|
11277
|
+
assert(ggml_is_contiguous_1(src0));
|
11278
|
+
assert(ggml_is_contiguous_1(dst));
|
11293
11279
|
assert(ggml_are_same_shape(src0, dst));
|
11294
11280
|
|
11295
11281
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
@@ -11299,9 +11285,6 @@ static void ggml_compute_forward_relu_f32(
|
|
11299
11285
|
const int n = ggml_nrows(src0);
|
11300
11286
|
const int nc = src0->ne[0];
|
11301
11287
|
|
11302
|
-
assert(dst->nb[0] == sizeof(float));
|
11303
|
-
assert(src0->nb[0] == sizeof(float));
|
11304
|
-
|
11305
11288
|
for (int i = 0; i < n; i++) {
|
11306
11289
|
ggml_vec_relu_f32(nc,
|
11307
11290
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
@@ -11336,6 +11319,8 @@ static void ggml_compute_forward_sigmoid_f32(
|
|
11336
11319
|
const struct ggml_tensor * src0 = dst->src[0];
|
11337
11320
|
|
11338
11321
|
assert(params->ith == 0);
|
11322
|
+
assert(ggml_is_contiguous_1(src0));
|
11323
|
+
assert(ggml_is_contiguous_1(dst));
|
11339
11324
|
assert(ggml_are_same_shape(src0, dst));
|
11340
11325
|
|
11341
11326
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
@@ -11345,9 +11330,6 @@ static void ggml_compute_forward_sigmoid_f32(
|
|
11345
11330
|
const int n = ggml_nrows(src0);
|
11346
11331
|
const int nc = src0->ne[0];
|
11347
11332
|
|
11348
|
-
assert(dst->nb[0] == sizeof(float));
|
11349
|
-
assert(src0->nb[0] == sizeof(float));
|
11350
|
-
|
11351
11333
|
for (int i = 0; i < n; i++) {
|
11352
11334
|
ggml_vec_sigmoid_f32(nc,
|
11353
11335
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
@@ -11381,9 +11363,9 @@ static void ggml_compute_forward_gelu_f32(
|
|
11381
11363
|
|
11382
11364
|
const struct ggml_tensor * src0 = dst->src[0];
|
11383
11365
|
|
11384
|
-
|
11385
|
-
|
11386
|
-
|
11366
|
+
assert(ggml_is_contiguous_1(src0));
|
11367
|
+
assert(ggml_is_contiguous_1(dst));
|
11368
|
+
assert(ggml_are_same_shape(src0, dst));
|
11387
11369
|
|
11388
11370
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
11389
11371
|
return;
|
@@ -11444,9 +11426,9 @@ static void ggml_compute_forward_gelu_quick_f32(
|
|
11444
11426
|
|
11445
11427
|
const struct ggml_tensor * src0 = dst->src[0];
|
11446
11428
|
|
11447
|
-
|
11448
|
-
|
11449
|
-
|
11429
|
+
assert(ggml_is_contiguous_1(src0));
|
11430
|
+
assert(ggml_is_contiguous_1(dst));
|
11431
|
+
assert(ggml_are_same_shape(src0, dst));
|
11450
11432
|
|
11451
11433
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
11452
11434
|
return;
|
@@ -11507,9 +11489,9 @@ static void ggml_compute_forward_silu_f32(
|
|
11507
11489
|
|
11508
11490
|
const struct ggml_tensor * src0 = dst->src[0];
|
11509
11491
|
|
11510
|
-
|
11511
|
-
|
11512
|
-
|
11492
|
+
assert(ggml_is_contiguous_1(src0));
|
11493
|
+
assert(ggml_is_contiguous_1(dst));
|
11494
|
+
assert(ggml_are_same_shape(src0, dst));
|
11513
11495
|
|
11514
11496
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
11515
11497
|
return;
|
@@ -11570,6 +11552,8 @@ static void ggml_compute_forward_leaky_relu_f32(
|
|
11570
11552
|
const struct ggml_tensor * src0 = dst->src[0];
|
11571
11553
|
|
11572
11554
|
assert(params->ith == 0);
|
11555
|
+
assert(ggml_is_contiguous_1(src0));
|
11556
|
+
assert(ggml_is_contiguous_1(dst));
|
11573
11557
|
assert(ggml_are_same_shape(src0, dst));
|
11574
11558
|
|
11575
11559
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
@@ -11619,11 +11603,11 @@ static void ggml_compute_forward_silu_back_f32(
|
|
11619
11603
|
const struct ggml_tensor * src0 = dst->src[0];
|
11620
11604
|
const struct ggml_tensor * grad = dst->src[1];
|
11621
11605
|
|
11622
|
-
|
11623
|
-
|
11624
|
-
|
11625
|
-
|
11626
|
-
|
11606
|
+
assert(ggml_is_contiguous_1(grad));
|
11607
|
+
assert(ggml_is_contiguous_1(src0));
|
11608
|
+
assert(ggml_is_contiguous_1(dst));
|
11609
|
+
assert(ggml_are_same_shape(src0, dst));
|
11610
|
+
assert(ggml_are_same_shape(src0, grad));
|
11627
11611
|
|
11628
11612
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
11629
11613
|
return;
|
@@ -11685,6 +11669,8 @@ static void ggml_compute_forward_hardswish_f32(
|
|
11685
11669
|
const struct ggml_tensor * src0 = dst->src[0];
|
11686
11670
|
|
11687
11671
|
assert(params->ith == 0);
|
11672
|
+
assert(ggml_is_contiguous_1(src0));
|
11673
|
+
assert(ggml_is_contiguous_1(dst));
|
11688
11674
|
assert(ggml_are_same_shape(src0, dst));
|
11689
11675
|
|
11690
11676
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
@@ -11694,9 +11680,6 @@ static void ggml_compute_forward_hardswish_f32(
|
|
11694
11680
|
const int n = ggml_nrows(src0);
|
11695
11681
|
const int nc = src0->ne[0];
|
11696
11682
|
|
11697
|
-
assert(dst->nb[0] == sizeof(float));
|
11698
|
-
assert(src0->nb[0] == sizeof(float));
|
11699
|
-
|
11700
11683
|
for (int i = 0; i < n; i++) {
|
11701
11684
|
ggml_vec_hardswish_f32(nc,
|
11702
11685
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
@@ -11728,6 +11711,8 @@ static void ggml_compute_forward_hardsigmoid_f32(
|
|
11728
11711
|
const struct ggml_tensor * src0 = dst->src[0];
|
11729
11712
|
|
11730
11713
|
assert(params->ith == 0);
|
11714
|
+
assert(ggml_is_contiguous_1(src0));
|
11715
|
+
assert(ggml_is_contiguous_1(dst));
|
11731
11716
|
assert(ggml_are_same_shape(src0, dst));
|
11732
11717
|
|
11733
11718
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
@@ -11737,9 +11722,6 @@ static void ggml_compute_forward_hardsigmoid_f32(
|
|
11737
11722
|
const int n = ggml_nrows(src0);
|
11738
11723
|
const int nc = src0->ne[0];
|
11739
11724
|
|
11740
|
-
assert(dst->nb[0] == sizeof(float));
|
11741
|
-
assert(src0->nb[0] == sizeof(float));
|
11742
|
-
|
11743
11725
|
for (int i = 0; i < n; i++) {
|
11744
11726
|
ggml_vec_hardsigmoid_f32(nc,
|
11745
11727
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
@@ -12190,39 +12172,6 @@ static void ggml_compute_forward_group_norm(
|
|
12190
12172
|
|
12191
12173
|
// ggml_compute_forward_mul_mat
|
12192
12174
|
|
12193
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
12194
|
-
// helper function to determine if it is better to use BLAS or not
|
12195
|
-
// for large matrices, BLAS is faster
|
12196
|
-
static bool ggml_compute_forward_mul_mat_use_blas(struct ggml_tensor * dst) {
|
12197
|
-
const struct ggml_tensor * src0 = dst->src[0];
|
12198
|
-
const struct ggml_tensor * src1 = dst->src[1];
|
12199
|
-
|
12200
|
-
//const int64_t ne00 = src0->ne[0];
|
12201
|
-
//const int64_t ne01 = src0->ne[1];
|
12202
|
-
|
12203
|
-
const int64_t ne10 = src1->ne[0];
|
12204
|
-
|
12205
|
-
const int64_t ne0 = dst->ne[0];
|
12206
|
-
const int64_t ne1 = dst->ne[1];
|
12207
|
-
|
12208
|
-
// NOTE: with GGML_OP_MUL_MAT_ID we don't want to go through the BLAS branch because it will dequantize (to_float)
|
12209
|
-
// all the experts for each batch element and the processing would become incredibly slow
|
12210
|
-
// TODO: find the optimal values for these
|
12211
|
-
if (dst->op != GGML_OP_MUL_MAT_ID &&
|
12212
|
-
ggml_is_contiguous(src0) &&
|
12213
|
-
ggml_is_contiguous(src1) &&
|
12214
|
-
//src0->type == GGML_TYPE_F32 &&
|
12215
|
-
src1->type == GGML_TYPE_F32 &&
|
12216
|
-
(ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
|
12217
|
-
|
12218
|
-
/*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
|
12219
|
-
return true;
|
12220
|
-
}
|
12221
|
-
|
12222
|
-
return false;
|
12223
|
-
}
|
12224
|
-
#endif
|
12225
|
-
|
12226
12175
|
static void ggml_compute_forward_mul_mat_one_chunk(
|
12227
12176
|
const struct ggml_compute_params * params,
|
12228
12177
|
struct ggml_tensor * dst,
|
@@ -12360,73 +12309,6 @@ static void ggml_compute_forward_mul_mat(
|
|
12360
12309
|
// nb01 >= nb00 - src0 is not transposed
|
12361
12310
|
// compute by src0 rows
|
12362
12311
|
|
12363
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
12364
|
-
if (ggml_compute_forward_mul_mat_use_blas(dst)) {
|
12365
|
-
const int64_t ne_plane = ne01*ne00;
|
12366
|
-
const size_t desired_wsize = ne13*ne12*ne_plane*sizeof(float);
|
12367
|
-
UNUSED(desired_wsize);
|
12368
|
-
|
12369
|
-
if (params->type == GGML_TASK_TYPE_INIT) {
|
12370
|
-
if (type != GGML_TYPE_F32) {
|
12371
|
-
assert(params->wsize >= desired_wsize);
|
12372
|
-
// parallelize by src0 rows
|
12373
|
-
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
12374
|
-
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
12375
|
-
// broadcast src0 into src1 across 2nd,3rd dimension
|
12376
|
-
const int64_t i03 = i13/r3;
|
12377
|
-
const int64_t i02 = i12/r2;
|
12378
|
-
|
12379
|
-
const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
|
12380
|
-
float * const wdata = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
|
12381
|
-
ggml_to_float_t const to_float = type_traits[type].to_float;
|
12382
|
-
|
12383
|
-
for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
|
12384
|
-
to_float((const char *) x + i01*nb01, wdata + i01*ne00, ne00);
|
12385
|
-
}
|
12386
|
-
}
|
12387
|
-
}
|
12388
|
-
}
|
12389
|
-
return;
|
12390
|
-
}
|
12391
|
-
|
12392
|
-
if (params->type == GGML_TASK_TYPE_FINALIZE) {
|
12393
|
-
return;
|
12394
|
-
}
|
12395
|
-
|
12396
|
-
// perform sgemm, parallelization controlled by blas lib
|
12397
|
-
if (ith != 0) {
|
12398
|
-
return;
|
12399
|
-
}
|
12400
|
-
|
12401
|
-
//const int64_t tgemm0 = ggml_perf_time_us();
|
12402
|
-
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
12403
|
-
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
12404
|
-
const int64_t i03 = i13/r3;
|
12405
|
-
const int64_t i02 = i12/r2;
|
12406
|
-
|
12407
|
-
const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
|
12408
|
-
const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
|
12409
|
-
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
12410
|
-
|
12411
|
-
if (type != GGML_TYPE_F32) {
|
12412
|
-
x = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
|
12413
|
-
}
|
12414
|
-
|
12415
|
-
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
12416
|
-
ne1, ne01, ne10,
|
12417
|
-
1.0f, y, ne10,
|
12418
|
-
x, ne00,
|
12419
|
-
0.0f, d, ne01);
|
12420
|
-
}
|
12421
|
-
}
|
12422
|
-
//printf("cblas_sgemm = %.3f ms, %lld flops\n", (ggml_perf_time_us() - tgemm0)/1000.0, ne13*ne12*ne1*ne01*ne10*2);
|
12423
|
-
|
12424
|
-
//printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
|
12425
|
-
|
12426
|
-
return;
|
12427
|
-
}
|
12428
|
-
#endif
|
12429
|
-
|
12430
12312
|
#if GGML_USE_LLAMAFILE
|
12431
12313
|
const bool src1_cont = ggml_is_contiguous(src1);
|
12432
12314
|
|
@@ -12807,19 +12689,7 @@ static void ggml_compute_forward_out_prod_f32(
|
|
12807
12689
|
// nb01 >= nb00 - src0 is not transposed
|
12808
12690
|
// compute by src0 rows
|
12809
12691
|
|
12810
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
12811
|
-
bool use_blas = ggml_is_matrix(src0) &&
|
12812
|
-
ggml_is_matrix(src1) &&
|
12813
|
-
ggml_is_contiguous(src0) &&
|
12814
|
-
(ggml_is_contiguous(src1) || ggml_is_transposed(src1));
|
12815
|
-
#endif
|
12816
|
-
|
12817
12692
|
if (params->type == GGML_TASK_TYPE_INIT) {
|
12818
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) // gemm beta will zero dst
|
12819
|
-
if (use_blas) {
|
12820
|
-
return;
|
12821
|
-
}
|
12822
|
-
#endif
|
12823
12693
|
if (ith != 0) {
|
12824
12694
|
return;
|
12825
12695
|
}
|
@@ -12831,50 +12701,6 @@ static void ggml_compute_forward_out_prod_f32(
|
|
12831
12701
|
return;
|
12832
12702
|
}
|
12833
12703
|
|
12834
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
12835
|
-
if (use_blas) {
|
12836
|
-
if (params->ith != 0) { // All threads other than the first do no work.
|
12837
|
-
return;
|
12838
|
-
}
|
12839
|
-
// Arguments to ggml_compute_forward_out_prod (expressed as major,minor)
|
12840
|
-
// src0: (k,n)
|
12841
|
-
// src1: (k,m)
|
12842
|
-
// dst: (m,n)
|
12843
|
-
//
|
12844
|
-
// Arguments to sgemm (see https://github.com/Reference-LAPACK/lapack/blob/master/BLAS/SRC/sgemm.f)
|
12845
|
-
// Also expressed as (major,minor)
|
12846
|
-
// a: (m,k): so src1 transposed
|
12847
|
-
// b: (k,n): so src0
|
12848
|
-
// c: (m,n)
|
12849
|
-
//
|
12850
|
-
// However, if ggml_is_transposed(src1) is true, then
|
12851
|
-
// src1->data already contains a transposed version, so sgemm mustn't
|
12852
|
-
// transpose it further.
|
12853
|
-
|
12854
|
-
int n = src0->ne[0];
|
12855
|
-
int k = src0->ne[1];
|
12856
|
-
int m = src1->ne[0];
|
12857
|
-
|
12858
|
-
int transposeA, lda;
|
12859
|
-
|
12860
|
-
if (!ggml_is_transposed(src1)) {
|
12861
|
-
transposeA = CblasTrans;
|
12862
|
-
lda = m;
|
12863
|
-
} else {
|
12864
|
-
transposeA = CblasNoTrans;
|
12865
|
-
lda = k;
|
12866
|
-
}
|
12867
|
-
|
12868
|
-
float * a = (float *) ((char *) src1->data);
|
12869
|
-
float * b = (float *) ((char *) src0->data);
|
12870
|
-
float * c = (float *) ((char *) dst->data);
|
12871
|
-
|
12872
|
-
cblas_sgemm(CblasRowMajor, transposeA, CblasNoTrans, m, n, k, 1.0, a, lda, b, n, 0.0, c, n);
|
12873
|
-
|
12874
|
-
return;
|
12875
|
-
}
|
12876
|
-
#endif
|
12877
|
-
|
12878
12704
|
// dst[:,:,:,:] = 0
|
12879
12705
|
// for i2,i3:
|
12880
12706
|
// for i1:
|
@@ -13004,8 +12830,6 @@ static void ggml_compute_forward_out_prod_q_f32(
|
|
13004
12830
|
// nb01 >= nb00 - src0 is not transposed
|
13005
12831
|
// compute by src0 rows
|
13006
12832
|
|
13007
|
-
// TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
13008
|
-
|
13009
12833
|
if (params->type == GGML_TASK_TYPE_INIT) {
|
13010
12834
|
if (ith != 0) {
|
13011
12835
|
return;
|
@@ -13402,6 +13226,8 @@ static void ggml_compute_forward_get_rows_q(
|
|
13402
13226
|
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
|
13403
13227
|
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
13404
13228
|
|
13229
|
+
assert(i01 >= 0 && i01 < ne01);
|
13230
|
+
|
13405
13231
|
dequantize_row_q(
|
13406
13232
|
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
13407
13233
|
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
@@ -13445,6 +13271,8 @@ static void ggml_compute_forward_get_rows_f16(
|
|
13445
13271
|
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
|
13446
13272
|
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
13447
13273
|
|
13274
|
+
assert(i01 >= 0 && i01 < ne01);
|
13275
|
+
|
13448
13276
|
ggml_fp16_to_fp32_row(
|
13449
13277
|
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
13450
13278
|
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
@@ -13488,7 +13316,9 @@ static void ggml_compute_forward_get_rows_bf16(
|
|
13488
13316
|
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
|
13489
13317
|
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
13490
13318
|
|
13491
|
-
|
13319
|
+
assert(i01 >= 0 && i01 < ne01);
|
13320
|
+
|
13321
|
+
ggml_bf16_to_fp32_row(
|
13492
13322
|
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
13493
13323
|
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
13494
13324
|
}
|
@@ -13531,6 +13361,8 @@ static void ggml_compute_forward_get_rows_f32(
|
|
13531
13361
|
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
|
13532
13362
|
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
13533
13363
|
|
13364
|
+
assert(i01 >= 0 && i01 < ne01);
|
13365
|
+
|
13534
13366
|
ggml_vec_cpy_f32(nc,
|
13535
13367
|
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3),
|
13536
13368
|
(float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03));
|
@@ -16686,7 +16518,10 @@ static void ggml_compute_forward_map_unary_f32(
|
|
16686
16518
|
|
16687
16519
|
const struct ggml_tensor * src0 = dst->src[0];
|
16688
16520
|
|
16689
|
-
|
16521
|
+
assert(params->ith == 0);
|
16522
|
+
assert(ggml_is_contiguous_1(src0));
|
16523
|
+
assert(ggml_is_contiguous_1(dst));
|
16524
|
+
assert(ggml_are_same_shape(src0, dst));
|
16690
16525
|
|
16691
16526
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
16692
16527
|
return;
|
@@ -16695,9 +16530,6 @@ static void ggml_compute_forward_map_unary_f32(
|
|
16695
16530
|
const int n = ggml_nrows(src0);
|
16696
16531
|
const int nc = src0->ne[0];
|
16697
16532
|
|
16698
|
-
assert( dst->nb[0] == sizeof(float));
|
16699
|
-
assert(src0->nb[0] == sizeof(float));
|
16700
|
-
|
16701
16533
|
for (int i = 0; i < n; i++) {
|
16702
16534
|
fun(nc,
|
16703
16535
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
@@ -16735,6 +16567,9 @@ static void ggml_compute_forward_map_binary_f32(
|
|
16735
16567
|
const struct ggml_tensor * src1 = dst->src[1];
|
16736
16568
|
|
16737
16569
|
assert(params->ith == 0);
|
16570
|
+
assert(ggml_is_contiguous_1(src0));
|
16571
|
+
assert(ggml_is_contiguous_1(src1));
|
16572
|
+
assert(ggml_is_contiguous_1(dst));
|
16738
16573
|
assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
|
16739
16574
|
|
16740
16575
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
@@ -16744,10 +16579,6 @@ static void ggml_compute_forward_map_binary_f32(
|
|
16744
16579
|
const int n = ggml_nrows(src0);
|
16745
16580
|
const int nc = src0->ne[0];
|
16746
16581
|
|
16747
|
-
assert( dst->nb[0] == sizeof(float));
|
16748
|
-
assert(src0->nb[0] == sizeof(float));
|
16749
|
-
assert(src1->nb[0] == sizeof(float));
|
16750
|
-
|
16751
16582
|
for (int i = 0; i < n; i++) {
|
16752
16583
|
fun(nc,
|
16753
16584
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
@@ -18905,6 +18736,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
|
|
18905
18736
|
switch (node->op) {
|
18906
18737
|
case GGML_OP_CPY:
|
18907
18738
|
case GGML_OP_DUP:
|
18739
|
+
case GGML_OP_CONT:
|
18908
18740
|
case GGML_OP_ADD:
|
18909
18741
|
case GGML_OP_ADD1:
|
18910
18742
|
case GGML_OP_ACC:
|
@@ -18989,7 +18821,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
|
|
18989
18821
|
} break;
|
18990
18822
|
case GGML_OP_SCALE:
|
18991
18823
|
case GGML_OP_SET:
|
18992
|
-
case GGML_OP_CONT:
|
18993
18824
|
case GGML_OP_RESHAPE:
|
18994
18825
|
case GGML_OP_VIEW:
|
18995
18826
|
case GGML_OP_PERMUTE:
|
@@ -19140,41 +18971,49 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
|
|
19140
18971
|
return n_tasks;
|
19141
18972
|
}
|
19142
18973
|
|
19143
|
-
|
19144
|
-
|
19145
|
-
|
19146
|
-
|
19147
|
-
while (true) {
|
19148
|
-
if (do_yield) {
|
19149
|
-
sched_yield();
|
19150
|
-
}
|
19151
|
-
|
19152
|
-
* node_n = atomic_load(&state->shared->node_n);
|
19153
|
-
if (* node_n != last_node_n) break;
|
19154
|
-
#if defined(__SSE3__)
|
19155
|
-
// Tell the processor we're spinning. It's a processor hint for spinlocks.
|
19156
|
-
_mm_pause();
|
19157
|
-
#endif
|
18974
|
+
#ifdef GGML_USE_OPENMP
|
18975
|
+
static void ggml_barrier(struct ggml_compute_state * state) {
|
18976
|
+
if (state->shared->n_threads == 1) {
|
18977
|
+
return;
|
19158
18978
|
}
|
18979
|
+
|
18980
|
+
#pragma omp barrier
|
19159
18981
|
}
|
18982
|
+
#else
|
18983
|
+
static void ggml_barrier(struct ggml_compute_state * state) {
|
18984
|
+
if (state->shared->n_threads == 1) {
|
18985
|
+
return;
|
18986
|
+
}
|
19160
18987
|
|
19161
|
-
|
19162
|
-
|
19163
|
-
const int last_task_phase = * task_phase;
|
18988
|
+
atomic_int * n_barrier = &state->shared->n_barrier;
|
18989
|
+
atomic_int * n_barrier_passed = &state->shared->n_barrier_passed;
|
19164
18990
|
|
19165
|
-
|
19166
|
-
|
18991
|
+
int n_threads = state->shared->n_threads;
|
18992
|
+
int passed_old = atomic_load(n_barrier_passed);
|
18993
|
+
|
18994
|
+
if (atomic_fetch_add(n_barrier, 1) == n_threads - 1) {
|
18995
|
+
// last thread
|
18996
|
+
atomic_store(n_barrier, 0);
|
18997
|
+
atomic_fetch_add(n_barrier_passed, 1);
|
18998
|
+
} else {
|
18999
|
+
// wait for other threads
|
19000
|
+
//while (atomic_load(n_barrier_passed) == passed_old) {
|
19001
|
+
//}
|
19002
|
+
const int n_spin_before_sleep = 100000;
|
19003
|
+
while (true) {
|
19004
|
+
for (int i = 0; i < n_spin_before_sleep; i++) {
|
19005
|
+
if (atomic_load(n_barrier_passed) != passed_old) {
|
19006
|
+
return;
|
19007
|
+
}
|
19008
|
+
#if defined(__SSE3__)
|
19009
|
+
_mm_pause();
|
19010
|
+
#endif
|
19011
|
+
}
|
19167
19012
|
sched_yield();
|
19168
19013
|
}
|
19169
|
-
|
19170
|
-
* task_phase = atomic_load(&state->shared->node_task);
|
19171
|
-
if (* task_phase != last_task_phase) break;
|
19172
|
-
#if defined(__SSE3__)
|
19173
|
-
// Tell the processor we're spinning. It's a processor hint for spinlocks.
|
19174
|
-
_mm_pause();
|
19175
|
-
#endif
|
19176
19014
|
}
|
19177
19015
|
}
|
19016
|
+
#endif
|
19178
19017
|
|
19179
19018
|
static thread_ret_t ggml_graph_compute_thread(void * data) {
|
19180
19019
|
struct ggml_compute_state * state = (struct ggml_compute_state *) data;
|
@@ -19182,136 +19021,54 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
19182
19021
|
const struct ggml_cgraph * cgraph = state->shared->cgraph;
|
19183
19022
|
const struct ggml_cplan * cplan = state->shared->cplan;
|
19184
19023
|
|
19185
|
-
const int
|
19024
|
+
const int ith = state->ith;
|
19025
|
+
const int n_threads = state->shared->n_threads;
|
19186
19026
|
|
19187
|
-
set_numa_thread_affinity(
|
19027
|
+
set_numa_thread_affinity(ith);
|
19188
19028
|
|
19189
|
-
|
19190
|
-
|
19029
|
+
struct ggml_compute_params params = {
|
19030
|
+
/*.type =*/ GGML_TASK_TYPE_INIT,
|
19031
|
+
/*.ith =*/ ith,
|
19032
|
+
/*.nth =*/ state->shared->n_threads,
|
19033
|
+
/*.wsize =*/ cplan->work_size,
|
19034
|
+
/*.wdata =*/ cplan->work_data,
|
19035
|
+
};
|
19191
19036
|
|
19192
|
-
|
19037
|
+
for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
|
19193
19038
|
if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
|
19194
|
-
state->shared->node_n += 1;
|
19195
19039
|
state->ec = GGML_STATUS_ABORTED;
|
19196
19040
|
return 0;
|
19197
19041
|
}
|
19198
19042
|
|
19199
|
-
if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
|
19200
|
-
// all other threads are finished and spinning
|
19201
|
-
// do finalize and init here so we don't have synchronize again
|
19202
|
-
struct ggml_compute_params params = {
|
19203
|
-
/*.type =*/ GGML_TASK_TYPE_FINALIZE,
|
19204
|
-
/*.ith =*/ 0,
|
19205
|
-
/*.nth =*/ 0,
|
19206
|
-
/*.wsize =*/ cplan->work_size,
|
19207
|
-
/*.wdata =*/ cplan->work_data,
|
19208
|
-
};
|
19209
|
-
|
19210
|
-
if (node_n != -1) {
|
19211
|
-
/* FINALIZE */
|
19212
|
-
struct ggml_tensor * node = cgraph->nodes[node_n];
|
19213
|
-
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
19214
|
-
params.nth = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
|
19215
|
-
ggml_compute_forward(¶ms, node, state);
|
19216
|
-
}
|
19217
|
-
ggml_graph_compute_perf_stats_node(node, state->shared);
|
19218
|
-
}
|
19219
|
-
|
19220
|
-
// distribute new work or execute it direct if 1T
|
19221
|
-
while (++node_n < cgraph->n_nodes) {
|
19222
|
-
GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
|
19223
|
-
struct ggml_tensor * node = cgraph->nodes[node_n];
|
19224
|
-
const int n_tasks = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
|
19225
|
-
|
19226
|
-
state->shared->perf_node_start_cycles = ggml_perf_cycles();
|
19227
|
-
state->shared->perf_node_start_time_us = ggml_perf_time_us();
|
19228
|
-
|
19229
|
-
params.nth = n_tasks;
|
19230
|
-
|
19231
|
-
if (n_tasks == 1) {
|
19232
|
-
/* INIT */
|
19233
|
-
if (GGML_OP_HAS_INIT[node->op]) {
|
19234
|
-
params.type = GGML_TASK_TYPE_INIT;
|
19235
|
-
ggml_compute_forward(¶ms, node, state);
|
19236
|
-
}
|
19237
|
-
|
19238
|
-
// TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
|
19239
|
-
// they do something more efficient than spinning (?)
|
19240
|
-
params.type = GGML_TASK_TYPE_COMPUTE;
|
19241
|
-
ggml_compute_forward(¶ms, node, state);
|
19242
|
-
|
19243
|
-
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
19244
|
-
params.type = GGML_TASK_TYPE_FINALIZE;
|
19245
|
-
ggml_compute_forward(¶ms, node, state);
|
19246
|
-
}
|
19247
|
-
|
19248
|
-
ggml_graph_compute_perf_stats_node(node, state->shared);
|
19249
|
-
} else {
|
19250
|
-
break;
|
19251
|
-
}
|
19252
|
-
|
19253
|
-
if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
|
19254
|
-
break;
|
19255
|
-
}
|
19256
|
-
}
|
19257
|
-
|
19258
|
-
task_phase = GGML_TASK_TYPE_INIT;
|
19259
|
-
atomic_store(&state->shared->n_active, n_threads);
|
19260
|
-
atomic_store(&state->shared->node_n, node_n);
|
19261
|
-
atomic_store(&state->shared->node_task, task_phase);
|
19262
|
-
} else {
|
19263
|
-
ggml_graph_compute_thread_sync_node(&node_n, state, false);
|
19264
|
-
ggml_graph_compute_thread_sync_task(&task_phase, state, false);
|
19265
|
-
}
|
19266
|
-
|
19267
|
-
// check if we should stop
|
19268
|
-
if (node_n >= cgraph->n_nodes) break;
|
19269
|
-
|
19270
|
-
/* INIT & COMPUTE */
|
19271
19043
|
struct ggml_tensor * node = cgraph->nodes[node_n];
|
19272
19044
|
const int n_tasks = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
|
19273
19045
|
|
19274
|
-
|
19275
|
-
/*.type =*/ GGML_TASK_TYPE_INIT,
|
19276
|
-
/*.ith =*/ state->ith,
|
19277
|
-
/*.nth =*/ n_tasks,
|
19278
|
-
/*.wsize =*/ cplan->work_size,
|
19279
|
-
/*.wdata =*/ cplan->work_data,
|
19280
|
-
};
|
19046
|
+
params.nth = n_tasks;
|
19281
19047
|
|
19282
|
-
|
19283
|
-
|
19048
|
+
/* INIT */
|
19049
|
+
if (GGML_OP_HAS_INIT[node->op]) {
|
19050
|
+
if (ith < n_tasks) {
|
19051
|
+
params.type = GGML_TASK_TYPE_INIT;
|
19284
19052
|
ggml_compute_forward(¶ms, node, state);
|
19285
19053
|
}
|
19054
|
+
ggml_barrier(state);
|
19286
19055
|
}
|
19287
19056
|
|
19288
|
-
|
19289
|
-
|
19290
|
-
atomic_store(&state->shared->n_active, n_threads);
|
19291
|
-
atomic_store(&state->shared->node_task, task_phase);
|
19292
|
-
}
|
19293
|
-
else {
|
19294
|
-
// TODO: this sched_yield can have significant impact on the performance - either positive or negative
|
19295
|
-
// depending on the workload and the operating system.
|
19296
|
-
// since it is not clear what is the best approach, it should potentially become user-configurable
|
19297
|
-
// ref: https://github.com/ggerganov/ggml/issues/291
|
19298
|
-
// UPD: adding the do_yield flag seems to resolve the issue universally
|
19299
|
-
const bool do_yield = node_n < 0 || cgraph->nodes[node_n]->op == GGML_OP_MUL_MAT;
|
19300
|
-
ggml_graph_compute_thread_sync_task(&task_phase, state, do_yield);
|
19301
|
-
}
|
19302
|
-
|
19303
|
-
if (state->ith < n_tasks) {
|
19057
|
+
/* COMPUTE */
|
19058
|
+
if (ith < n_tasks) {
|
19304
19059
|
params.type = GGML_TASK_TYPE_COMPUTE;
|
19305
19060
|
ggml_compute_forward(¶ms, node, state);
|
19306
19061
|
}
|
19307
19062
|
|
19308
|
-
|
19309
|
-
|
19310
|
-
|
19311
|
-
|
19312
|
-
|
19313
|
-
|
19314
|
-
|
19063
|
+
ggml_barrier(state);
|
19064
|
+
|
19065
|
+
/* FINALIZE */
|
19066
|
+
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
19067
|
+
if (params.ith == 0) {
|
19068
|
+
params.type = GGML_TASK_TYPE_FINALIZE;
|
19069
|
+
ggml_compute_forward(¶ms, node, state);
|
19070
|
+
}
|
19071
|
+
ggml_barrier(state);
|
19315
19072
|
}
|
19316
19073
|
}
|
19317
19074
|
|
@@ -19368,17 +19125,6 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
|
19368
19125
|
{
|
19369
19126
|
const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type;
|
19370
19127
|
|
19371
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
19372
|
-
if (ggml_compute_forward_mul_mat_use_blas(node)) {
|
19373
|
-
if (node->src[0]->type != GGML_TYPE_F32) {
|
19374
|
-
// here we need memory for fully dequantized matrix from src0
|
19375
|
-
// take into account that src0 can be broadcasted into src1[2,3]
|
19376
|
-
cur = ggml_type_size(GGML_TYPE_F32)
|
19377
|
-
* node->src[0]->ne[0]*node->src[0]->ne[1]
|
19378
|
-
* node->src[1]->ne[2]*node->src[1]->ne[3];
|
19379
|
-
}
|
19380
|
-
} else
|
19381
|
-
#endif
|
19382
19128
|
if (node->src[1]->type != vec_dot_type) {
|
19383
19129
|
cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
|
19384
19130
|
}
|
@@ -19509,7 +19255,6 @@ static enum ggml_status ggml_graph_compute_parallel(struct ggml_compute_state *
|
|
19509
19255
|
// update the number of threads from the actual number of threads that we got from OpenMP
|
19510
19256
|
n_threads = omp_get_num_threads();
|
19511
19257
|
workers[0].shared->n_threads = n_threads;
|
19512
|
-
workers[0].shared->n_active = n_threads;
|
19513
19258
|
}
|
19514
19259
|
ggml_graph_compute_thread(&workers[omp_get_thread_num()]);
|
19515
19260
|
}
|
@@ -19572,9 +19317,8 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
|
19572
19317
|
/*.perf_node_start_cycles =*/ 0,
|
19573
19318
|
/*.perf_node_start_time_us =*/ 0,
|
19574
19319
|
/*.n_threads =*/ n_threads,
|
19575
|
-
/*.
|
19576
|
-
/*.
|
19577
|
-
/*.node_task =*/ GGML_TASK_TYPE_FINALIZE,
|
19320
|
+
/*.n_barrier =*/ 0,
|
19321
|
+
/*.n_barrier_passed =*/ 0,
|
19578
19322
|
/*.abort_callback =*/ NULL,
|
19579
19323
|
/*.abort_callback_data =*/ NULL,
|
19580
19324
|
/*.current_chunk; =*/ 0,
|
@@ -22676,7 +22420,7 @@ int ggml_cpu_has_wasm_simd(void) {
|
|
22676
22420
|
}
|
22677
22421
|
|
22678
22422
|
int ggml_cpu_has_blas(void) {
|
22679
|
-
#if defined(
|
22423
|
+
#if defined(GGML_USE_BLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL)
|
22680
22424
|
return 1;
|
22681
22425
|
#else
|
22682
22426
|
return 0;
|