llama_cpp 0.16.0 → 0.16.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/ext/llama_cpp/extconf.rb +3 -0
- data/ext/llama_cpp/llama_cpp.cpp +14 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +119 -54
- data/vendor/tmp/llama.cpp/ggml-alloc.c +78 -22
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +20 -8
- data/vendor/tmp/llama.cpp/ggml-backend.c +190 -65
- data/vendor/tmp/llama.cpp/ggml-backend.h +6 -3
- data/vendor/tmp/llama.cpp/ggml-blas.cpp +363 -0
- data/vendor/tmp/llama.cpp/ggml-blas.h +23 -0
- data/vendor/tmp/llama.cpp/ggml-common.h +6 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +1 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +21 -9
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +15 -1491
- data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +77 -62
- data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +77 -10
- data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +1 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +48 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +95 -129
- data/vendor/tmp/llama.cpp/ggml-impl.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +8 -7
- data/vendor/tmp/llama.cpp/ggml-metal.m +17 -9
- data/vendor/tmp/llama.cpp/ggml-quants.c +982 -368
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +21 -15
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +2133 -13215
- data/vendor/tmp/llama.cpp/ggml-sycl.h +1 -10
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +28826 -25037
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +438 -493
- data/vendor/tmp/llama.cpp/ggml.c +158 -414
- data/vendor/tmp/llama.cpp/ggml.h +6 -0
- data/vendor/tmp/llama.cpp/llama.cpp +628 -279
- data/vendor/tmp/llama.cpp/llama.h +9 -1
- data/vendor/tmp/llama.cpp/sgemm.cpp +2 -0
- data/vendor/tmp/llama.cpp/unicode-data.cpp +851 -801
- data/vendor/tmp/llama.cpp/unicode.cpp +33 -19
- data/vendor/tmp/llama.cpp/unicode.h +1 -1
- metadata +15 -3
data/vendor/tmp/llama.cpp/ggml.c
CHANGED
|
@@ -297,12 +297,6 @@ inline static void * ggml_calloc(size_t num, size_t size) {
|
|
|
297
297
|
|
|
298
298
|
#if defined(GGML_USE_ACCELERATE)
|
|
299
299
|
#include <Accelerate/Accelerate.h>
|
|
300
|
-
#elif defined(GGML_USE_OPENBLAS)
|
|
301
|
-
#if defined(GGML_BLAS_USE_MKL)
|
|
302
|
-
#include <mkl.h>
|
|
303
|
-
#else
|
|
304
|
-
#include <cblas.h>
|
|
305
|
-
#endif
|
|
306
300
|
#endif
|
|
307
301
|
|
|
308
302
|
// floating point type used to accumulate sums
|
|
@@ -1759,9 +1753,8 @@ struct ggml_compute_state_shared {
|
|
|
1759
1753
|
int n_threads;
|
|
1760
1754
|
|
|
1761
1755
|
// synchronization primitives
|
|
1762
|
-
atomic_int
|
|
1763
|
-
atomic_int
|
|
1764
|
-
atomic_int node_task; // active graph node task phase
|
|
1756
|
+
atomic_int n_barrier;
|
|
1757
|
+
atomic_int n_barrier_passed;
|
|
1765
1758
|
|
|
1766
1759
|
ggml_abort_callback abort_callback; // abort ggml_graph_compute when true
|
|
1767
1760
|
void* abort_callback_data;
|
|
@@ -3212,35 +3205,42 @@ GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor) {
|
|
|
3212
3205
|
return tensor->nb[0] > tensor->nb[1];
|
|
3213
3206
|
}
|
|
3214
3207
|
|
|
3215
|
-
|
|
3216
|
-
|
|
3208
|
+
static bool ggml_is_contiguous_n(const struct ggml_tensor * tensor, int n) {
|
|
3209
|
+
size_t next_nb = ggml_type_size(tensor->type);
|
|
3210
|
+
if (tensor->ne[0] != ggml_blck_size(tensor->type) && tensor->nb[0] != next_nb) {
|
|
3211
|
+
return false;
|
|
3212
|
+
}
|
|
3213
|
+
next_nb *= tensor->ne[0]/ggml_blck_size(tensor->type);
|
|
3214
|
+
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
|
3215
|
+
if (tensor->ne[i] != 1) {
|
|
3216
|
+
if (i > n) {
|
|
3217
|
+
if (tensor->nb[i] != next_nb) {
|
|
3218
|
+
return false;
|
|
3219
|
+
}
|
|
3220
|
+
next_nb *= tensor->ne[i];
|
|
3221
|
+
} else {
|
|
3222
|
+
// this dimension does not need to be contiguous
|
|
3223
|
+
next_nb = tensor->ne[i]*tensor->nb[i];
|
|
3224
|
+
}
|
|
3225
|
+
}
|
|
3226
|
+
}
|
|
3227
|
+
return true;
|
|
3228
|
+
}
|
|
3217
3229
|
|
|
3218
|
-
|
|
3219
|
-
|
|
3220
|
-
tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) &&
|
|
3221
|
-
tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
|
|
3222
|
-
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
|
3230
|
+
GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
|
|
3231
|
+
return ggml_is_contiguous_0(tensor);
|
|
3223
3232
|
}
|
|
3224
3233
|
|
|
3225
3234
|
GGML_CALL bool ggml_is_contiguous_0(const struct ggml_tensor * tensor) {
|
|
3226
|
-
return
|
|
3235
|
+
return ggml_is_contiguous_n(tensor, 0);
|
|
3227
3236
|
}
|
|
3228
3237
|
|
|
3229
3238
|
GGML_CALL bool ggml_is_contiguous_1(const struct ggml_tensor * tensor) {
|
|
3230
|
-
|
|
3231
|
-
|
|
3232
|
-
return
|
|
3233
|
-
tensor->nb[0] == ggml_type_size(tensor->type) &&
|
|
3234
|
-
tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
|
|
3235
|
-
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
|
3239
|
+
return ggml_is_contiguous_n(tensor, 1);
|
|
3236
3240
|
}
|
|
3237
3241
|
|
|
3238
3242
|
GGML_CALL bool ggml_is_contiguous_2(const struct ggml_tensor * tensor) {
|
|
3239
|
-
|
|
3240
|
-
|
|
3241
|
-
return
|
|
3242
|
-
tensor->nb[0] == ggml_type_size(tensor->type) &&
|
|
3243
|
-
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
|
3243
|
+
return ggml_is_contiguous_n(tensor, 2);
|
|
3244
3244
|
}
|
|
3245
3245
|
|
|
3246
3246
|
GGML_CALL bool ggml_is_permuted(const struct ggml_tensor * tensor) {
|
|
@@ -3272,20 +3272,20 @@ bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor
|
|
|
3272
3272
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
|
3273
3273
|
|
|
3274
3274
|
return
|
|
3275
|
-
(t0->ne[0] == t1->ne[0]
|
|
3276
|
-
(t0->ne[1] == t1->ne[1]
|
|
3277
|
-
(t0->ne[2] == t1->ne[2]
|
|
3278
|
-
(t0->ne[3] == t1->ne[3]
|
|
3275
|
+
(t0->ne[0] == t1->ne[0]) &&
|
|
3276
|
+
(t0->ne[1] == t1->ne[1]) &&
|
|
3277
|
+
(t0->ne[2] == t1->ne[2]) &&
|
|
3278
|
+
(t0->ne[3] == t1->ne[3]);
|
|
3279
3279
|
}
|
|
3280
3280
|
|
|
3281
3281
|
bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
|
3282
3282
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
|
3283
3283
|
|
|
3284
3284
|
return
|
|
3285
|
-
(t0->nb[0] == t1->nb[0]
|
|
3286
|
-
(t0->nb[1] == t1->nb[1]
|
|
3287
|
-
(t0->nb[2] == t1->nb[2]
|
|
3288
|
-
(t0->nb[3] == t1->nb[3]
|
|
3285
|
+
(t0->nb[0] == t1->nb[0]) &&
|
|
3286
|
+
(t0->nb[1] == t1->nb[1]) &&
|
|
3287
|
+
(t0->nb[2] == t1->nb[2]) &&
|
|
3288
|
+
(t0->nb[3] == t1->nb[3]);
|
|
3289
3289
|
}
|
|
3290
3290
|
|
|
3291
3291
|
// check if t1 can be represented as a repeatition of t0
|
|
@@ -4078,32 +4078,26 @@ float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) {
|
|
|
4078
4078
|
switch (tensor->type) {
|
|
4079
4079
|
case GGML_TYPE_I8:
|
|
4080
4080
|
{
|
|
4081
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
|
|
4082
4081
|
return ((int8_t *)(tensor->data))[i];
|
|
4083
4082
|
}
|
|
4084
4083
|
case GGML_TYPE_I16:
|
|
4085
4084
|
{
|
|
4086
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
|
|
4087
4085
|
return ((int16_t *)(tensor->data))[i];
|
|
4088
4086
|
}
|
|
4089
4087
|
case GGML_TYPE_I32:
|
|
4090
4088
|
{
|
|
4091
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
|
|
4092
4089
|
return ((int32_t *)(tensor->data))[i];
|
|
4093
4090
|
}
|
|
4094
4091
|
case GGML_TYPE_F16:
|
|
4095
4092
|
{
|
|
4096
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
|
|
4097
4093
|
return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
|
|
4098
4094
|
}
|
|
4099
4095
|
case GGML_TYPE_BF16:
|
|
4100
4096
|
{
|
|
4101
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_bf16_t));
|
|
4102
4097
|
return GGML_BF16_TO_FP32(((ggml_bf16_t *)(tensor->data))[i]);
|
|
4103
4098
|
}
|
|
4104
4099
|
case GGML_TYPE_F32:
|
|
4105
4100
|
{
|
|
4106
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(float));
|
|
4107
4101
|
return ((float *)(tensor->data))[i];
|
|
4108
4102
|
}
|
|
4109
4103
|
default:
|
|
@@ -4125,32 +4119,26 @@ void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) {
|
|
|
4125
4119
|
switch (tensor->type) {
|
|
4126
4120
|
case GGML_TYPE_I8:
|
|
4127
4121
|
{
|
|
4128
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
|
|
4129
4122
|
((int8_t *)(tensor->data))[i] = value;
|
|
4130
4123
|
} break;
|
|
4131
4124
|
case GGML_TYPE_I16:
|
|
4132
4125
|
{
|
|
4133
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
|
|
4134
4126
|
((int16_t *)(tensor->data))[i] = value;
|
|
4135
4127
|
} break;
|
|
4136
4128
|
case GGML_TYPE_I32:
|
|
4137
4129
|
{
|
|
4138
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
|
|
4139
4130
|
((int32_t *)(tensor->data))[i] = value;
|
|
4140
4131
|
} break;
|
|
4141
4132
|
case GGML_TYPE_F16:
|
|
4142
4133
|
{
|
|
4143
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
|
|
4144
4134
|
((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
|
|
4145
4135
|
} break;
|
|
4146
4136
|
case GGML_TYPE_BF16:
|
|
4147
4137
|
{
|
|
4148
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_bf16_t));
|
|
4149
4138
|
((ggml_bf16_t *)(tensor->data))[i] = GGML_FP32_TO_BF16(value);
|
|
4150
4139
|
} break;
|
|
4151
4140
|
case GGML_TYPE_F32:
|
|
4152
4141
|
{
|
|
4153
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(float));
|
|
4154
4142
|
((float *)(tensor->data))[i] = value;
|
|
4155
4143
|
} break;
|
|
4156
4144
|
default:
|
|
@@ -7343,13 +7331,15 @@ struct ggml_tensor * ggml_add_rel_pos_inplace(
|
|
|
7343
7331
|
return ggml_add_rel_pos_impl(ctx, a, pw, ph, true);
|
|
7344
7332
|
}
|
|
7345
7333
|
|
|
7346
|
-
//
|
|
7334
|
+
// ggml_unary
|
|
7347
7335
|
|
|
7348
7336
|
static struct ggml_tensor * ggml_unary_impl(
|
|
7349
7337
|
struct ggml_context * ctx,
|
|
7350
7338
|
struct ggml_tensor * a,
|
|
7351
7339
|
enum ggml_unary_op op,
|
|
7352
7340
|
bool inplace) {
|
|
7341
|
+
GGML_ASSERT(ggml_is_contiguous_1(a));
|
|
7342
|
+
|
|
7353
7343
|
bool is_node = false;
|
|
7354
7344
|
|
|
7355
7345
|
if (!inplace && (a->grad)) {
|
|
@@ -11014,6 +11004,8 @@ static void ggml_compute_forward_abs_f32(
|
|
|
11014
11004
|
const struct ggml_tensor * src0 = dst->src[0];
|
|
11015
11005
|
|
|
11016
11006
|
assert(params->ith == 0);
|
|
11007
|
+
assert(ggml_is_contiguous_1(src0));
|
|
11008
|
+
assert(ggml_is_contiguous_1(dst));
|
|
11017
11009
|
assert(ggml_are_same_shape(src0, dst));
|
|
11018
11010
|
|
|
11019
11011
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
@@ -11023,9 +11015,6 @@ static void ggml_compute_forward_abs_f32(
|
|
|
11023
11015
|
const int n = ggml_nrows(src0);
|
|
11024
11016
|
const int nc = src0->ne[0];
|
|
11025
11017
|
|
|
11026
|
-
assert(dst->nb[0] == sizeof(float));
|
|
11027
|
-
assert(src0->nb[0] == sizeof(float));
|
|
11028
|
-
|
|
11029
11018
|
for (int i = 0; i < n; i++) {
|
|
11030
11019
|
ggml_vec_abs_f32(nc,
|
|
11031
11020
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
|
@@ -11060,6 +11049,8 @@ static void ggml_compute_forward_sgn_f32(
|
|
|
11060
11049
|
const struct ggml_tensor * src0 = dst->src[0];
|
|
11061
11050
|
|
|
11062
11051
|
assert(params->ith == 0);
|
|
11052
|
+
assert(ggml_is_contiguous_1(src0));
|
|
11053
|
+
assert(ggml_is_contiguous_1(dst));
|
|
11063
11054
|
assert(ggml_are_same_shape(src0, dst));
|
|
11064
11055
|
|
|
11065
11056
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
@@ -11069,9 +11060,6 @@ static void ggml_compute_forward_sgn_f32(
|
|
|
11069
11060
|
const int n = ggml_nrows(src0);
|
|
11070
11061
|
const int nc = src0->ne[0];
|
|
11071
11062
|
|
|
11072
|
-
assert(dst->nb[0] == sizeof(float));
|
|
11073
|
-
assert(src0->nb[0] == sizeof(float));
|
|
11074
|
-
|
|
11075
11063
|
for (int i = 0; i < n; i++) {
|
|
11076
11064
|
ggml_vec_sgn_f32(nc,
|
|
11077
11065
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
|
@@ -11106,6 +11094,8 @@ static void ggml_compute_forward_neg_f32(
|
|
|
11106
11094
|
const struct ggml_tensor * src0 = dst->src[0];
|
|
11107
11095
|
|
|
11108
11096
|
assert(params->ith == 0);
|
|
11097
|
+
assert(ggml_is_contiguous_1(src0));
|
|
11098
|
+
assert(ggml_is_contiguous_1(dst));
|
|
11109
11099
|
assert(ggml_are_same_shape(src0, dst));
|
|
11110
11100
|
|
|
11111
11101
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
@@ -11115,9 +11105,6 @@ static void ggml_compute_forward_neg_f32(
|
|
|
11115
11105
|
const int n = ggml_nrows(src0);
|
|
11116
11106
|
const int nc = src0->ne[0];
|
|
11117
11107
|
|
|
11118
|
-
assert(dst->nb[0] == sizeof(float));
|
|
11119
|
-
assert(src0->nb[0] == sizeof(float));
|
|
11120
|
-
|
|
11121
11108
|
for (int i = 0; i < n; i++) {
|
|
11122
11109
|
ggml_vec_neg_f32(nc,
|
|
11123
11110
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
|
@@ -11152,6 +11139,8 @@ static void ggml_compute_forward_step_f32(
|
|
|
11152
11139
|
const struct ggml_tensor * src0 = dst->src[0];
|
|
11153
11140
|
|
|
11154
11141
|
assert(params->ith == 0);
|
|
11142
|
+
assert(ggml_is_contiguous_1(src0));
|
|
11143
|
+
assert(ggml_is_contiguous_1(dst));
|
|
11155
11144
|
assert(ggml_are_same_shape(src0, dst));
|
|
11156
11145
|
|
|
11157
11146
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
@@ -11161,9 +11150,6 @@ static void ggml_compute_forward_step_f32(
|
|
|
11161
11150
|
const int n = ggml_nrows(src0);
|
|
11162
11151
|
const int nc = src0->ne[0];
|
|
11163
11152
|
|
|
11164
|
-
assert(dst->nb[0] == sizeof(float));
|
|
11165
|
-
assert(src0->nb[0] == sizeof(float));
|
|
11166
|
-
|
|
11167
11153
|
for (int i = 0; i < n; i++) {
|
|
11168
11154
|
ggml_vec_step_f32(nc,
|
|
11169
11155
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
|
@@ -11198,6 +11184,8 @@ static void ggml_compute_forward_tanh_f32(
|
|
|
11198
11184
|
const struct ggml_tensor * src0 = dst->src[0];
|
|
11199
11185
|
|
|
11200
11186
|
assert(params->ith == 0);
|
|
11187
|
+
assert(ggml_is_contiguous_1(src0));
|
|
11188
|
+
assert(ggml_is_contiguous_1(dst));
|
|
11201
11189
|
assert(ggml_are_same_shape(src0, dst));
|
|
11202
11190
|
|
|
11203
11191
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
@@ -11207,9 +11195,6 @@ static void ggml_compute_forward_tanh_f32(
|
|
|
11207
11195
|
const int n = ggml_nrows(src0);
|
|
11208
11196
|
const int nc = src0->ne[0];
|
|
11209
11197
|
|
|
11210
|
-
assert(dst->nb[0] == sizeof(float));
|
|
11211
|
-
assert(src0->nb[0] == sizeof(float));
|
|
11212
|
-
|
|
11213
11198
|
for (int i = 0; i < n; i++) {
|
|
11214
11199
|
ggml_vec_tanh_f32(nc,
|
|
11215
11200
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
|
@@ -11244,6 +11229,8 @@ static void ggml_compute_forward_elu_f32(
|
|
|
11244
11229
|
const struct ggml_tensor * src0 = dst->src[0];
|
|
11245
11230
|
|
|
11246
11231
|
assert(params->ith == 0);
|
|
11232
|
+
assert(ggml_is_contiguous_1(src0));
|
|
11233
|
+
assert(ggml_is_contiguous_1(dst));
|
|
11247
11234
|
assert(ggml_are_same_shape(src0, dst));
|
|
11248
11235
|
|
|
11249
11236
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
@@ -11253,9 +11240,6 @@ static void ggml_compute_forward_elu_f32(
|
|
|
11253
11240
|
const int n = ggml_nrows(src0);
|
|
11254
11241
|
const int nc = src0->ne[0];
|
|
11255
11242
|
|
|
11256
|
-
assert(dst->nb[0] == sizeof(float));
|
|
11257
|
-
assert(src0->nb[0] == sizeof(float));
|
|
11258
|
-
|
|
11259
11243
|
for (int i = 0; i < n; i++) {
|
|
11260
11244
|
ggml_vec_elu_f32(nc,
|
|
11261
11245
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
|
@@ -11290,6 +11274,8 @@ static void ggml_compute_forward_relu_f32(
|
|
|
11290
11274
|
const struct ggml_tensor * src0 = dst->src[0];
|
|
11291
11275
|
|
|
11292
11276
|
assert(params->ith == 0);
|
|
11277
|
+
assert(ggml_is_contiguous_1(src0));
|
|
11278
|
+
assert(ggml_is_contiguous_1(dst));
|
|
11293
11279
|
assert(ggml_are_same_shape(src0, dst));
|
|
11294
11280
|
|
|
11295
11281
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
@@ -11299,9 +11285,6 @@ static void ggml_compute_forward_relu_f32(
|
|
|
11299
11285
|
const int n = ggml_nrows(src0);
|
|
11300
11286
|
const int nc = src0->ne[0];
|
|
11301
11287
|
|
|
11302
|
-
assert(dst->nb[0] == sizeof(float));
|
|
11303
|
-
assert(src0->nb[0] == sizeof(float));
|
|
11304
|
-
|
|
11305
11288
|
for (int i = 0; i < n; i++) {
|
|
11306
11289
|
ggml_vec_relu_f32(nc,
|
|
11307
11290
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
|
@@ -11336,6 +11319,8 @@ static void ggml_compute_forward_sigmoid_f32(
|
|
|
11336
11319
|
const struct ggml_tensor * src0 = dst->src[0];
|
|
11337
11320
|
|
|
11338
11321
|
assert(params->ith == 0);
|
|
11322
|
+
assert(ggml_is_contiguous_1(src0));
|
|
11323
|
+
assert(ggml_is_contiguous_1(dst));
|
|
11339
11324
|
assert(ggml_are_same_shape(src0, dst));
|
|
11340
11325
|
|
|
11341
11326
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
@@ -11345,9 +11330,6 @@ static void ggml_compute_forward_sigmoid_f32(
|
|
|
11345
11330
|
const int n = ggml_nrows(src0);
|
|
11346
11331
|
const int nc = src0->ne[0];
|
|
11347
11332
|
|
|
11348
|
-
assert(dst->nb[0] == sizeof(float));
|
|
11349
|
-
assert(src0->nb[0] == sizeof(float));
|
|
11350
|
-
|
|
11351
11333
|
for (int i = 0; i < n; i++) {
|
|
11352
11334
|
ggml_vec_sigmoid_f32(nc,
|
|
11353
11335
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
|
@@ -11381,9 +11363,9 @@ static void ggml_compute_forward_gelu_f32(
|
|
|
11381
11363
|
|
|
11382
11364
|
const struct ggml_tensor * src0 = dst->src[0];
|
|
11383
11365
|
|
|
11384
|
-
|
|
11385
|
-
|
|
11386
|
-
|
|
11366
|
+
assert(ggml_is_contiguous_1(src0));
|
|
11367
|
+
assert(ggml_is_contiguous_1(dst));
|
|
11368
|
+
assert(ggml_are_same_shape(src0, dst));
|
|
11387
11369
|
|
|
11388
11370
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
11389
11371
|
return;
|
|
@@ -11444,9 +11426,9 @@ static void ggml_compute_forward_gelu_quick_f32(
|
|
|
11444
11426
|
|
|
11445
11427
|
const struct ggml_tensor * src0 = dst->src[0];
|
|
11446
11428
|
|
|
11447
|
-
|
|
11448
|
-
|
|
11449
|
-
|
|
11429
|
+
assert(ggml_is_contiguous_1(src0));
|
|
11430
|
+
assert(ggml_is_contiguous_1(dst));
|
|
11431
|
+
assert(ggml_are_same_shape(src0, dst));
|
|
11450
11432
|
|
|
11451
11433
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
11452
11434
|
return;
|
|
@@ -11507,9 +11489,9 @@ static void ggml_compute_forward_silu_f32(
|
|
|
11507
11489
|
|
|
11508
11490
|
const struct ggml_tensor * src0 = dst->src[0];
|
|
11509
11491
|
|
|
11510
|
-
|
|
11511
|
-
|
|
11512
|
-
|
|
11492
|
+
assert(ggml_is_contiguous_1(src0));
|
|
11493
|
+
assert(ggml_is_contiguous_1(dst));
|
|
11494
|
+
assert(ggml_are_same_shape(src0, dst));
|
|
11513
11495
|
|
|
11514
11496
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
11515
11497
|
return;
|
|
@@ -11570,6 +11552,8 @@ static void ggml_compute_forward_leaky_relu_f32(
|
|
|
11570
11552
|
const struct ggml_tensor * src0 = dst->src[0];
|
|
11571
11553
|
|
|
11572
11554
|
assert(params->ith == 0);
|
|
11555
|
+
assert(ggml_is_contiguous_1(src0));
|
|
11556
|
+
assert(ggml_is_contiguous_1(dst));
|
|
11573
11557
|
assert(ggml_are_same_shape(src0, dst));
|
|
11574
11558
|
|
|
11575
11559
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
@@ -11619,11 +11603,11 @@ static void ggml_compute_forward_silu_back_f32(
|
|
|
11619
11603
|
const struct ggml_tensor * src0 = dst->src[0];
|
|
11620
11604
|
const struct ggml_tensor * grad = dst->src[1];
|
|
11621
11605
|
|
|
11622
|
-
|
|
11623
|
-
|
|
11624
|
-
|
|
11625
|
-
|
|
11626
|
-
|
|
11606
|
+
assert(ggml_is_contiguous_1(grad));
|
|
11607
|
+
assert(ggml_is_contiguous_1(src0));
|
|
11608
|
+
assert(ggml_is_contiguous_1(dst));
|
|
11609
|
+
assert(ggml_are_same_shape(src0, dst));
|
|
11610
|
+
assert(ggml_are_same_shape(src0, grad));
|
|
11627
11611
|
|
|
11628
11612
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
11629
11613
|
return;
|
|
@@ -11685,6 +11669,8 @@ static void ggml_compute_forward_hardswish_f32(
|
|
|
11685
11669
|
const struct ggml_tensor * src0 = dst->src[0];
|
|
11686
11670
|
|
|
11687
11671
|
assert(params->ith == 0);
|
|
11672
|
+
assert(ggml_is_contiguous_1(src0));
|
|
11673
|
+
assert(ggml_is_contiguous_1(dst));
|
|
11688
11674
|
assert(ggml_are_same_shape(src0, dst));
|
|
11689
11675
|
|
|
11690
11676
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
@@ -11694,9 +11680,6 @@ static void ggml_compute_forward_hardswish_f32(
|
|
|
11694
11680
|
const int n = ggml_nrows(src0);
|
|
11695
11681
|
const int nc = src0->ne[0];
|
|
11696
11682
|
|
|
11697
|
-
assert(dst->nb[0] == sizeof(float));
|
|
11698
|
-
assert(src0->nb[0] == sizeof(float));
|
|
11699
|
-
|
|
11700
11683
|
for (int i = 0; i < n; i++) {
|
|
11701
11684
|
ggml_vec_hardswish_f32(nc,
|
|
11702
11685
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
|
@@ -11728,6 +11711,8 @@ static void ggml_compute_forward_hardsigmoid_f32(
|
|
|
11728
11711
|
const struct ggml_tensor * src0 = dst->src[0];
|
|
11729
11712
|
|
|
11730
11713
|
assert(params->ith == 0);
|
|
11714
|
+
assert(ggml_is_contiguous_1(src0));
|
|
11715
|
+
assert(ggml_is_contiguous_1(dst));
|
|
11731
11716
|
assert(ggml_are_same_shape(src0, dst));
|
|
11732
11717
|
|
|
11733
11718
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
@@ -11737,9 +11722,6 @@ static void ggml_compute_forward_hardsigmoid_f32(
|
|
|
11737
11722
|
const int n = ggml_nrows(src0);
|
|
11738
11723
|
const int nc = src0->ne[0];
|
|
11739
11724
|
|
|
11740
|
-
assert(dst->nb[0] == sizeof(float));
|
|
11741
|
-
assert(src0->nb[0] == sizeof(float));
|
|
11742
|
-
|
|
11743
11725
|
for (int i = 0; i < n; i++) {
|
|
11744
11726
|
ggml_vec_hardsigmoid_f32(nc,
|
|
11745
11727
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
|
@@ -12190,39 +12172,6 @@ static void ggml_compute_forward_group_norm(
|
|
|
12190
12172
|
|
|
12191
12173
|
// ggml_compute_forward_mul_mat
|
|
12192
12174
|
|
|
12193
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
|
12194
|
-
// helper function to determine if it is better to use BLAS or not
|
|
12195
|
-
// for large matrices, BLAS is faster
|
|
12196
|
-
static bool ggml_compute_forward_mul_mat_use_blas(struct ggml_tensor * dst) {
|
|
12197
|
-
const struct ggml_tensor * src0 = dst->src[0];
|
|
12198
|
-
const struct ggml_tensor * src1 = dst->src[1];
|
|
12199
|
-
|
|
12200
|
-
//const int64_t ne00 = src0->ne[0];
|
|
12201
|
-
//const int64_t ne01 = src0->ne[1];
|
|
12202
|
-
|
|
12203
|
-
const int64_t ne10 = src1->ne[0];
|
|
12204
|
-
|
|
12205
|
-
const int64_t ne0 = dst->ne[0];
|
|
12206
|
-
const int64_t ne1 = dst->ne[1];
|
|
12207
|
-
|
|
12208
|
-
// NOTE: with GGML_OP_MUL_MAT_ID we don't want to go through the BLAS branch because it will dequantize (to_float)
|
|
12209
|
-
// all the experts for each batch element and the processing would become incredibly slow
|
|
12210
|
-
// TODO: find the optimal values for these
|
|
12211
|
-
if (dst->op != GGML_OP_MUL_MAT_ID &&
|
|
12212
|
-
ggml_is_contiguous(src0) &&
|
|
12213
|
-
ggml_is_contiguous(src1) &&
|
|
12214
|
-
//src0->type == GGML_TYPE_F32 &&
|
|
12215
|
-
src1->type == GGML_TYPE_F32 &&
|
|
12216
|
-
(ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
|
|
12217
|
-
|
|
12218
|
-
/*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
|
|
12219
|
-
return true;
|
|
12220
|
-
}
|
|
12221
|
-
|
|
12222
|
-
return false;
|
|
12223
|
-
}
|
|
12224
|
-
#endif
|
|
12225
|
-
|
|
12226
12175
|
static void ggml_compute_forward_mul_mat_one_chunk(
|
|
12227
12176
|
const struct ggml_compute_params * params,
|
|
12228
12177
|
struct ggml_tensor * dst,
|
|
@@ -12360,73 +12309,6 @@ static void ggml_compute_forward_mul_mat(
|
|
|
12360
12309
|
// nb01 >= nb00 - src0 is not transposed
|
|
12361
12310
|
// compute by src0 rows
|
|
12362
12311
|
|
|
12363
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
|
12364
|
-
if (ggml_compute_forward_mul_mat_use_blas(dst)) {
|
|
12365
|
-
const int64_t ne_plane = ne01*ne00;
|
|
12366
|
-
const size_t desired_wsize = ne13*ne12*ne_plane*sizeof(float);
|
|
12367
|
-
UNUSED(desired_wsize);
|
|
12368
|
-
|
|
12369
|
-
if (params->type == GGML_TASK_TYPE_INIT) {
|
|
12370
|
-
if (type != GGML_TYPE_F32) {
|
|
12371
|
-
assert(params->wsize >= desired_wsize);
|
|
12372
|
-
// parallelize by src0 rows
|
|
12373
|
-
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
|
12374
|
-
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
|
12375
|
-
// broadcast src0 into src1 across 2nd,3rd dimension
|
|
12376
|
-
const int64_t i03 = i13/r3;
|
|
12377
|
-
const int64_t i02 = i12/r2;
|
|
12378
|
-
|
|
12379
|
-
const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
|
|
12380
|
-
float * const wdata = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
|
|
12381
|
-
ggml_to_float_t const to_float = type_traits[type].to_float;
|
|
12382
|
-
|
|
12383
|
-
for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
|
|
12384
|
-
to_float((const char *) x + i01*nb01, wdata + i01*ne00, ne00);
|
|
12385
|
-
}
|
|
12386
|
-
}
|
|
12387
|
-
}
|
|
12388
|
-
}
|
|
12389
|
-
return;
|
|
12390
|
-
}
|
|
12391
|
-
|
|
12392
|
-
if (params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
12393
|
-
return;
|
|
12394
|
-
}
|
|
12395
|
-
|
|
12396
|
-
// perform sgemm, parallelization controlled by blas lib
|
|
12397
|
-
if (ith != 0) {
|
|
12398
|
-
return;
|
|
12399
|
-
}
|
|
12400
|
-
|
|
12401
|
-
//const int64_t tgemm0 = ggml_perf_time_us();
|
|
12402
|
-
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
|
12403
|
-
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
|
12404
|
-
const int64_t i03 = i13/r3;
|
|
12405
|
-
const int64_t i02 = i12/r2;
|
|
12406
|
-
|
|
12407
|
-
const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
|
|
12408
|
-
const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
|
|
12409
|
-
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
|
12410
|
-
|
|
12411
|
-
if (type != GGML_TYPE_F32) {
|
|
12412
|
-
x = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
|
|
12413
|
-
}
|
|
12414
|
-
|
|
12415
|
-
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
|
12416
|
-
ne1, ne01, ne10,
|
|
12417
|
-
1.0f, y, ne10,
|
|
12418
|
-
x, ne00,
|
|
12419
|
-
0.0f, d, ne01);
|
|
12420
|
-
}
|
|
12421
|
-
}
|
|
12422
|
-
//printf("cblas_sgemm = %.3f ms, %lld flops\n", (ggml_perf_time_us() - tgemm0)/1000.0, ne13*ne12*ne1*ne01*ne10*2);
|
|
12423
|
-
|
|
12424
|
-
//printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
|
|
12425
|
-
|
|
12426
|
-
return;
|
|
12427
|
-
}
|
|
12428
|
-
#endif
|
|
12429
|
-
|
|
12430
12312
|
#if GGML_USE_LLAMAFILE
|
|
12431
12313
|
const bool src1_cont = ggml_is_contiguous(src1);
|
|
12432
12314
|
|
|
@@ -12807,19 +12689,7 @@ static void ggml_compute_forward_out_prod_f32(
|
|
|
12807
12689
|
// nb01 >= nb00 - src0 is not transposed
|
|
12808
12690
|
// compute by src0 rows
|
|
12809
12691
|
|
|
12810
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
|
12811
|
-
bool use_blas = ggml_is_matrix(src0) &&
|
|
12812
|
-
ggml_is_matrix(src1) &&
|
|
12813
|
-
ggml_is_contiguous(src0) &&
|
|
12814
|
-
(ggml_is_contiguous(src1) || ggml_is_transposed(src1));
|
|
12815
|
-
#endif
|
|
12816
|
-
|
|
12817
12692
|
if (params->type == GGML_TASK_TYPE_INIT) {
|
|
12818
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) // gemm beta will zero dst
|
|
12819
|
-
if (use_blas) {
|
|
12820
|
-
return;
|
|
12821
|
-
}
|
|
12822
|
-
#endif
|
|
12823
12693
|
if (ith != 0) {
|
|
12824
12694
|
return;
|
|
12825
12695
|
}
|
|
@@ -12831,50 +12701,6 @@ static void ggml_compute_forward_out_prod_f32(
|
|
|
12831
12701
|
return;
|
|
12832
12702
|
}
|
|
12833
12703
|
|
|
12834
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
|
12835
|
-
if (use_blas) {
|
|
12836
|
-
if (params->ith != 0) { // All threads other than the first do no work.
|
|
12837
|
-
return;
|
|
12838
|
-
}
|
|
12839
|
-
// Arguments to ggml_compute_forward_out_prod (expressed as major,minor)
|
|
12840
|
-
// src0: (k,n)
|
|
12841
|
-
// src1: (k,m)
|
|
12842
|
-
// dst: (m,n)
|
|
12843
|
-
//
|
|
12844
|
-
// Arguments to sgemm (see https://github.com/Reference-LAPACK/lapack/blob/master/BLAS/SRC/sgemm.f)
|
|
12845
|
-
// Also expressed as (major,minor)
|
|
12846
|
-
// a: (m,k): so src1 transposed
|
|
12847
|
-
// b: (k,n): so src0
|
|
12848
|
-
// c: (m,n)
|
|
12849
|
-
//
|
|
12850
|
-
// However, if ggml_is_transposed(src1) is true, then
|
|
12851
|
-
// src1->data already contains a transposed version, so sgemm mustn't
|
|
12852
|
-
// transpose it further.
|
|
12853
|
-
|
|
12854
|
-
int n = src0->ne[0];
|
|
12855
|
-
int k = src0->ne[1];
|
|
12856
|
-
int m = src1->ne[0];
|
|
12857
|
-
|
|
12858
|
-
int transposeA, lda;
|
|
12859
|
-
|
|
12860
|
-
if (!ggml_is_transposed(src1)) {
|
|
12861
|
-
transposeA = CblasTrans;
|
|
12862
|
-
lda = m;
|
|
12863
|
-
} else {
|
|
12864
|
-
transposeA = CblasNoTrans;
|
|
12865
|
-
lda = k;
|
|
12866
|
-
}
|
|
12867
|
-
|
|
12868
|
-
float * a = (float *) ((char *) src1->data);
|
|
12869
|
-
float * b = (float *) ((char *) src0->data);
|
|
12870
|
-
float * c = (float *) ((char *) dst->data);
|
|
12871
|
-
|
|
12872
|
-
cblas_sgemm(CblasRowMajor, transposeA, CblasNoTrans, m, n, k, 1.0, a, lda, b, n, 0.0, c, n);
|
|
12873
|
-
|
|
12874
|
-
return;
|
|
12875
|
-
}
|
|
12876
|
-
#endif
|
|
12877
|
-
|
|
12878
12704
|
// dst[:,:,:,:] = 0
|
|
12879
12705
|
// for i2,i3:
|
|
12880
12706
|
// for i1:
|
|
@@ -13004,8 +12830,6 @@ static void ggml_compute_forward_out_prod_q_f32(
|
|
|
13004
12830
|
// nb01 >= nb00 - src0 is not transposed
|
|
13005
12831
|
// compute by src0 rows
|
|
13006
12832
|
|
|
13007
|
-
// TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
|
13008
|
-
|
|
13009
12833
|
if (params->type == GGML_TASK_TYPE_INIT) {
|
|
13010
12834
|
if (ith != 0) {
|
|
13011
12835
|
return;
|
|
@@ -13402,6 +13226,8 @@ static void ggml_compute_forward_get_rows_q(
|
|
|
13402
13226
|
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
|
|
13403
13227
|
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
|
13404
13228
|
|
|
13229
|
+
assert(i01 >= 0 && i01 < ne01);
|
|
13230
|
+
|
|
13405
13231
|
dequantize_row_q(
|
|
13406
13232
|
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
|
13407
13233
|
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
|
@@ -13445,6 +13271,8 @@ static void ggml_compute_forward_get_rows_f16(
|
|
|
13445
13271
|
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
|
|
13446
13272
|
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
|
13447
13273
|
|
|
13274
|
+
assert(i01 >= 0 && i01 < ne01);
|
|
13275
|
+
|
|
13448
13276
|
ggml_fp16_to_fp32_row(
|
|
13449
13277
|
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
|
13450
13278
|
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
|
@@ -13488,7 +13316,9 @@ static void ggml_compute_forward_get_rows_bf16(
|
|
|
13488
13316
|
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
|
|
13489
13317
|
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
|
13490
13318
|
|
|
13491
|
-
|
|
13319
|
+
assert(i01 >= 0 && i01 < ne01);
|
|
13320
|
+
|
|
13321
|
+
ggml_bf16_to_fp32_row(
|
|
13492
13322
|
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
|
13493
13323
|
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
|
13494
13324
|
}
|
|
@@ -13531,6 +13361,8 @@ static void ggml_compute_forward_get_rows_f32(
|
|
|
13531
13361
|
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
|
|
13532
13362
|
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
|
13533
13363
|
|
|
13364
|
+
assert(i01 >= 0 && i01 < ne01);
|
|
13365
|
+
|
|
13534
13366
|
ggml_vec_cpy_f32(nc,
|
|
13535
13367
|
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3),
|
|
13536
13368
|
(float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03));
|
|
@@ -16686,7 +16518,10 @@ static void ggml_compute_forward_map_unary_f32(
|
|
|
16686
16518
|
|
|
16687
16519
|
const struct ggml_tensor * src0 = dst->src[0];
|
|
16688
16520
|
|
|
16689
|
-
|
|
16521
|
+
assert(params->ith == 0);
|
|
16522
|
+
assert(ggml_is_contiguous_1(src0));
|
|
16523
|
+
assert(ggml_is_contiguous_1(dst));
|
|
16524
|
+
assert(ggml_are_same_shape(src0, dst));
|
|
16690
16525
|
|
|
16691
16526
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
16692
16527
|
return;
|
|
@@ -16695,9 +16530,6 @@ static void ggml_compute_forward_map_unary_f32(
|
|
|
16695
16530
|
const int n = ggml_nrows(src0);
|
|
16696
16531
|
const int nc = src0->ne[0];
|
|
16697
16532
|
|
|
16698
|
-
assert( dst->nb[0] == sizeof(float));
|
|
16699
|
-
assert(src0->nb[0] == sizeof(float));
|
|
16700
|
-
|
|
16701
16533
|
for (int i = 0; i < n; i++) {
|
|
16702
16534
|
fun(nc,
|
|
16703
16535
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
|
@@ -16735,6 +16567,9 @@ static void ggml_compute_forward_map_binary_f32(
|
|
|
16735
16567
|
const struct ggml_tensor * src1 = dst->src[1];
|
|
16736
16568
|
|
|
16737
16569
|
assert(params->ith == 0);
|
|
16570
|
+
assert(ggml_is_contiguous_1(src0));
|
|
16571
|
+
assert(ggml_is_contiguous_1(src1));
|
|
16572
|
+
assert(ggml_is_contiguous_1(dst));
|
|
16738
16573
|
assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
|
|
16739
16574
|
|
|
16740
16575
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
@@ -16744,10 +16579,6 @@ static void ggml_compute_forward_map_binary_f32(
|
|
|
16744
16579
|
const int n = ggml_nrows(src0);
|
|
16745
16580
|
const int nc = src0->ne[0];
|
|
16746
16581
|
|
|
16747
|
-
assert( dst->nb[0] == sizeof(float));
|
|
16748
|
-
assert(src0->nb[0] == sizeof(float));
|
|
16749
|
-
assert(src1->nb[0] == sizeof(float));
|
|
16750
|
-
|
|
16751
16582
|
for (int i = 0; i < n; i++) {
|
|
16752
16583
|
fun(nc,
|
|
16753
16584
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
|
@@ -18905,6 +18736,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
|
|
|
18905
18736
|
switch (node->op) {
|
|
18906
18737
|
case GGML_OP_CPY:
|
|
18907
18738
|
case GGML_OP_DUP:
|
|
18739
|
+
case GGML_OP_CONT:
|
|
18908
18740
|
case GGML_OP_ADD:
|
|
18909
18741
|
case GGML_OP_ADD1:
|
|
18910
18742
|
case GGML_OP_ACC:
|
|
@@ -18989,7 +18821,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
|
|
|
18989
18821
|
} break;
|
|
18990
18822
|
case GGML_OP_SCALE:
|
|
18991
18823
|
case GGML_OP_SET:
|
|
18992
|
-
case GGML_OP_CONT:
|
|
18993
18824
|
case GGML_OP_RESHAPE:
|
|
18994
18825
|
case GGML_OP_VIEW:
|
|
18995
18826
|
case GGML_OP_PERMUTE:
|
|
@@ -19140,41 +18971,49 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
|
|
|
19140
18971
|
return n_tasks;
|
|
19141
18972
|
}
|
|
19142
18973
|
|
|
19143
|
-
|
|
19144
|
-
|
|
19145
|
-
|
|
19146
|
-
|
|
19147
|
-
while (true) {
|
|
19148
|
-
if (do_yield) {
|
|
19149
|
-
sched_yield();
|
|
19150
|
-
}
|
|
19151
|
-
|
|
19152
|
-
* node_n = atomic_load(&state->shared->node_n);
|
|
19153
|
-
if (* node_n != last_node_n) break;
|
|
19154
|
-
#if defined(__SSE3__)
|
|
19155
|
-
// Tell the processor we're spinning. It's a processor hint for spinlocks.
|
|
19156
|
-
_mm_pause();
|
|
19157
|
-
#endif
|
|
18974
|
+
#ifdef GGML_USE_OPENMP
|
|
18975
|
+
static void ggml_barrier(struct ggml_compute_state * state) {
|
|
18976
|
+
if (state->shared->n_threads == 1) {
|
|
18977
|
+
return;
|
|
19158
18978
|
}
|
|
18979
|
+
|
|
18980
|
+
#pragma omp barrier
|
|
19159
18981
|
}
|
|
18982
|
+
#else
|
|
18983
|
+
static void ggml_barrier(struct ggml_compute_state * state) {
|
|
18984
|
+
if (state->shared->n_threads == 1) {
|
|
18985
|
+
return;
|
|
18986
|
+
}
|
|
19160
18987
|
|
|
19161
|
-
|
|
19162
|
-
|
|
19163
|
-
const int last_task_phase = * task_phase;
|
|
18988
|
+
atomic_int * n_barrier = &state->shared->n_barrier;
|
|
18989
|
+
atomic_int * n_barrier_passed = &state->shared->n_barrier_passed;
|
|
19164
18990
|
|
|
19165
|
-
|
|
19166
|
-
|
|
18991
|
+
int n_threads = state->shared->n_threads;
|
|
18992
|
+
int passed_old = atomic_load(n_barrier_passed);
|
|
18993
|
+
|
|
18994
|
+
if (atomic_fetch_add(n_barrier, 1) == n_threads - 1) {
|
|
18995
|
+
// last thread
|
|
18996
|
+
atomic_store(n_barrier, 0);
|
|
18997
|
+
atomic_fetch_add(n_barrier_passed, 1);
|
|
18998
|
+
} else {
|
|
18999
|
+
// wait for other threads
|
|
19000
|
+
//while (atomic_load(n_barrier_passed) == passed_old) {
|
|
19001
|
+
//}
|
|
19002
|
+
const int n_spin_before_sleep = 100000;
|
|
19003
|
+
while (true) {
|
|
19004
|
+
for (int i = 0; i < n_spin_before_sleep; i++) {
|
|
19005
|
+
if (atomic_load(n_barrier_passed) != passed_old) {
|
|
19006
|
+
return;
|
|
19007
|
+
}
|
|
19008
|
+
#if defined(__SSE3__)
|
|
19009
|
+
_mm_pause();
|
|
19010
|
+
#endif
|
|
19011
|
+
}
|
|
19167
19012
|
sched_yield();
|
|
19168
19013
|
}
|
|
19169
|
-
|
|
19170
|
-
* task_phase = atomic_load(&state->shared->node_task);
|
|
19171
|
-
if (* task_phase != last_task_phase) break;
|
|
19172
|
-
#if defined(__SSE3__)
|
|
19173
|
-
// Tell the processor we're spinning. It's a processor hint for spinlocks.
|
|
19174
|
-
_mm_pause();
|
|
19175
|
-
#endif
|
|
19176
19014
|
}
|
|
19177
19015
|
}
|
|
19016
|
+
#endif
|
|
19178
19017
|
|
|
19179
19018
|
static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
19180
19019
|
struct ggml_compute_state * state = (struct ggml_compute_state *) data;
|
|
@@ -19182,136 +19021,54 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
|
19182
19021
|
const struct ggml_cgraph * cgraph = state->shared->cgraph;
|
|
19183
19022
|
const struct ggml_cplan * cplan = state->shared->cplan;
|
|
19184
19023
|
|
|
19185
|
-
const int
|
|
19024
|
+
const int ith = state->ith;
|
|
19025
|
+
const int n_threads = state->shared->n_threads;
|
|
19186
19026
|
|
|
19187
|
-
set_numa_thread_affinity(
|
|
19027
|
+
set_numa_thread_affinity(ith);
|
|
19188
19028
|
|
|
19189
|
-
|
|
19190
|
-
|
|
19029
|
+
struct ggml_compute_params params = {
|
|
19030
|
+
/*.type =*/ GGML_TASK_TYPE_INIT,
|
|
19031
|
+
/*.ith =*/ ith,
|
|
19032
|
+
/*.nth =*/ state->shared->n_threads,
|
|
19033
|
+
/*.wsize =*/ cplan->work_size,
|
|
19034
|
+
/*.wdata =*/ cplan->work_data,
|
|
19035
|
+
};
|
|
19191
19036
|
|
|
19192
|
-
|
|
19037
|
+
for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) {
|
|
19193
19038
|
if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
|
|
19194
|
-
state->shared->node_n += 1;
|
|
19195
19039
|
state->ec = GGML_STATUS_ABORTED;
|
|
19196
19040
|
return 0;
|
|
19197
19041
|
}
|
|
19198
19042
|
|
|
19199
|
-
if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
|
|
19200
|
-
// all other threads are finished and spinning
|
|
19201
|
-
// do finalize and init here so we don't have synchronize again
|
|
19202
|
-
struct ggml_compute_params params = {
|
|
19203
|
-
/*.type =*/ GGML_TASK_TYPE_FINALIZE,
|
|
19204
|
-
/*.ith =*/ 0,
|
|
19205
|
-
/*.nth =*/ 0,
|
|
19206
|
-
/*.wsize =*/ cplan->work_size,
|
|
19207
|
-
/*.wdata =*/ cplan->work_data,
|
|
19208
|
-
};
|
|
19209
|
-
|
|
19210
|
-
if (node_n != -1) {
|
|
19211
|
-
/* FINALIZE */
|
|
19212
|
-
struct ggml_tensor * node = cgraph->nodes[node_n];
|
|
19213
|
-
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
|
19214
|
-
params.nth = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
|
|
19215
|
-
ggml_compute_forward(¶ms, node, state);
|
|
19216
|
-
}
|
|
19217
|
-
ggml_graph_compute_perf_stats_node(node, state->shared);
|
|
19218
|
-
}
|
|
19219
|
-
|
|
19220
|
-
// distribute new work or execute it direct if 1T
|
|
19221
|
-
while (++node_n < cgraph->n_nodes) {
|
|
19222
|
-
GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
|
|
19223
|
-
struct ggml_tensor * node = cgraph->nodes[node_n];
|
|
19224
|
-
const int n_tasks = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
|
|
19225
|
-
|
|
19226
|
-
state->shared->perf_node_start_cycles = ggml_perf_cycles();
|
|
19227
|
-
state->shared->perf_node_start_time_us = ggml_perf_time_us();
|
|
19228
|
-
|
|
19229
|
-
params.nth = n_tasks;
|
|
19230
|
-
|
|
19231
|
-
if (n_tasks == 1) {
|
|
19232
|
-
/* INIT */
|
|
19233
|
-
if (GGML_OP_HAS_INIT[node->op]) {
|
|
19234
|
-
params.type = GGML_TASK_TYPE_INIT;
|
|
19235
|
-
ggml_compute_forward(¶ms, node, state);
|
|
19236
|
-
}
|
|
19237
|
-
|
|
19238
|
-
// TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
|
|
19239
|
-
// they do something more efficient than spinning (?)
|
|
19240
|
-
params.type = GGML_TASK_TYPE_COMPUTE;
|
|
19241
|
-
ggml_compute_forward(¶ms, node, state);
|
|
19242
|
-
|
|
19243
|
-
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
|
19244
|
-
params.type = GGML_TASK_TYPE_FINALIZE;
|
|
19245
|
-
ggml_compute_forward(¶ms, node, state);
|
|
19246
|
-
}
|
|
19247
|
-
|
|
19248
|
-
ggml_graph_compute_perf_stats_node(node, state->shared);
|
|
19249
|
-
} else {
|
|
19250
|
-
break;
|
|
19251
|
-
}
|
|
19252
|
-
|
|
19253
|
-
if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
|
|
19254
|
-
break;
|
|
19255
|
-
}
|
|
19256
|
-
}
|
|
19257
|
-
|
|
19258
|
-
task_phase = GGML_TASK_TYPE_INIT;
|
|
19259
|
-
atomic_store(&state->shared->n_active, n_threads);
|
|
19260
|
-
atomic_store(&state->shared->node_n, node_n);
|
|
19261
|
-
atomic_store(&state->shared->node_task, task_phase);
|
|
19262
|
-
} else {
|
|
19263
|
-
ggml_graph_compute_thread_sync_node(&node_n, state, false);
|
|
19264
|
-
ggml_graph_compute_thread_sync_task(&task_phase, state, false);
|
|
19265
|
-
}
|
|
19266
|
-
|
|
19267
|
-
// check if we should stop
|
|
19268
|
-
if (node_n >= cgraph->n_nodes) break;
|
|
19269
|
-
|
|
19270
|
-
/* INIT & COMPUTE */
|
|
19271
19043
|
struct ggml_tensor * node = cgraph->nodes[node_n];
|
|
19272
19044
|
const int n_tasks = ggml_get_n_tasks(node, n_threads, state->shared->n_threads);
|
|
19273
19045
|
|
|
19274
|
-
|
|
19275
|
-
/*.type =*/ GGML_TASK_TYPE_INIT,
|
|
19276
|
-
/*.ith =*/ state->ith,
|
|
19277
|
-
/*.nth =*/ n_tasks,
|
|
19278
|
-
/*.wsize =*/ cplan->work_size,
|
|
19279
|
-
/*.wdata =*/ cplan->work_data,
|
|
19280
|
-
};
|
|
19046
|
+
params.nth = n_tasks;
|
|
19281
19047
|
|
|
19282
|
-
|
|
19283
|
-
|
|
19048
|
+
/* INIT */
|
|
19049
|
+
if (GGML_OP_HAS_INIT[node->op]) {
|
|
19050
|
+
if (ith < n_tasks) {
|
|
19051
|
+
params.type = GGML_TASK_TYPE_INIT;
|
|
19284
19052
|
ggml_compute_forward(¶ms, node, state);
|
|
19285
19053
|
}
|
|
19054
|
+
ggml_barrier(state);
|
|
19286
19055
|
}
|
|
19287
19056
|
|
|
19288
|
-
|
|
19289
|
-
|
|
19290
|
-
atomic_store(&state->shared->n_active, n_threads);
|
|
19291
|
-
atomic_store(&state->shared->node_task, task_phase);
|
|
19292
|
-
}
|
|
19293
|
-
else {
|
|
19294
|
-
// TODO: this sched_yield can have significant impact on the performance - either positive or negative
|
|
19295
|
-
// depending on the workload and the operating system.
|
|
19296
|
-
// since it is not clear what is the best approach, it should potentially become user-configurable
|
|
19297
|
-
// ref: https://github.com/ggerganov/ggml/issues/291
|
|
19298
|
-
// UPD: adding the do_yield flag seems to resolve the issue universally
|
|
19299
|
-
const bool do_yield = node_n < 0 || cgraph->nodes[node_n]->op == GGML_OP_MUL_MAT;
|
|
19300
|
-
ggml_graph_compute_thread_sync_task(&task_phase, state, do_yield);
|
|
19301
|
-
}
|
|
19302
|
-
|
|
19303
|
-
if (state->ith < n_tasks) {
|
|
19057
|
+
/* COMPUTE */
|
|
19058
|
+
if (ith < n_tasks) {
|
|
19304
19059
|
params.type = GGML_TASK_TYPE_COMPUTE;
|
|
19305
19060
|
ggml_compute_forward(¶ms, node, state);
|
|
19306
19061
|
}
|
|
19307
19062
|
|
|
19308
|
-
|
|
19309
|
-
|
|
19310
|
-
|
|
19311
|
-
|
|
19312
|
-
|
|
19313
|
-
|
|
19314
|
-
|
|
19063
|
+
ggml_barrier(state);
|
|
19064
|
+
|
|
19065
|
+
/* FINALIZE */
|
|
19066
|
+
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
|
19067
|
+
if (params.ith == 0) {
|
|
19068
|
+
params.type = GGML_TASK_TYPE_FINALIZE;
|
|
19069
|
+
ggml_compute_forward(¶ms, node, state);
|
|
19070
|
+
}
|
|
19071
|
+
ggml_barrier(state);
|
|
19315
19072
|
}
|
|
19316
19073
|
}
|
|
19317
19074
|
|
|
@@ -19368,17 +19125,6 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
|
|
19368
19125
|
{
|
|
19369
19126
|
const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type;
|
|
19370
19127
|
|
|
19371
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
|
19372
|
-
if (ggml_compute_forward_mul_mat_use_blas(node)) {
|
|
19373
|
-
if (node->src[0]->type != GGML_TYPE_F32) {
|
|
19374
|
-
// here we need memory for fully dequantized matrix from src0
|
|
19375
|
-
// take into account that src0 can be broadcasted into src1[2,3]
|
|
19376
|
-
cur = ggml_type_size(GGML_TYPE_F32)
|
|
19377
|
-
* node->src[0]->ne[0]*node->src[0]->ne[1]
|
|
19378
|
-
* node->src[1]->ne[2]*node->src[1]->ne[3];
|
|
19379
|
-
}
|
|
19380
|
-
} else
|
|
19381
|
-
#endif
|
|
19382
19128
|
if (node->src[1]->type != vec_dot_type) {
|
|
19383
19129
|
cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
|
|
19384
19130
|
}
|
|
@@ -19509,7 +19255,6 @@ static enum ggml_status ggml_graph_compute_parallel(struct ggml_compute_state *
|
|
|
19509
19255
|
// update the number of threads from the actual number of threads that we got from OpenMP
|
|
19510
19256
|
n_threads = omp_get_num_threads();
|
|
19511
19257
|
workers[0].shared->n_threads = n_threads;
|
|
19512
|
-
workers[0].shared->n_active = n_threads;
|
|
19513
19258
|
}
|
|
19514
19259
|
ggml_graph_compute_thread(&workers[omp_get_thread_num()]);
|
|
19515
19260
|
}
|
|
@@ -19572,9 +19317,8 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
|
|
19572
19317
|
/*.perf_node_start_cycles =*/ 0,
|
|
19573
19318
|
/*.perf_node_start_time_us =*/ 0,
|
|
19574
19319
|
/*.n_threads =*/ n_threads,
|
|
19575
|
-
/*.
|
|
19576
|
-
/*.
|
|
19577
|
-
/*.node_task =*/ GGML_TASK_TYPE_FINALIZE,
|
|
19320
|
+
/*.n_barrier =*/ 0,
|
|
19321
|
+
/*.n_barrier_passed =*/ 0,
|
|
19578
19322
|
/*.abort_callback =*/ NULL,
|
|
19579
19323
|
/*.abort_callback_data =*/ NULL,
|
|
19580
19324
|
/*.current_chunk; =*/ 0,
|
|
@@ -22676,7 +22420,7 @@ int ggml_cpu_has_wasm_simd(void) {
|
|
|
22676
22420
|
}
|
|
22677
22421
|
|
|
22678
22422
|
int ggml_cpu_has_blas(void) {
|
|
22679
|
-
#if defined(
|
|
22423
|
+
#if defined(GGML_USE_BLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL)
|
|
22680
22424
|
return 1;
|
|
22681
22425
|
#else
|
|
22682
22426
|
return 0;
|