llama_cpp 0.16.0 → 0.16.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/ext/llama_cpp/extconf.rb +2 -0
- data/ext/llama_cpp/llama_cpp.cpp +2 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- data/vendor/tmp/llama.cpp/Makefile +110 -53
- data/vendor/tmp/llama.cpp/ggml-alloc.c +78 -22
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +20 -8
- data/vendor/tmp/llama.cpp/ggml-backend.c +178 -64
- data/vendor/tmp/llama.cpp/ggml-backend.h +3 -3
- data/vendor/tmp/llama.cpp/ggml-blas.cpp +363 -0
- data/vendor/tmp/llama.cpp/ggml-blas.h +23 -0
- data/vendor/tmp/llama.cpp/ggml-common.h +6 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +1 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +21 -9
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +15 -1491
- data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +76 -61
- data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +77 -10
- data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +1 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +20 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +95 -129
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +8 -7
- data/vendor/tmp/llama.cpp/ggml-metal.m +11 -9
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +13 -12
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +19 -23
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +1230 -1129
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +181 -148
- data/vendor/tmp/llama.cpp/ggml.c +102 -275
- data/vendor/tmp/llama.cpp/llama.cpp +103 -47
- data/vendor/tmp/llama.cpp/llama.h +4 -0
- metadata +15 -3
data/vendor/tmp/llama.cpp/ggml.c
CHANGED
|
@@ -297,12 +297,6 @@ inline static void * ggml_calloc(size_t num, size_t size) {
|
|
|
297
297
|
|
|
298
298
|
#if defined(GGML_USE_ACCELERATE)
|
|
299
299
|
#include <Accelerate/Accelerate.h>
|
|
300
|
-
#elif defined(GGML_USE_OPENBLAS)
|
|
301
|
-
#if defined(GGML_BLAS_USE_MKL)
|
|
302
|
-
#include <mkl.h>
|
|
303
|
-
#else
|
|
304
|
-
#include <cblas.h>
|
|
305
|
-
#endif
|
|
306
300
|
#endif
|
|
307
301
|
|
|
308
302
|
// floating point type used to accumulate sums
|
|
@@ -3212,35 +3206,42 @@ GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor) {
|
|
|
3212
3206
|
return tensor->nb[0] > tensor->nb[1];
|
|
3213
3207
|
}
|
|
3214
3208
|
|
|
3215
|
-
|
|
3216
|
-
|
|
3209
|
+
static bool ggml_is_contiguous_n(const struct ggml_tensor * tensor, int n) {
|
|
3210
|
+
size_t next_nb = ggml_type_size(tensor->type);
|
|
3211
|
+
if (tensor->ne[0] != ggml_blck_size(tensor->type) && tensor->nb[0] != next_nb) {
|
|
3212
|
+
return false;
|
|
3213
|
+
}
|
|
3214
|
+
next_nb *= tensor->ne[0]/ggml_blck_size(tensor->type);
|
|
3215
|
+
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
|
3216
|
+
if (tensor->ne[i] != 1) {
|
|
3217
|
+
if (i > n) {
|
|
3218
|
+
if (tensor->nb[i] != next_nb) {
|
|
3219
|
+
return false;
|
|
3220
|
+
}
|
|
3221
|
+
next_nb *= tensor->ne[i];
|
|
3222
|
+
} else {
|
|
3223
|
+
// this dimension does not need to be contiguous
|
|
3224
|
+
next_nb = tensor->ne[i]*tensor->nb[i];
|
|
3225
|
+
}
|
|
3226
|
+
}
|
|
3227
|
+
}
|
|
3228
|
+
return true;
|
|
3229
|
+
}
|
|
3217
3230
|
|
|
3218
|
-
|
|
3219
|
-
|
|
3220
|
-
tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) &&
|
|
3221
|
-
tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
|
|
3222
|
-
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
|
3231
|
+
GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
|
|
3232
|
+
return ggml_is_contiguous_0(tensor);
|
|
3223
3233
|
}
|
|
3224
3234
|
|
|
3225
3235
|
GGML_CALL bool ggml_is_contiguous_0(const struct ggml_tensor * tensor) {
|
|
3226
|
-
return
|
|
3236
|
+
return ggml_is_contiguous_n(tensor, 0);
|
|
3227
3237
|
}
|
|
3228
3238
|
|
|
3229
3239
|
GGML_CALL bool ggml_is_contiguous_1(const struct ggml_tensor * tensor) {
|
|
3230
|
-
|
|
3231
|
-
|
|
3232
|
-
return
|
|
3233
|
-
tensor->nb[0] == ggml_type_size(tensor->type) &&
|
|
3234
|
-
tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
|
|
3235
|
-
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
|
3240
|
+
return ggml_is_contiguous_n(tensor, 1);
|
|
3236
3241
|
}
|
|
3237
3242
|
|
|
3238
3243
|
GGML_CALL bool ggml_is_contiguous_2(const struct ggml_tensor * tensor) {
|
|
3239
|
-
|
|
3240
|
-
|
|
3241
|
-
return
|
|
3242
|
-
tensor->nb[0] == ggml_type_size(tensor->type) &&
|
|
3243
|
-
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
|
3244
|
+
return ggml_is_contiguous_n(tensor, 2);
|
|
3244
3245
|
}
|
|
3245
3246
|
|
|
3246
3247
|
GGML_CALL bool ggml_is_permuted(const struct ggml_tensor * tensor) {
|
|
@@ -3272,20 +3273,20 @@ bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor
|
|
|
3272
3273
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
|
3273
3274
|
|
|
3274
3275
|
return
|
|
3275
|
-
(t0->ne[0] == t1->ne[0]
|
|
3276
|
-
(t0->ne[1] == t1->ne[1]
|
|
3277
|
-
(t0->ne[2] == t1->ne[2]
|
|
3278
|
-
(t0->ne[3] == t1->ne[3]
|
|
3276
|
+
(t0->ne[0] == t1->ne[0]) &&
|
|
3277
|
+
(t0->ne[1] == t1->ne[1]) &&
|
|
3278
|
+
(t0->ne[2] == t1->ne[2]) &&
|
|
3279
|
+
(t0->ne[3] == t1->ne[3]);
|
|
3279
3280
|
}
|
|
3280
3281
|
|
|
3281
3282
|
bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
|
3282
3283
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
|
3283
3284
|
|
|
3284
3285
|
return
|
|
3285
|
-
(t0->nb[0] == t1->nb[0]
|
|
3286
|
-
(t0->nb[1] == t1->nb[1]
|
|
3287
|
-
(t0->nb[2] == t1->nb[2]
|
|
3288
|
-
(t0->nb[3] == t1->nb[3]
|
|
3286
|
+
(t0->nb[0] == t1->nb[0]) &&
|
|
3287
|
+
(t0->nb[1] == t1->nb[1]) &&
|
|
3288
|
+
(t0->nb[2] == t1->nb[2]) &&
|
|
3289
|
+
(t0->nb[3] == t1->nb[3]);
|
|
3289
3290
|
}
|
|
3290
3291
|
|
|
3291
3292
|
// check if t1 can be represented as a repeatition of t0
|
|
@@ -4078,32 +4079,26 @@ float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) {
|
|
|
4078
4079
|
switch (tensor->type) {
|
|
4079
4080
|
case GGML_TYPE_I8:
|
|
4080
4081
|
{
|
|
4081
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
|
|
4082
4082
|
return ((int8_t *)(tensor->data))[i];
|
|
4083
4083
|
}
|
|
4084
4084
|
case GGML_TYPE_I16:
|
|
4085
4085
|
{
|
|
4086
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
|
|
4087
4086
|
return ((int16_t *)(tensor->data))[i];
|
|
4088
4087
|
}
|
|
4089
4088
|
case GGML_TYPE_I32:
|
|
4090
4089
|
{
|
|
4091
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
|
|
4092
4090
|
return ((int32_t *)(tensor->data))[i];
|
|
4093
4091
|
}
|
|
4094
4092
|
case GGML_TYPE_F16:
|
|
4095
4093
|
{
|
|
4096
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
|
|
4097
4094
|
return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
|
|
4098
4095
|
}
|
|
4099
4096
|
case GGML_TYPE_BF16:
|
|
4100
4097
|
{
|
|
4101
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_bf16_t));
|
|
4102
4098
|
return GGML_BF16_TO_FP32(((ggml_bf16_t *)(tensor->data))[i]);
|
|
4103
4099
|
}
|
|
4104
4100
|
case GGML_TYPE_F32:
|
|
4105
4101
|
{
|
|
4106
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(float));
|
|
4107
4102
|
return ((float *)(tensor->data))[i];
|
|
4108
4103
|
}
|
|
4109
4104
|
default:
|
|
@@ -4125,32 +4120,26 @@ void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) {
|
|
|
4125
4120
|
switch (tensor->type) {
|
|
4126
4121
|
case GGML_TYPE_I8:
|
|
4127
4122
|
{
|
|
4128
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
|
|
4129
4123
|
((int8_t *)(tensor->data))[i] = value;
|
|
4130
4124
|
} break;
|
|
4131
4125
|
case GGML_TYPE_I16:
|
|
4132
4126
|
{
|
|
4133
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
|
|
4134
4127
|
((int16_t *)(tensor->data))[i] = value;
|
|
4135
4128
|
} break;
|
|
4136
4129
|
case GGML_TYPE_I32:
|
|
4137
4130
|
{
|
|
4138
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
|
|
4139
4131
|
((int32_t *)(tensor->data))[i] = value;
|
|
4140
4132
|
} break;
|
|
4141
4133
|
case GGML_TYPE_F16:
|
|
4142
4134
|
{
|
|
4143
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
|
|
4144
4135
|
((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
|
|
4145
4136
|
} break;
|
|
4146
4137
|
case GGML_TYPE_BF16:
|
|
4147
4138
|
{
|
|
4148
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_bf16_t));
|
|
4149
4139
|
((ggml_bf16_t *)(tensor->data))[i] = GGML_FP32_TO_BF16(value);
|
|
4150
4140
|
} break;
|
|
4151
4141
|
case GGML_TYPE_F32:
|
|
4152
4142
|
{
|
|
4153
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(float));
|
|
4154
4143
|
((float *)(tensor->data))[i] = value;
|
|
4155
4144
|
} break;
|
|
4156
4145
|
default:
|
|
@@ -7343,13 +7332,15 @@ struct ggml_tensor * ggml_add_rel_pos_inplace(
|
|
|
7343
7332
|
return ggml_add_rel_pos_impl(ctx, a, pw, ph, true);
|
|
7344
7333
|
}
|
|
7345
7334
|
|
|
7346
|
-
//
|
|
7335
|
+
// ggml_unary
|
|
7347
7336
|
|
|
7348
7337
|
static struct ggml_tensor * ggml_unary_impl(
|
|
7349
7338
|
struct ggml_context * ctx,
|
|
7350
7339
|
struct ggml_tensor * a,
|
|
7351
7340
|
enum ggml_unary_op op,
|
|
7352
7341
|
bool inplace) {
|
|
7342
|
+
GGML_ASSERT(ggml_is_contiguous_1(a));
|
|
7343
|
+
|
|
7353
7344
|
bool is_node = false;
|
|
7354
7345
|
|
|
7355
7346
|
if (!inplace && (a->grad)) {
|
|
@@ -11014,6 +11005,8 @@ static void ggml_compute_forward_abs_f32(
|
|
|
11014
11005
|
const struct ggml_tensor * src0 = dst->src[0];
|
|
11015
11006
|
|
|
11016
11007
|
assert(params->ith == 0);
|
|
11008
|
+
assert(ggml_is_contiguous_1(src0));
|
|
11009
|
+
assert(ggml_is_contiguous_1(dst));
|
|
11017
11010
|
assert(ggml_are_same_shape(src0, dst));
|
|
11018
11011
|
|
|
11019
11012
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
@@ -11023,9 +11016,6 @@ static void ggml_compute_forward_abs_f32(
|
|
|
11023
11016
|
const int n = ggml_nrows(src0);
|
|
11024
11017
|
const int nc = src0->ne[0];
|
|
11025
11018
|
|
|
11026
|
-
assert(dst->nb[0] == sizeof(float));
|
|
11027
|
-
assert(src0->nb[0] == sizeof(float));
|
|
11028
|
-
|
|
11029
11019
|
for (int i = 0; i < n; i++) {
|
|
11030
11020
|
ggml_vec_abs_f32(nc,
|
|
11031
11021
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
|
@@ -11060,6 +11050,8 @@ static void ggml_compute_forward_sgn_f32(
|
|
|
11060
11050
|
const struct ggml_tensor * src0 = dst->src[0];
|
|
11061
11051
|
|
|
11062
11052
|
assert(params->ith == 0);
|
|
11053
|
+
assert(ggml_is_contiguous_1(src0));
|
|
11054
|
+
assert(ggml_is_contiguous_1(dst));
|
|
11063
11055
|
assert(ggml_are_same_shape(src0, dst));
|
|
11064
11056
|
|
|
11065
11057
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
@@ -11069,9 +11061,6 @@ static void ggml_compute_forward_sgn_f32(
|
|
|
11069
11061
|
const int n = ggml_nrows(src0);
|
|
11070
11062
|
const int nc = src0->ne[0];
|
|
11071
11063
|
|
|
11072
|
-
assert(dst->nb[0] == sizeof(float));
|
|
11073
|
-
assert(src0->nb[0] == sizeof(float));
|
|
11074
|
-
|
|
11075
11064
|
for (int i = 0; i < n; i++) {
|
|
11076
11065
|
ggml_vec_sgn_f32(nc,
|
|
11077
11066
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
|
@@ -11106,6 +11095,8 @@ static void ggml_compute_forward_neg_f32(
|
|
|
11106
11095
|
const struct ggml_tensor * src0 = dst->src[0];
|
|
11107
11096
|
|
|
11108
11097
|
assert(params->ith == 0);
|
|
11098
|
+
assert(ggml_is_contiguous_1(src0));
|
|
11099
|
+
assert(ggml_is_contiguous_1(dst));
|
|
11109
11100
|
assert(ggml_are_same_shape(src0, dst));
|
|
11110
11101
|
|
|
11111
11102
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
@@ -11115,9 +11106,6 @@ static void ggml_compute_forward_neg_f32(
|
|
|
11115
11106
|
const int n = ggml_nrows(src0);
|
|
11116
11107
|
const int nc = src0->ne[0];
|
|
11117
11108
|
|
|
11118
|
-
assert(dst->nb[0] == sizeof(float));
|
|
11119
|
-
assert(src0->nb[0] == sizeof(float));
|
|
11120
|
-
|
|
11121
11109
|
for (int i = 0; i < n; i++) {
|
|
11122
11110
|
ggml_vec_neg_f32(nc,
|
|
11123
11111
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
|
@@ -11152,6 +11140,8 @@ static void ggml_compute_forward_step_f32(
|
|
|
11152
11140
|
const struct ggml_tensor * src0 = dst->src[0];
|
|
11153
11141
|
|
|
11154
11142
|
assert(params->ith == 0);
|
|
11143
|
+
assert(ggml_is_contiguous_1(src0));
|
|
11144
|
+
assert(ggml_is_contiguous_1(dst));
|
|
11155
11145
|
assert(ggml_are_same_shape(src0, dst));
|
|
11156
11146
|
|
|
11157
11147
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
@@ -11161,9 +11151,6 @@ static void ggml_compute_forward_step_f32(
|
|
|
11161
11151
|
const int n = ggml_nrows(src0);
|
|
11162
11152
|
const int nc = src0->ne[0];
|
|
11163
11153
|
|
|
11164
|
-
assert(dst->nb[0] == sizeof(float));
|
|
11165
|
-
assert(src0->nb[0] == sizeof(float));
|
|
11166
|
-
|
|
11167
11154
|
for (int i = 0; i < n; i++) {
|
|
11168
11155
|
ggml_vec_step_f32(nc,
|
|
11169
11156
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
|
@@ -11198,6 +11185,8 @@ static void ggml_compute_forward_tanh_f32(
|
|
|
11198
11185
|
const struct ggml_tensor * src0 = dst->src[0];
|
|
11199
11186
|
|
|
11200
11187
|
assert(params->ith == 0);
|
|
11188
|
+
assert(ggml_is_contiguous_1(src0));
|
|
11189
|
+
assert(ggml_is_contiguous_1(dst));
|
|
11201
11190
|
assert(ggml_are_same_shape(src0, dst));
|
|
11202
11191
|
|
|
11203
11192
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
@@ -11207,9 +11196,6 @@ static void ggml_compute_forward_tanh_f32(
|
|
|
11207
11196
|
const int n = ggml_nrows(src0);
|
|
11208
11197
|
const int nc = src0->ne[0];
|
|
11209
11198
|
|
|
11210
|
-
assert(dst->nb[0] == sizeof(float));
|
|
11211
|
-
assert(src0->nb[0] == sizeof(float));
|
|
11212
|
-
|
|
11213
11199
|
for (int i = 0; i < n; i++) {
|
|
11214
11200
|
ggml_vec_tanh_f32(nc,
|
|
11215
11201
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
|
@@ -11244,6 +11230,8 @@ static void ggml_compute_forward_elu_f32(
|
|
|
11244
11230
|
const struct ggml_tensor * src0 = dst->src[0];
|
|
11245
11231
|
|
|
11246
11232
|
assert(params->ith == 0);
|
|
11233
|
+
assert(ggml_is_contiguous_1(src0));
|
|
11234
|
+
assert(ggml_is_contiguous_1(dst));
|
|
11247
11235
|
assert(ggml_are_same_shape(src0, dst));
|
|
11248
11236
|
|
|
11249
11237
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
@@ -11253,9 +11241,6 @@ static void ggml_compute_forward_elu_f32(
|
|
|
11253
11241
|
const int n = ggml_nrows(src0);
|
|
11254
11242
|
const int nc = src0->ne[0];
|
|
11255
11243
|
|
|
11256
|
-
assert(dst->nb[0] == sizeof(float));
|
|
11257
|
-
assert(src0->nb[0] == sizeof(float));
|
|
11258
|
-
|
|
11259
11244
|
for (int i = 0; i < n; i++) {
|
|
11260
11245
|
ggml_vec_elu_f32(nc,
|
|
11261
11246
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
|
@@ -11290,6 +11275,8 @@ static void ggml_compute_forward_relu_f32(
|
|
|
11290
11275
|
const struct ggml_tensor * src0 = dst->src[0];
|
|
11291
11276
|
|
|
11292
11277
|
assert(params->ith == 0);
|
|
11278
|
+
assert(ggml_is_contiguous_1(src0));
|
|
11279
|
+
assert(ggml_is_contiguous_1(dst));
|
|
11293
11280
|
assert(ggml_are_same_shape(src0, dst));
|
|
11294
11281
|
|
|
11295
11282
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
@@ -11299,9 +11286,6 @@ static void ggml_compute_forward_relu_f32(
|
|
|
11299
11286
|
const int n = ggml_nrows(src0);
|
|
11300
11287
|
const int nc = src0->ne[0];
|
|
11301
11288
|
|
|
11302
|
-
assert(dst->nb[0] == sizeof(float));
|
|
11303
|
-
assert(src0->nb[0] == sizeof(float));
|
|
11304
|
-
|
|
11305
11289
|
for (int i = 0; i < n; i++) {
|
|
11306
11290
|
ggml_vec_relu_f32(nc,
|
|
11307
11291
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
|
@@ -11336,6 +11320,8 @@ static void ggml_compute_forward_sigmoid_f32(
|
|
|
11336
11320
|
const struct ggml_tensor * src0 = dst->src[0];
|
|
11337
11321
|
|
|
11338
11322
|
assert(params->ith == 0);
|
|
11323
|
+
assert(ggml_is_contiguous_1(src0));
|
|
11324
|
+
assert(ggml_is_contiguous_1(dst));
|
|
11339
11325
|
assert(ggml_are_same_shape(src0, dst));
|
|
11340
11326
|
|
|
11341
11327
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
@@ -11345,9 +11331,6 @@ static void ggml_compute_forward_sigmoid_f32(
|
|
|
11345
11331
|
const int n = ggml_nrows(src0);
|
|
11346
11332
|
const int nc = src0->ne[0];
|
|
11347
11333
|
|
|
11348
|
-
assert(dst->nb[0] == sizeof(float));
|
|
11349
|
-
assert(src0->nb[0] == sizeof(float));
|
|
11350
|
-
|
|
11351
11334
|
for (int i = 0; i < n; i++) {
|
|
11352
11335
|
ggml_vec_sigmoid_f32(nc,
|
|
11353
11336
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
|
@@ -11381,9 +11364,9 @@ static void ggml_compute_forward_gelu_f32(
|
|
|
11381
11364
|
|
|
11382
11365
|
const struct ggml_tensor * src0 = dst->src[0];
|
|
11383
11366
|
|
|
11384
|
-
|
|
11385
|
-
|
|
11386
|
-
|
|
11367
|
+
assert(ggml_is_contiguous_1(src0));
|
|
11368
|
+
assert(ggml_is_contiguous_1(dst));
|
|
11369
|
+
assert(ggml_are_same_shape(src0, dst));
|
|
11387
11370
|
|
|
11388
11371
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
11389
11372
|
return;
|
|
@@ -11444,9 +11427,9 @@ static void ggml_compute_forward_gelu_quick_f32(
|
|
|
11444
11427
|
|
|
11445
11428
|
const struct ggml_tensor * src0 = dst->src[0];
|
|
11446
11429
|
|
|
11447
|
-
|
|
11448
|
-
|
|
11449
|
-
|
|
11430
|
+
assert(ggml_is_contiguous_1(src0));
|
|
11431
|
+
assert(ggml_is_contiguous_1(dst));
|
|
11432
|
+
assert(ggml_are_same_shape(src0, dst));
|
|
11450
11433
|
|
|
11451
11434
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
11452
11435
|
return;
|
|
@@ -11507,9 +11490,9 @@ static void ggml_compute_forward_silu_f32(
|
|
|
11507
11490
|
|
|
11508
11491
|
const struct ggml_tensor * src0 = dst->src[0];
|
|
11509
11492
|
|
|
11510
|
-
|
|
11511
|
-
|
|
11512
|
-
|
|
11493
|
+
assert(ggml_is_contiguous_1(src0));
|
|
11494
|
+
assert(ggml_is_contiguous_1(dst));
|
|
11495
|
+
assert(ggml_are_same_shape(src0, dst));
|
|
11513
11496
|
|
|
11514
11497
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
11515
11498
|
return;
|
|
@@ -11570,6 +11553,8 @@ static void ggml_compute_forward_leaky_relu_f32(
|
|
|
11570
11553
|
const struct ggml_tensor * src0 = dst->src[0];
|
|
11571
11554
|
|
|
11572
11555
|
assert(params->ith == 0);
|
|
11556
|
+
assert(ggml_is_contiguous_1(src0));
|
|
11557
|
+
assert(ggml_is_contiguous_1(dst));
|
|
11573
11558
|
assert(ggml_are_same_shape(src0, dst));
|
|
11574
11559
|
|
|
11575
11560
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
@@ -11619,11 +11604,11 @@ static void ggml_compute_forward_silu_back_f32(
|
|
|
11619
11604
|
const struct ggml_tensor * src0 = dst->src[0];
|
|
11620
11605
|
const struct ggml_tensor * grad = dst->src[1];
|
|
11621
11606
|
|
|
11622
|
-
|
|
11623
|
-
|
|
11624
|
-
|
|
11625
|
-
|
|
11626
|
-
|
|
11607
|
+
assert(ggml_is_contiguous_1(grad));
|
|
11608
|
+
assert(ggml_is_contiguous_1(src0));
|
|
11609
|
+
assert(ggml_is_contiguous_1(dst));
|
|
11610
|
+
assert(ggml_are_same_shape(src0, dst));
|
|
11611
|
+
assert(ggml_are_same_shape(src0, grad));
|
|
11627
11612
|
|
|
11628
11613
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
11629
11614
|
return;
|
|
@@ -11685,6 +11670,8 @@ static void ggml_compute_forward_hardswish_f32(
|
|
|
11685
11670
|
const struct ggml_tensor * src0 = dst->src[0];
|
|
11686
11671
|
|
|
11687
11672
|
assert(params->ith == 0);
|
|
11673
|
+
assert(ggml_is_contiguous_1(src0));
|
|
11674
|
+
assert(ggml_is_contiguous_1(dst));
|
|
11688
11675
|
assert(ggml_are_same_shape(src0, dst));
|
|
11689
11676
|
|
|
11690
11677
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
@@ -11694,9 +11681,6 @@ static void ggml_compute_forward_hardswish_f32(
|
|
|
11694
11681
|
const int n = ggml_nrows(src0);
|
|
11695
11682
|
const int nc = src0->ne[0];
|
|
11696
11683
|
|
|
11697
|
-
assert(dst->nb[0] == sizeof(float));
|
|
11698
|
-
assert(src0->nb[0] == sizeof(float));
|
|
11699
|
-
|
|
11700
11684
|
for (int i = 0; i < n; i++) {
|
|
11701
11685
|
ggml_vec_hardswish_f32(nc,
|
|
11702
11686
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
|
@@ -11728,6 +11712,8 @@ static void ggml_compute_forward_hardsigmoid_f32(
|
|
|
11728
11712
|
const struct ggml_tensor * src0 = dst->src[0];
|
|
11729
11713
|
|
|
11730
11714
|
assert(params->ith == 0);
|
|
11715
|
+
assert(ggml_is_contiguous_1(src0));
|
|
11716
|
+
assert(ggml_is_contiguous_1(dst));
|
|
11731
11717
|
assert(ggml_are_same_shape(src0, dst));
|
|
11732
11718
|
|
|
11733
11719
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
@@ -11737,9 +11723,6 @@ static void ggml_compute_forward_hardsigmoid_f32(
|
|
|
11737
11723
|
const int n = ggml_nrows(src0);
|
|
11738
11724
|
const int nc = src0->ne[0];
|
|
11739
11725
|
|
|
11740
|
-
assert(dst->nb[0] == sizeof(float));
|
|
11741
|
-
assert(src0->nb[0] == sizeof(float));
|
|
11742
|
-
|
|
11743
11726
|
for (int i = 0; i < n; i++) {
|
|
11744
11727
|
ggml_vec_hardsigmoid_f32(nc,
|
|
11745
11728
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
|
@@ -12190,39 +12173,6 @@ static void ggml_compute_forward_group_norm(
|
|
|
12190
12173
|
|
|
12191
12174
|
// ggml_compute_forward_mul_mat
|
|
12192
12175
|
|
|
12193
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
|
12194
|
-
// helper function to determine if it is better to use BLAS or not
|
|
12195
|
-
// for large matrices, BLAS is faster
|
|
12196
|
-
static bool ggml_compute_forward_mul_mat_use_blas(struct ggml_tensor * dst) {
|
|
12197
|
-
const struct ggml_tensor * src0 = dst->src[0];
|
|
12198
|
-
const struct ggml_tensor * src1 = dst->src[1];
|
|
12199
|
-
|
|
12200
|
-
//const int64_t ne00 = src0->ne[0];
|
|
12201
|
-
//const int64_t ne01 = src0->ne[1];
|
|
12202
|
-
|
|
12203
|
-
const int64_t ne10 = src1->ne[0];
|
|
12204
|
-
|
|
12205
|
-
const int64_t ne0 = dst->ne[0];
|
|
12206
|
-
const int64_t ne1 = dst->ne[1];
|
|
12207
|
-
|
|
12208
|
-
// NOTE: with GGML_OP_MUL_MAT_ID we don't want to go through the BLAS branch because it will dequantize (to_float)
|
|
12209
|
-
// all the experts for each batch element and the processing would become incredibly slow
|
|
12210
|
-
// TODO: find the optimal values for these
|
|
12211
|
-
if (dst->op != GGML_OP_MUL_MAT_ID &&
|
|
12212
|
-
ggml_is_contiguous(src0) &&
|
|
12213
|
-
ggml_is_contiguous(src1) &&
|
|
12214
|
-
//src0->type == GGML_TYPE_F32 &&
|
|
12215
|
-
src1->type == GGML_TYPE_F32 &&
|
|
12216
|
-
(ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
|
|
12217
|
-
|
|
12218
|
-
/*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
|
|
12219
|
-
return true;
|
|
12220
|
-
}
|
|
12221
|
-
|
|
12222
|
-
return false;
|
|
12223
|
-
}
|
|
12224
|
-
#endif
|
|
12225
|
-
|
|
12226
12176
|
static void ggml_compute_forward_mul_mat_one_chunk(
|
|
12227
12177
|
const struct ggml_compute_params * params,
|
|
12228
12178
|
struct ggml_tensor * dst,
|
|
@@ -12360,73 +12310,6 @@ static void ggml_compute_forward_mul_mat(
|
|
|
12360
12310
|
// nb01 >= nb00 - src0 is not transposed
|
|
12361
12311
|
// compute by src0 rows
|
|
12362
12312
|
|
|
12363
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
|
12364
|
-
if (ggml_compute_forward_mul_mat_use_blas(dst)) {
|
|
12365
|
-
const int64_t ne_plane = ne01*ne00;
|
|
12366
|
-
const size_t desired_wsize = ne13*ne12*ne_plane*sizeof(float);
|
|
12367
|
-
UNUSED(desired_wsize);
|
|
12368
|
-
|
|
12369
|
-
if (params->type == GGML_TASK_TYPE_INIT) {
|
|
12370
|
-
if (type != GGML_TYPE_F32) {
|
|
12371
|
-
assert(params->wsize >= desired_wsize);
|
|
12372
|
-
// parallelize by src0 rows
|
|
12373
|
-
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
|
12374
|
-
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
|
12375
|
-
// broadcast src0 into src1 across 2nd,3rd dimension
|
|
12376
|
-
const int64_t i03 = i13/r3;
|
|
12377
|
-
const int64_t i02 = i12/r2;
|
|
12378
|
-
|
|
12379
|
-
const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
|
|
12380
|
-
float * const wdata = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
|
|
12381
|
-
ggml_to_float_t const to_float = type_traits[type].to_float;
|
|
12382
|
-
|
|
12383
|
-
for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
|
|
12384
|
-
to_float((const char *) x + i01*nb01, wdata + i01*ne00, ne00);
|
|
12385
|
-
}
|
|
12386
|
-
}
|
|
12387
|
-
}
|
|
12388
|
-
}
|
|
12389
|
-
return;
|
|
12390
|
-
}
|
|
12391
|
-
|
|
12392
|
-
if (params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
12393
|
-
return;
|
|
12394
|
-
}
|
|
12395
|
-
|
|
12396
|
-
// perform sgemm, parallelization controlled by blas lib
|
|
12397
|
-
if (ith != 0) {
|
|
12398
|
-
return;
|
|
12399
|
-
}
|
|
12400
|
-
|
|
12401
|
-
//const int64_t tgemm0 = ggml_perf_time_us();
|
|
12402
|
-
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
|
12403
|
-
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
|
12404
|
-
const int64_t i03 = i13/r3;
|
|
12405
|
-
const int64_t i02 = i12/r2;
|
|
12406
|
-
|
|
12407
|
-
const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
|
|
12408
|
-
const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
|
|
12409
|
-
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
|
12410
|
-
|
|
12411
|
-
if (type != GGML_TYPE_F32) {
|
|
12412
|
-
x = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
|
|
12413
|
-
}
|
|
12414
|
-
|
|
12415
|
-
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
|
12416
|
-
ne1, ne01, ne10,
|
|
12417
|
-
1.0f, y, ne10,
|
|
12418
|
-
x, ne00,
|
|
12419
|
-
0.0f, d, ne01);
|
|
12420
|
-
}
|
|
12421
|
-
}
|
|
12422
|
-
//printf("cblas_sgemm = %.3f ms, %lld flops\n", (ggml_perf_time_us() - tgemm0)/1000.0, ne13*ne12*ne1*ne01*ne10*2);
|
|
12423
|
-
|
|
12424
|
-
//printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
|
|
12425
|
-
|
|
12426
|
-
return;
|
|
12427
|
-
}
|
|
12428
|
-
#endif
|
|
12429
|
-
|
|
12430
12313
|
#if GGML_USE_LLAMAFILE
|
|
12431
12314
|
const bool src1_cont = ggml_is_contiguous(src1);
|
|
12432
12315
|
|
|
@@ -12807,19 +12690,7 @@ static void ggml_compute_forward_out_prod_f32(
|
|
|
12807
12690
|
// nb01 >= nb00 - src0 is not transposed
|
|
12808
12691
|
// compute by src0 rows
|
|
12809
12692
|
|
|
12810
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
|
12811
|
-
bool use_blas = ggml_is_matrix(src0) &&
|
|
12812
|
-
ggml_is_matrix(src1) &&
|
|
12813
|
-
ggml_is_contiguous(src0) &&
|
|
12814
|
-
(ggml_is_contiguous(src1) || ggml_is_transposed(src1));
|
|
12815
|
-
#endif
|
|
12816
|
-
|
|
12817
12693
|
if (params->type == GGML_TASK_TYPE_INIT) {
|
|
12818
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) // gemm beta will zero dst
|
|
12819
|
-
if (use_blas) {
|
|
12820
|
-
return;
|
|
12821
|
-
}
|
|
12822
|
-
#endif
|
|
12823
12694
|
if (ith != 0) {
|
|
12824
12695
|
return;
|
|
12825
12696
|
}
|
|
@@ -12831,50 +12702,6 @@ static void ggml_compute_forward_out_prod_f32(
|
|
|
12831
12702
|
return;
|
|
12832
12703
|
}
|
|
12833
12704
|
|
|
12834
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
|
12835
|
-
if (use_blas) {
|
|
12836
|
-
if (params->ith != 0) { // All threads other than the first do no work.
|
|
12837
|
-
return;
|
|
12838
|
-
}
|
|
12839
|
-
// Arguments to ggml_compute_forward_out_prod (expressed as major,minor)
|
|
12840
|
-
// src0: (k,n)
|
|
12841
|
-
// src1: (k,m)
|
|
12842
|
-
// dst: (m,n)
|
|
12843
|
-
//
|
|
12844
|
-
// Arguments to sgemm (see https://github.com/Reference-LAPACK/lapack/blob/master/BLAS/SRC/sgemm.f)
|
|
12845
|
-
// Also expressed as (major,minor)
|
|
12846
|
-
// a: (m,k): so src1 transposed
|
|
12847
|
-
// b: (k,n): so src0
|
|
12848
|
-
// c: (m,n)
|
|
12849
|
-
//
|
|
12850
|
-
// However, if ggml_is_transposed(src1) is true, then
|
|
12851
|
-
// src1->data already contains a transposed version, so sgemm mustn't
|
|
12852
|
-
// transpose it further.
|
|
12853
|
-
|
|
12854
|
-
int n = src0->ne[0];
|
|
12855
|
-
int k = src0->ne[1];
|
|
12856
|
-
int m = src1->ne[0];
|
|
12857
|
-
|
|
12858
|
-
int transposeA, lda;
|
|
12859
|
-
|
|
12860
|
-
if (!ggml_is_transposed(src1)) {
|
|
12861
|
-
transposeA = CblasTrans;
|
|
12862
|
-
lda = m;
|
|
12863
|
-
} else {
|
|
12864
|
-
transposeA = CblasNoTrans;
|
|
12865
|
-
lda = k;
|
|
12866
|
-
}
|
|
12867
|
-
|
|
12868
|
-
float * a = (float *) ((char *) src1->data);
|
|
12869
|
-
float * b = (float *) ((char *) src0->data);
|
|
12870
|
-
float * c = (float *) ((char *) dst->data);
|
|
12871
|
-
|
|
12872
|
-
cblas_sgemm(CblasRowMajor, transposeA, CblasNoTrans, m, n, k, 1.0, a, lda, b, n, 0.0, c, n);
|
|
12873
|
-
|
|
12874
|
-
return;
|
|
12875
|
-
}
|
|
12876
|
-
#endif
|
|
12877
|
-
|
|
12878
12705
|
// dst[:,:,:,:] = 0
|
|
12879
12706
|
// for i2,i3:
|
|
12880
12707
|
// for i1:
|
|
@@ -13004,8 +12831,6 @@ static void ggml_compute_forward_out_prod_q_f32(
|
|
|
13004
12831
|
// nb01 >= nb00 - src0 is not transposed
|
|
13005
12832
|
// compute by src0 rows
|
|
13006
12833
|
|
|
13007
|
-
// TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
|
13008
|
-
|
|
13009
12834
|
if (params->type == GGML_TASK_TYPE_INIT) {
|
|
13010
12835
|
if (ith != 0) {
|
|
13011
12836
|
return;
|
|
@@ -13402,6 +13227,8 @@ static void ggml_compute_forward_get_rows_q(
|
|
|
13402
13227
|
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
|
|
13403
13228
|
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
|
13404
13229
|
|
|
13230
|
+
assert(i01 >= 0 && i01 < ne01);
|
|
13231
|
+
|
|
13405
13232
|
dequantize_row_q(
|
|
13406
13233
|
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
|
13407
13234
|
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
|
@@ -13445,6 +13272,8 @@ static void ggml_compute_forward_get_rows_f16(
|
|
|
13445
13272
|
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
|
|
13446
13273
|
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
|
13447
13274
|
|
|
13275
|
+
assert(i01 >= 0 && i01 < ne01);
|
|
13276
|
+
|
|
13448
13277
|
ggml_fp16_to_fp32_row(
|
|
13449
13278
|
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
|
13450
13279
|
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
|
@@ -13488,7 +13317,9 @@ static void ggml_compute_forward_get_rows_bf16(
|
|
|
13488
13317
|
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
|
|
13489
13318
|
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
|
13490
13319
|
|
|
13491
|
-
|
|
13320
|
+
assert(i01 >= 0 && i01 < ne01);
|
|
13321
|
+
|
|
13322
|
+
ggml_bf16_to_fp32_row(
|
|
13492
13323
|
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
|
13493
13324
|
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
|
13494
13325
|
}
|
|
@@ -13531,6 +13362,8 @@ static void ggml_compute_forward_get_rows_f32(
|
|
|
13531
13362
|
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
|
|
13532
13363
|
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
|
13533
13364
|
|
|
13365
|
+
assert(i01 >= 0 && i01 < ne01);
|
|
13366
|
+
|
|
13534
13367
|
ggml_vec_cpy_f32(nc,
|
|
13535
13368
|
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3),
|
|
13536
13369
|
(float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03));
|
|
@@ -16686,7 +16519,10 @@ static void ggml_compute_forward_map_unary_f32(
|
|
|
16686
16519
|
|
|
16687
16520
|
const struct ggml_tensor * src0 = dst->src[0];
|
|
16688
16521
|
|
|
16689
|
-
|
|
16522
|
+
assert(params->ith == 0);
|
|
16523
|
+
assert(ggml_is_contiguous_1(src0));
|
|
16524
|
+
assert(ggml_is_contiguous_1(dst));
|
|
16525
|
+
assert(ggml_are_same_shape(src0, dst));
|
|
16690
16526
|
|
|
16691
16527
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
16692
16528
|
return;
|
|
@@ -16695,9 +16531,6 @@ static void ggml_compute_forward_map_unary_f32(
|
|
|
16695
16531
|
const int n = ggml_nrows(src0);
|
|
16696
16532
|
const int nc = src0->ne[0];
|
|
16697
16533
|
|
|
16698
|
-
assert( dst->nb[0] == sizeof(float));
|
|
16699
|
-
assert(src0->nb[0] == sizeof(float));
|
|
16700
|
-
|
|
16701
16534
|
for (int i = 0; i < n; i++) {
|
|
16702
16535
|
fun(nc,
|
|
16703
16536
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
|
@@ -16735,6 +16568,9 @@ static void ggml_compute_forward_map_binary_f32(
|
|
|
16735
16568
|
const struct ggml_tensor * src1 = dst->src[1];
|
|
16736
16569
|
|
|
16737
16570
|
assert(params->ith == 0);
|
|
16571
|
+
assert(ggml_is_contiguous_1(src0));
|
|
16572
|
+
assert(ggml_is_contiguous_1(src1));
|
|
16573
|
+
assert(ggml_is_contiguous_1(dst));
|
|
16738
16574
|
assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
|
|
16739
16575
|
|
|
16740
16576
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
@@ -16744,10 +16580,6 @@ static void ggml_compute_forward_map_binary_f32(
|
|
|
16744
16580
|
const int n = ggml_nrows(src0);
|
|
16745
16581
|
const int nc = src0->ne[0];
|
|
16746
16582
|
|
|
16747
|
-
assert( dst->nb[0] == sizeof(float));
|
|
16748
|
-
assert(src0->nb[0] == sizeof(float));
|
|
16749
|
-
assert(src1->nb[0] == sizeof(float));
|
|
16750
|
-
|
|
16751
16583
|
for (int i = 0; i < n; i++) {
|
|
16752
16584
|
fun(nc,
|
|
16753
16585
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
|
@@ -18905,6 +18737,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
|
|
|
18905
18737
|
switch (node->op) {
|
|
18906
18738
|
case GGML_OP_CPY:
|
|
18907
18739
|
case GGML_OP_DUP:
|
|
18740
|
+
case GGML_OP_CONT:
|
|
18908
18741
|
case GGML_OP_ADD:
|
|
18909
18742
|
case GGML_OP_ADD1:
|
|
18910
18743
|
case GGML_OP_ACC:
|
|
@@ -18989,7 +18822,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
|
|
|
18989
18822
|
} break;
|
|
18990
18823
|
case GGML_OP_SCALE:
|
|
18991
18824
|
case GGML_OP_SET:
|
|
18992
|
-
case GGML_OP_CONT:
|
|
18993
18825
|
case GGML_OP_RESHAPE:
|
|
18994
18826
|
case GGML_OP_VIEW:
|
|
18995
18827
|
case GGML_OP_PERMUTE:
|
|
@@ -19149,8 +18981,11 @@ static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_comput
|
|
|
19149
18981
|
sched_yield();
|
|
19150
18982
|
}
|
|
19151
18983
|
|
|
19152
|
-
*
|
|
19153
|
-
if (*
|
|
18984
|
+
*node_n = atomic_load(&state->shared->node_n);
|
|
18985
|
+
if (*node_n != last_node_n) {
|
|
18986
|
+
break;
|
|
18987
|
+
}
|
|
18988
|
+
|
|
19154
18989
|
#if defined(__SSE3__)
|
|
19155
18990
|
// Tell the processor we're spinning. It's a processor hint for spinlocks.
|
|
19156
18991
|
_mm_pause();
|
|
@@ -19160,15 +18995,18 @@ static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_comput
|
|
|
19160
18995
|
|
|
19161
18996
|
static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_compute_state * state, const bool do_yield) {
|
|
19162
18997
|
// wait for other threads to finish
|
|
19163
|
-
const int last_task_phase = *
|
|
18998
|
+
const int last_task_phase = *task_phase;
|
|
19164
18999
|
|
|
19165
19000
|
while (true) {
|
|
19166
19001
|
if (do_yield) {
|
|
19167
19002
|
sched_yield();
|
|
19168
19003
|
}
|
|
19169
19004
|
|
|
19170
|
-
*
|
|
19171
|
-
if (*
|
|
19005
|
+
*task_phase = atomic_load(&state->shared->node_task);
|
|
19006
|
+
if (*task_phase != last_task_phase) {
|
|
19007
|
+
break;
|
|
19008
|
+
}
|
|
19009
|
+
|
|
19172
19010
|
#if defined(__SSE3__)
|
|
19173
19011
|
// Tell the processor we're spinning. It's a processor hint for spinlocks.
|
|
19174
19012
|
_mm_pause();
|
|
@@ -19368,17 +19206,6 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
|
|
19368
19206
|
{
|
|
19369
19207
|
const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type;
|
|
19370
19208
|
|
|
19371
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
|
19372
|
-
if (ggml_compute_forward_mul_mat_use_blas(node)) {
|
|
19373
|
-
if (node->src[0]->type != GGML_TYPE_F32) {
|
|
19374
|
-
// here we need memory for fully dequantized matrix from src0
|
|
19375
|
-
// take into account that src0 can be broadcasted into src1[2,3]
|
|
19376
|
-
cur = ggml_type_size(GGML_TYPE_F32)
|
|
19377
|
-
* node->src[0]->ne[0]*node->src[0]->ne[1]
|
|
19378
|
-
* node->src[1]->ne[2]*node->src[1]->ne[3];
|
|
19379
|
-
}
|
|
19380
|
-
} else
|
|
19381
|
-
#endif
|
|
19382
19209
|
if (node->src[1]->type != vec_dot_type) {
|
|
19383
19210
|
cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
|
|
19384
19211
|
}
|
|
@@ -22676,7 +22503,7 @@ int ggml_cpu_has_wasm_simd(void) {
|
|
|
22676
22503
|
}
|
|
22677
22504
|
|
|
22678
22505
|
int ggml_cpu_has_blas(void) {
|
|
22679
|
-
#if defined(
|
|
22506
|
+
#if defined(GGML_USE_BLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL)
|
|
22680
22507
|
return 1;
|
|
22681
22508
|
#else
|
|
22682
22509
|
return 0;
|