llama_cpp 0.16.0 → 0.16.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/ext/llama_cpp/extconf.rb +2 -0
- data/ext/llama_cpp/llama_cpp.cpp +2 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- data/vendor/tmp/llama.cpp/Makefile +110 -53
- data/vendor/tmp/llama.cpp/ggml-alloc.c +78 -22
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +20 -8
- data/vendor/tmp/llama.cpp/ggml-backend.c +178 -64
- data/vendor/tmp/llama.cpp/ggml-backend.h +3 -3
- data/vendor/tmp/llama.cpp/ggml-blas.cpp +363 -0
- data/vendor/tmp/llama.cpp/ggml-blas.h +23 -0
- data/vendor/tmp/llama.cpp/ggml-common.h +6 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +1 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +21 -9
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +15 -1491
- data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +76 -61
- data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +77 -10
- data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +1 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +5 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +20 -0
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +95 -129
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +8 -7
- data/vendor/tmp/llama.cpp/ggml-metal.m +11 -9
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +13 -12
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +19 -23
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +1230 -1129
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +181 -148
- data/vendor/tmp/llama.cpp/ggml.c +102 -275
- data/vendor/tmp/llama.cpp/llama.cpp +103 -47
- data/vendor/tmp/llama.cpp/llama.h +4 -0
- metadata +15 -3
data/vendor/tmp/llama.cpp/ggml.c
CHANGED
@@ -297,12 +297,6 @@ inline static void * ggml_calloc(size_t num, size_t size) {
|
|
297
297
|
|
298
298
|
#if defined(GGML_USE_ACCELERATE)
|
299
299
|
#include <Accelerate/Accelerate.h>
|
300
|
-
#elif defined(GGML_USE_OPENBLAS)
|
301
|
-
#if defined(GGML_BLAS_USE_MKL)
|
302
|
-
#include <mkl.h>
|
303
|
-
#else
|
304
|
-
#include <cblas.h>
|
305
|
-
#endif
|
306
300
|
#endif
|
307
301
|
|
308
302
|
// floating point type used to accumulate sums
|
@@ -3212,35 +3206,42 @@ GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor) {
|
|
3212
3206
|
return tensor->nb[0] > tensor->nb[1];
|
3213
3207
|
}
|
3214
3208
|
|
3215
|
-
|
3216
|
-
|
3209
|
+
static bool ggml_is_contiguous_n(const struct ggml_tensor * tensor, int n) {
|
3210
|
+
size_t next_nb = ggml_type_size(tensor->type);
|
3211
|
+
if (tensor->ne[0] != ggml_blck_size(tensor->type) && tensor->nb[0] != next_nb) {
|
3212
|
+
return false;
|
3213
|
+
}
|
3214
|
+
next_nb *= tensor->ne[0]/ggml_blck_size(tensor->type);
|
3215
|
+
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
3216
|
+
if (tensor->ne[i] != 1) {
|
3217
|
+
if (i > n) {
|
3218
|
+
if (tensor->nb[i] != next_nb) {
|
3219
|
+
return false;
|
3220
|
+
}
|
3221
|
+
next_nb *= tensor->ne[i];
|
3222
|
+
} else {
|
3223
|
+
// this dimension does not need to be contiguous
|
3224
|
+
next_nb = tensor->ne[i]*tensor->nb[i];
|
3225
|
+
}
|
3226
|
+
}
|
3227
|
+
}
|
3228
|
+
return true;
|
3229
|
+
}
|
3217
3230
|
|
3218
|
-
|
3219
|
-
|
3220
|
-
tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) &&
|
3221
|
-
tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
|
3222
|
-
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
3231
|
+
GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
|
3232
|
+
return ggml_is_contiguous_0(tensor);
|
3223
3233
|
}
|
3224
3234
|
|
3225
3235
|
GGML_CALL bool ggml_is_contiguous_0(const struct ggml_tensor * tensor) {
|
3226
|
-
return
|
3236
|
+
return ggml_is_contiguous_n(tensor, 0);
|
3227
3237
|
}
|
3228
3238
|
|
3229
3239
|
GGML_CALL bool ggml_is_contiguous_1(const struct ggml_tensor * tensor) {
|
3230
|
-
|
3231
|
-
|
3232
|
-
return
|
3233
|
-
tensor->nb[0] == ggml_type_size(tensor->type) &&
|
3234
|
-
tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
|
3235
|
-
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
3240
|
+
return ggml_is_contiguous_n(tensor, 1);
|
3236
3241
|
}
|
3237
3242
|
|
3238
3243
|
GGML_CALL bool ggml_is_contiguous_2(const struct ggml_tensor * tensor) {
|
3239
|
-
|
3240
|
-
|
3241
|
-
return
|
3242
|
-
tensor->nb[0] == ggml_type_size(tensor->type) &&
|
3243
|
-
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
3244
|
+
return ggml_is_contiguous_n(tensor, 2);
|
3244
3245
|
}
|
3245
3246
|
|
3246
3247
|
GGML_CALL bool ggml_is_permuted(const struct ggml_tensor * tensor) {
|
@@ -3272,20 +3273,20 @@ bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor
|
|
3272
3273
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
3273
3274
|
|
3274
3275
|
return
|
3275
|
-
(t0->ne[0] == t1->ne[0]
|
3276
|
-
(t0->ne[1] == t1->ne[1]
|
3277
|
-
(t0->ne[2] == t1->ne[2]
|
3278
|
-
(t0->ne[3] == t1->ne[3]
|
3276
|
+
(t0->ne[0] == t1->ne[0]) &&
|
3277
|
+
(t0->ne[1] == t1->ne[1]) &&
|
3278
|
+
(t0->ne[2] == t1->ne[2]) &&
|
3279
|
+
(t0->ne[3] == t1->ne[3]);
|
3279
3280
|
}
|
3280
3281
|
|
3281
3282
|
bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
3282
3283
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
3283
3284
|
|
3284
3285
|
return
|
3285
|
-
(t0->nb[0] == t1->nb[0]
|
3286
|
-
(t0->nb[1] == t1->nb[1]
|
3287
|
-
(t0->nb[2] == t1->nb[2]
|
3288
|
-
(t0->nb[3] == t1->nb[3]
|
3286
|
+
(t0->nb[0] == t1->nb[0]) &&
|
3287
|
+
(t0->nb[1] == t1->nb[1]) &&
|
3288
|
+
(t0->nb[2] == t1->nb[2]) &&
|
3289
|
+
(t0->nb[3] == t1->nb[3]);
|
3289
3290
|
}
|
3290
3291
|
|
3291
3292
|
// check if t1 can be represented as a repeatition of t0
|
@@ -4078,32 +4079,26 @@ float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) {
|
|
4078
4079
|
switch (tensor->type) {
|
4079
4080
|
case GGML_TYPE_I8:
|
4080
4081
|
{
|
4081
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
|
4082
4082
|
return ((int8_t *)(tensor->data))[i];
|
4083
4083
|
}
|
4084
4084
|
case GGML_TYPE_I16:
|
4085
4085
|
{
|
4086
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
|
4087
4086
|
return ((int16_t *)(tensor->data))[i];
|
4088
4087
|
}
|
4089
4088
|
case GGML_TYPE_I32:
|
4090
4089
|
{
|
4091
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
|
4092
4090
|
return ((int32_t *)(tensor->data))[i];
|
4093
4091
|
}
|
4094
4092
|
case GGML_TYPE_F16:
|
4095
4093
|
{
|
4096
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
|
4097
4094
|
return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
|
4098
4095
|
}
|
4099
4096
|
case GGML_TYPE_BF16:
|
4100
4097
|
{
|
4101
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_bf16_t));
|
4102
4098
|
return GGML_BF16_TO_FP32(((ggml_bf16_t *)(tensor->data))[i]);
|
4103
4099
|
}
|
4104
4100
|
case GGML_TYPE_F32:
|
4105
4101
|
{
|
4106
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(float));
|
4107
4102
|
return ((float *)(tensor->data))[i];
|
4108
4103
|
}
|
4109
4104
|
default:
|
@@ -4125,32 +4120,26 @@ void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) {
|
|
4125
4120
|
switch (tensor->type) {
|
4126
4121
|
case GGML_TYPE_I8:
|
4127
4122
|
{
|
4128
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
|
4129
4123
|
((int8_t *)(tensor->data))[i] = value;
|
4130
4124
|
} break;
|
4131
4125
|
case GGML_TYPE_I16:
|
4132
4126
|
{
|
4133
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
|
4134
4127
|
((int16_t *)(tensor->data))[i] = value;
|
4135
4128
|
} break;
|
4136
4129
|
case GGML_TYPE_I32:
|
4137
4130
|
{
|
4138
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
|
4139
4131
|
((int32_t *)(tensor->data))[i] = value;
|
4140
4132
|
} break;
|
4141
4133
|
case GGML_TYPE_F16:
|
4142
4134
|
{
|
4143
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
|
4144
4135
|
((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
|
4145
4136
|
} break;
|
4146
4137
|
case GGML_TYPE_BF16:
|
4147
4138
|
{
|
4148
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(ggml_bf16_t));
|
4149
4139
|
((ggml_bf16_t *)(tensor->data))[i] = GGML_FP32_TO_BF16(value);
|
4150
4140
|
} break;
|
4151
4141
|
case GGML_TYPE_F32:
|
4152
4142
|
{
|
4153
|
-
GGML_ASSERT(tensor->nb[0] == sizeof(float));
|
4154
4143
|
((float *)(tensor->data))[i] = value;
|
4155
4144
|
} break;
|
4156
4145
|
default:
|
@@ -7343,13 +7332,15 @@ struct ggml_tensor * ggml_add_rel_pos_inplace(
|
|
7343
7332
|
return ggml_add_rel_pos_impl(ctx, a, pw, ph, true);
|
7344
7333
|
}
|
7345
7334
|
|
7346
|
-
//
|
7335
|
+
// ggml_unary
|
7347
7336
|
|
7348
7337
|
static struct ggml_tensor * ggml_unary_impl(
|
7349
7338
|
struct ggml_context * ctx,
|
7350
7339
|
struct ggml_tensor * a,
|
7351
7340
|
enum ggml_unary_op op,
|
7352
7341
|
bool inplace) {
|
7342
|
+
GGML_ASSERT(ggml_is_contiguous_1(a));
|
7343
|
+
|
7353
7344
|
bool is_node = false;
|
7354
7345
|
|
7355
7346
|
if (!inplace && (a->grad)) {
|
@@ -11014,6 +11005,8 @@ static void ggml_compute_forward_abs_f32(
|
|
11014
11005
|
const struct ggml_tensor * src0 = dst->src[0];
|
11015
11006
|
|
11016
11007
|
assert(params->ith == 0);
|
11008
|
+
assert(ggml_is_contiguous_1(src0));
|
11009
|
+
assert(ggml_is_contiguous_1(dst));
|
11017
11010
|
assert(ggml_are_same_shape(src0, dst));
|
11018
11011
|
|
11019
11012
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
@@ -11023,9 +11016,6 @@ static void ggml_compute_forward_abs_f32(
|
|
11023
11016
|
const int n = ggml_nrows(src0);
|
11024
11017
|
const int nc = src0->ne[0];
|
11025
11018
|
|
11026
|
-
assert(dst->nb[0] == sizeof(float));
|
11027
|
-
assert(src0->nb[0] == sizeof(float));
|
11028
|
-
|
11029
11019
|
for (int i = 0; i < n; i++) {
|
11030
11020
|
ggml_vec_abs_f32(nc,
|
11031
11021
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
@@ -11060,6 +11050,8 @@ static void ggml_compute_forward_sgn_f32(
|
|
11060
11050
|
const struct ggml_tensor * src0 = dst->src[0];
|
11061
11051
|
|
11062
11052
|
assert(params->ith == 0);
|
11053
|
+
assert(ggml_is_contiguous_1(src0));
|
11054
|
+
assert(ggml_is_contiguous_1(dst));
|
11063
11055
|
assert(ggml_are_same_shape(src0, dst));
|
11064
11056
|
|
11065
11057
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
@@ -11069,9 +11061,6 @@ static void ggml_compute_forward_sgn_f32(
|
|
11069
11061
|
const int n = ggml_nrows(src0);
|
11070
11062
|
const int nc = src0->ne[0];
|
11071
11063
|
|
11072
|
-
assert(dst->nb[0] == sizeof(float));
|
11073
|
-
assert(src0->nb[0] == sizeof(float));
|
11074
|
-
|
11075
11064
|
for (int i = 0; i < n; i++) {
|
11076
11065
|
ggml_vec_sgn_f32(nc,
|
11077
11066
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
@@ -11106,6 +11095,8 @@ static void ggml_compute_forward_neg_f32(
|
|
11106
11095
|
const struct ggml_tensor * src0 = dst->src[0];
|
11107
11096
|
|
11108
11097
|
assert(params->ith == 0);
|
11098
|
+
assert(ggml_is_contiguous_1(src0));
|
11099
|
+
assert(ggml_is_contiguous_1(dst));
|
11109
11100
|
assert(ggml_are_same_shape(src0, dst));
|
11110
11101
|
|
11111
11102
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
@@ -11115,9 +11106,6 @@ static void ggml_compute_forward_neg_f32(
|
|
11115
11106
|
const int n = ggml_nrows(src0);
|
11116
11107
|
const int nc = src0->ne[0];
|
11117
11108
|
|
11118
|
-
assert(dst->nb[0] == sizeof(float));
|
11119
|
-
assert(src0->nb[0] == sizeof(float));
|
11120
|
-
|
11121
11109
|
for (int i = 0; i < n; i++) {
|
11122
11110
|
ggml_vec_neg_f32(nc,
|
11123
11111
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
@@ -11152,6 +11140,8 @@ static void ggml_compute_forward_step_f32(
|
|
11152
11140
|
const struct ggml_tensor * src0 = dst->src[0];
|
11153
11141
|
|
11154
11142
|
assert(params->ith == 0);
|
11143
|
+
assert(ggml_is_contiguous_1(src0));
|
11144
|
+
assert(ggml_is_contiguous_1(dst));
|
11155
11145
|
assert(ggml_are_same_shape(src0, dst));
|
11156
11146
|
|
11157
11147
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
@@ -11161,9 +11151,6 @@ static void ggml_compute_forward_step_f32(
|
|
11161
11151
|
const int n = ggml_nrows(src0);
|
11162
11152
|
const int nc = src0->ne[0];
|
11163
11153
|
|
11164
|
-
assert(dst->nb[0] == sizeof(float));
|
11165
|
-
assert(src0->nb[0] == sizeof(float));
|
11166
|
-
|
11167
11154
|
for (int i = 0; i < n; i++) {
|
11168
11155
|
ggml_vec_step_f32(nc,
|
11169
11156
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
@@ -11198,6 +11185,8 @@ static void ggml_compute_forward_tanh_f32(
|
|
11198
11185
|
const struct ggml_tensor * src0 = dst->src[0];
|
11199
11186
|
|
11200
11187
|
assert(params->ith == 0);
|
11188
|
+
assert(ggml_is_contiguous_1(src0));
|
11189
|
+
assert(ggml_is_contiguous_1(dst));
|
11201
11190
|
assert(ggml_are_same_shape(src0, dst));
|
11202
11191
|
|
11203
11192
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
@@ -11207,9 +11196,6 @@ static void ggml_compute_forward_tanh_f32(
|
|
11207
11196
|
const int n = ggml_nrows(src0);
|
11208
11197
|
const int nc = src0->ne[0];
|
11209
11198
|
|
11210
|
-
assert(dst->nb[0] == sizeof(float));
|
11211
|
-
assert(src0->nb[0] == sizeof(float));
|
11212
|
-
|
11213
11199
|
for (int i = 0; i < n; i++) {
|
11214
11200
|
ggml_vec_tanh_f32(nc,
|
11215
11201
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
@@ -11244,6 +11230,8 @@ static void ggml_compute_forward_elu_f32(
|
|
11244
11230
|
const struct ggml_tensor * src0 = dst->src[0];
|
11245
11231
|
|
11246
11232
|
assert(params->ith == 0);
|
11233
|
+
assert(ggml_is_contiguous_1(src0));
|
11234
|
+
assert(ggml_is_contiguous_1(dst));
|
11247
11235
|
assert(ggml_are_same_shape(src0, dst));
|
11248
11236
|
|
11249
11237
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
@@ -11253,9 +11241,6 @@ static void ggml_compute_forward_elu_f32(
|
|
11253
11241
|
const int n = ggml_nrows(src0);
|
11254
11242
|
const int nc = src0->ne[0];
|
11255
11243
|
|
11256
|
-
assert(dst->nb[0] == sizeof(float));
|
11257
|
-
assert(src0->nb[0] == sizeof(float));
|
11258
|
-
|
11259
11244
|
for (int i = 0; i < n; i++) {
|
11260
11245
|
ggml_vec_elu_f32(nc,
|
11261
11246
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
@@ -11290,6 +11275,8 @@ static void ggml_compute_forward_relu_f32(
|
|
11290
11275
|
const struct ggml_tensor * src0 = dst->src[0];
|
11291
11276
|
|
11292
11277
|
assert(params->ith == 0);
|
11278
|
+
assert(ggml_is_contiguous_1(src0));
|
11279
|
+
assert(ggml_is_contiguous_1(dst));
|
11293
11280
|
assert(ggml_are_same_shape(src0, dst));
|
11294
11281
|
|
11295
11282
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
@@ -11299,9 +11286,6 @@ static void ggml_compute_forward_relu_f32(
|
|
11299
11286
|
const int n = ggml_nrows(src0);
|
11300
11287
|
const int nc = src0->ne[0];
|
11301
11288
|
|
11302
|
-
assert(dst->nb[0] == sizeof(float));
|
11303
|
-
assert(src0->nb[0] == sizeof(float));
|
11304
|
-
|
11305
11289
|
for (int i = 0; i < n; i++) {
|
11306
11290
|
ggml_vec_relu_f32(nc,
|
11307
11291
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
@@ -11336,6 +11320,8 @@ static void ggml_compute_forward_sigmoid_f32(
|
|
11336
11320
|
const struct ggml_tensor * src0 = dst->src[0];
|
11337
11321
|
|
11338
11322
|
assert(params->ith == 0);
|
11323
|
+
assert(ggml_is_contiguous_1(src0));
|
11324
|
+
assert(ggml_is_contiguous_1(dst));
|
11339
11325
|
assert(ggml_are_same_shape(src0, dst));
|
11340
11326
|
|
11341
11327
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
@@ -11345,9 +11331,6 @@ static void ggml_compute_forward_sigmoid_f32(
|
|
11345
11331
|
const int n = ggml_nrows(src0);
|
11346
11332
|
const int nc = src0->ne[0];
|
11347
11333
|
|
11348
|
-
assert(dst->nb[0] == sizeof(float));
|
11349
|
-
assert(src0->nb[0] == sizeof(float));
|
11350
|
-
|
11351
11334
|
for (int i = 0; i < n; i++) {
|
11352
11335
|
ggml_vec_sigmoid_f32(nc,
|
11353
11336
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
@@ -11381,9 +11364,9 @@ static void ggml_compute_forward_gelu_f32(
|
|
11381
11364
|
|
11382
11365
|
const struct ggml_tensor * src0 = dst->src[0];
|
11383
11366
|
|
11384
|
-
|
11385
|
-
|
11386
|
-
|
11367
|
+
assert(ggml_is_contiguous_1(src0));
|
11368
|
+
assert(ggml_is_contiguous_1(dst));
|
11369
|
+
assert(ggml_are_same_shape(src0, dst));
|
11387
11370
|
|
11388
11371
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
11389
11372
|
return;
|
@@ -11444,9 +11427,9 @@ static void ggml_compute_forward_gelu_quick_f32(
|
|
11444
11427
|
|
11445
11428
|
const struct ggml_tensor * src0 = dst->src[0];
|
11446
11429
|
|
11447
|
-
|
11448
|
-
|
11449
|
-
|
11430
|
+
assert(ggml_is_contiguous_1(src0));
|
11431
|
+
assert(ggml_is_contiguous_1(dst));
|
11432
|
+
assert(ggml_are_same_shape(src0, dst));
|
11450
11433
|
|
11451
11434
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
11452
11435
|
return;
|
@@ -11507,9 +11490,9 @@ static void ggml_compute_forward_silu_f32(
|
|
11507
11490
|
|
11508
11491
|
const struct ggml_tensor * src0 = dst->src[0];
|
11509
11492
|
|
11510
|
-
|
11511
|
-
|
11512
|
-
|
11493
|
+
assert(ggml_is_contiguous_1(src0));
|
11494
|
+
assert(ggml_is_contiguous_1(dst));
|
11495
|
+
assert(ggml_are_same_shape(src0, dst));
|
11513
11496
|
|
11514
11497
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
11515
11498
|
return;
|
@@ -11570,6 +11553,8 @@ static void ggml_compute_forward_leaky_relu_f32(
|
|
11570
11553
|
const struct ggml_tensor * src0 = dst->src[0];
|
11571
11554
|
|
11572
11555
|
assert(params->ith == 0);
|
11556
|
+
assert(ggml_is_contiguous_1(src0));
|
11557
|
+
assert(ggml_is_contiguous_1(dst));
|
11573
11558
|
assert(ggml_are_same_shape(src0, dst));
|
11574
11559
|
|
11575
11560
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
@@ -11619,11 +11604,11 @@ static void ggml_compute_forward_silu_back_f32(
|
|
11619
11604
|
const struct ggml_tensor * src0 = dst->src[0];
|
11620
11605
|
const struct ggml_tensor * grad = dst->src[1];
|
11621
11606
|
|
11622
|
-
|
11623
|
-
|
11624
|
-
|
11625
|
-
|
11626
|
-
|
11607
|
+
assert(ggml_is_contiguous_1(grad));
|
11608
|
+
assert(ggml_is_contiguous_1(src0));
|
11609
|
+
assert(ggml_is_contiguous_1(dst));
|
11610
|
+
assert(ggml_are_same_shape(src0, dst));
|
11611
|
+
assert(ggml_are_same_shape(src0, grad));
|
11627
11612
|
|
11628
11613
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
11629
11614
|
return;
|
@@ -11685,6 +11670,8 @@ static void ggml_compute_forward_hardswish_f32(
|
|
11685
11670
|
const struct ggml_tensor * src0 = dst->src[0];
|
11686
11671
|
|
11687
11672
|
assert(params->ith == 0);
|
11673
|
+
assert(ggml_is_contiguous_1(src0));
|
11674
|
+
assert(ggml_is_contiguous_1(dst));
|
11688
11675
|
assert(ggml_are_same_shape(src0, dst));
|
11689
11676
|
|
11690
11677
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
@@ -11694,9 +11681,6 @@ static void ggml_compute_forward_hardswish_f32(
|
|
11694
11681
|
const int n = ggml_nrows(src0);
|
11695
11682
|
const int nc = src0->ne[0];
|
11696
11683
|
|
11697
|
-
assert(dst->nb[0] == sizeof(float));
|
11698
|
-
assert(src0->nb[0] == sizeof(float));
|
11699
|
-
|
11700
11684
|
for (int i = 0; i < n; i++) {
|
11701
11685
|
ggml_vec_hardswish_f32(nc,
|
11702
11686
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
@@ -11728,6 +11712,8 @@ static void ggml_compute_forward_hardsigmoid_f32(
|
|
11728
11712
|
const struct ggml_tensor * src0 = dst->src[0];
|
11729
11713
|
|
11730
11714
|
assert(params->ith == 0);
|
11715
|
+
assert(ggml_is_contiguous_1(src0));
|
11716
|
+
assert(ggml_is_contiguous_1(dst));
|
11731
11717
|
assert(ggml_are_same_shape(src0, dst));
|
11732
11718
|
|
11733
11719
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
@@ -11737,9 +11723,6 @@ static void ggml_compute_forward_hardsigmoid_f32(
|
|
11737
11723
|
const int n = ggml_nrows(src0);
|
11738
11724
|
const int nc = src0->ne[0];
|
11739
11725
|
|
11740
|
-
assert(dst->nb[0] == sizeof(float));
|
11741
|
-
assert(src0->nb[0] == sizeof(float));
|
11742
|
-
|
11743
11726
|
for (int i = 0; i < n; i++) {
|
11744
11727
|
ggml_vec_hardsigmoid_f32(nc,
|
11745
11728
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
@@ -12190,39 +12173,6 @@ static void ggml_compute_forward_group_norm(
|
|
12190
12173
|
|
12191
12174
|
// ggml_compute_forward_mul_mat
|
12192
12175
|
|
12193
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
12194
|
-
// helper function to determine if it is better to use BLAS or not
|
12195
|
-
// for large matrices, BLAS is faster
|
12196
|
-
static bool ggml_compute_forward_mul_mat_use_blas(struct ggml_tensor * dst) {
|
12197
|
-
const struct ggml_tensor * src0 = dst->src[0];
|
12198
|
-
const struct ggml_tensor * src1 = dst->src[1];
|
12199
|
-
|
12200
|
-
//const int64_t ne00 = src0->ne[0];
|
12201
|
-
//const int64_t ne01 = src0->ne[1];
|
12202
|
-
|
12203
|
-
const int64_t ne10 = src1->ne[0];
|
12204
|
-
|
12205
|
-
const int64_t ne0 = dst->ne[0];
|
12206
|
-
const int64_t ne1 = dst->ne[1];
|
12207
|
-
|
12208
|
-
// NOTE: with GGML_OP_MUL_MAT_ID we don't want to go through the BLAS branch because it will dequantize (to_float)
|
12209
|
-
// all the experts for each batch element and the processing would become incredibly slow
|
12210
|
-
// TODO: find the optimal values for these
|
12211
|
-
if (dst->op != GGML_OP_MUL_MAT_ID &&
|
12212
|
-
ggml_is_contiguous(src0) &&
|
12213
|
-
ggml_is_contiguous(src1) &&
|
12214
|
-
//src0->type == GGML_TYPE_F32 &&
|
12215
|
-
src1->type == GGML_TYPE_F32 &&
|
12216
|
-
(ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
|
12217
|
-
|
12218
|
-
/*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
|
12219
|
-
return true;
|
12220
|
-
}
|
12221
|
-
|
12222
|
-
return false;
|
12223
|
-
}
|
12224
|
-
#endif
|
12225
|
-
|
12226
12176
|
static void ggml_compute_forward_mul_mat_one_chunk(
|
12227
12177
|
const struct ggml_compute_params * params,
|
12228
12178
|
struct ggml_tensor * dst,
|
@@ -12360,73 +12310,6 @@ static void ggml_compute_forward_mul_mat(
|
|
12360
12310
|
// nb01 >= nb00 - src0 is not transposed
|
12361
12311
|
// compute by src0 rows
|
12362
12312
|
|
12363
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
12364
|
-
if (ggml_compute_forward_mul_mat_use_blas(dst)) {
|
12365
|
-
const int64_t ne_plane = ne01*ne00;
|
12366
|
-
const size_t desired_wsize = ne13*ne12*ne_plane*sizeof(float);
|
12367
|
-
UNUSED(desired_wsize);
|
12368
|
-
|
12369
|
-
if (params->type == GGML_TASK_TYPE_INIT) {
|
12370
|
-
if (type != GGML_TYPE_F32) {
|
12371
|
-
assert(params->wsize >= desired_wsize);
|
12372
|
-
// parallelize by src0 rows
|
12373
|
-
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
12374
|
-
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
12375
|
-
// broadcast src0 into src1 across 2nd,3rd dimension
|
12376
|
-
const int64_t i03 = i13/r3;
|
12377
|
-
const int64_t i02 = i12/r2;
|
12378
|
-
|
12379
|
-
const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
|
12380
|
-
float * const wdata = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
|
12381
|
-
ggml_to_float_t const to_float = type_traits[type].to_float;
|
12382
|
-
|
12383
|
-
for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
|
12384
|
-
to_float((const char *) x + i01*nb01, wdata + i01*ne00, ne00);
|
12385
|
-
}
|
12386
|
-
}
|
12387
|
-
}
|
12388
|
-
}
|
12389
|
-
return;
|
12390
|
-
}
|
12391
|
-
|
12392
|
-
if (params->type == GGML_TASK_TYPE_FINALIZE) {
|
12393
|
-
return;
|
12394
|
-
}
|
12395
|
-
|
12396
|
-
// perform sgemm, parallelization controlled by blas lib
|
12397
|
-
if (ith != 0) {
|
12398
|
-
return;
|
12399
|
-
}
|
12400
|
-
|
12401
|
-
//const int64_t tgemm0 = ggml_perf_time_us();
|
12402
|
-
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
12403
|
-
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
12404
|
-
const int64_t i03 = i13/r3;
|
12405
|
-
const int64_t i02 = i12/r2;
|
12406
|
-
|
12407
|
-
const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
|
12408
|
-
const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
|
12409
|
-
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
12410
|
-
|
12411
|
-
if (type != GGML_TYPE_F32) {
|
12412
|
-
x = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
|
12413
|
-
}
|
12414
|
-
|
12415
|
-
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
12416
|
-
ne1, ne01, ne10,
|
12417
|
-
1.0f, y, ne10,
|
12418
|
-
x, ne00,
|
12419
|
-
0.0f, d, ne01);
|
12420
|
-
}
|
12421
|
-
}
|
12422
|
-
//printf("cblas_sgemm = %.3f ms, %lld flops\n", (ggml_perf_time_us() - tgemm0)/1000.0, ne13*ne12*ne1*ne01*ne10*2);
|
12423
|
-
|
12424
|
-
//printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
|
12425
|
-
|
12426
|
-
return;
|
12427
|
-
}
|
12428
|
-
#endif
|
12429
|
-
|
12430
12313
|
#if GGML_USE_LLAMAFILE
|
12431
12314
|
const bool src1_cont = ggml_is_contiguous(src1);
|
12432
12315
|
|
@@ -12807,19 +12690,7 @@ static void ggml_compute_forward_out_prod_f32(
|
|
12807
12690
|
// nb01 >= nb00 - src0 is not transposed
|
12808
12691
|
// compute by src0 rows
|
12809
12692
|
|
12810
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
12811
|
-
bool use_blas = ggml_is_matrix(src0) &&
|
12812
|
-
ggml_is_matrix(src1) &&
|
12813
|
-
ggml_is_contiguous(src0) &&
|
12814
|
-
(ggml_is_contiguous(src1) || ggml_is_transposed(src1));
|
12815
|
-
#endif
|
12816
|
-
|
12817
12693
|
if (params->type == GGML_TASK_TYPE_INIT) {
|
12818
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) // gemm beta will zero dst
|
12819
|
-
if (use_blas) {
|
12820
|
-
return;
|
12821
|
-
}
|
12822
|
-
#endif
|
12823
12694
|
if (ith != 0) {
|
12824
12695
|
return;
|
12825
12696
|
}
|
@@ -12831,50 +12702,6 @@ static void ggml_compute_forward_out_prod_f32(
|
|
12831
12702
|
return;
|
12832
12703
|
}
|
12833
12704
|
|
12834
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
12835
|
-
if (use_blas) {
|
12836
|
-
if (params->ith != 0) { // All threads other than the first do no work.
|
12837
|
-
return;
|
12838
|
-
}
|
12839
|
-
// Arguments to ggml_compute_forward_out_prod (expressed as major,minor)
|
12840
|
-
// src0: (k,n)
|
12841
|
-
// src1: (k,m)
|
12842
|
-
// dst: (m,n)
|
12843
|
-
//
|
12844
|
-
// Arguments to sgemm (see https://github.com/Reference-LAPACK/lapack/blob/master/BLAS/SRC/sgemm.f)
|
12845
|
-
// Also expressed as (major,minor)
|
12846
|
-
// a: (m,k): so src1 transposed
|
12847
|
-
// b: (k,n): so src0
|
12848
|
-
// c: (m,n)
|
12849
|
-
//
|
12850
|
-
// However, if ggml_is_transposed(src1) is true, then
|
12851
|
-
// src1->data already contains a transposed version, so sgemm mustn't
|
12852
|
-
// transpose it further.
|
12853
|
-
|
12854
|
-
int n = src0->ne[0];
|
12855
|
-
int k = src0->ne[1];
|
12856
|
-
int m = src1->ne[0];
|
12857
|
-
|
12858
|
-
int transposeA, lda;
|
12859
|
-
|
12860
|
-
if (!ggml_is_transposed(src1)) {
|
12861
|
-
transposeA = CblasTrans;
|
12862
|
-
lda = m;
|
12863
|
-
} else {
|
12864
|
-
transposeA = CblasNoTrans;
|
12865
|
-
lda = k;
|
12866
|
-
}
|
12867
|
-
|
12868
|
-
float * a = (float *) ((char *) src1->data);
|
12869
|
-
float * b = (float *) ((char *) src0->data);
|
12870
|
-
float * c = (float *) ((char *) dst->data);
|
12871
|
-
|
12872
|
-
cblas_sgemm(CblasRowMajor, transposeA, CblasNoTrans, m, n, k, 1.0, a, lda, b, n, 0.0, c, n);
|
12873
|
-
|
12874
|
-
return;
|
12875
|
-
}
|
12876
|
-
#endif
|
12877
|
-
|
12878
12705
|
// dst[:,:,:,:] = 0
|
12879
12706
|
// for i2,i3:
|
12880
12707
|
// for i1:
|
@@ -13004,8 +12831,6 @@ static void ggml_compute_forward_out_prod_q_f32(
|
|
13004
12831
|
// nb01 >= nb00 - src0 is not transposed
|
13005
12832
|
// compute by src0 rows
|
13006
12833
|
|
13007
|
-
// TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
13008
|
-
|
13009
12834
|
if (params->type == GGML_TASK_TYPE_INIT) {
|
13010
12835
|
if (ith != 0) {
|
13011
12836
|
return;
|
@@ -13402,6 +13227,8 @@ static void ggml_compute_forward_get_rows_q(
|
|
13402
13227
|
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
|
13403
13228
|
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
13404
13229
|
|
13230
|
+
assert(i01 >= 0 && i01 < ne01);
|
13231
|
+
|
13405
13232
|
dequantize_row_q(
|
13406
13233
|
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
13407
13234
|
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
@@ -13445,6 +13272,8 @@ static void ggml_compute_forward_get_rows_f16(
|
|
13445
13272
|
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
|
13446
13273
|
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
13447
13274
|
|
13275
|
+
assert(i01 >= 0 && i01 < ne01);
|
13276
|
+
|
13448
13277
|
ggml_fp16_to_fp32_row(
|
13449
13278
|
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
13450
13279
|
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
@@ -13488,7 +13317,9 @@ static void ggml_compute_forward_get_rows_bf16(
|
|
13488
13317
|
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
|
13489
13318
|
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
13490
13319
|
|
13491
|
-
|
13320
|
+
assert(i01 >= 0 && i01 < ne01);
|
13321
|
+
|
13322
|
+
ggml_bf16_to_fp32_row(
|
13492
13323
|
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
|
13493
13324
|
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
|
13494
13325
|
}
|
@@ -13531,6 +13362,8 @@ static void ggml_compute_forward_get_rows_f32(
|
|
13531
13362
|
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
|
13532
13363
|
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
|
13533
13364
|
|
13365
|
+
assert(i01 >= 0 && i01 < ne01);
|
13366
|
+
|
13534
13367
|
ggml_vec_cpy_f32(nc,
|
13535
13368
|
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3),
|
13536
13369
|
(float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03));
|
@@ -16686,7 +16519,10 @@ static void ggml_compute_forward_map_unary_f32(
|
|
16686
16519
|
|
16687
16520
|
const struct ggml_tensor * src0 = dst->src[0];
|
16688
16521
|
|
16689
|
-
|
16522
|
+
assert(params->ith == 0);
|
16523
|
+
assert(ggml_is_contiguous_1(src0));
|
16524
|
+
assert(ggml_is_contiguous_1(dst));
|
16525
|
+
assert(ggml_are_same_shape(src0, dst));
|
16690
16526
|
|
16691
16527
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
16692
16528
|
return;
|
@@ -16695,9 +16531,6 @@ static void ggml_compute_forward_map_unary_f32(
|
|
16695
16531
|
const int n = ggml_nrows(src0);
|
16696
16532
|
const int nc = src0->ne[0];
|
16697
16533
|
|
16698
|
-
assert( dst->nb[0] == sizeof(float));
|
16699
|
-
assert(src0->nb[0] == sizeof(float));
|
16700
|
-
|
16701
16534
|
for (int i = 0; i < n; i++) {
|
16702
16535
|
fun(nc,
|
16703
16536
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
@@ -16735,6 +16568,9 @@ static void ggml_compute_forward_map_binary_f32(
|
|
16735
16568
|
const struct ggml_tensor * src1 = dst->src[1];
|
16736
16569
|
|
16737
16570
|
assert(params->ith == 0);
|
16571
|
+
assert(ggml_is_contiguous_1(src0));
|
16572
|
+
assert(ggml_is_contiguous_1(src1));
|
16573
|
+
assert(ggml_is_contiguous_1(dst));
|
16738
16574
|
assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
|
16739
16575
|
|
16740
16576
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
@@ -16744,10 +16580,6 @@ static void ggml_compute_forward_map_binary_f32(
|
|
16744
16580
|
const int n = ggml_nrows(src0);
|
16745
16581
|
const int nc = src0->ne[0];
|
16746
16582
|
|
16747
|
-
assert( dst->nb[0] == sizeof(float));
|
16748
|
-
assert(src0->nb[0] == sizeof(float));
|
16749
|
-
assert(src1->nb[0] == sizeof(float));
|
16750
|
-
|
16751
16583
|
for (int i = 0; i < n; i++) {
|
16752
16584
|
fun(nc,
|
16753
16585
|
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
@@ -18905,6 +18737,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
|
|
18905
18737
|
switch (node->op) {
|
18906
18738
|
case GGML_OP_CPY:
|
18907
18739
|
case GGML_OP_DUP:
|
18740
|
+
case GGML_OP_CONT:
|
18908
18741
|
case GGML_OP_ADD:
|
18909
18742
|
case GGML_OP_ADD1:
|
18910
18743
|
case GGML_OP_ACC:
|
@@ -18989,7 +18822,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
|
|
18989
18822
|
} break;
|
18990
18823
|
case GGML_OP_SCALE:
|
18991
18824
|
case GGML_OP_SET:
|
18992
|
-
case GGML_OP_CONT:
|
18993
18825
|
case GGML_OP_RESHAPE:
|
18994
18826
|
case GGML_OP_VIEW:
|
18995
18827
|
case GGML_OP_PERMUTE:
|
@@ -19149,8 +18981,11 @@ static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_comput
|
|
19149
18981
|
sched_yield();
|
19150
18982
|
}
|
19151
18983
|
|
19152
|
-
*
|
19153
|
-
if (*
|
18984
|
+
*node_n = atomic_load(&state->shared->node_n);
|
18985
|
+
if (*node_n != last_node_n) {
|
18986
|
+
break;
|
18987
|
+
}
|
18988
|
+
|
19154
18989
|
#if defined(__SSE3__)
|
19155
18990
|
// Tell the processor we're spinning. It's a processor hint for spinlocks.
|
19156
18991
|
_mm_pause();
|
@@ -19160,15 +18995,18 @@ static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_comput
|
|
19160
18995
|
|
19161
18996
|
static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_compute_state * state, const bool do_yield) {
|
19162
18997
|
// wait for other threads to finish
|
19163
|
-
const int last_task_phase = *
|
18998
|
+
const int last_task_phase = *task_phase;
|
19164
18999
|
|
19165
19000
|
while (true) {
|
19166
19001
|
if (do_yield) {
|
19167
19002
|
sched_yield();
|
19168
19003
|
}
|
19169
19004
|
|
19170
|
-
*
|
19171
|
-
if (*
|
19005
|
+
*task_phase = atomic_load(&state->shared->node_task);
|
19006
|
+
if (*task_phase != last_task_phase) {
|
19007
|
+
break;
|
19008
|
+
}
|
19009
|
+
|
19172
19010
|
#if defined(__SSE3__)
|
19173
19011
|
// Tell the processor we're spinning. It's a processor hint for spinlocks.
|
19174
19012
|
_mm_pause();
|
@@ -19368,17 +19206,6 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
|
19368
19206
|
{
|
19369
19207
|
const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type;
|
19370
19208
|
|
19371
|
-
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
19372
|
-
if (ggml_compute_forward_mul_mat_use_blas(node)) {
|
19373
|
-
if (node->src[0]->type != GGML_TYPE_F32) {
|
19374
|
-
// here we need memory for fully dequantized matrix from src0
|
19375
|
-
// take into account that src0 can be broadcasted into src1[2,3]
|
19376
|
-
cur = ggml_type_size(GGML_TYPE_F32)
|
19377
|
-
* node->src[0]->ne[0]*node->src[0]->ne[1]
|
19378
|
-
* node->src[1]->ne[2]*node->src[1]->ne[3];
|
19379
|
-
}
|
19380
|
-
} else
|
19381
|
-
#endif
|
19382
19209
|
if (node->src[1]->type != vec_dot_type) {
|
19383
19210
|
cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
|
19384
19211
|
}
|
@@ -22676,7 +22503,7 @@ int ggml_cpu_has_wasm_simd(void) {
|
|
22676
22503
|
}
|
22677
22504
|
|
22678
22505
|
int ggml_cpu_has_blas(void) {
|
22679
|
-
#if defined(
|
22506
|
+
#if defined(GGML_USE_BLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL)
|
22680
22507
|
return 1;
|
22681
22508
|
#else
|
22682
22509
|
return 0;
|