llama_cpp 0.14.2 → 0.14.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/ext/llama_cpp/llama_cpp.cpp +60 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- data/vendor/tmp/llama.cpp/Makefile +20 -3
- data/vendor/tmp/llama.cpp/ggml-alloc.c +7 -3
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +5 -0
- data/vendor/tmp/llama.cpp/ggml-backend.c +154 -124
- data/vendor/tmp/llama.cpp/ggml-backend.h +4 -4
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +8741 -8691
- data/vendor/tmp/llama.cpp/ggml-cuda.h +6 -15
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +1 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +34 -11
- data/vendor/tmp/llama.cpp/ggml-metal.metal +260 -28
- data/vendor/tmp/llama.cpp/ggml-quants.c +25 -13
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +237 -78
- data/vendor/tmp/llama.cpp/ggml-sycl.h +6 -1
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +7 -0
- data/vendor/tmp/llama.cpp/ggml.c +98 -16
- data/vendor/tmp/llama.cpp/llama.cpp +382 -42
- data/vendor/tmp/llama.cpp/llama.h +19 -4
- metadata +3 -3
data/vendor/tmp/llama.cpp/ggml.c
CHANGED
@@ -282,8 +282,6 @@ inline static void * ggml_calloc(size_t num, size_t size) {
|
|
282
282
|
#else
|
283
283
|
#include <cblas.h>
|
284
284
|
#endif
|
285
|
-
#elif defined(GGML_USE_CUBLAS)
|
286
|
-
#include "ggml-cuda.h"
|
287
285
|
#elif defined(GGML_USE_CLBLAST)
|
288
286
|
#include "ggml-opencl.h"
|
289
287
|
#elif defined(GGML_USE_VULKAN)
|
@@ -931,6 +929,101 @@ inline static float vaddvq_f32(float32x4_t v) {
|
|
931
929
|
#define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
|
932
930
|
#endif
|
933
931
|
|
932
|
+
#elif defined(__AVX512F__)
|
933
|
+
|
934
|
+
#define GGML_SIMD
|
935
|
+
|
936
|
+
// F32 AVX512
|
937
|
+
|
938
|
+
#define GGML_F32_STEP 64
|
939
|
+
#define GGML_F32_EPR 16
|
940
|
+
|
941
|
+
#define GGML_F32x16 __m512
|
942
|
+
#define GGML_F32x16_ZERO _mm512_setzero_ps()
|
943
|
+
#define GGML_F32x16_SET1(x) _mm512_set1_ps(x)
|
944
|
+
#define GGML_F32x16_LOAD _mm512_loadu_ps
|
945
|
+
#define GGML_F32x16_STORE _mm512_storeu_ps
|
946
|
+
// _mm512_fmadd_ps is defined in AVX512F so no guard is required
|
947
|
+
#define GGML_F32x16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
|
948
|
+
#define GGML_F32x16_ADD _mm512_add_ps
|
949
|
+
#define GGML_F32x16_MUL _mm512_mul_ps
|
950
|
+
#define GGML_F32x16_REDUCE(res, x) \
|
951
|
+
do { \
|
952
|
+
int offset = GGML_F32_ARR >> 1; \
|
953
|
+
for (int i = 0; i < offset; ++i) { \
|
954
|
+
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
955
|
+
} \
|
956
|
+
offset >>= 1; \
|
957
|
+
for (int i = 0; i < offset; ++i) { \
|
958
|
+
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
959
|
+
} \
|
960
|
+
offset >>= 1; \
|
961
|
+
for (int i = 0; i < offset; ++i) { \
|
962
|
+
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
963
|
+
} \
|
964
|
+
res = _mm512_reduce_add_ps(x[0]); \
|
965
|
+
} while (0)
|
966
|
+
|
967
|
+
// TODO: is this optimal ?
|
968
|
+
|
969
|
+
#define GGML_F32_VEC GGML_F32x16
|
970
|
+
#define GGML_F32_VEC_ZERO GGML_F32x16_ZERO
|
971
|
+
#define GGML_F32_VEC_SET1 GGML_F32x16_SET1
|
972
|
+
#define GGML_F32_VEC_LOAD GGML_F32x16_LOAD
|
973
|
+
#define GGML_F32_VEC_STORE GGML_F32x16_STORE
|
974
|
+
#define GGML_F32_VEC_FMA GGML_F32x16_FMA
|
975
|
+
#define GGML_F32_VEC_ADD GGML_F32x16_ADD
|
976
|
+
#define GGML_F32_VEC_MUL GGML_F32x16_MUL
|
977
|
+
#define GGML_F32_VEC_REDUCE GGML_F32x16_REDUCE
|
978
|
+
|
979
|
+
// F16 AVX512
|
980
|
+
|
981
|
+
// F16 AVX
|
982
|
+
|
983
|
+
#define GGML_F16_STEP 64
|
984
|
+
#define GGML_F16_EPR 16
|
985
|
+
|
986
|
+
// AVX512 has FP16 extension (AVX512_FP16) but I don't have it on my machine so I use FP32 instead
|
987
|
+
|
988
|
+
#define GGML_F32Cx16 __m512
|
989
|
+
#define GGML_F32Cx16_ZERO _mm512_setzero_ps()
|
990
|
+
#define GGML_F32Cx16_SET1(x) _mm512_set1_ps(x)
|
991
|
+
|
992
|
+
// unlike _mm256_cvt intrinsics that require F16C, _mm512_cvt is defined in AVX512F
|
993
|
+
// so F16C guard isn't required
|
994
|
+
#define GGML_F32Cx16_LOAD(x) _mm512_cvtph_ps(_mm256_loadu_si256((__m256i *)(x)))
|
995
|
+
#define GGML_F32Cx16_STORE(x, y) _mm256_storeu_si256((__m256i *)(x), _mm512_cvtps_ph(y, 0))
|
996
|
+
|
997
|
+
#define GGML_F32Cx16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
|
998
|
+
#define GGML_F32Cx16_ADD _mm512_add_ps
|
999
|
+
#define GGML_F32Cx16_MUL _mm512_mul_ps
|
1000
|
+
#define GGML_F32Cx16_REDUCE(res, x) \
|
1001
|
+
do { \
|
1002
|
+
int offset = GGML_F32_ARR >> 1; \
|
1003
|
+
for (int i = 0; i < offset; ++i) { \
|
1004
|
+
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
1005
|
+
} \
|
1006
|
+
offset >>= 1; \
|
1007
|
+
for (int i = 0; i < offset; ++i) { \
|
1008
|
+
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
1009
|
+
} \
|
1010
|
+
offset >>= 1; \
|
1011
|
+
for (int i = 0; i < offset; ++i) { \
|
1012
|
+
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
1013
|
+
} \
|
1014
|
+
res = _mm512_reduce_add_ps(x[0]); \
|
1015
|
+
} while (0)
|
1016
|
+
|
1017
|
+
#define GGML_F16_VEC GGML_F32Cx16
|
1018
|
+
#define GGML_F16_VEC_ZERO GGML_F32Cx16_ZERO
|
1019
|
+
#define GGML_F16_VEC_SET1 GGML_F32Cx16_SET1
|
1020
|
+
#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx16_LOAD(p)
|
1021
|
+
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx16_STORE(p, r[i])
|
1022
|
+
#define GGML_F16_VEC_FMA GGML_F32Cx16_FMA
|
1023
|
+
#define GGML_F16_VEC_ADD GGML_F32Cx16_ADD
|
1024
|
+
#define GGML_F16_VEC_MUL GGML_F32Cx16_MUL
|
1025
|
+
#define GGML_F16_VEC_REDUCE GGML_F32Cx16_REDUCE
|
1026
|
+
|
934
1027
|
#elif defined(__AVX__)
|
935
1028
|
|
936
1029
|
#define GGML_SIMD
|
@@ -2545,9 +2638,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
2545
2638
|
GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
|
2546
2639
|
}
|
2547
2640
|
|
2548
|
-
#if defined(
|
2549
|
-
ggml_init_cublas();
|
2550
|
-
#elif defined(GGML_USE_CLBLAST)
|
2641
|
+
#if defined(GGML_USE_CLBLAST)
|
2551
2642
|
ggml_cl_init();
|
2552
2643
|
#elif defined(GGML_USE_VULKAN)
|
2553
2644
|
ggml_vk_init_cpu_assist();
|
@@ -11010,7 +11101,6 @@ static void ggml_compute_forward_out_prod_f32(
|
|
11010
11101
|
// nb01 >= nb00 - src0 is not transposed
|
11011
11102
|
// compute by src0 rows
|
11012
11103
|
|
11013
|
-
// TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod
|
11014
11104
|
// TODO: #if defined(GGML_USE_CLBLAST)
|
11015
11105
|
|
11016
11106
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
@@ -11210,7 +11300,6 @@ static void ggml_compute_forward_out_prod_q_f32(
|
|
11210
11300
|
// nb01 >= nb00 - src0 is not transposed
|
11211
11301
|
// compute by src0 rows
|
11212
11302
|
|
11213
|
-
// TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod
|
11214
11303
|
// TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
11215
11304
|
|
11216
11305
|
if (params->type == GGML_TASK_TYPE_INIT) {
|
@@ -15956,14 +16045,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
15956
16045
|
return;
|
15957
16046
|
}
|
15958
16047
|
|
15959
|
-
#
|
15960
|
-
bool skip_cpu = ggml_cuda_compute_forward(params, tensor);
|
15961
|
-
if (skip_cpu) {
|
15962
|
-
return;
|
15963
|
-
}
|
15964
|
-
GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU);
|
15965
|
-
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU);
|
15966
|
-
#elif defined(GGML_USE_VULKAN)
|
16048
|
+
#if defined(GGML_USE_VULKAN)
|
15967
16049
|
const bool skip_cpu = ggml_vk_compute_forward_cpu_assist(params, tensor);
|
15968
16050
|
#ifdef GGML_VULKAN_CHECK_RESULTS
|
15969
16051
|
if (skip_cpu) {
|
@@ -15975,7 +16057,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
15975
16057
|
}
|
15976
16058
|
GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU);
|
15977
16059
|
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU);
|
15978
|
-
#endif //
|
16060
|
+
#endif // GGML_USE_VULKAN
|
15979
16061
|
|
15980
16062
|
#ifdef GGML_USE_SYCL
|
15981
16063
|
bool skip_cpu = ggml_sycl_compute_forward(params, tensor);
|