llama_cpp 0.14.2 → 0.14.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/ext/llama_cpp/llama_cpp.cpp +60 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- data/vendor/tmp/llama.cpp/Makefile +20 -3
- data/vendor/tmp/llama.cpp/ggml-alloc.c +7 -3
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +5 -0
- data/vendor/tmp/llama.cpp/ggml-backend.c +154 -124
- data/vendor/tmp/llama.cpp/ggml-backend.h +4 -4
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +8741 -8691
- data/vendor/tmp/llama.cpp/ggml-cuda.h +6 -15
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +1 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +34 -11
- data/vendor/tmp/llama.cpp/ggml-metal.metal +260 -28
- data/vendor/tmp/llama.cpp/ggml-quants.c +25 -13
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +237 -78
- data/vendor/tmp/llama.cpp/ggml-sycl.h +6 -1
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +7 -0
- data/vendor/tmp/llama.cpp/ggml.c +98 -16
- data/vendor/tmp/llama.cpp/llama.cpp +382 -42
- data/vendor/tmp/llama.cpp/llama.h +19 -4
- metadata +3 -3
data/vendor/tmp/llama.cpp/ggml.c
CHANGED
@@ -282,8 +282,6 @@ inline static void * ggml_calloc(size_t num, size_t size) {
|
|
282
282
|
#else
|
283
283
|
#include <cblas.h>
|
284
284
|
#endif
|
285
|
-
#elif defined(GGML_USE_CUBLAS)
|
286
|
-
#include "ggml-cuda.h"
|
287
285
|
#elif defined(GGML_USE_CLBLAST)
|
288
286
|
#include "ggml-opencl.h"
|
289
287
|
#elif defined(GGML_USE_VULKAN)
|
@@ -931,6 +929,101 @@ inline static float vaddvq_f32(float32x4_t v) {
|
|
931
929
|
#define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
|
932
930
|
#endif
|
933
931
|
|
932
|
+
#elif defined(__AVX512F__)
|
933
|
+
|
934
|
+
#define GGML_SIMD
|
935
|
+
|
936
|
+
// F32 AVX512
|
937
|
+
|
938
|
+
#define GGML_F32_STEP 64
|
939
|
+
#define GGML_F32_EPR 16
|
940
|
+
|
941
|
+
#define GGML_F32x16 __m512
|
942
|
+
#define GGML_F32x16_ZERO _mm512_setzero_ps()
|
943
|
+
#define GGML_F32x16_SET1(x) _mm512_set1_ps(x)
|
944
|
+
#define GGML_F32x16_LOAD _mm512_loadu_ps
|
945
|
+
#define GGML_F32x16_STORE _mm512_storeu_ps
|
946
|
+
// _mm512_fmadd_ps is defined in AVX512F so no guard is required
|
947
|
+
#define GGML_F32x16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
|
948
|
+
#define GGML_F32x16_ADD _mm512_add_ps
|
949
|
+
#define GGML_F32x16_MUL _mm512_mul_ps
|
950
|
+
#define GGML_F32x16_REDUCE(res, x) \
|
951
|
+
do { \
|
952
|
+
int offset = GGML_F32_ARR >> 1; \
|
953
|
+
for (int i = 0; i < offset; ++i) { \
|
954
|
+
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
955
|
+
} \
|
956
|
+
offset >>= 1; \
|
957
|
+
for (int i = 0; i < offset; ++i) { \
|
958
|
+
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
959
|
+
} \
|
960
|
+
offset >>= 1; \
|
961
|
+
for (int i = 0; i < offset; ++i) { \
|
962
|
+
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
963
|
+
} \
|
964
|
+
res = _mm512_reduce_add_ps(x[0]); \
|
965
|
+
} while (0)
|
966
|
+
|
967
|
+
// TODO: is this optimal ?
|
968
|
+
|
969
|
+
#define GGML_F32_VEC GGML_F32x16
|
970
|
+
#define GGML_F32_VEC_ZERO GGML_F32x16_ZERO
|
971
|
+
#define GGML_F32_VEC_SET1 GGML_F32x16_SET1
|
972
|
+
#define GGML_F32_VEC_LOAD GGML_F32x16_LOAD
|
973
|
+
#define GGML_F32_VEC_STORE GGML_F32x16_STORE
|
974
|
+
#define GGML_F32_VEC_FMA GGML_F32x16_FMA
|
975
|
+
#define GGML_F32_VEC_ADD GGML_F32x16_ADD
|
976
|
+
#define GGML_F32_VEC_MUL GGML_F32x16_MUL
|
977
|
+
#define GGML_F32_VEC_REDUCE GGML_F32x16_REDUCE
|
978
|
+
|
979
|
+
// F16 AVX512
|
980
|
+
|
981
|
+
// F16 AVX
|
982
|
+
|
983
|
+
#define GGML_F16_STEP 64
|
984
|
+
#define GGML_F16_EPR 16
|
985
|
+
|
986
|
+
// AVX512 has FP16 extension (AVX512_FP16) but I don't have it on my machine so I use FP32 instead
|
987
|
+
|
988
|
+
#define GGML_F32Cx16 __m512
|
989
|
+
#define GGML_F32Cx16_ZERO _mm512_setzero_ps()
|
990
|
+
#define GGML_F32Cx16_SET1(x) _mm512_set1_ps(x)
|
991
|
+
|
992
|
+
// unlike _mm256_cvt intrinsics that require F16C, _mm512_cvt is defined in AVX512F
|
993
|
+
// so F16C guard isn't required
|
994
|
+
#define GGML_F32Cx16_LOAD(x) _mm512_cvtph_ps(_mm256_loadu_si256((__m256i *)(x)))
|
995
|
+
#define GGML_F32Cx16_STORE(x, y) _mm256_storeu_si256((__m256i *)(x), _mm512_cvtps_ph(y, 0))
|
996
|
+
|
997
|
+
#define GGML_F32Cx16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
|
998
|
+
#define GGML_F32Cx16_ADD _mm512_add_ps
|
999
|
+
#define GGML_F32Cx16_MUL _mm512_mul_ps
|
1000
|
+
#define GGML_F32Cx16_REDUCE(res, x) \
|
1001
|
+
do { \
|
1002
|
+
int offset = GGML_F32_ARR >> 1; \
|
1003
|
+
for (int i = 0; i < offset; ++i) { \
|
1004
|
+
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
1005
|
+
} \
|
1006
|
+
offset >>= 1; \
|
1007
|
+
for (int i = 0; i < offset; ++i) { \
|
1008
|
+
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
1009
|
+
} \
|
1010
|
+
offset >>= 1; \
|
1011
|
+
for (int i = 0; i < offset; ++i) { \
|
1012
|
+
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
1013
|
+
} \
|
1014
|
+
res = _mm512_reduce_add_ps(x[0]); \
|
1015
|
+
} while (0)
|
1016
|
+
|
1017
|
+
#define GGML_F16_VEC GGML_F32Cx16
|
1018
|
+
#define GGML_F16_VEC_ZERO GGML_F32Cx16_ZERO
|
1019
|
+
#define GGML_F16_VEC_SET1 GGML_F32Cx16_SET1
|
1020
|
+
#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx16_LOAD(p)
|
1021
|
+
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx16_STORE(p, r[i])
|
1022
|
+
#define GGML_F16_VEC_FMA GGML_F32Cx16_FMA
|
1023
|
+
#define GGML_F16_VEC_ADD GGML_F32Cx16_ADD
|
1024
|
+
#define GGML_F16_VEC_MUL GGML_F32Cx16_MUL
|
1025
|
+
#define GGML_F16_VEC_REDUCE GGML_F32Cx16_REDUCE
|
1026
|
+
|
934
1027
|
#elif defined(__AVX__)
|
935
1028
|
|
936
1029
|
#define GGML_SIMD
|
@@ -2545,9 +2638,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
2545
2638
|
GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
|
2546
2639
|
}
|
2547
2640
|
|
2548
|
-
#if defined(
|
2549
|
-
ggml_init_cublas();
|
2550
|
-
#elif defined(GGML_USE_CLBLAST)
|
2641
|
+
#if defined(GGML_USE_CLBLAST)
|
2551
2642
|
ggml_cl_init();
|
2552
2643
|
#elif defined(GGML_USE_VULKAN)
|
2553
2644
|
ggml_vk_init_cpu_assist();
|
@@ -11010,7 +11101,6 @@ static void ggml_compute_forward_out_prod_f32(
|
|
11010
11101
|
// nb01 >= nb00 - src0 is not transposed
|
11011
11102
|
// compute by src0 rows
|
11012
11103
|
|
11013
|
-
// TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod
|
11014
11104
|
// TODO: #if defined(GGML_USE_CLBLAST)
|
11015
11105
|
|
11016
11106
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
@@ -11210,7 +11300,6 @@ static void ggml_compute_forward_out_prod_q_f32(
|
|
11210
11300
|
// nb01 >= nb00 - src0 is not transposed
|
11211
11301
|
// compute by src0 rows
|
11212
11302
|
|
11213
|
-
// TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod
|
11214
11303
|
// TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
11215
11304
|
|
11216
11305
|
if (params->type == GGML_TASK_TYPE_INIT) {
|
@@ -15956,14 +16045,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
15956
16045
|
return;
|
15957
16046
|
}
|
15958
16047
|
|
15959
|
-
#
|
15960
|
-
bool skip_cpu = ggml_cuda_compute_forward(params, tensor);
|
15961
|
-
if (skip_cpu) {
|
15962
|
-
return;
|
15963
|
-
}
|
15964
|
-
GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU);
|
15965
|
-
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU);
|
15966
|
-
#elif defined(GGML_USE_VULKAN)
|
16048
|
+
#if defined(GGML_USE_VULKAN)
|
15967
16049
|
const bool skip_cpu = ggml_vk_compute_forward_cpu_assist(params, tensor);
|
15968
16050
|
#ifdef GGML_VULKAN_CHECK_RESULTS
|
15969
16051
|
if (skip_cpu) {
|
@@ -15975,7 +16057,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
15975
16057
|
}
|
15976
16058
|
GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU);
|
15977
16059
|
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU);
|
15978
|
-
#endif //
|
16060
|
+
#endif // GGML_USE_VULKAN
|
15979
16061
|
|
15980
16062
|
#ifdef GGML_USE_SYCL
|
15981
16063
|
bool skip_cpu = ggml_sycl_compute_forward(params, tensor);
|