llama_cpp 0.14.2 → 0.14.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -282,8 +282,6 @@ inline static void * ggml_calloc(size_t num, size_t size) {
282
282
  #else
283
283
  #include <cblas.h>
284
284
  #endif
285
- #elif defined(GGML_USE_CUBLAS)
286
- #include "ggml-cuda.h"
287
285
  #elif defined(GGML_USE_CLBLAST)
288
286
  #include "ggml-opencl.h"
289
287
  #elif defined(GGML_USE_VULKAN)
@@ -931,6 +929,101 @@ inline static float vaddvq_f32(float32x4_t v) {
931
929
  #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
932
930
  #endif
933
931
 
932
+ #elif defined(__AVX512F__)
933
+
934
+ #define GGML_SIMD
935
+
936
+ // F32 AVX512
937
+
938
+ #define GGML_F32_STEP 64
939
+ #define GGML_F32_EPR 16
940
+
941
+ #define GGML_F32x16 __m512
942
+ #define GGML_F32x16_ZERO _mm512_setzero_ps()
943
+ #define GGML_F32x16_SET1(x) _mm512_set1_ps(x)
944
+ #define GGML_F32x16_LOAD _mm512_loadu_ps
945
+ #define GGML_F32x16_STORE _mm512_storeu_ps
946
+ // _mm512_fmadd_ps is defined in AVX512F so no guard is required
947
+ #define GGML_F32x16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
948
+ #define GGML_F32x16_ADD _mm512_add_ps
949
+ #define GGML_F32x16_MUL _mm512_mul_ps
950
+ #define GGML_F32x16_REDUCE(res, x) \
951
+ do { \
952
+ int offset = GGML_F32_ARR >> 1; \
953
+ for (int i = 0; i < offset; ++i) { \
954
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
955
+ } \
956
+ offset >>= 1; \
957
+ for (int i = 0; i < offset; ++i) { \
958
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
959
+ } \
960
+ offset >>= 1; \
961
+ for (int i = 0; i < offset; ++i) { \
962
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
963
+ } \
964
+ res = _mm512_reduce_add_ps(x[0]); \
965
+ } while (0)
966
+
967
+ // TODO: is this optimal ?
968
+
969
+ #define GGML_F32_VEC GGML_F32x16
970
+ #define GGML_F32_VEC_ZERO GGML_F32x16_ZERO
971
+ #define GGML_F32_VEC_SET1 GGML_F32x16_SET1
972
+ #define GGML_F32_VEC_LOAD GGML_F32x16_LOAD
973
+ #define GGML_F32_VEC_STORE GGML_F32x16_STORE
974
+ #define GGML_F32_VEC_FMA GGML_F32x16_FMA
975
+ #define GGML_F32_VEC_ADD GGML_F32x16_ADD
976
+ #define GGML_F32_VEC_MUL GGML_F32x16_MUL
977
+ #define GGML_F32_VEC_REDUCE GGML_F32x16_REDUCE
978
+
979
+ // F16 AVX512
980
+
981
+ // F16 AVX
982
+
983
+ #define GGML_F16_STEP 64
984
+ #define GGML_F16_EPR 16
985
+
986
+ // AVX512 has FP16 extension (AVX512_FP16) but I don't have it on my machine so I use FP32 instead
987
+
988
+ #define GGML_F32Cx16 __m512
989
+ #define GGML_F32Cx16_ZERO _mm512_setzero_ps()
990
+ #define GGML_F32Cx16_SET1(x) _mm512_set1_ps(x)
991
+
992
+ // unlike _mm256_cvt intrinsics that require F16C, _mm512_cvt is defined in AVX512F
993
+ // so F16C guard isn't required
994
+ #define GGML_F32Cx16_LOAD(x) _mm512_cvtph_ps(_mm256_loadu_si256((__m256i *)(x)))
995
+ #define GGML_F32Cx16_STORE(x, y) _mm256_storeu_si256((__m256i *)(x), _mm512_cvtps_ph(y, 0))
996
+
997
+ #define GGML_F32Cx16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
998
+ #define GGML_F32Cx16_ADD _mm512_add_ps
999
+ #define GGML_F32Cx16_MUL _mm512_mul_ps
1000
+ #define GGML_F32Cx16_REDUCE(res, x) \
1001
+ do { \
1002
+ int offset = GGML_F32_ARR >> 1; \
1003
+ for (int i = 0; i < offset; ++i) { \
1004
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
1005
+ } \
1006
+ offset >>= 1; \
1007
+ for (int i = 0; i < offset; ++i) { \
1008
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
1009
+ } \
1010
+ offset >>= 1; \
1011
+ for (int i = 0; i < offset; ++i) { \
1012
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
1013
+ } \
1014
+ res = _mm512_reduce_add_ps(x[0]); \
1015
+ } while (0)
1016
+
1017
+ #define GGML_F16_VEC GGML_F32Cx16
1018
+ #define GGML_F16_VEC_ZERO GGML_F32Cx16_ZERO
1019
+ #define GGML_F16_VEC_SET1 GGML_F32Cx16_SET1
1020
+ #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx16_LOAD(p)
1021
+ #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx16_STORE(p, r[i])
1022
+ #define GGML_F16_VEC_FMA GGML_F32Cx16_FMA
1023
+ #define GGML_F16_VEC_ADD GGML_F32Cx16_ADD
1024
+ #define GGML_F16_VEC_MUL GGML_F32Cx16_MUL
1025
+ #define GGML_F16_VEC_REDUCE GGML_F32Cx16_REDUCE
1026
+
934
1027
  #elif defined(__AVX__)
935
1028
 
936
1029
  #define GGML_SIMD
@@ -2545,9 +2638,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
2545
2638
  GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
2546
2639
  }
2547
2640
 
2548
- #if defined(GGML_USE_CUBLAS)
2549
- ggml_init_cublas();
2550
- #elif defined(GGML_USE_CLBLAST)
2641
+ #if defined(GGML_USE_CLBLAST)
2551
2642
  ggml_cl_init();
2552
2643
  #elif defined(GGML_USE_VULKAN)
2553
2644
  ggml_vk_init_cpu_assist();
@@ -11010,7 +11101,6 @@ static void ggml_compute_forward_out_prod_f32(
11010
11101
  // nb01 >= nb00 - src0 is not transposed
11011
11102
  // compute by src0 rows
11012
11103
 
11013
- // TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod
11014
11104
  // TODO: #if defined(GGML_USE_CLBLAST)
11015
11105
 
11016
11106
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
@@ -11210,7 +11300,6 @@ static void ggml_compute_forward_out_prod_q_f32(
11210
11300
  // nb01 >= nb00 - src0 is not transposed
11211
11301
  // compute by src0 rows
11212
11302
 
11213
- // TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod
11214
11303
  // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
11215
11304
 
11216
11305
  if (params->type == GGML_TASK_TYPE_INIT) {
@@ -15956,14 +16045,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
15956
16045
  return;
15957
16046
  }
15958
16047
 
15959
- #ifdef GGML_USE_CUBLAS
15960
- bool skip_cpu = ggml_cuda_compute_forward(params, tensor);
15961
- if (skip_cpu) {
15962
- return;
15963
- }
15964
- GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU);
15965
- GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU);
15966
- #elif defined(GGML_USE_VULKAN)
16048
+ #if defined(GGML_USE_VULKAN)
15967
16049
  const bool skip_cpu = ggml_vk_compute_forward_cpu_assist(params, tensor);
15968
16050
  #ifdef GGML_VULKAN_CHECK_RESULTS
15969
16051
  if (skip_cpu) {
@@ -15975,7 +16057,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
15975
16057
  }
15976
16058
  GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU);
15977
16059
  GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU);
15978
- #endif // GGML_USE_CUBLAS
16060
+ #endif // GGML_USE_VULKAN
15979
16061
 
15980
16062
  #ifdef GGML_USE_SYCL
15981
16063
  bool skip_cpu = ggml_sycl_compute_forward(params, tensor);