llama_cpp 0.14.2 → 0.14.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -282,8 +282,6 @@ inline static void * ggml_calloc(size_t num, size_t size) {
282
282
  #else
283
283
  #include <cblas.h>
284
284
  #endif
285
- #elif defined(GGML_USE_CUBLAS)
286
- #include "ggml-cuda.h"
287
285
  #elif defined(GGML_USE_CLBLAST)
288
286
  #include "ggml-opencl.h"
289
287
  #elif defined(GGML_USE_VULKAN)
@@ -931,6 +929,101 @@ inline static float vaddvq_f32(float32x4_t v) {
931
929
  #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
932
930
  #endif
933
931
 
932
+ #elif defined(__AVX512F__)
933
+
934
+ #define GGML_SIMD
935
+
936
+ // F32 AVX512
937
+
938
+ #define GGML_F32_STEP 64
939
+ #define GGML_F32_EPR 16
940
+
941
+ #define GGML_F32x16 __m512
942
+ #define GGML_F32x16_ZERO _mm512_setzero_ps()
943
+ #define GGML_F32x16_SET1(x) _mm512_set1_ps(x)
944
+ #define GGML_F32x16_LOAD _mm512_loadu_ps
945
+ #define GGML_F32x16_STORE _mm512_storeu_ps
946
+ // _mm512_fmadd_ps is defined in AVX512F so no guard is required
947
+ #define GGML_F32x16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
948
+ #define GGML_F32x16_ADD _mm512_add_ps
949
+ #define GGML_F32x16_MUL _mm512_mul_ps
950
+ #define GGML_F32x16_REDUCE(res, x) \
951
+ do { \
952
+ int offset = GGML_F32_ARR >> 1; \
953
+ for (int i = 0; i < offset; ++i) { \
954
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
955
+ } \
956
+ offset >>= 1; \
957
+ for (int i = 0; i < offset; ++i) { \
958
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
959
+ } \
960
+ offset >>= 1; \
961
+ for (int i = 0; i < offset; ++i) { \
962
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
963
+ } \
964
+ res = _mm512_reduce_add_ps(x[0]); \
965
+ } while (0)
966
+
967
+ // TODO: is this optimal ?
968
+
969
+ #define GGML_F32_VEC GGML_F32x16
970
+ #define GGML_F32_VEC_ZERO GGML_F32x16_ZERO
971
+ #define GGML_F32_VEC_SET1 GGML_F32x16_SET1
972
+ #define GGML_F32_VEC_LOAD GGML_F32x16_LOAD
973
+ #define GGML_F32_VEC_STORE GGML_F32x16_STORE
974
+ #define GGML_F32_VEC_FMA GGML_F32x16_FMA
975
+ #define GGML_F32_VEC_ADD GGML_F32x16_ADD
976
+ #define GGML_F32_VEC_MUL GGML_F32x16_MUL
977
+ #define GGML_F32_VEC_REDUCE GGML_F32x16_REDUCE
978
+
979
+ // F16 AVX512
980
+
981
+ // F16 AVX
982
+
983
+ #define GGML_F16_STEP 64
984
+ #define GGML_F16_EPR 16
985
+
986
+ // AVX512 has FP16 extension (AVX512_FP16) but I don't have it on my machine so I use FP32 instead
987
+
988
+ #define GGML_F32Cx16 __m512
989
+ #define GGML_F32Cx16_ZERO _mm512_setzero_ps()
990
+ #define GGML_F32Cx16_SET1(x) _mm512_set1_ps(x)
991
+
992
+ // unlike _mm256_cvt intrinsics that require F16C, _mm512_cvt is defined in AVX512F
993
+ // so F16C guard isn't required
994
+ #define GGML_F32Cx16_LOAD(x) _mm512_cvtph_ps(_mm256_loadu_si256((__m256i *)(x)))
995
+ #define GGML_F32Cx16_STORE(x, y) _mm256_storeu_si256((__m256i *)(x), _mm512_cvtps_ph(y, 0))
996
+
997
+ #define GGML_F32Cx16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
998
+ #define GGML_F32Cx16_ADD _mm512_add_ps
999
+ #define GGML_F32Cx16_MUL _mm512_mul_ps
1000
+ #define GGML_F32Cx16_REDUCE(res, x) \
1001
+ do { \
1002
+ int offset = GGML_F32_ARR >> 1; \
1003
+ for (int i = 0; i < offset; ++i) { \
1004
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
1005
+ } \
1006
+ offset >>= 1; \
1007
+ for (int i = 0; i < offset; ++i) { \
1008
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
1009
+ } \
1010
+ offset >>= 1; \
1011
+ for (int i = 0; i < offset; ++i) { \
1012
+ x[i] = _mm512_add_ps(x[i], x[offset+i]); \
1013
+ } \
1014
+ res = _mm512_reduce_add_ps(x[0]); \
1015
+ } while (0)
1016
+
1017
+ #define GGML_F16_VEC GGML_F32Cx16
1018
+ #define GGML_F16_VEC_ZERO GGML_F32Cx16_ZERO
1019
+ #define GGML_F16_VEC_SET1 GGML_F32Cx16_SET1
1020
+ #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx16_LOAD(p)
1021
+ #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx16_STORE(p, r[i])
1022
+ #define GGML_F16_VEC_FMA GGML_F32Cx16_FMA
1023
+ #define GGML_F16_VEC_ADD GGML_F32Cx16_ADD
1024
+ #define GGML_F16_VEC_MUL GGML_F32Cx16_MUL
1025
+ #define GGML_F16_VEC_REDUCE GGML_F32Cx16_REDUCE
1026
+
934
1027
  #elif defined(__AVX__)
935
1028
 
936
1029
  #define GGML_SIMD
@@ -2545,9 +2638,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
2545
2638
  GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
2546
2639
  }
2547
2640
 
2548
- #if defined(GGML_USE_CUBLAS)
2549
- ggml_init_cublas();
2550
- #elif defined(GGML_USE_CLBLAST)
2641
+ #if defined(GGML_USE_CLBLAST)
2551
2642
  ggml_cl_init();
2552
2643
  #elif defined(GGML_USE_VULKAN)
2553
2644
  ggml_vk_init_cpu_assist();
@@ -11010,7 +11101,6 @@ static void ggml_compute_forward_out_prod_f32(
11010
11101
  // nb01 >= nb00 - src0 is not transposed
11011
11102
  // compute by src0 rows
11012
11103
 
11013
- // TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod
11014
11104
  // TODO: #if defined(GGML_USE_CLBLAST)
11015
11105
 
11016
11106
  #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
@@ -11210,7 +11300,6 @@ static void ggml_compute_forward_out_prod_q_f32(
11210
11300
  // nb01 >= nb00 - src0 is not transposed
11211
11301
  // compute by src0 rows
11212
11302
 
11213
- // TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod
11214
11303
  // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
11215
11304
 
11216
11305
  if (params->type == GGML_TASK_TYPE_INIT) {
@@ -15956,14 +16045,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
15956
16045
  return;
15957
16046
  }
15958
16047
 
15959
- #ifdef GGML_USE_CUBLAS
15960
- bool skip_cpu = ggml_cuda_compute_forward(params, tensor);
15961
- if (skip_cpu) {
15962
- return;
15963
- }
15964
- GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU);
15965
- GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU);
15966
- #elif defined(GGML_USE_VULKAN)
16048
+ #if defined(GGML_USE_VULKAN)
15967
16049
  const bool skip_cpu = ggml_vk_compute_forward_cpu_assist(params, tensor);
15968
16050
  #ifdef GGML_VULKAN_CHECK_RESULTS
15969
16051
  if (skip_cpu) {
@@ -15975,7 +16057,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
15975
16057
  }
15976
16058
  GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU);
15977
16059
  GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU);
15978
- #endif // GGML_USE_CUBLAS
16060
+ #endif // GGML_USE_VULKAN
15979
16061
 
15980
16062
  #ifdef GGML_USE_SYCL
15981
16063
  bool skip_cpu = ggml_sycl_compute_forward(params, tensor);