llama_cpp 0.16.2 → 0.17.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +18 -0
- data/README.md +7 -12
- data/ext/llama_cpp/extconf.rb +2 -43
- data/ext/llama_cpp/llama_cpp.cpp +8 -0
- data/lib/llama_cpp/version.rb +3 -3
- data/sig/llama_cpp.rbs +3 -0
- metadata +2 -171
- data/vendor/include/.gitkeep +0 -0
- data/vendor/lib/.gitkeep +0 -0
- data/vendor/tmp/llama.cpp/LICENSE +0 -21
- data/vendor/tmp/llama.cpp/Makefile +0 -1124
- data/vendor/tmp/llama.cpp/ggml-alloc.c +0 -1041
- data/vendor/tmp/llama.cpp/ggml-alloc.h +0 -76
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +0 -153
- data/vendor/tmp/llama.cpp/ggml-backend.c +0 -2225
- data/vendor/tmp/llama.cpp/ggml-backend.h +0 -236
- data/vendor/tmp/llama.cpp/ggml-blas.cpp +0 -363
- data/vendor/tmp/llama.cpp/ggml-blas.h +0 -23
- data/vendor/tmp/llama.cpp/ggml-common.h +0 -1805
- data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +0 -47
- data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +0 -34
- data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +0 -104
- data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +0 -280
- data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +0 -34
- data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +0 -196
- data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +0 -686
- data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +0 -490
- data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +0 -40
- data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +0 -674
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +0 -319
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +0 -312
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +0 -345
- data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +0 -178
- data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +0 -104
- data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +0 -88
- data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +0 -419
- data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +0 -221
- data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +0 -49
- data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +0 -94
- data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +0 -112
- data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +0 -271
- data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +0 -31
- data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +0 -206
- data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +0 -40
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +0 -10
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +0 -9
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +0 -10
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +0 -10
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +0 -8
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +0 -47
- data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +0 -314
- data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +0 -51
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +0 -3069
- data/vendor/tmp/llama.cpp/ggml-cuda.h +0 -44
- data/vendor/tmp/llama.cpp/ggml-impl.h +0 -651
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +0 -2038
- data/vendor/tmp/llama.cpp/ggml-kompute.h +0 -46
- data/vendor/tmp/llama.cpp/ggml-metal.h +0 -66
- data/vendor/tmp/llama.cpp/ggml-metal.m +0 -3273
- data/vendor/tmp/llama.cpp/ggml-metal.metal +0 -6540
- data/vendor/tmp/llama.cpp/ggml-quants.c +0 -14994
- data/vendor/tmp/llama.cpp/ggml-quants.h +0 -133
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +0 -1178
- data/vendor/tmp/llama.cpp/ggml-rpc.h +0 -24
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +0 -6351
- data/vendor/tmp/llama.cpp/ggml-sycl.h +0 -40
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +0 -144508
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +0 -7183
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +0 -29
- data/vendor/tmp/llama.cpp/ggml.c +0 -22506
- data/vendor/tmp/llama.cpp/ggml.h +0 -2458
- data/vendor/tmp/llama.cpp/llama.cpp +0 -18985
- data/vendor/tmp/llama.cpp/llama.h +0 -1147
- data/vendor/tmp/llama.cpp/scripts/get-flags.mk +0 -38
- data/vendor/tmp/llama.cpp/sgemm.cpp +0 -1032
- data/vendor/tmp/llama.cpp/sgemm.h +0 -14
- data/vendor/tmp/llama.cpp/unicode-data.cpp +0 -7033
- data/vendor/tmp/llama.cpp/unicode-data.h +0 -20
- data/vendor/tmp/llama.cpp/unicode.cpp +0 -810
- data/vendor/tmp/llama.cpp/unicode.h +0 -63
@@ -1,44 +0,0 @@
|
|
1
|
-
#pragma once
|
2
|
-
|
3
|
-
#include "ggml.h"
|
4
|
-
#include "ggml-backend.h"
|
5
|
-
|
6
|
-
#ifdef GGML_USE_HIPBLAS
|
7
|
-
#define GGML_CUDA_NAME "ROCm"
|
8
|
-
#define GGML_CUBLAS_NAME "hipBLAS"
|
9
|
-
#else
|
10
|
-
#define GGML_CUDA_NAME "CUDA"
|
11
|
-
#define GGML_CUBLAS_NAME "cuBLAS"
|
12
|
-
#endif
|
13
|
-
|
14
|
-
#ifdef __cplusplus
|
15
|
-
extern "C" {
|
16
|
-
#endif
|
17
|
-
|
18
|
-
#define GGML_CUDA_MAX_DEVICES 16
|
19
|
-
|
20
|
-
// backend API
|
21
|
-
GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device);
|
22
|
-
|
23
|
-
GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend);
|
24
|
-
|
25
|
-
// device buffer
|
26
|
-
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
|
27
|
-
|
28
|
-
// split tensor buffer that splits matrices by rows across multiple devices
|
29
|
-
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
|
30
|
-
|
31
|
-
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
32
|
-
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
|
33
|
-
|
34
|
-
GGML_API GGML_CALL int ggml_backend_cuda_get_device_count(void);
|
35
|
-
GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
|
36
|
-
GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
|
37
|
-
|
38
|
-
GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
|
39
|
-
GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer);
|
40
|
-
|
41
|
-
GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data);
|
42
|
-
#ifdef __cplusplus
|
43
|
-
}
|
44
|
-
#endif
|
@@ -1,651 +0,0 @@
|
|
1
|
-
#pragma once
|
2
|
-
|
3
|
-
#include "ggml.h"
|
4
|
-
|
5
|
-
// GGML internal header
|
6
|
-
|
7
|
-
#include <assert.h>
|
8
|
-
#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
|
9
|
-
#include <stddef.h>
|
10
|
-
#include <stdbool.h>
|
11
|
-
#include <string.h> // memcpy
|
12
|
-
#include <math.h> // fabsf
|
13
|
-
|
14
|
-
#undef MIN
|
15
|
-
#undef MAX
|
16
|
-
|
17
|
-
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
18
|
-
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
19
|
-
|
20
|
-
#if defined(_MSC_VER)
|
21
|
-
|
22
|
-
#define m512bh(p) p
|
23
|
-
#define m512i(p) p
|
24
|
-
|
25
|
-
#else
|
26
|
-
|
27
|
-
#define m512bh(p) (__m512bh)(p)
|
28
|
-
#define m512i(p) (__m512i)(p)
|
29
|
-
|
30
|
-
#endif
|
31
|
-
|
32
|
-
/**
|
33
|
-
* Converts brain16 to float32.
|
34
|
-
*
|
35
|
-
* The bfloat16 floating point format has the following structure:
|
36
|
-
*
|
37
|
-
* ┌sign
|
38
|
-
* │
|
39
|
-
* │ ┌exponent
|
40
|
-
* │ │
|
41
|
-
* │ │ ┌mantissa
|
42
|
-
* │ │ │
|
43
|
-
* │┌──┴───┐┌─┴───┐
|
44
|
-
* 0b0000000000000000 brain16
|
45
|
-
*
|
46
|
-
* Since bf16 has the same number of exponent bits as a 32bit float,
|
47
|
-
* encoding and decoding numbers becomes relatively straightforward.
|
48
|
-
*
|
49
|
-
* ┌sign
|
50
|
-
* │
|
51
|
-
* │ ┌exponent
|
52
|
-
* │ │
|
53
|
-
* │ │ ┌mantissa
|
54
|
-
* │ │ │
|
55
|
-
* │┌──┴───┐┌─┴───────────────────┐
|
56
|
-
* 0b00000000000000000000000000000000 IEEE binary32
|
57
|
-
*
|
58
|
-
* For comparison, the standard fp16 format has fewer exponent bits.
|
59
|
-
*
|
60
|
-
* ┌sign
|
61
|
-
* │
|
62
|
-
* │ ┌exponent
|
63
|
-
* │ │
|
64
|
-
* │ │ ┌mantissa
|
65
|
-
* │ │ │
|
66
|
-
* │┌─┴─┐┌─┴──────┐
|
67
|
-
* 0b0000000000000000 IEEE binary16
|
68
|
-
*
|
69
|
-
* @see IEEE 754-2008
|
70
|
-
*/
|
71
|
-
static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
|
72
|
-
union {
|
73
|
-
float f;
|
74
|
-
uint32_t i;
|
75
|
-
} u;
|
76
|
-
u.i = (uint32_t)h.bits << 16;
|
77
|
-
return u.f;
|
78
|
-
}
|
79
|
-
|
80
|
-
/**
|
81
|
-
* Converts float32 to brain16.
|
82
|
-
*
|
83
|
-
* This function is binary identical to AMD Zen4 VCVTNEPS2BF16.
|
84
|
-
* Subnormals shall be flushed to zero, and NANs will be quiet.
|
85
|
-
* This code should vectorize nicely if using modern compilers.
|
86
|
-
*/
|
87
|
-
static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
|
88
|
-
ggml_bf16_t h;
|
89
|
-
union {
|
90
|
-
float f;
|
91
|
-
uint32_t i;
|
92
|
-
} u;
|
93
|
-
u.f = s;
|
94
|
-
if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */
|
95
|
-
h.bits = (u.i >> 16) | 64; /* force to quiet */
|
96
|
-
return h;
|
97
|
-
}
|
98
|
-
if (!(u.i & 0x7f800000)) { /* subnormal */
|
99
|
-
h.bits = (u.i & 0x80000000) >> 16; /* flush to zero */
|
100
|
-
return h;
|
101
|
-
}
|
102
|
-
h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
|
103
|
-
return h;
|
104
|
-
}
|
105
|
-
|
106
|
-
#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
|
107
|
-
#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
|
108
|
-
|
109
|
-
#ifdef __cplusplus
|
110
|
-
extern "C" {
|
111
|
-
#endif
|
112
|
-
|
113
|
-
// static_assert should be a #define, but if it's not,
|
114
|
-
// fall back to the _Static_assert C11 keyword.
|
115
|
-
// if C99 - static_assert is noop
|
116
|
-
// ref: https://stackoverflow.com/a/53923785/4039976
|
117
|
-
#ifndef __cplusplus
|
118
|
-
#ifndef static_assert
|
119
|
-
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
|
120
|
-
#define static_assert(cond, msg) _Static_assert(cond, msg)
|
121
|
-
#else
|
122
|
-
#define static_assert(cond, msg) struct global_scope_noop_trick
|
123
|
-
#endif
|
124
|
-
#endif
|
125
|
-
#endif
|
126
|
-
|
127
|
-
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
|
128
|
-
#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
|
129
|
-
#ifndef __FMA__
|
130
|
-
#define __FMA__
|
131
|
-
#endif
|
132
|
-
#ifndef __F16C__
|
133
|
-
#define __F16C__
|
134
|
-
#endif
|
135
|
-
#endif
|
136
|
-
|
137
|
-
// __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available
|
138
|
-
#if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__))
|
139
|
-
#ifndef __SSE3__
|
140
|
-
#define __SSE3__
|
141
|
-
#endif
|
142
|
-
#ifndef __SSSE3__
|
143
|
-
#define __SSSE3__
|
144
|
-
#endif
|
145
|
-
#endif
|
146
|
-
|
147
|
-
#if defined(__ARM_FEATURE_SVE)
|
148
|
-
#include <arm_sve.h>
|
149
|
-
#endif
|
150
|
-
|
151
|
-
// 16-bit float
|
152
|
-
// on Arm, we use __fp16
|
153
|
-
// on x86, we use uint16_t
|
154
|
-
#if defined(__ARM_NEON)
|
155
|
-
|
156
|
-
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
|
157
|
-
//
|
158
|
-
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
|
159
|
-
//
|
160
|
-
#include <arm_neon.h>
|
161
|
-
|
162
|
-
#ifdef _MSC_VER
|
163
|
-
|
164
|
-
typedef uint16_t ggml_fp16_internal_t;
|
165
|
-
|
166
|
-
#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
|
167
|
-
|
168
|
-
#else
|
169
|
-
|
170
|
-
typedef __fp16 ggml_fp16_internal_t;
|
171
|
-
|
172
|
-
#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
|
173
|
-
|
174
|
-
#endif // _MSC_VER
|
175
|
-
|
176
|
-
#if !defined(__aarch64__)
|
177
|
-
|
178
|
-
// 32-bit ARM compatibility
|
179
|
-
|
180
|
-
// vaddvq_s16
|
181
|
-
// vpaddq_s16
|
182
|
-
// vpaddq_s32
|
183
|
-
// vaddvq_s32
|
184
|
-
// vaddvq_f32
|
185
|
-
// vmaxvq_f32
|
186
|
-
// vcvtnq_s32_f32
|
187
|
-
// vzip1_u8
|
188
|
-
// vzip2_u8
|
189
|
-
|
190
|
-
inline static int32_t vaddvq_s16(int16x8_t v) {
|
191
|
-
return
|
192
|
-
(int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
|
193
|
-
(int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
|
194
|
-
(int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
|
195
|
-
(int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
|
196
|
-
}
|
197
|
-
|
198
|
-
inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
|
199
|
-
int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
|
200
|
-
int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
|
201
|
-
return vcombine_s16(a0, b0);
|
202
|
-
}
|
203
|
-
|
204
|
-
inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
|
205
|
-
int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
|
206
|
-
int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
|
207
|
-
return vcombine_s32(a0, b0);
|
208
|
-
}
|
209
|
-
|
210
|
-
inline static int32_t vaddvq_s32(int32x4_t v) {
|
211
|
-
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
|
212
|
-
}
|
213
|
-
|
214
|
-
inline static float vaddvq_f32(float32x4_t v) {
|
215
|
-
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
|
216
|
-
}
|
217
|
-
|
218
|
-
inline static float vmaxvq_f32(float32x4_t v) {
|
219
|
-
return
|
220
|
-
MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
|
221
|
-
MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
|
222
|
-
}
|
223
|
-
|
224
|
-
inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
|
225
|
-
int32x4_t res;
|
226
|
-
|
227
|
-
res[0] = roundf(vgetq_lane_f32(v, 0));
|
228
|
-
res[1] = roundf(vgetq_lane_f32(v, 1));
|
229
|
-
res[2] = roundf(vgetq_lane_f32(v, 2));
|
230
|
-
res[3] = roundf(vgetq_lane_f32(v, 3));
|
231
|
-
|
232
|
-
return res;
|
233
|
-
}
|
234
|
-
|
235
|
-
inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
|
236
|
-
uint8x8_t res;
|
237
|
-
|
238
|
-
res[0] = a[0]; res[1] = b[0];
|
239
|
-
res[2] = a[1]; res[3] = b[1];
|
240
|
-
res[4] = a[2]; res[5] = b[2];
|
241
|
-
res[6] = a[3]; res[7] = b[3];
|
242
|
-
|
243
|
-
return res;
|
244
|
-
}
|
245
|
-
|
246
|
-
inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
|
247
|
-
uint8x8_t res;
|
248
|
-
|
249
|
-
res[0] = a[4]; res[1] = b[4];
|
250
|
-
res[2] = a[5]; res[3] = b[5];
|
251
|
-
res[4] = a[6]; res[5] = b[6];
|
252
|
-
res[6] = a[7]; res[7] = b[7];
|
253
|
-
|
254
|
-
return res;
|
255
|
-
}
|
256
|
-
|
257
|
-
// vld1q_s16_x2
|
258
|
-
// vld1q_u8_x2
|
259
|
-
// vld1q_u8_x4
|
260
|
-
// vld1q_s8_x2
|
261
|
-
// vld1q_s8_x4
|
262
|
-
// TODO: double-check these work correctly
|
263
|
-
|
264
|
-
typedef struct ggml_int16x8x2_t {
|
265
|
-
int16x8_t val[2];
|
266
|
-
} ggml_int16x8x2_t;
|
267
|
-
|
268
|
-
inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) {
|
269
|
-
ggml_int16x8x2_t res;
|
270
|
-
|
271
|
-
res.val[0] = vld1q_s16(ptr + 0);
|
272
|
-
res.val[1] = vld1q_s16(ptr + 8);
|
273
|
-
|
274
|
-
return res;
|
275
|
-
}
|
276
|
-
|
277
|
-
typedef struct ggml_uint8x16x2_t {
|
278
|
-
uint8x16_t val[2];
|
279
|
-
} ggml_uint8x16x2_t;
|
280
|
-
|
281
|
-
inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) {
|
282
|
-
ggml_uint8x16x2_t res;
|
283
|
-
|
284
|
-
res.val[0] = vld1q_u8(ptr + 0);
|
285
|
-
res.val[1] = vld1q_u8(ptr + 16);
|
286
|
-
|
287
|
-
return res;
|
288
|
-
}
|
289
|
-
|
290
|
-
typedef struct ggml_uint8x16x4_t {
|
291
|
-
uint8x16_t val[4];
|
292
|
-
} ggml_uint8x16x4_t;
|
293
|
-
|
294
|
-
inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) {
|
295
|
-
ggml_uint8x16x4_t res;
|
296
|
-
|
297
|
-
res.val[0] = vld1q_u8(ptr + 0);
|
298
|
-
res.val[1] = vld1q_u8(ptr + 16);
|
299
|
-
res.val[2] = vld1q_u8(ptr + 32);
|
300
|
-
res.val[3] = vld1q_u8(ptr + 48);
|
301
|
-
|
302
|
-
return res;
|
303
|
-
}
|
304
|
-
|
305
|
-
typedef struct ggml_int8x16x2_t {
|
306
|
-
int8x16_t val[2];
|
307
|
-
} ggml_int8x16x2_t;
|
308
|
-
|
309
|
-
inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) {
|
310
|
-
ggml_int8x16x2_t res;
|
311
|
-
|
312
|
-
res.val[0] = vld1q_s8(ptr + 0);
|
313
|
-
res.val[1] = vld1q_s8(ptr + 16);
|
314
|
-
|
315
|
-
return res;
|
316
|
-
}
|
317
|
-
|
318
|
-
typedef struct ggml_int8x16x4_t {
|
319
|
-
int8x16_t val[4];
|
320
|
-
} ggml_int8x16x4_t;
|
321
|
-
|
322
|
-
inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
|
323
|
-
ggml_int8x16x4_t res;
|
324
|
-
|
325
|
-
res.val[0] = vld1q_s8(ptr + 0);
|
326
|
-
res.val[1] = vld1q_s8(ptr + 16);
|
327
|
-
res.val[2] = vld1q_s8(ptr + 32);
|
328
|
-
res.val[3] = vld1q_s8(ptr + 48);
|
329
|
-
|
330
|
-
return res;
|
331
|
-
}
|
332
|
-
|
333
|
-
// NOTE: not tested
|
334
|
-
inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
|
335
|
-
int8x16_t res;
|
336
|
-
|
337
|
-
res[ 0] = a[b[ 0]];
|
338
|
-
res[ 1] = a[b[ 1]];
|
339
|
-
res[ 2] = a[b[ 2]];
|
340
|
-
res[ 3] = a[b[ 3]];
|
341
|
-
res[ 4] = a[b[ 4]];
|
342
|
-
res[ 5] = a[b[ 5]];
|
343
|
-
res[ 6] = a[b[ 6]];
|
344
|
-
res[ 7] = a[b[ 7]];
|
345
|
-
res[ 8] = a[b[ 8]];
|
346
|
-
res[ 9] = a[b[ 9]];
|
347
|
-
res[10] = a[b[10]];
|
348
|
-
res[11] = a[b[11]];
|
349
|
-
res[12] = a[b[12]];
|
350
|
-
res[13] = a[b[13]];
|
351
|
-
res[14] = a[b[14]];
|
352
|
-
res[15] = a[b[15]];
|
353
|
-
|
354
|
-
return res;
|
355
|
-
}
|
356
|
-
|
357
|
-
// NOTE: not tested
|
358
|
-
inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
|
359
|
-
uint8x16_t res;
|
360
|
-
|
361
|
-
res[ 0] = a[b[ 0]];
|
362
|
-
res[ 1] = a[b[ 1]];
|
363
|
-
res[ 2] = a[b[ 2]];
|
364
|
-
res[ 3] = a[b[ 3]];
|
365
|
-
res[ 4] = a[b[ 4]];
|
366
|
-
res[ 5] = a[b[ 5]];
|
367
|
-
res[ 6] = a[b[ 6]];
|
368
|
-
res[ 7] = a[b[ 7]];
|
369
|
-
res[ 8] = a[b[ 8]];
|
370
|
-
res[ 9] = a[b[ 9]];
|
371
|
-
res[10] = a[b[10]];
|
372
|
-
res[11] = a[b[11]];
|
373
|
-
res[12] = a[b[12]];
|
374
|
-
res[13] = a[b[13]];
|
375
|
-
res[14] = a[b[14]];
|
376
|
-
res[15] = a[b[15]];
|
377
|
-
|
378
|
-
return res;
|
379
|
-
}
|
380
|
-
|
381
|
-
#else
|
382
|
-
|
383
|
-
#define ggml_int16x8x2_t int16x8x2_t
|
384
|
-
#define ggml_uint8x16x2_t uint8x16x2_t
|
385
|
-
#define ggml_uint8x16x4_t uint8x16x4_t
|
386
|
-
#define ggml_int8x16x2_t int8x16x2_t
|
387
|
-
#define ggml_int8x16x4_t int8x16x4_t
|
388
|
-
|
389
|
-
#define ggml_vld1q_s16_x2 vld1q_s16_x2
|
390
|
-
#define ggml_vld1q_u8_x2 vld1q_u8_x2
|
391
|
-
#define ggml_vld1q_u8_x4 vld1q_u8_x4
|
392
|
-
#define ggml_vld1q_s8_x2 vld1q_s8_x2
|
393
|
-
#define ggml_vld1q_s8_x4 vld1q_s8_x4
|
394
|
-
#define ggml_vqtbl1q_s8 vqtbl1q_s8
|
395
|
-
#define ggml_vqtbl1q_u8 vqtbl1q_u8
|
396
|
-
|
397
|
-
#endif // !defined(__aarch64__)
|
398
|
-
|
399
|
-
#if !defined(__ARM_FEATURE_DOTPROD)
|
400
|
-
|
401
|
-
inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
|
402
|
-
const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
|
403
|
-
const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
|
404
|
-
|
405
|
-
return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
|
406
|
-
}
|
407
|
-
|
408
|
-
#else
|
409
|
-
|
410
|
-
#define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c)
|
411
|
-
|
412
|
-
#endif // !defined(__ARM_FEATURE_DOTPROD)
|
413
|
-
|
414
|
-
#endif // defined(__ARM_NEON)
|
415
|
-
|
416
|
-
#if defined(__ARM_NEON) && !defined(_MSC_VER)
|
417
|
-
|
418
|
-
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
419
|
-
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
420
|
-
|
421
|
-
#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
422
|
-
|
423
|
-
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
424
|
-
ggml_fp16_internal_t tmp;
|
425
|
-
memcpy(&tmp, &h, sizeof(ggml_fp16_t));
|
426
|
-
return (float)tmp;
|
427
|
-
}
|
428
|
-
|
429
|
-
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
430
|
-
ggml_fp16_t res;
|
431
|
-
ggml_fp16_internal_t tmp = f;
|
432
|
-
memcpy(&res, &tmp, sizeof(ggml_fp16_t));
|
433
|
-
return res;
|
434
|
-
}
|
435
|
-
|
436
|
-
#else
|
437
|
-
|
438
|
-
#ifdef __wasm_simd128__
|
439
|
-
#include <wasm_simd128.h>
|
440
|
-
#else
|
441
|
-
#ifdef __POWER9_VECTOR__
|
442
|
-
#include <altivec.h>
|
443
|
-
#undef bool
|
444
|
-
#define bool _Bool
|
445
|
-
#else
|
446
|
-
#if defined(_MSC_VER) || defined(__MINGW32__)
|
447
|
-
#include <intrin.h>
|
448
|
-
#else
|
449
|
-
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__)
|
450
|
-
#if !defined(__riscv)
|
451
|
-
#include <immintrin.h>
|
452
|
-
#endif
|
453
|
-
#endif
|
454
|
-
#endif
|
455
|
-
#endif
|
456
|
-
#endif
|
457
|
-
|
458
|
-
#ifdef __riscv_v_intrinsic
|
459
|
-
#include <riscv_vector.h>
|
460
|
-
#endif
|
461
|
-
|
462
|
-
#if defined(__loongarch64)
|
463
|
-
#if defined(__loongarch_asx)
|
464
|
-
#include <lasxintrin.h>
|
465
|
-
#endif
|
466
|
-
#if defined(__loongarch_sx)
|
467
|
-
#include <lsxintrin.h>
|
468
|
-
#endif
|
469
|
-
#endif
|
470
|
-
|
471
|
-
#if defined(__loongarch_asx)
|
472
|
-
|
473
|
-
typedef union {
|
474
|
-
int32_t i;
|
475
|
-
float f;
|
476
|
-
} ft_union;
|
477
|
-
|
478
|
-
/* float type data load instructions */
|
479
|
-
static __m128 __lsx_vreplfr2vr_s(float val) {
|
480
|
-
ft_union fi_tmpval = {.f = val};
|
481
|
-
return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
|
482
|
-
}
|
483
|
-
|
484
|
-
static __m256 __lasx_xvreplfr2vr_s(float val) {
|
485
|
-
ft_union fi_tmpval = {.f = val};
|
486
|
-
return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i);
|
487
|
-
}
|
488
|
-
#endif
|
489
|
-
|
490
|
-
#ifdef __F16C__
|
491
|
-
|
492
|
-
#ifdef _MSC_VER
|
493
|
-
#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
|
494
|
-
#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
|
495
|
-
#else
|
496
|
-
#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
|
497
|
-
#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
|
498
|
-
#endif
|
499
|
-
|
500
|
-
#elif defined(__POWER9_VECTOR__)
|
501
|
-
|
502
|
-
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
503
|
-
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
504
|
-
/* the inline asm below is about 12% faster than the lookup method */
|
505
|
-
#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
|
506
|
-
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
507
|
-
|
508
|
-
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
509
|
-
register float f;
|
510
|
-
register double d;
|
511
|
-
__asm__(
|
512
|
-
"mtfprd %0,%2\n"
|
513
|
-
"xscvhpdp %0,%0\n"
|
514
|
-
"frsp %1,%0\n" :
|
515
|
-
/* temp */ "=d"(d),
|
516
|
-
/* out */ "=f"(f):
|
517
|
-
/* in */ "r"(h));
|
518
|
-
return f;
|
519
|
-
}
|
520
|
-
|
521
|
-
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
522
|
-
register double d;
|
523
|
-
register ggml_fp16_t r;
|
524
|
-
__asm__( /* xscvdphp can work on double or single precision */
|
525
|
-
"xscvdphp %0,%2\n"
|
526
|
-
"mffprd %1,%0\n" :
|
527
|
-
/* temp */ "=d"(d),
|
528
|
-
/* out */ "=r"(r):
|
529
|
-
/* in */ "f"(f));
|
530
|
-
return r;
|
531
|
-
}
|
532
|
-
|
533
|
-
#else
|
534
|
-
|
535
|
-
// FP16 <-> FP32
|
536
|
-
// ref: https://github.com/Maratyszcza/FP16
|
537
|
-
|
538
|
-
static inline float fp32_from_bits(uint32_t w) {
|
539
|
-
union {
|
540
|
-
uint32_t as_bits;
|
541
|
-
float as_value;
|
542
|
-
} fp32;
|
543
|
-
fp32.as_bits = w;
|
544
|
-
return fp32.as_value;
|
545
|
-
}
|
546
|
-
|
547
|
-
static inline uint32_t fp32_to_bits(float f) {
|
548
|
-
union {
|
549
|
-
float as_value;
|
550
|
-
uint32_t as_bits;
|
551
|
-
} fp32;
|
552
|
-
fp32.as_value = f;
|
553
|
-
return fp32.as_bits;
|
554
|
-
}
|
555
|
-
|
556
|
-
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
557
|
-
const uint32_t w = (uint32_t) h << 16;
|
558
|
-
const uint32_t sign = w & UINT32_C(0x80000000);
|
559
|
-
const uint32_t two_w = w + w;
|
560
|
-
|
561
|
-
const uint32_t exp_offset = UINT32_C(0xE0) << 23;
|
562
|
-
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
|
563
|
-
const float exp_scale = 0x1.0p-112f;
|
564
|
-
#else
|
565
|
-
const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
|
566
|
-
#endif
|
567
|
-
const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
|
568
|
-
|
569
|
-
const uint32_t magic_mask = UINT32_C(126) << 23;
|
570
|
-
const float magic_bias = 0.5f;
|
571
|
-
const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
|
572
|
-
|
573
|
-
const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
|
574
|
-
const uint32_t result = sign |
|
575
|
-
(two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
|
576
|
-
return fp32_from_bits(result);
|
577
|
-
}
|
578
|
-
|
579
|
-
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
580
|
-
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
|
581
|
-
const float scale_to_inf = 0x1.0p+112f;
|
582
|
-
const float scale_to_zero = 0x1.0p-110f;
|
583
|
-
#else
|
584
|
-
const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
|
585
|
-
const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
|
586
|
-
#endif
|
587
|
-
float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
|
588
|
-
|
589
|
-
const uint32_t w = fp32_to_bits(f);
|
590
|
-
const uint32_t shl1_w = w + w;
|
591
|
-
const uint32_t sign = w & UINT32_C(0x80000000);
|
592
|
-
uint32_t bias = shl1_w & UINT32_C(0xFF000000);
|
593
|
-
if (bias < UINT32_C(0x71000000)) {
|
594
|
-
bias = UINT32_C(0x71000000);
|
595
|
-
}
|
596
|
-
|
597
|
-
base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
|
598
|
-
const uint32_t bits = fp32_to_bits(base);
|
599
|
-
const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
|
600
|
-
const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
|
601
|
-
const uint32_t nonsign = exp_bits + mantissa_bits;
|
602
|
-
return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
|
603
|
-
}
|
604
|
-
|
605
|
-
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
606
|
-
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
607
|
-
|
608
|
-
#endif // __F16C__
|
609
|
-
|
610
|
-
#endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
|
611
|
-
|
612
|
-
// precomputed f32 table for f16 (256 KB)
|
613
|
-
// defined in ggml.c, initialized in ggml_init()
|
614
|
-
extern float ggml_table_f32_f16[1 << 16];
|
615
|
-
|
616
|
-
// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
|
617
|
-
// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
|
618
|
-
// This is also true for POWER9.
|
619
|
-
#if !defined(GGML_FP16_TO_FP32)
|
620
|
-
inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
621
|
-
uint16_t s;
|
622
|
-
memcpy(&s, &f, sizeof(uint16_t));
|
623
|
-
return ggml_table_f32_f16[s];
|
624
|
-
}
|
625
|
-
|
626
|
-
#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
|
627
|
-
#endif
|
628
|
-
|
629
|
-
#if !defined(GGML_FP32_TO_FP16)
|
630
|
-
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
631
|
-
#endif
|
632
|
-
|
633
|
-
#define GGML_HASHTABLE_FULL ((size_t)-1)
|
634
|
-
#define GGML_HASHTABLE_ALREADY_EXISTS ((size_t)-2)
|
635
|
-
|
636
|
-
struct ggml_hash_set ggml_hash_set_new(size_t size);
|
637
|
-
|
638
|
-
bool ggml_hash_contains (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
|
639
|
-
|
640
|
-
// returns GGML_HASHTABLE_FULL if table is full, otherwise the current index of the key or where it should be inserted
|
641
|
-
size_t ggml_hash_find (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
|
642
|
-
|
643
|
-
// returns GGML_HASHTABLE_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
|
644
|
-
size_t ggml_hash_insert ( struct ggml_hash_set hash_set, struct ggml_tensor * key);
|
645
|
-
|
646
|
-
// return index, asserts if table is full
|
647
|
-
size_t ggml_hash_find_or_insert( struct ggml_hash_set hash_set, struct ggml_tensor * key);
|
648
|
-
|
649
|
-
#ifdef __cplusplus
|
650
|
-
}
|
651
|
-
#endif
|