llama_cpp 0.16.2 → 0.17.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +18 -0
- data/README.md +7 -12
- data/ext/llama_cpp/extconf.rb +2 -43
- data/ext/llama_cpp/llama_cpp.cpp +8 -0
- data/lib/llama_cpp/version.rb +3 -3
- data/sig/llama_cpp.rbs +3 -0
- metadata +2 -171
- data/vendor/include/.gitkeep +0 -0
- data/vendor/lib/.gitkeep +0 -0
- data/vendor/tmp/llama.cpp/LICENSE +0 -21
- data/vendor/tmp/llama.cpp/Makefile +0 -1124
- data/vendor/tmp/llama.cpp/ggml-alloc.c +0 -1041
- data/vendor/tmp/llama.cpp/ggml-alloc.h +0 -76
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +0 -153
- data/vendor/tmp/llama.cpp/ggml-backend.c +0 -2225
- data/vendor/tmp/llama.cpp/ggml-backend.h +0 -236
- data/vendor/tmp/llama.cpp/ggml-blas.cpp +0 -363
- data/vendor/tmp/llama.cpp/ggml-blas.h +0 -23
- data/vendor/tmp/llama.cpp/ggml-common.h +0 -1805
- data/vendor/tmp/llama.cpp/ggml-cuda/acc.cu +0 -47
- data/vendor/tmp/llama.cpp/ggml-cuda/arange.cu +0 -34
- data/vendor/tmp/llama.cpp/ggml-cuda/argsort.cu +0 -104
- data/vendor/tmp/llama.cpp/ggml-cuda/binbcast.cu +0 -280
- data/vendor/tmp/llama.cpp/ggml-cuda/clamp.cu +0 -34
- data/vendor/tmp/llama.cpp/ggml-cuda/concat.cu +0 -196
- data/vendor/tmp/llama.cpp/ggml-cuda/convert.cu +0 -686
- data/vendor/tmp/llama.cpp/ggml-cuda/cpy.cu +0 -490
- data/vendor/tmp/llama.cpp/ggml-cuda/diagmask.cu +0 -40
- data/vendor/tmp/llama.cpp/ggml-cuda/dmmv.cu +0 -674
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f16.cu +0 -319
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn-tile-f32.cu +0 -312
- data/vendor/tmp/llama.cpp/ggml-cuda/fattn.cu +0 -345
- data/vendor/tmp/llama.cpp/ggml-cuda/getrows.cu +0 -178
- data/vendor/tmp/llama.cpp/ggml-cuda/im2col.cu +0 -104
- data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +0 -88
- data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +0 -419
- data/vendor/tmp/llama.cpp/ggml-cuda/norm.cu +0 -221
- data/vendor/tmp/llama.cpp/ggml-cuda/pad.cu +0 -49
- data/vendor/tmp/llama.cpp/ggml-cuda/pool2d.cu +0 -94
- data/vendor/tmp/llama.cpp/ggml-cuda/quantize.cu +0 -112
- data/vendor/tmp/llama.cpp/ggml-cuda/rope.cu +0 -271
- data/vendor/tmp/llama.cpp/ggml-cuda/scale.cu +0 -31
- data/vendor/tmp/llama.cpp/ggml-cuda/softmax.cu +0 -206
- data/vendor/tmp/llama.cpp/ggml-cuda/sumrows.cu +0 -40
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu +0 -10
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu +0 -9
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu +0 -10
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu +0 -10
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu +0 -8
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q2_k.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q3_k.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q4_k.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_1.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q5_k.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q6_k.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/template-instances/mmq-instance-q8_0.cu +0 -5
- data/vendor/tmp/llama.cpp/ggml-cuda/tsembd.cu +0 -47
- data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +0 -314
- data/vendor/tmp/llama.cpp/ggml-cuda/upscale.cu +0 -51
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +0 -3069
- data/vendor/tmp/llama.cpp/ggml-cuda.h +0 -44
- data/vendor/tmp/llama.cpp/ggml-impl.h +0 -651
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +0 -2038
- data/vendor/tmp/llama.cpp/ggml-kompute.h +0 -46
- data/vendor/tmp/llama.cpp/ggml-metal.h +0 -66
- data/vendor/tmp/llama.cpp/ggml-metal.m +0 -3273
- data/vendor/tmp/llama.cpp/ggml-metal.metal +0 -6540
- data/vendor/tmp/llama.cpp/ggml-quants.c +0 -14994
- data/vendor/tmp/llama.cpp/ggml-quants.h +0 -133
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +0 -1178
- data/vendor/tmp/llama.cpp/ggml-rpc.h +0 -24
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +0 -6351
- data/vendor/tmp/llama.cpp/ggml-sycl.h +0 -40
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +0 -144508
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +0 -7183
- data/vendor/tmp/llama.cpp/ggml-vulkan.h +0 -29
- data/vendor/tmp/llama.cpp/ggml.c +0 -22506
- data/vendor/tmp/llama.cpp/ggml.h +0 -2458
- data/vendor/tmp/llama.cpp/llama.cpp +0 -18985
- data/vendor/tmp/llama.cpp/llama.h +0 -1147
- data/vendor/tmp/llama.cpp/scripts/get-flags.mk +0 -38
- data/vendor/tmp/llama.cpp/sgemm.cpp +0 -1032
- data/vendor/tmp/llama.cpp/sgemm.h +0 -14
- data/vendor/tmp/llama.cpp/unicode-data.cpp +0 -7033
- data/vendor/tmp/llama.cpp/unicode-data.h +0 -20
- data/vendor/tmp/llama.cpp/unicode.cpp +0 -810
- data/vendor/tmp/llama.cpp/unicode.h +0 -63
@@ -1,236 +0,0 @@
|
|
1
|
-
#pragma once
|
2
|
-
|
3
|
-
#include "ggml.h"
|
4
|
-
#include "ggml-alloc.h"
|
5
|
-
|
6
|
-
#ifdef __cplusplus
|
7
|
-
extern "C" {
|
8
|
-
#endif
|
9
|
-
|
10
|
-
typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
|
11
|
-
typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
|
12
|
-
typedef struct ggml_backend_event * ggml_backend_event_t;
|
13
|
-
typedef struct ggml_backend * ggml_backend_t;
|
14
|
-
typedef void * ggml_backend_graph_plan_t;
|
15
|
-
|
16
|
-
//
|
17
|
-
// Backend buffer
|
18
|
-
//
|
19
|
-
|
20
|
-
// buffer type
|
21
|
-
GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
|
22
|
-
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
|
23
|
-
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
24
|
-
GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
|
25
|
-
GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
|
26
|
-
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
|
27
|
-
|
28
|
-
// buffer
|
29
|
-
enum ggml_backend_buffer_usage {
|
30
|
-
GGML_BACKEND_BUFFER_USAGE_ANY = 0,
|
31
|
-
GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
|
32
|
-
};
|
33
|
-
|
34
|
-
GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer);
|
35
|
-
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
|
36
|
-
GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
|
37
|
-
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
|
38
|
-
GGML_API GGML_CALL void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
39
|
-
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
40
|
-
GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
|
41
|
-
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
42
|
-
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
|
43
|
-
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
|
44
|
-
GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
|
45
|
-
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer);
|
46
|
-
GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer);
|
47
|
-
|
48
|
-
//
|
49
|
-
// Backend
|
50
|
-
//
|
51
|
-
|
52
|
-
GGML_API ggml_guid_t ggml_backend_guid(ggml_backend_t backend);
|
53
|
-
GGML_API const char * ggml_backend_name(ggml_backend_t backend);
|
54
|
-
GGML_API void ggml_backend_free(ggml_backend_t backend);
|
55
|
-
|
56
|
-
GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend);
|
57
|
-
GGML_API ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
|
58
|
-
GGML_API size_t ggml_backend_get_alignment(ggml_backend_t backend);
|
59
|
-
GGML_API size_t ggml_backend_get_max_size(ggml_backend_t backend);
|
60
|
-
|
61
|
-
GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
62
|
-
GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
63
|
-
|
64
|
-
GGML_API GGML_CALL void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
65
|
-
GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
66
|
-
|
67
|
-
GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
|
68
|
-
|
69
|
-
GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
70
|
-
GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
71
|
-
|
72
|
-
GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
73
|
-
GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
74
|
-
GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
75
|
-
GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
76
|
-
GGML_API bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
|
77
|
-
GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
78
|
-
|
79
|
-
// tensor copy between different backends
|
80
|
-
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
|
81
|
-
|
82
|
-
// asynchronous copy
|
83
|
-
// the copy is performed after all the currently queued operations in backend_src
|
84
|
-
// backend_dst will wait for the copy to complete before performing other operations
|
85
|
-
// automatic fallback to sync copy if async is not supported
|
86
|
-
GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst);
|
87
|
-
|
88
|
-
// events
|
89
|
-
GGML_API ggml_backend_event_t ggml_backend_event_new (ggml_backend_t backend);
|
90
|
-
GGML_API void ggml_backend_event_free (ggml_backend_event_t event);
|
91
|
-
GGML_API void ggml_backend_event_record (ggml_backend_event_t event);
|
92
|
-
GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event);
|
93
|
-
GGML_API void ggml_backend_event_wait (ggml_backend_t backend, ggml_backend_event_t event);
|
94
|
-
|
95
|
-
//
|
96
|
-
// CPU backend
|
97
|
-
//
|
98
|
-
|
99
|
-
GGML_API ggml_backend_t ggml_backend_cpu_init(void);
|
100
|
-
|
101
|
-
GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend);
|
102
|
-
GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
|
103
|
-
GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
|
104
|
-
|
105
|
-
// Create a backend buffer from an existing pointer
|
106
|
-
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
|
107
|
-
|
108
|
-
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
|
109
|
-
|
110
|
-
#ifdef GGML_USE_CPU_HBM
|
111
|
-
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
|
112
|
-
#endif
|
113
|
-
|
114
|
-
//
|
115
|
-
// Backend registry
|
116
|
-
//
|
117
|
-
|
118
|
-
// The backend registry is a registry of all the available backends, and allows initializing backends in a generic way
|
119
|
-
|
120
|
-
GGML_API size_t ggml_backend_reg_get_count(void);
|
121
|
-
GGML_API size_t ggml_backend_reg_find_by_name(const char * name);
|
122
|
-
GGML_API ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is backend_name:params (params is optional)
|
123
|
-
GGML_API const char * ggml_backend_reg_get_name(size_t i);
|
124
|
-
GGML_API ggml_backend_t ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific
|
125
|
-
GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i);
|
126
|
-
GGML_API ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size);
|
127
|
-
|
128
|
-
//
|
129
|
-
// Backend scheduler
|
130
|
-
//
|
131
|
-
|
132
|
-
// The backend scheduler allows for multiple backends to be used together
|
133
|
-
// Handles compute buffer allocation, assignment of tensors to backends, and copying of tensors between backends
|
134
|
-
// The backends are selected based on:
|
135
|
-
// - the backend that supports the operation
|
136
|
-
// - the location of the pre-allocated tensors (e.g. the weights)
|
137
|
-
/*
|
138
|
-
Example usage:
|
139
|
-
|
140
|
-
// operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be assigned
|
141
|
-
// preferrably to run on the same backend as the buffer
|
142
|
-
ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
|
143
|
-
|
144
|
-
sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false);
|
145
|
-
|
146
|
-
// initialize buffers from a max size graph (optional)
|
147
|
-
reserve_graph = build_graph(sched, max_batch_size);
|
148
|
-
|
149
|
-
// manually assign nodes to a backend (optional, should not be needed in most cases)
|
150
|
-
struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
|
151
|
-
ggml_backend_sched_set_tensor_backend(sched, node, backend_gpu);
|
152
|
-
|
153
|
-
ggml_backend_sched_reserve(sched, reserve_graph);
|
154
|
-
|
155
|
-
// compute
|
156
|
-
graph = build_graph(sched);
|
157
|
-
ggml_backend_sched_graph_compute(sched, graph);
|
158
|
-
|
159
|
-
// if there are graph inputs:
|
160
|
-
ggml_backend_sched_reset(sched);
|
161
|
-
ggml_backend_sched_alloc_graph(sched, graph);
|
162
|
-
ggml_backend_tensor_set(input_tensor, ...);
|
163
|
-
ggml_backend_sched_graph_compute(sched, graph);
|
164
|
-
}
|
165
|
-
*/
|
166
|
-
|
167
|
-
struct ggml_backend_sched;
|
168
|
-
typedef struct ggml_backend_sched * ggml_backend_sched_t;
|
169
|
-
|
170
|
-
// when ask == true, the scheduler wants to know if the user wants to observe this node
|
171
|
-
// this allows the scheduler to batch nodes together in order to evaluate them in a single call
|
172
|
-
//
|
173
|
-
// when ask == false, the scheduler is passing the node tensor to the user for observation
|
174
|
-
// if the user returns false, the scheduler will cancel the graph compute
|
175
|
-
//
|
176
|
-
typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
|
177
|
-
|
178
|
-
// Initialize a backend scheduler
|
179
|
-
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
|
180
|
-
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
|
181
|
-
|
182
|
-
// Initialize backend buffers from a measure graph
|
183
|
-
GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
|
184
|
-
|
185
|
-
GGML_API int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched);
|
186
|
-
GGML_API ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i);
|
187
|
-
|
188
|
-
// Get the number of splits of the last graph
|
189
|
-
GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
|
190
|
-
GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
|
191
|
-
|
192
|
-
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
|
193
|
-
|
194
|
-
GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
|
195
|
-
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
|
196
|
-
|
197
|
-
// Allocate and compute graph on the backend scheduler
|
198
|
-
GGML_API bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
|
199
|
-
GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
|
200
|
-
GGML_API enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
|
201
|
-
GGML_API void ggml_backend_sched_synchronize(ggml_backend_sched_t sched);
|
202
|
-
|
203
|
-
// Reset all assignments and allocators - must be called before changing the node backends
|
204
|
-
GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);
|
205
|
-
|
206
|
-
// Set a callback to be called for each resulting node during graph compute
|
207
|
-
GGML_API void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
|
208
|
-
|
209
|
-
//
|
210
|
-
// Utils
|
211
|
-
//
|
212
|
-
|
213
|
-
struct ggml_backend_graph_copy {
|
214
|
-
ggml_backend_buffer_t buffer;
|
215
|
-
struct ggml_context * ctx_allocated;
|
216
|
-
struct ggml_context * ctx_unallocated;
|
217
|
-
struct ggml_cgraph * graph;
|
218
|
-
};
|
219
|
-
|
220
|
-
// Copy a graph to a different backend
|
221
|
-
GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
|
222
|
-
GGML_API void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
|
223
|
-
|
224
|
-
typedef bool (*GGML_CALL ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
|
225
|
-
|
226
|
-
// Compare the output of two backends
|
227
|
-
GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
|
228
|
-
|
229
|
-
// Tensor initialization
|
230
|
-
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
|
231
|
-
GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
|
232
|
-
|
233
|
-
|
234
|
-
#ifdef __cplusplus
|
235
|
-
}
|
236
|
-
#endif
|
@@ -1,363 +0,0 @@
|
|
1
|
-
#include "ggml-blas.h"
|
2
|
-
#include "ggml-backend-impl.h"
|
3
|
-
|
4
|
-
#include <future>
|
5
|
-
#include <vector>
|
6
|
-
|
7
|
-
#if defined(GGML_USE_ACCELERATE)
|
8
|
-
# include <Accelerate/Accelerate.h>
|
9
|
-
#elif defined(GGML_BLAS_USE_MKL)
|
10
|
-
# include <mkl.h>
|
11
|
-
#else
|
12
|
-
# include <cblas.h>
|
13
|
-
# ifdef BLIS_ENABLE_CBLAS
|
14
|
-
# include <blis.h>
|
15
|
-
# endif
|
16
|
-
#endif
|
17
|
-
|
18
|
-
struct ggml_backend_blas_context {
|
19
|
-
int n_threads = GGML_DEFAULT_N_THREADS;
|
20
|
-
std::unique_ptr<char[]> work_data;
|
21
|
-
size_t work_size = 0;
|
22
|
-
#ifndef GGML_USE_OPENMP
|
23
|
-
std::vector<std::future<void>> tasks;
|
24
|
-
#endif
|
25
|
-
};
|
26
|
-
|
27
|
-
// helper function to determine if it is better to use BLAS or not
|
28
|
-
// for large matrices, BLAS is faster
|
29
|
-
static bool ggml_backend_blas_use_blas(const struct ggml_tensor * dst) {
|
30
|
-
const struct ggml_tensor * src0 = dst->src[0];
|
31
|
-
const struct ggml_tensor * src1 = dst->src[1];
|
32
|
-
|
33
|
-
const int64_t ne10 = src1->ne[0];
|
34
|
-
|
35
|
-
const int64_t ne0 = dst->ne[0];
|
36
|
-
const int64_t ne1 = dst->ne[1];
|
37
|
-
|
38
|
-
// TODO: find the optimal values for these
|
39
|
-
if (ggml_is_contiguous(src0) &&
|
40
|
-
ggml_is_contiguous(src1) &&
|
41
|
-
src1->type == GGML_TYPE_F32 &&
|
42
|
-
(ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
|
43
|
-
|
44
|
-
/*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
|
45
|
-
return true;
|
46
|
-
}
|
47
|
-
|
48
|
-
return false;
|
49
|
-
}
|
50
|
-
|
51
|
-
static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct ggml_tensor * dst) {
|
52
|
-
const struct ggml_tensor * src0 = dst->src[0];
|
53
|
-
const struct ggml_tensor * src1 = dst->src[1];
|
54
|
-
|
55
|
-
GGML_TENSOR_BINARY_OP_LOCALS
|
56
|
-
|
57
|
-
const enum ggml_type type = src0->type;
|
58
|
-
|
59
|
-
GGML_ASSERT(ne0 == ne01);
|
60
|
-
GGML_ASSERT(ne1 == ne11);
|
61
|
-
GGML_ASSERT(ne2 == ne12);
|
62
|
-
GGML_ASSERT(ne3 == ne13);
|
63
|
-
|
64
|
-
// we don't support permuted src0 or src1
|
65
|
-
GGML_ASSERT(nb00 == ggml_type_size(type));
|
66
|
-
GGML_ASSERT(nb10 == ggml_type_size(src1->type));
|
67
|
-
|
68
|
-
// dst cannot be transposed or permuted
|
69
|
-
GGML_ASSERT(nb0 == sizeof(float));
|
70
|
-
GGML_ASSERT(nb0 <= nb1);
|
71
|
-
GGML_ASSERT(nb1 <= nb2);
|
72
|
-
GGML_ASSERT(nb2 <= nb3);
|
73
|
-
|
74
|
-
// broadcast factors
|
75
|
-
const int64_t r2 = ne12/ne02;
|
76
|
-
const int64_t r3 = ne13/ne03;
|
77
|
-
|
78
|
-
const int64_t ne_plane = ne01*ne00;
|
79
|
-
const size_t desired_wsize = type == GGML_TYPE_F32 ? 0 : ne03*ne02*ne_plane*sizeof(float);
|
80
|
-
|
81
|
-
if (ctx->work_size < desired_wsize) {
|
82
|
-
ctx->work_data.reset(new char[desired_wsize]);
|
83
|
-
ctx->work_size = desired_wsize;
|
84
|
-
}
|
85
|
-
void * wdata = ctx->work_data.get();
|
86
|
-
|
87
|
-
// convert src0 to float
|
88
|
-
if (type != GGML_TYPE_F32) {
|
89
|
-
ggml_type_traits_t type_traits = ggml_internal_get_type_traits(type);
|
90
|
-
ggml_to_float_t const to_float = type_traits.to_float;
|
91
|
-
|
92
|
-
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
93
|
-
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
94
|
-
const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
|
95
|
-
float * const wplane = (float *) wdata + i02*ne_plane + i03*ne02*ne_plane;
|
96
|
-
|
97
|
-
const int min_cols_per_thread = 4096;
|
98
|
-
const int min_rows_per_thread = std::max((int)(min_cols_per_thread/ne00), 1);
|
99
|
-
const int n_threads = std::max(std::min(ctx->n_threads, (int)(ne01/min_rows_per_thread)), 1);
|
100
|
-
|
101
|
-
#ifdef GGML_USE_OPENMP
|
102
|
-
#pragma omp parallel for num_threads(n_threads)
|
103
|
-
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
104
|
-
to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00);
|
105
|
-
}
|
106
|
-
#else
|
107
|
-
for (int i = 1; i < n_threads; i++) {
|
108
|
-
const int64_t start = i*ne01/n_threads;
|
109
|
-
const int64_t end = (i + 1)*ne01/n_threads;
|
110
|
-
if (start < end) {
|
111
|
-
ctx->tasks.push_back(std::async(std::launch::async, [=]() {
|
112
|
-
for (int64_t i01 = start; i01 < end; i01++) {
|
113
|
-
to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00);
|
114
|
-
}
|
115
|
-
}));
|
116
|
-
}
|
117
|
-
}
|
118
|
-
{
|
119
|
-
// reuse the current thread for the first task
|
120
|
-
const int64_t start = 0;
|
121
|
-
const int64_t end = ne01/n_threads;
|
122
|
-
for (int64_t i01 = start; i01 < end; i01++) {
|
123
|
-
to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00);
|
124
|
-
}
|
125
|
-
}
|
126
|
-
#endif
|
127
|
-
}
|
128
|
-
}
|
129
|
-
|
130
|
-
#ifndef GGML_USE_OPENMP
|
131
|
-
// wait for all tasks to finish
|
132
|
-
for (auto & task : ctx->tasks) {
|
133
|
-
task.get();
|
134
|
-
}
|
135
|
-
ctx->tasks.clear();
|
136
|
-
#endif
|
137
|
-
}
|
138
|
-
|
139
|
-
#if defined(OPENBLAS_VERSION)
|
140
|
-
openblas_set_num_threads(ctx->n_threads);
|
141
|
-
#endif
|
142
|
-
|
143
|
-
#if defined(BLIS_ENABLE_CBLAS)
|
144
|
-
bli_thread_set_num_threads(ctx->n_threads);
|
145
|
-
#endif
|
146
|
-
|
147
|
-
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
148
|
-
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
149
|
-
const int64_t i03 = i13/r3;
|
150
|
-
const int64_t i02 = i12/r2;
|
151
|
-
|
152
|
-
const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03);
|
153
|
-
const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
|
154
|
-
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
155
|
-
|
156
|
-
if (type != GGML_TYPE_F32) {
|
157
|
-
x = (float *) wdata + i02*ne_plane + i03*ne02*ne_plane;
|
158
|
-
}
|
159
|
-
|
160
|
-
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
161
|
-
ne1, ne01, ne10,
|
162
|
-
1.0f, y, ne10,
|
163
|
-
x, ne00,
|
164
|
-
0.0f, d, ne01);
|
165
|
-
}
|
166
|
-
}
|
167
|
-
}
|
168
|
-
|
169
|
-
static void ggml_backend_blas_out_prod(ggml_backend_blas_context * ctx, struct ggml_tensor * dst) {
|
170
|
-
const struct ggml_tensor * src0 = dst->src[0];
|
171
|
-
const struct ggml_tensor * src1 = dst->src[1];
|
172
|
-
|
173
|
-
GGML_TENSOR_BINARY_OP_LOCALS
|
174
|
-
|
175
|
-
GGML_ASSERT(ne0 == ne00);
|
176
|
-
GGML_ASSERT(ne1 == ne10);
|
177
|
-
GGML_ASSERT(ne2 == ne02);
|
178
|
-
GGML_ASSERT(ne02 == ne12);
|
179
|
-
GGML_ASSERT(ne3 == ne13);
|
180
|
-
GGML_ASSERT(ne03 == ne13);
|
181
|
-
|
182
|
-
// we don't support permuted src0 or src1
|
183
|
-
GGML_ASSERT(nb00 == sizeof(float));
|
184
|
-
|
185
|
-
// dst cannot be transposed or permuted
|
186
|
-
GGML_ASSERT(nb0 == sizeof(float));
|
187
|
-
// GGML_ASSERT(nb0 <= nb1);
|
188
|
-
// GGML_ASSERT(nb1 <= nb2);
|
189
|
-
// GGML_ASSERT(nb2 <= nb3);
|
190
|
-
|
191
|
-
// Arguments to ggml_compute_forward_out_prod (expressed as major,minor)
|
192
|
-
// src0: (k,n)
|
193
|
-
// src1: (k,m)
|
194
|
-
// dst: (m,n)
|
195
|
-
//
|
196
|
-
// Arguments to sgemm (see https://github.com/Reference-LAPACK/lapack/blob/master/BLAS/SRC/sgemm.f)
|
197
|
-
// Also expressed as (major,minor)
|
198
|
-
// a: (m,k): so src1 transposed
|
199
|
-
// b: (k,n): so src0
|
200
|
-
// c: (m,n)
|
201
|
-
//
|
202
|
-
// However, if ggml_is_transposed(src1) is true, then
|
203
|
-
// src1->data already contains a transposed version, so sgemm mustn't
|
204
|
-
// transpose it further.
|
205
|
-
|
206
|
-
int n = src0->ne[0];
|
207
|
-
int k = src0->ne[1];
|
208
|
-
int m = src1->ne[0];
|
209
|
-
|
210
|
-
CBLAS_TRANSPOSE transposeA;
|
211
|
-
int lda;
|
212
|
-
|
213
|
-
if (!ggml_is_transposed(src1)) {
|
214
|
-
transposeA = CblasTrans;
|
215
|
-
lda = m;
|
216
|
-
} else {
|
217
|
-
transposeA = CblasNoTrans;
|
218
|
-
lda = k;
|
219
|
-
}
|
220
|
-
|
221
|
-
float * a = (float *) ((char *) src1->data);
|
222
|
-
float * b = (float *) ((char *) src0->data);
|
223
|
-
float * c = (float *) ((char *) dst->data);
|
224
|
-
|
225
|
-
cblas_sgemm(CblasRowMajor, transposeA, CblasNoTrans, m, n, k, 1.0, a, lda, b, n, 0.0, c, n);
|
226
|
-
|
227
|
-
GGML_UNUSED(ctx);
|
228
|
-
}
|
229
|
-
|
230
|
-
// backend interface
|
231
|
-
|
232
|
-
GGML_CALL static const char * ggml_backend_blas_name(ggml_backend_t backend) {
|
233
|
-
return "BLAS";
|
234
|
-
|
235
|
-
GGML_UNUSED(backend);
|
236
|
-
}
|
237
|
-
|
238
|
-
GGML_CALL static void ggml_backend_blas_free(ggml_backend_t backend) {
|
239
|
-
ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context;
|
240
|
-
delete ctx;
|
241
|
-
delete backend;
|
242
|
-
}
|
243
|
-
|
244
|
-
GGML_CALL static ggml_backend_buffer_type_t ggml_backend_blas_get_default_buffer_type(ggml_backend_t backend) {
|
245
|
-
return ggml_backend_cpu_buffer_type();
|
246
|
-
|
247
|
-
GGML_UNUSED(backend);
|
248
|
-
}
|
249
|
-
|
250
|
-
GGML_CALL static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
251
|
-
ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context;
|
252
|
-
|
253
|
-
for (int i = 0; i < cgraph->n_nodes; i++) {
|
254
|
-
struct ggml_tensor * node = cgraph->nodes[i];
|
255
|
-
|
256
|
-
switch (node->op) {
|
257
|
-
case GGML_OP_MUL_MAT:
|
258
|
-
ggml_backend_blas_mul_mat(ctx, node);
|
259
|
-
break;
|
260
|
-
|
261
|
-
case GGML_OP_OUT_PROD:
|
262
|
-
ggml_backend_blas_out_prod(ctx, node);
|
263
|
-
break;
|
264
|
-
|
265
|
-
case GGML_OP_NONE:
|
266
|
-
case GGML_OP_RESHAPE:
|
267
|
-
case GGML_OP_VIEW:
|
268
|
-
case GGML_OP_PERMUTE:
|
269
|
-
case GGML_OP_TRANSPOSE:
|
270
|
-
break;
|
271
|
-
|
272
|
-
default:
|
273
|
-
fprintf(stderr, "%s: unsupported op %s\n", __func__, ggml_op_desc(node));
|
274
|
-
GGML_ASSERT(false);
|
275
|
-
}
|
276
|
-
}
|
277
|
-
|
278
|
-
return GGML_STATUS_SUCCESS;
|
279
|
-
|
280
|
-
GGML_UNUSED(backend);
|
281
|
-
}
|
282
|
-
|
283
|
-
GGML_CALL static bool ggml_backend_blas_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
284
|
-
const struct ggml_tensor * src0 = op->src[0];
|
285
|
-
const struct ggml_tensor * src1 = op->src[1];
|
286
|
-
|
287
|
-
return (op->op == GGML_OP_MUL_MAT && ggml_backend_blas_use_blas(op)) ||
|
288
|
-
(op->op == GGML_OP_OUT_PROD && op->src[0]->type == GGML_TYPE_F32 &&
|
289
|
-
op->src[1]->type == GGML_TYPE_F32 &&
|
290
|
-
ggml_is_matrix(src0) &&
|
291
|
-
ggml_is_matrix(src1) &&
|
292
|
-
ggml_is_contiguous(src0) &&
|
293
|
-
(ggml_is_contiguous(src1) || ggml_is_transposed(src1)));
|
294
|
-
|
295
|
-
GGML_UNUSED(backend);
|
296
|
-
}
|
297
|
-
|
298
|
-
GGML_CALL static bool ggml_backend_blas_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
299
|
-
return ggml_backend_buft_is_host(buft);
|
300
|
-
|
301
|
-
GGML_UNUSED(backend);
|
302
|
-
}
|
303
|
-
|
304
|
-
static struct ggml_backend_i blas_backend_i = {
|
305
|
-
/* .get_name = */ ggml_backend_blas_name,
|
306
|
-
/* .free = */ ggml_backend_blas_free,
|
307
|
-
/* .get_default_buffer_type = */ ggml_backend_blas_get_default_buffer_type,
|
308
|
-
/* .set_tensor_async = */ NULL,
|
309
|
-
/* .get_tensor_async = */ NULL,
|
310
|
-
/* .cpy_tensor_async = */ NULL,
|
311
|
-
/* .synchronize = */ NULL,
|
312
|
-
/* .graph_plan_create = */ NULL,
|
313
|
-
/* .graph_plan_free = */ NULL,
|
314
|
-
/* .graph_plan_update = */ NULL,
|
315
|
-
/* .graph_plan_compute = */ NULL,
|
316
|
-
/* .graph_compute = */ ggml_backend_blas_graph_compute,
|
317
|
-
/* .supports_op = */ ggml_backend_blas_supports_op,
|
318
|
-
/* .supports_buft = */ ggml_backend_blas_supports_buft,
|
319
|
-
/* .offload_op = */ NULL,
|
320
|
-
/* .event_new = */ NULL,
|
321
|
-
/* .event_free = */ NULL,
|
322
|
-
/* .event_record = */ NULL,
|
323
|
-
/* .event_wait = */ NULL,
|
324
|
-
/* .event_synchronize = */ NULL,
|
325
|
-
};
|
326
|
-
|
327
|
-
static ggml_guid_t ggml_backend_blas_guid(void) {
|
328
|
-
static ggml_guid guid = { 0x12, 0xa8, 0xae, 0xf4, 0xc0, 0x1e, 0x61, 0x97, 0x8f, 0xeb, 0x33, 0x04, 0xa1, 0x33, 0x51, 0x2d };
|
329
|
-
return &guid;
|
330
|
-
}
|
331
|
-
|
332
|
-
ggml_backend_t ggml_backend_blas_init(void) {
|
333
|
-
ggml_backend_blas_context * ctx = new ggml_backend_blas_context;
|
334
|
-
|
335
|
-
ggml_backend_t backend = new ggml_backend {
|
336
|
-
/* .guid = */ ggml_backend_blas_guid(),
|
337
|
-
/* .interface = */ blas_backend_i,
|
338
|
-
/* .context = */ ctx,
|
339
|
-
};
|
340
|
-
|
341
|
-
#if !defined(NDEBUG) && defined(OPENBLAS_VERSION) && defined(GGML_USE_OPENMP)
|
342
|
-
if (openblas_get_parallel() != OPENBLAS_OPENMP) {
|
343
|
-
fprintf(stderr, "%s: warning: ggml is using OpenMP, but OpenBLAS was compiled without OpenMP support\n", __func__);
|
344
|
-
}
|
345
|
-
#endif
|
346
|
-
|
347
|
-
#if !defined(NDEBUG) && defined(BLIS_ENABLE_CBLAS) && defined(GGML_USE_OPENMP) && !defined(BLIS_ENABLE_OPENMP)
|
348
|
-
fprintf(stderr, "%s: warning: ggml is using OpenMP, but BLIS was compiled without OpenMP support\n", __func__);
|
349
|
-
#endif
|
350
|
-
|
351
|
-
return backend;
|
352
|
-
}
|
353
|
-
|
354
|
-
GGML_CALL bool ggml_backend_is_blas(ggml_backend_t backend) {
|
355
|
-
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_blas_guid());
|
356
|
-
}
|
357
|
-
|
358
|
-
void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads) {
|
359
|
-
GGML_ASSERT(ggml_backend_is_blas(backend_blas));
|
360
|
-
|
361
|
-
ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend_blas->context;
|
362
|
-
ctx->n_threads = n_threads;
|
363
|
-
}
|
@@ -1,23 +0,0 @@
|
|
1
|
-
#pragma once
|
2
|
-
|
3
|
-
#include "ggml.h"
|
4
|
-
#include "ggml-backend.h"
|
5
|
-
|
6
|
-
|
7
|
-
#ifdef __cplusplus
|
8
|
-
extern "C" {
|
9
|
-
#endif
|
10
|
-
|
11
|
-
// backend API
|
12
|
-
GGML_API GGML_CALL ggml_backend_t ggml_backend_blas_init(void);
|
13
|
-
|
14
|
-
GGML_API GGML_CALL bool ggml_backend_is_blas(ggml_backend_t backend);
|
15
|
-
|
16
|
-
// number of threads used for conversion to float
|
17
|
-
// for openblas and blis, this will also set the number of threads used for blas operations
|
18
|
-
GGML_API GGML_CALL void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
|
19
|
-
|
20
|
-
|
21
|
-
#ifdef __cplusplus
|
22
|
-
}
|
23
|
-
#endif
|