llama_cpp 0.12.1 → 0.12.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/llama_cpp/llama_cpp.cpp +64 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +7 -0
- data/vendor/tmp/llama.cpp/Makefile +0 -9
- data/vendor/tmp/llama.cpp/ggml-alloc.c +28 -6
- data/vendor/tmp/llama.cpp/ggml-alloc.h +3 -1
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +36 -36
- data/vendor/tmp/llama.cpp/ggml-backend.c +510 -263
- data/vendor/tmp/llama.cpp/ggml-backend.h +42 -32
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +692 -476
- data/vendor/tmp/llama.cpp/ggml-cuda.h +18 -30
- data/vendor/tmp/llama.cpp/ggml-impl.h +2 -0
- data/vendor/tmp/llama.cpp/ggml-metal.h +4 -56
- data/vendor/tmp/llama.cpp/ggml-metal.m +1860 -2073
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +321 -14
- data/vendor/tmp/llama.cpp/ggml-opencl.h +13 -3
- data/vendor/tmp/llama.cpp/ggml-quants.c +1638 -134
- data/vendor/tmp/llama.cpp/ggml-quants.h +15 -4
- data/vendor/tmp/llama.cpp/ggml.c +142 -64
- data/vendor/tmp/llama.cpp/ggml.h +47 -29
- data/vendor/tmp/llama.cpp/llama.cpp +1219 -1615
- data/vendor/tmp/llama.cpp/llama.h +30 -8
- metadata +2 -2
@@ -18,46 +18,34 @@ extern "C" {
|
|
18
18
|
#define GGML_CUDA_MAX_DEVICES 16
|
19
19
|
|
20
20
|
// Always success. To check if CUDA is actually loaded, use `ggml_cublas_loaded`.
|
21
|
-
GGML_API void ggml_init_cublas(void);
|
21
|
+
GGML_API GGML_CALL void ggml_init_cublas(void);
|
22
22
|
|
23
23
|
// Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`.
|
24
|
-
GGML_API bool ggml_cublas_loaded(void);
|
24
|
+
GGML_API GGML_CALL bool ggml_cublas_loaded(void);
|
25
25
|
|
26
|
-
GGML_API void * ggml_cuda_host_malloc(size_t size);
|
27
|
-
GGML_API void ggml_cuda_host_free(void * ptr);
|
26
|
+
GGML_API GGML_CALL void * ggml_cuda_host_malloc(size_t size);
|
27
|
+
GGML_API GGML_CALL void ggml_cuda_host_free(void * ptr);
|
28
28
|
|
29
|
-
GGML_API bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
30
|
-
GGML_API
|
31
|
-
GGML_API void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
|
32
|
-
GGML_API void ggml_cuda_free_data(struct ggml_tensor * tensor);
|
29
|
+
GGML_API GGML_CALL bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
30
|
+
GGML_API GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
33
31
|
|
34
|
-
GGML_API void
|
35
|
-
GGML_API void
|
36
|
-
GGML_API void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);
|
37
|
-
|
38
|
-
GGML_API void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor);
|
39
|
-
GGML_API void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset);
|
40
|
-
GGML_API void ggml_cuda_copy_to_device(struct ggml_tensor * tensor);
|
41
|
-
|
42
|
-
GGML_API void ggml_cuda_set_main_device(int main_device);
|
43
|
-
GGML_API void ggml_cuda_set_mul_mat_q(bool mul_mat_q);
|
44
|
-
GGML_API void ggml_cuda_set_scratch_size(size_t scratch_size);
|
45
|
-
GGML_API void ggml_cuda_free_scratch(void);
|
46
|
-
GGML_API bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
47
|
-
|
48
|
-
GGML_API int ggml_cuda_get_device_count(void);
|
49
|
-
GGML_API void ggml_cuda_get_device_description(int device, char * description, size_t description_size);
|
32
|
+
GGML_API GGML_CALL int ggml_cuda_get_device_count(void);
|
33
|
+
GGML_API GGML_CALL void ggml_cuda_get_device_description(int device, char * description, size_t description_size);
|
50
34
|
|
51
35
|
// backend API
|
52
|
-
GGML_API ggml_backend_t ggml_backend_cuda_init(int device);
|
36
|
+
GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device);
|
53
37
|
|
54
|
-
GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend);
|
55
|
-
GGML_API int ggml_backend_cuda_get_device(ggml_backend_t backend);
|
38
|
+
GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend);
|
56
39
|
|
57
|
-
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
|
40
|
+
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
|
41
|
+
// split tensor buffer that splits matrices by rows across multiple devices
|
42
|
+
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
|
43
|
+
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
44
|
+
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
|
58
45
|
|
59
|
-
|
60
|
-
GGML_API
|
46
|
+
GGML_API GGML_CALL int ggml_backend_cuda_get_device_count(void);
|
47
|
+
GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
|
48
|
+
GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
|
61
49
|
|
62
50
|
#ifdef __cplusplus
|
63
51
|
}
|
@@ -228,6 +228,8 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
|
228
228
|
#define GGML_HASHTABLE_FULL ((size_t)-1)
|
229
229
|
#define GGML_HASHTABLE_ALREADY_EXISTS ((size_t)-2)
|
230
230
|
|
231
|
+
struct ggml_hash_set ggml_hash_set_new(size_t size);
|
232
|
+
|
231
233
|
bool ggml_hash_contains (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
|
232
234
|
|
233
235
|
// returns GGML_HASHTABLE_FULL if table is full, otherwise the current index of the key or where it should be inserted
|
@@ -27,7 +27,6 @@
|
|
27
27
|
|
28
28
|
// max memory buffers that can be mapped to the device
|
29
29
|
#define GGML_METAL_MAX_BUFFERS 64
|
30
|
-
#define GGML_METAL_MAX_COMMAND_BUFFERS 32
|
31
30
|
|
32
31
|
struct ggml_tensor;
|
33
32
|
struct ggml_cgraph;
|
@@ -36,73 +35,22 @@ struct ggml_cgraph;
|
|
36
35
|
extern "C" {
|
37
36
|
#endif
|
38
37
|
|
39
|
-
//
|
40
|
-
// internal API
|
41
|
-
// temporary exposed to user-code
|
42
|
-
//
|
43
|
-
|
44
|
-
struct ggml_metal_context;
|
45
|
-
|
46
|
-
void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
|
47
|
-
|
48
|
-
// number of command buffers to use
|
49
|
-
struct ggml_metal_context * ggml_metal_init(int n_cb);
|
50
|
-
void ggml_metal_free(struct ggml_metal_context * ctx);
|
51
|
-
|
52
|
-
void * ggml_metal_host_malloc(size_t n);
|
53
|
-
void ggml_metal_host_free (void * data);
|
54
|
-
|
55
|
-
// set the number of command buffers to use
|
56
|
-
void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);
|
57
|
-
|
58
|
-
// creates a mapping between a host memory buffer and a device memory buffer
|
59
|
-
// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
|
60
|
-
// - the mapping is used during computation to determine the arguments of the compute kernels
|
61
|
-
// - you don't need to keep the host memory buffer allocated as it is never accessed by Metal
|
62
|
-
// - max_size specifies the maximum size of a tensor and is used to create shared views such
|
63
|
-
// that it is guaranteed that the tensor will fit in at least one of the views
|
64
|
-
//
|
65
|
-
bool ggml_metal_add_buffer(
|
66
|
-
struct ggml_metal_context * ctx,
|
67
|
-
const char * name,
|
68
|
-
void * data,
|
69
|
-
size_t size,
|
70
|
-
size_t max_size);
|
71
|
-
|
72
|
-
// set data from host memory into the device
|
73
|
-
void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
|
74
|
-
|
75
|
-
// get data from the device into host memory
|
76
|
-
void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
|
77
|
-
|
78
|
-
// try to find operations that can be run concurrently in the graph
|
79
|
-
// you should run it again if the topology of your graph changes
|
80
|
-
void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf, bool check_mem);
|
81
|
-
|
82
|
-
// if the graph has been optimized for concurrently dispatch, return length of the concur_list if optimized
|
83
|
-
int ggml_metal_if_optimized(struct ggml_metal_context * ctx);
|
84
|
-
|
85
|
-
// output the concur_list for ggml_alloc
|
86
|
-
int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx);
|
87
|
-
|
88
|
-
// same as ggml_graph_compute but uses Metal
|
89
|
-
// creates gf->n_threads command buffers in parallel
|
90
|
-
bool ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
|
91
|
-
|
92
38
|
//
|
93
39
|
// backend API
|
94
40
|
// user-code should use only these functions
|
95
41
|
//
|
96
42
|
|
43
|
+
GGML_API void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
|
44
|
+
|
97
45
|
GGML_API ggml_backend_t ggml_backend_metal_init(void);
|
98
46
|
|
99
47
|
GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
|
100
48
|
|
101
|
-
GGML_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
|
49
|
+
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
|
102
50
|
|
103
51
|
GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
|
104
52
|
|
105
|
-
GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
53
|
+
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
106
54
|
|
107
55
|
// helper to check if the device supports a specific family
|
108
56
|
// ideally, the user code should be doing these checks
|