llama_cpp 0.12.1 → 0.12.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -18,46 +18,34 @@ extern "C" {
18
18
  #define GGML_CUDA_MAX_DEVICES 16
19
19
 
20
20
  // Always success. To check if CUDA is actually loaded, use `ggml_cublas_loaded`.
21
- GGML_API void ggml_init_cublas(void);
21
+ GGML_API GGML_CALL void ggml_init_cublas(void);
22
22
 
23
23
  // Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`.
24
- GGML_API bool ggml_cublas_loaded(void);
24
+ GGML_API GGML_CALL bool ggml_cublas_loaded(void);
25
25
 
26
- GGML_API void * ggml_cuda_host_malloc(size_t size);
27
- GGML_API void ggml_cuda_host_free(void * ptr);
26
+ GGML_API GGML_CALL void * ggml_cuda_host_malloc(size_t size);
27
+ GGML_API GGML_CALL void ggml_cuda_host_free(void * ptr);
28
28
 
29
- GGML_API bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
30
- GGML_API void ggml_cuda_set_tensor_split(const float * tensor_split);
31
- GGML_API void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
32
- GGML_API void ggml_cuda_free_data(struct ggml_tensor * tensor);
29
+ GGML_API GGML_CALL bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
30
+ GGML_API GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
33
31
 
34
- GGML_API void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
35
- GGML_API void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
36
- GGML_API void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);
37
-
38
- GGML_API void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor);
39
- GGML_API void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset);
40
- GGML_API void ggml_cuda_copy_to_device(struct ggml_tensor * tensor);
41
-
42
- GGML_API void ggml_cuda_set_main_device(int main_device);
43
- GGML_API void ggml_cuda_set_mul_mat_q(bool mul_mat_q);
44
- GGML_API void ggml_cuda_set_scratch_size(size_t scratch_size);
45
- GGML_API void ggml_cuda_free_scratch(void);
46
- GGML_API bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
47
-
48
- GGML_API int ggml_cuda_get_device_count(void);
49
- GGML_API void ggml_cuda_get_device_description(int device, char * description, size_t description_size);
32
+ GGML_API GGML_CALL int ggml_cuda_get_device_count(void);
33
+ GGML_API GGML_CALL void ggml_cuda_get_device_description(int device, char * description, size_t description_size);
50
34
 
51
35
  // backend API
52
- GGML_API ggml_backend_t ggml_backend_cuda_init(int device);
36
+ GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device);
53
37
 
54
- GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend);
55
- GGML_API int ggml_backend_cuda_get_device(ggml_backend_t backend);
38
+ GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend);
56
39
 
57
- GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
40
+ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
41
+ // split tensor buffer that splits matrices by rows across multiple devices
42
+ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
43
+ // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
44
+ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
58
45
 
59
- // pinned host buffer for use with CPU backend for faster copies between CPU and GPU
60
- GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
46
+ GGML_API GGML_CALL int ggml_backend_cuda_get_device_count(void);
47
+ GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
48
+ GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
61
49
 
62
50
  #ifdef __cplusplus
63
51
  }
@@ -228,6 +228,8 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
228
228
  #define GGML_HASHTABLE_FULL ((size_t)-1)
229
229
  #define GGML_HASHTABLE_ALREADY_EXISTS ((size_t)-2)
230
230
 
231
+ struct ggml_hash_set ggml_hash_set_new(size_t size);
232
+
231
233
  bool ggml_hash_contains (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
232
234
 
233
235
  // returns GGML_HASHTABLE_FULL if table is full, otherwise the current index of the key or where it should be inserted
@@ -27,7 +27,6 @@
27
27
 
28
28
  // max memory buffers that can be mapped to the device
29
29
  #define GGML_METAL_MAX_BUFFERS 64
30
- #define GGML_METAL_MAX_COMMAND_BUFFERS 32
31
30
 
32
31
  struct ggml_tensor;
33
32
  struct ggml_cgraph;
@@ -36,73 +35,22 @@ struct ggml_cgraph;
36
35
  extern "C" {
37
36
  #endif
38
37
 
39
- //
40
- // internal API
41
- // temporary exposed to user-code
42
- //
43
-
44
- struct ggml_metal_context;
45
-
46
- void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
47
-
48
- // number of command buffers to use
49
- struct ggml_metal_context * ggml_metal_init(int n_cb);
50
- void ggml_metal_free(struct ggml_metal_context * ctx);
51
-
52
- void * ggml_metal_host_malloc(size_t n);
53
- void ggml_metal_host_free (void * data);
54
-
55
- // set the number of command buffers to use
56
- void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);
57
-
58
- // creates a mapping between a host memory buffer and a device memory buffer
59
- // - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
60
- // - the mapping is used during computation to determine the arguments of the compute kernels
61
- // - you don't need to keep the host memory buffer allocated as it is never accessed by Metal
62
- // - max_size specifies the maximum size of a tensor and is used to create shared views such
63
- // that it is guaranteed that the tensor will fit in at least one of the views
64
- //
65
- bool ggml_metal_add_buffer(
66
- struct ggml_metal_context * ctx,
67
- const char * name,
68
- void * data,
69
- size_t size,
70
- size_t max_size);
71
-
72
- // set data from host memory into the device
73
- void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
74
-
75
- // get data from the device into host memory
76
- void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
77
-
78
- // try to find operations that can be run concurrently in the graph
79
- // you should run it again if the topology of your graph changes
80
- void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf, bool check_mem);
81
-
82
- // if the graph has been optimized for concurrently dispatch, return length of the concur_list if optimized
83
- int ggml_metal_if_optimized(struct ggml_metal_context * ctx);
84
-
85
- // output the concur_list for ggml_alloc
86
- int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx);
87
-
88
- // same as ggml_graph_compute but uses Metal
89
- // creates gf->n_threads command buffers in parallel
90
- bool ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
91
-
92
38
  //
93
39
  // backend API
94
40
  // user-code should use only these functions
95
41
  //
96
42
 
43
+ GGML_API void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
44
+
97
45
  GGML_API ggml_backend_t ggml_backend_metal_init(void);
98
46
 
99
47
  GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
100
48
 
101
- GGML_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
49
+ GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
102
50
 
103
51
  GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
104
52
 
105
- GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
53
+ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
106
54
 
107
55
  // helper to check if the device supports a specific family
108
56
  // ideally, the user code should be doing these checks