llama_cpp 0.3.7 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -2,35 +2,44 @@
2
2
 
3
3
  #include "ggml.h"
4
4
 
5
+ #ifdef GGML_USE_HIPBLAS
6
+ #define GGML_CUDA_NAME "ROCm"
7
+ #define GGML_CUBLAS_NAME "hipBLAS"
8
+ #else
9
+ #define GGML_CUDA_NAME "CUDA"
10
+ #define GGML_CUBLAS_NAME "cuBLAS"
11
+ #endif
12
+
5
13
  #ifdef __cplusplus
6
14
  extern "C" {
7
15
  #endif
8
16
 
9
17
  #define GGML_CUDA_MAX_DEVICES 16
10
18
 
11
- void ggml_init_cublas(void);
12
- void ggml_cuda_set_tensor_split(const float * tensor_split);
13
-
14
- void ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
15
- bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
16
- size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
17
- void ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
18
-
19
- // TODO: export these with GGML_API
20
- void * ggml_cuda_host_malloc(size_t size);
21
- void ggml_cuda_host_free(void * ptr);
22
-
23
- void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
24
-
25
- void ggml_cuda_free_data(struct ggml_tensor * tensor);
26
- void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
27
- void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
28
- void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);
29
- void ggml_cuda_set_main_device(int main_device);
30
- void ggml_cuda_set_mul_mat_q(bool mul_mat_q);
31
- void ggml_cuda_set_scratch_size(size_t scratch_size);
32
- void ggml_cuda_free_scratch(void);
33
- bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
19
+ GGML_API void ggml_init_cublas(void);
20
+ GGML_API void * ggml_cuda_host_malloc(size_t size);
21
+ GGML_API void ggml_cuda_host_free(void * ptr);
22
+
23
+ GGML_API bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
24
+ GGML_API void ggml_cuda_set_tensor_split(const float * tensor_split);
25
+ GGML_API void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
26
+ GGML_API void ggml_cuda_free_data(struct ggml_tensor * tensor);
27
+
28
+ GGML_API void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
29
+ GGML_API void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
30
+ GGML_API void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);
31
+
32
+ GGML_API void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor);
33
+ GGML_API void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset);
34
+
35
+ GGML_API void ggml_cuda_set_main_device(int main_device);
36
+ GGML_API void ggml_cuda_set_mul_mat_q(bool mul_mat_q);
37
+ GGML_API void ggml_cuda_set_scratch_size(size_t scratch_size);
38
+ GGML_API void ggml_cuda_free_scratch(void);
39
+ GGML_API bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
40
+
41
+ GGML_API int ggml_cuda_get_device_count(void);
42
+ GGML_API void ggml_cuda_get_device_description(int device, char * description, size_t description_size);
34
43
 
35
44
  #ifdef __cplusplus
36
45
  }
@@ -38,6 +38,9 @@ struct ggml_metal_context;
38
38
  struct ggml_metal_context * ggml_metal_init(int n_cb);
39
39
  void ggml_metal_free(struct ggml_metal_context * ctx);
40
40
 
41
+ void * ggml_metal_host_malloc(size_t n);
42
+ void ggml_metal_host_free (void * data);
43
+
41
44
  // set the number of command buffers to use
42
45
  void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);
43
46
 
@@ -63,10 +66,13 @@ void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor *
63
66
 
64
67
  // try to find operations that can be run concurrently in the graph
65
68
  // you should run it again if the topology of your graph changes
66
- void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
69
+ void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf, bool check_mem);
70
+
71
+ // if the graph has been optimized for concurrently dispatch, return length of the concur_list if optimized
72
+ int ggml_metal_if_optimized(struct ggml_metal_context * ctx);
67
73
 
68
- // if the graph has been optimized for concurrently dispatch
69
- bool ggml_metal_if_optimized(struct ggml_metal_context * ctx);
74
+ // output the concur_list for ggml_alloc
75
+ int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx);
70
76
 
71
77
  // same as ggml_graph_compute but uses Metal
72
78
  // creates gf->n_threads command buffers in parallel