llama_cpp 0.3.7 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +17 -0
- data/README.md +1 -1
- data/examples/chat.rb +2 -4
- data/ext/llama_cpp/extconf.rb +3 -3
- data/ext/llama_cpp/llama_cpp.cpp +118 -117
- data/ext/llama_cpp/src/ggml-alloc.c +97 -53
- data/ext/llama_cpp/src/ggml-alloc.h +4 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +1010 -497
- data/ext/llama_cpp/src/ggml-cuda.h +32 -23
- data/ext/llama_cpp/src/ggml-metal.h +9 -3
- data/ext/llama_cpp/src/ggml-metal.m +142 -161
- data/ext/llama_cpp/src/ggml-metal.metal +577 -500
- data/ext/llama_cpp/src/ggml.c +2064 -233
- data/ext/llama_cpp/src/ggml.h +238 -13
- data/ext/llama_cpp/src/k_quants.c +110 -54
- data/ext/llama_cpp/src/llama-util.h +10 -8
- data/ext/llama_cpp/src/llama.cpp +4544 -2890
- data/ext/llama_cpp/src/llama.h +133 -123
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +8 -8
- metadata +2 -2
@@ -2,35 +2,44 @@
|
|
2
2
|
|
3
3
|
#include "ggml.h"
|
4
4
|
|
5
|
+
#ifdef GGML_USE_HIPBLAS
|
6
|
+
#define GGML_CUDA_NAME "ROCm"
|
7
|
+
#define GGML_CUBLAS_NAME "hipBLAS"
|
8
|
+
#else
|
9
|
+
#define GGML_CUDA_NAME "CUDA"
|
10
|
+
#define GGML_CUBLAS_NAME "cuBLAS"
|
11
|
+
#endif
|
12
|
+
|
5
13
|
#ifdef __cplusplus
|
6
14
|
extern "C" {
|
7
15
|
#endif
|
8
16
|
|
9
17
|
#define GGML_CUDA_MAX_DEVICES 16
|
10
18
|
|
11
|
-
void ggml_init_cublas(void);
|
12
|
-
void
|
13
|
-
|
14
|
-
|
15
|
-
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
16
|
-
|
17
|
-
void
|
18
|
-
|
19
|
-
|
20
|
-
void *
|
21
|
-
void
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
void
|
26
|
-
|
27
|
-
void
|
28
|
-
void
|
29
|
-
void
|
30
|
-
void
|
31
|
-
|
32
|
-
|
33
|
-
|
19
|
+
GGML_API void ggml_init_cublas(void);
|
20
|
+
GGML_API void * ggml_cuda_host_malloc(size_t size);
|
21
|
+
GGML_API void ggml_cuda_host_free(void * ptr);
|
22
|
+
|
23
|
+
GGML_API bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
24
|
+
GGML_API void ggml_cuda_set_tensor_split(const float * tensor_split);
|
25
|
+
GGML_API void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
|
26
|
+
GGML_API void ggml_cuda_free_data(struct ggml_tensor * tensor);
|
27
|
+
|
28
|
+
GGML_API void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
|
29
|
+
GGML_API void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
|
30
|
+
GGML_API void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);
|
31
|
+
|
32
|
+
GGML_API void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor);
|
33
|
+
GGML_API void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset);
|
34
|
+
|
35
|
+
GGML_API void ggml_cuda_set_main_device(int main_device);
|
36
|
+
GGML_API void ggml_cuda_set_mul_mat_q(bool mul_mat_q);
|
37
|
+
GGML_API void ggml_cuda_set_scratch_size(size_t scratch_size);
|
38
|
+
GGML_API void ggml_cuda_free_scratch(void);
|
39
|
+
GGML_API bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
40
|
+
|
41
|
+
GGML_API int ggml_cuda_get_device_count(void);
|
42
|
+
GGML_API void ggml_cuda_get_device_description(int device, char * description, size_t description_size);
|
34
43
|
|
35
44
|
#ifdef __cplusplus
|
36
45
|
}
|
@@ -38,6 +38,9 @@ struct ggml_metal_context;
|
|
38
38
|
struct ggml_metal_context * ggml_metal_init(int n_cb);
|
39
39
|
void ggml_metal_free(struct ggml_metal_context * ctx);
|
40
40
|
|
41
|
+
void * ggml_metal_host_malloc(size_t n);
|
42
|
+
void ggml_metal_host_free (void * data);
|
43
|
+
|
41
44
|
// set the number of command buffers to use
|
42
45
|
void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);
|
43
46
|
|
@@ -63,10 +66,13 @@ void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor *
|
|
63
66
|
|
64
67
|
// try to find operations that can be run concurrently in the graph
|
65
68
|
// you should run it again if the topology of your graph changes
|
66
|
-
void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
|
69
|
+
void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf, bool check_mem);
|
70
|
+
|
71
|
+
// if the graph has been optimized for concurrently dispatch, return length of the concur_list if optimized
|
72
|
+
int ggml_metal_if_optimized(struct ggml_metal_context * ctx);
|
67
73
|
|
68
|
-
//
|
69
|
-
|
74
|
+
// output the concur_list for ggml_alloc
|
75
|
+
int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx);
|
70
76
|
|
71
77
|
// same as ggml_graph_compute but uses Metal
|
72
78
|
// creates gf->n_threads command buffers in parallel
|