llama_cpp 0.1.4 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +36 -0
- data/examples/README.md +60 -0
- data/examples/chat.rb +195 -0
- data/ext/llama_cpp/extconf.rb +26 -1
- data/ext/llama_cpp/llama_cpp.cpp +262 -13
- data/ext/llama_cpp/src/ggml-cuda.cu +2483 -0
- data/ext/llama_cpp/src/ggml-cuda.h +18 -2
- data/ext/llama_cpp/src/ggml-metal.h +64 -0
- data/ext/llama_cpp/src/ggml-metal.m +834 -0
- data/ext/llama_cpp/src/ggml-metal.metal +1436 -0
- data/ext/llama_cpp/src/ggml-opencl.cpp +207 -40
- data/ext/llama_cpp/src/ggml-opencl.h +4 -1
- data/ext/llama_cpp/src/ggml.c +2236 -404
- data/ext/llama_cpp/src/ggml.h +170 -8
- data/ext/llama_cpp/src/k_quants.c +2244 -0
- data/ext/llama_cpp/src/k_quants.h +122 -0
- data/ext/llama_cpp/src/llama-util.h +16 -0
- data/ext/llama_cpp/src/llama.cpp +631 -179
- data/ext/llama_cpp/src/llama.h +51 -11
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +36 -1
- metadata +10 -2
@@ -1,10 +1,19 @@
|
|
1
|
+
#pragma once
|
2
|
+
|
1
3
|
#include "ggml.h"
|
2
4
|
|
3
5
|
#ifdef __cplusplus
|
4
6
|
extern "C" {
|
5
7
|
#endif
|
6
8
|
|
9
|
+
#define GGML_CUDA_MAX_DEVICES 16
|
10
|
+
|
11
|
+
struct ggml_tensor_extra_gpu {
|
12
|
+
void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
|
13
|
+
};
|
14
|
+
|
7
15
|
void ggml_init_cublas(void);
|
16
|
+
void ggml_cuda_set_tensor_split(const float * tensor_split);
|
8
17
|
|
9
18
|
void ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
10
19
|
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
@@ -15,8 +24,15 @@ void ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
|
|
15
24
|
void * ggml_cuda_host_malloc(size_t size);
|
16
25
|
void ggml_cuda_host_free(void * ptr);
|
17
26
|
|
18
|
-
void
|
19
|
-
|
27
|
+
void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
|
28
|
+
|
29
|
+
void ggml_cuda_free_data(struct ggml_tensor * tensor);
|
30
|
+
void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
|
31
|
+
void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
|
32
|
+
void ggml_cuda_set_main_device(int main_device);
|
33
|
+
void ggml_cuda_set_scratch_size(size_t scratch_size);
|
34
|
+
void ggml_cuda_free_scratch(void);
|
35
|
+
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
20
36
|
|
21
37
|
#ifdef __cplusplus
|
22
38
|
}
|
@@ -0,0 +1,64 @@
|
|
1
|
+
// An interface allowing to compute ggml_cgraph with Metal
|
2
|
+
//
|
3
|
+
// This is a fully functional interface that extends ggml with GPU support for Apple devices.
|
4
|
+
// A similar interface can be created for other GPU backends (e.g. Vulkan, CUDA, OpenCL, etc.)
|
5
|
+
//
|
6
|
+
// How it works?
|
7
|
+
//
|
8
|
+
// As long as your program can create and evaluate a ggml_cgraph on the CPU, you can use this
|
9
|
+
// interface to evaluate the same graph on the GPU. Instead of using ggml_graph_compute(), you
|
10
|
+
// use ggml_metal_graph_compute() (or ggml_vulkan_graph_compute(), etc.)
|
11
|
+
//
|
12
|
+
// You only need to make sure that all memory buffers that you used during the graph creation
|
13
|
+
// are mapped to the device memory with the ggml_metal_add_buffer() function. This mapping is
|
14
|
+
// used during the graph evaluation to determine the arguments of the compute kernels.
|
15
|
+
//
|
16
|
+
// Synchronization between device and host memory (for example for input and output tensors)
|
17
|
+
// is done with the ggml_metal_set_tensor() and ggml_metal_get_tensor() functions.
|
18
|
+
//
|
19
|
+
|
20
|
+
#pragma once
|
21
|
+
|
22
|
+
#include <stddef.h>
|
23
|
+
#include <stdbool.h>
|
24
|
+
|
25
|
+
// max memory buffers that can be mapped to the device
|
26
|
+
#define GGML_METAL_MAX_BUFFERS 16
|
27
|
+
|
28
|
+
struct ggml_tensor;
|
29
|
+
struct ggml_cgraph;
|
30
|
+
|
31
|
+
#ifdef __cplusplus
|
32
|
+
extern "C" {
|
33
|
+
#endif
|
34
|
+
|
35
|
+
struct ggml_metal_context;
|
36
|
+
|
37
|
+
struct ggml_metal_context * ggml_metal_init(void);
|
38
|
+
void ggml_metal_free(struct ggml_metal_context * ctx);
|
39
|
+
|
40
|
+
// creates a mapping between a host memory buffer and a device memory buffer
|
41
|
+
// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
|
42
|
+
// - the mapping is used during computation to determine the arguments of the compute kernels
|
43
|
+
// - you don't need to keep the host memory buffer allocated as it is never accessed by Metal
|
44
|
+
//
|
45
|
+
bool ggml_metal_add_buffer(
|
46
|
+
struct ggml_metal_context * ctx,
|
47
|
+
const char * name,
|
48
|
+
void * data,
|
49
|
+
size_t size);
|
50
|
+
|
51
|
+
// set data from host memory into the device
|
52
|
+
void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
|
53
|
+
|
54
|
+
// get data from the device into host memory
|
55
|
+
void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
|
56
|
+
|
57
|
+
// same as ggml_graph_compute but uses Metal
|
58
|
+
// creates gf->n_threads command buffers in parallel
|
59
|
+
void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
|
60
|
+
|
61
|
+
#ifdef __cplusplus
|
62
|
+
}
|
63
|
+
#endif
|
64
|
+
|