llama_cpp 0.2.0 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -0
- data/examples/README.md +92 -0
- data/examples/chat.rb +195 -0
- data/examples/embedding.rb +37 -0
- data/ext/llama_cpp/llama_cpp.cpp +52 -0
- data/ext/llama_cpp/src/ggml-cuda.cu +1218 -411
- data/ext/llama_cpp/src/ggml-cuda.h +4 -1
- data/ext/llama_cpp/src/ggml-metal.h +5 -1
- data/ext/llama_cpp/src/ggml-metal.m +703 -514
- data/ext/llama_cpp/src/ggml-metal.metal +574 -122
- data/ext/llama_cpp/src/ggml-opencl.cpp +496 -36
- data/ext/llama_cpp/src/ggml-opencl.h +1 -2
- data/ext/llama_cpp/src/ggml.c +2715 -476
- data/ext/llama_cpp/src/ggml.h +266 -11
- data/ext/llama_cpp/src/llama.cpp +266 -135
- data/ext/llama_cpp/src/llama.h +19 -11
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +3 -0
- metadata +5 -2
@@ -24,11 +24,14 @@ void ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
|
|
24
24
|
void * ggml_cuda_host_malloc(size_t size);
|
25
25
|
void ggml_cuda_host_free(void * ptr);
|
26
26
|
|
27
|
-
void
|
27
|
+
void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
|
28
|
+
|
28
29
|
void ggml_cuda_free_data(struct ggml_tensor * tensor);
|
29
30
|
void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
|
31
|
+
void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
|
30
32
|
void ggml_cuda_set_main_device(int main_device);
|
31
33
|
void ggml_cuda_set_scratch_size(size_t scratch_size);
|
34
|
+
void ggml_cuda_free_scratch(void);
|
32
35
|
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
33
36
|
|
34
37
|
#ifdef __cplusplus
|
@@ -41,12 +41,15 @@ void ggml_metal_free(struct ggml_metal_context * ctx);
|
|
41
41
|
// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
|
42
42
|
// - the mapping is used during computation to determine the arguments of the compute kernels
|
43
43
|
// - you don't need to keep the host memory buffer allocated as it is never accessed by Metal
|
44
|
+
// - max_size specifies the maximum size of a tensor and is used to create shared views such
|
45
|
+
// that it is guaranteed that the tensor will fit in at least one of the views
|
44
46
|
//
|
45
47
|
bool ggml_metal_add_buffer(
|
46
48
|
struct ggml_metal_context * ctx,
|
47
49
|
const char * name,
|
48
50
|
void * data,
|
49
|
-
size_t size
|
51
|
+
size_t size,
|
52
|
+
size_t max_size);
|
50
53
|
|
51
54
|
// set data from host memory into the device
|
52
55
|
void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
|
@@ -55,6 +58,7 @@ void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor *
|
|
55
58
|
void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
|
56
59
|
|
57
60
|
// same as ggml_graph_compute but uses Metal
|
61
|
+
// creates gf->n_threads command buffers in parallel
|
58
62
|
void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
|
59
63
|
|
60
64
|
#ifdef __cplusplus
|