RubyGems - llama_cpp - Versions diffs - 0.1.3 → 0.2.0 - Mend

llama_cpp 0.1.3 → 0.2.0

Files changed (21) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +39 -8
data/ext/llama_cpp/extconf.rb +26 -1
data/ext/llama_cpp/llama_cpp.cpp +210 -13
data/ext/llama_cpp/src/ggml-cuda.cu +1916 -0
data/ext/llama_cpp/src/ggml-cuda.h +15 -2
data/ext/llama_cpp/src/ggml-metal.h +63 -0
data/ext/llama_cpp/src/ggml-metal.m +783 -0
data/ext/llama_cpp/src/ggml-metal.metal +1133 -0
data/ext/llama_cpp/src/ggml-opencl.cpp +242 -52
data/ext/llama_cpp/src/ggml-opencl.h +4 -0
data/ext/llama_cpp/src/ggml.c +835 -82
data/ext/llama_cpp/src/ggml.h +64 -8
data/ext/llama_cpp/src/k_quants.c +2244 -0
data/ext/llama_cpp/src/k_quants.h +122 -0
data/ext/llama_cpp/src/llama-util.h +16 -0
data/ext/llama_cpp/src/llama.cpp +489 -134
data/ext/llama_cpp/src/llama.h +43 -7
data/lib/llama_cpp/version.rb +2 -2
data/sig/llama_cpp.rbs +33 -1
metadata +8 -2

data/ext/llama_cpp/src/ggml-cuda.h CHANGED Viewed

@@ -1,10 +1,19 @@
+#pragma once
 #include "ggml.h"
 #ifdef  __cplusplus
 extern "C" {
 #endif
+#define GGML_CUDA_MAX_DEVICES       16
+struct ggml_tensor_extra_gpu {
+    void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
+};
 void   ggml_init_cublas(void);
+void   ggml_cuda_set_tensor_split(const float * tensor_split);
 void   ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
 bool   ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
@@ -15,8 +24,12 @@ void   ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
 void * ggml_cuda_host_malloc(size_t size);
 void   ggml_cuda_host_free(void * ptr);
-void ggml_cuda_transform_tensor(struct ggml_tensor * tensor);
-void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensors, size_t offset);
+void   ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensors, size_t offset);
+void   ggml_cuda_free_data(struct ggml_tensor * tensor);
+void   ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
+void   ggml_cuda_set_main_device(int main_device);
+void   ggml_cuda_set_scratch_size(size_t scratch_size);
+bool   ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
 #ifdef  __cplusplus
 }

data/ext/llama_cpp/src/ggml-metal.h ADDED Viewed

@@ -0,0 +1,63 @@
+// An interface allowing to compute ggml_cgraph with Metal
+//
+// This is a fully functional interface that extends ggml with GPU support for Apple devices.
+// A similar interface can be created for other GPU backends (e.g. Vulkan, CUDA, OpenCL, etc.)
+//
+// How it works?
+//
+// As long as your program can create and evaluate a ggml_cgraph on the CPU, you can use this
+// interface to evaluate the same graph on the GPU. Instead of using ggml_graph_compute(), you
+// use ggml_metal_graph_compute() (or ggml_vulkan_graph_compute(), etc.)
+//
+// You only need to make sure that all memory buffers that you used during the graph creation
+// are mapped to the device memory with the ggml_metal_add_buffer() function. This mapping is
+// used during the graph evaluation to determine the arguments of the compute kernels.
+//
+// Synchronization between device and host memory (for example for input and output tensors)
+// is done with the ggml_metal_set_tensor() and ggml_metal_get_tensor() functions.
+//
+#pragma once
+#include <stddef.h>
+#include <stdbool.h>
+// max memory buffers that can be mapped to the device
+#define GGML_METAL_MAX_BUFFERS 16
+struct ggml_tensor;
+struct ggml_cgraph;
+#ifdef __cplusplus
+extern "C" {
+#endif
+struct ggml_metal_context;
+struct ggml_metal_context * ggml_metal_init(void);
+void ggml_metal_free(struct ggml_metal_context * ctx);
+// creates a mapping between a host memory buffer and a device memory buffer
+// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
+// - the mapping is used during computation to determine the arguments of the compute kernels
+// - you don't need to keep the host memory buffer allocated as it is never accessed by Metal
+//
+bool ggml_metal_add_buffer(
+        struct ggml_metal_context * ctx,
+                       const char * name,
+                             void * data,
+                           size_t   size);
+// set data from host memory into the device
+void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
+// get data from the device into host memory
+void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
+// same as ggml_graph_compute but uses Metal
+void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
+#ifdef __cplusplus
+}
+#endif