gpt_neox_client 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +5 -0
- data/CODE_OF_CONDUCT.md +84 -0
- data/LICENSE.txt +21 -0
- data/README.md +68 -0
- data/ext/gpt_neox_client/extconf.rb +25 -0
- data/ext/gpt_neox_client/gpt_neox_client.cpp +316 -0
- data/ext/gpt_neox_client/gpt_neox_client.h +10 -0
- data/ext/gpt_neox_client/src/LICENSE +21 -0
- data/ext/gpt_neox_client/src/common-ggml.cpp +246 -0
- data/ext/gpt_neox_client/src/common-ggml.h +18 -0
- data/ext/gpt_neox_client/src/common.cpp +809 -0
- data/ext/gpt_neox_client/src/common.h +176 -0
- data/ext/gpt_neox_client/src/dr_wav.h +6434 -0
- data/ext/gpt_neox_client/src/ggml/ggml-alloc.c +594 -0
- data/ext/gpt_neox_client/src/ggml/ggml-alloc.h +26 -0
- data/ext/gpt_neox_client/src/ggml/ggml-cuda.cu +6756 -0
- data/ext/gpt_neox_client/src/ggml/ggml-cuda.h +46 -0
- data/ext/gpt_neox_client/src/ggml/ggml-metal.h +85 -0
- data/ext/gpt_neox_client/src/ggml/ggml-metal.m +1195 -0
- data/ext/gpt_neox_client/src/ggml/ggml-metal.metal +2049 -0
- data/ext/gpt_neox_client/src/ggml/ggml-opencl.cpp +1865 -0
- data/ext/gpt_neox_client/src/ggml/ggml-opencl.h +25 -0
- data/ext/gpt_neox_client/src/ggml/ggml.c +20632 -0
- data/ext/gpt_neox_client/src/ggml/ggml.h +1997 -0
- data/ext/gpt_neox_client/src/main.cpp +814 -0
- data/lib/gpt_neox_client/version.rb +7 -0
- data/lib/gpt_neox_client.rb +4 -0
- metadata +75 -0
@@ -0,0 +1,46 @@
|
|
1
|
+
#pragma once
|
2
|
+
|
3
|
+
#include "ggml.h"
|
4
|
+
|
5
|
+
#ifdef GGML_USE_HIPBLAS
|
6
|
+
#define GGML_CUDA_NAME "ROCm"
|
7
|
+
#define GGML_CUBLAS_NAME "hipBLAS"
|
8
|
+
#else
|
9
|
+
#define GGML_CUDA_NAME "CUDA"
|
10
|
+
#define GGML_CUBLAS_NAME "cuBLAS"
|
11
|
+
#endif
|
12
|
+
|
13
|
+
#ifdef __cplusplus
|
14
|
+
extern "C" {
|
15
|
+
#endif
|
16
|
+
|
17
|
+
#define GGML_CUDA_MAX_DEVICES 16
|
18
|
+
|
19
|
+
GGML_API void ggml_init_cublas(void);
|
20
|
+
GGML_API void * ggml_cuda_host_malloc(size_t size);
|
21
|
+
GGML_API void ggml_cuda_host_free(void * ptr);
|
22
|
+
|
23
|
+
GGML_API bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
24
|
+
GGML_API void ggml_cuda_set_tensor_split(const float * tensor_split);
|
25
|
+
GGML_API void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
|
26
|
+
GGML_API void ggml_cuda_free_data(struct ggml_tensor * tensor);
|
27
|
+
|
28
|
+
GGML_API void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
|
29
|
+
GGML_API void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
|
30
|
+
GGML_API void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);
|
31
|
+
|
32
|
+
GGML_API void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor);
|
33
|
+
GGML_API void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset);
|
34
|
+
|
35
|
+
GGML_API void ggml_cuda_set_main_device(int main_device);
|
36
|
+
GGML_API void ggml_cuda_set_mul_mat_q(bool mul_mat_q);
|
37
|
+
GGML_API void ggml_cuda_set_scratch_size(size_t scratch_size);
|
38
|
+
GGML_API void ggml_cuda_free_scratch(void);
|
39
|
+
GGML_API bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
40
|
+
|
41
|
+
GGML_API int ggml_cuda_get_device_count(void);
|
42
|
+
GGML_API void ggml_cuda_get_device_description(int device, char * description, size_t description_size);
|
43
|
+
|
44
|
+
#ifdef __cplusplus
|
45
|
+
}
|
46
|
+
#endif
|
@@ -0,0 +1,85 @@
|
|
1
|
+
// An interface allowing to compute ggml_cgraph with Metal
|
2
|
+
//
|
3
|
+
// This is a fully functional interface that extends ggml with GPU support for Apple devices.
|
4
|
+
// A similar interface can be created for other GPU backends (e.g. Vulkan, CUDA, OpenCL, etc.)
|
5
|
+
//
|
6
|
+
// How it works?
|
7
|
+
//
|
8
|
+
// As long as your program can create and evaluate a ggml_cgraph on the CPU, you can use this
|
9
|
+
// interface to evaluate the same graph on the GPU. Instead of using ggml_graph_compute(), you
|
10
|
+
// use ggml_metal_graph_compute() (or ggml_vulkan_graph_compute(), etc.)
|
11
|
+
//
|
12
|
+
// You only need to make sure that all memory buffers that you used during the graph creation
|
13
|
+
// are mapped to the device memory with the ggml_metal_add_buffer() function. This mapping is
|
14
|
+
// used during the graph evaluation to determine the arguments of the compute kernels.
|
15
|
+
//
|
16
|
+
// Synchronization between device and host memory (for example for input and output tensors)
|
17
|
+
// is done with the ggml_metal_set_tensor() and ggml_metal_get_tensor() functions.
|
18
|
+
//
|
19
|
+
|
20
|
+
#pragma once
|
21
|
+
|
22
|
+
#include <stddef.h>
|
23
|
+
#include <stdbool.h>
|
24
|
+
|
25
|
+
// max memory buffers that can be mapped to the device
|
26
|
+
#define GGML_METAL_MAX_BUFFERS 16
|
27
|
+
#define GGML_METAL_MAX_COMMAND_BUFFERS 32
|
28
|
+
|
29
|
+
struct ggml_tensor;
|
30
|
+
struct ggml_cgraph;
|
31
|
+
|
32
|
+
#ifdef __cplusplus
|
33
|
+
extern "C" {
|
34
|
+
#endif
|
35
|
+
|
36
|
+
struct ggml_metal_context;
|
37
|
+
|
38
|
+
// number of command buffers to use
|
39
|
+
struct ggml_metal_context * ggml_metal_init(int n_cb);
|
40
|
+
void ggml_metal_free(struct ggml_metal_context * ctx);
|
41
|
+
|
42
|
+
void * ggml_metal_host_malloc(size_t n);
|
43
|
+
void ggml_metal_host_free (void * data);
|
44
|
+
|
45
|
+
// set the number of command buffers to use
|
46
|
+
void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);
|
47
|
+
|
48
|
+
// creates a mapping between a host memory buffer and a device memory buffer
|
49
|
+
// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
|
50
|
+
// - the mapping is used during computation to determine the arguments of the compute kernels
|
51
|
+
// - you don't need to keep the host memory buffer allocated as it is never accessed by Metal
|
52
|
+
// - max_size specifies the maximum size of a tensor and is used to create shared views such
|
53
|
+
// that it is guaranteed that the tensor will fit in at least one of the views
|
54
|
+
//
|
55
|
+
bool ggml_metal_add_buffer(
|
56
|
+
struct ggml_metal_context * ctx,
|
57
|
+
const char * name,
|
58
|
+
void * data,
|
59
|
+
size_t size,
|
60
|
+
size_t max_size);
|
61
|
+
|
62
|
+
// set data from host memory into the device
|
63
|
+
void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
|
64
|
+
|
65
|
+
// get data from the device into host memory
|
66
|
+
void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
|
67
|
+
|
68
|
+
// try to find operations that can be run concurrently in the graph
|
69
|
+
// you should run it again if the topology of your graph changes
|
70
|
+
void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf, bool check_mem);
|
71
|
+
|
72
|
+
// if the graph has been optimized for concurrently dispatch, return length of the concur_list if optimized
|
73
|
+
int ggml_metal_if_optimized(struct ggml_metal_context * ctx);
|
74
|
+
|
75
|
+
// output the concur_list for ggml_alloc
|
76
|
+
int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx);
|
77
|
+
|
78
|
+
// same as ggml_graph_compute but uses Metal
|
79
|
+
// creates gf->n_threads command buffers in parallel
|
80
|
+
void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
|
81
|
+
|
82
|
+
#ifdef __cplusplus
|
83
|
+
}
|
84
|
+
#endif
|
85
|
+
|