llama_cpp 0.9.1 → 0.9.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -1
- data/ext/llama_cpp/llama_cpp.cpp +12 -0
- data/ext/llama_cpp/src/ggml-alloc.c +383 -210
- data/ext/llama_cpp/src/ggml-alloc.h +68 -16
- data/ext/llama_cpp/src/ggml-backend-impl.h +87 -0
- data/ext/llama_cpp/src/ggml-backend.c +578 -13
- data/ext/llama_cpp/src/ggml-backend.h +70 -77
- data/ext/llama_cpp/src/ggml-cuda.cu +277 -53
- data/ext/llama_cpp/src/ggml-cuda.h +5 -0
- data/ext/llama_cpp/src/ggml-impl.h +13 -7
- data/ext/llama_cpp/src/ggml-metal.h +1 -1
- data/ext/llama_cpp/src/ggml-metal.m +112 -30
- data/ext/llama_cpp/src/ggml-metal.metal +107 -1
- data/ext/llama_cpp/src/ggml-quants.c +173 -73
- data/ext/llama_cpp/src/ggml.c +877 -1707
- data/ext/llama_cpp/src/ggml.h +68 -45
- data/ext/llama_cpp/src/llama.cpp +475 -117
- data/ext/llama_cpp/src/llama.h +11 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- metadata +3 -2
@@ -1,51 +1,20 @@
|
|
1
1
|
#pragma once
|
2
2
|
|
3
3
|
#include "ggml.h"
|
4
|
+
#include "ggml-alloc.h"
|
4
5
|
|
5
6
|
#ifdef __cplusplus
|
6
7
|
extern "C" {
|
7
8
|
#endif
|
8
|
-
struct ggml_backend;
|
9
|
-
struct ggml_backend_buffer;
|
10
|
-
|
11
|
-
// type-erased backend-specific types / wrappers
|
12
|
-
typedef void * ggml_backend_context_t;
|
13
|
-
typedef void * ggml_backend_graph_plan_t;
|
14
|
-
typedef void * ggml_backend_buffer_context_t;
|
15
|
-
|
16
|
-
// avoid accessing internals of these types
|
17
|
-
typedef struct ggml_backend * ggml_backend_t;
|
18
|
-
typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
|
19
9
|
|
20
10
|
//
|
21
|
-
//
|
11
|
+
// Backend buffer
|
22
12
|
//
|
23
13
|
|
24
|
-
struct
|
25
|
-
|
26
|
-
void * (*get_base) (ggml_backend_buffer_t buffer); // get base pointer
|
27
|
-
size_t (*get_alloc_size)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-allocation callback
|
28
|
-
void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // post-allocation callback
|
29
|
-
void (*free_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-free callback
|
30
|
-
};
|
31
|
-
|
32
|
-
// TODO: hide behind API
|
33
|
-
struct ggml_backend_buffer {
|
34
|
-
struct ggml_backend_buffer_i iface;
|
35
|
-
|
36
|
-
ggml_backend_t backend;
|
37
|
-
ggml_backend_buffer_context_t context;
|
38
|
-
|
39
|
-
size_t size;
|
40
|
-
};
|
14
|
+
struct ggml_backend_buffer;
|
15
|
+
typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
|
41
16
|
|
42
17
|
// backend buffer functions
|
43
|
-
GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(
|
44
|
-
struct ggml_backend * backend,
|
45
|
-
struct ggml_backend_buffer_i iface,
|
46
|
-
ggml_backend_buffer_context_t context,
|
47
|
-
size_t size);
|
48
|
-
|
49
18
|
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
|
50
19
|
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
51
20
|
GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
|
@@ -55,50 +24,13 @@ extern "C" {
|
|
55
24
|
GGML_API void ggml_backend_buffer_free_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
56
25
|
|
57
26
|
//
|
58
|
-
//
|
27
|
+
// Backend
|
59
28
|
//
|
60
29
|
|
61
|
-
struct
|
62
|
-
|
63
|
-
|
64
|
-
void (*free)(ggml_backend_t backend);
|
65
|
-
|
66
|
-
// buffer allocation
|
67
|
-
ggml_backend_buffer_t (*alloc_buffer)(ggml_backend_t backend, size_t size);
|
68
|
-
|
69
|
-
// get buffer alignment
|
70
|
-
size_t (*get_alignment)(ggml_backend_t backend);
|
71
|
-
|
72
|
-
// tensor data access
|
73
|
-
// these functions can be asynchronous, helper functions are provided for synchronous access that automatically call synchronize
|
74
|
-
void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
75
|
-
void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
76
|
-
void (*synchronize) (ggml_backend_t backend);
|
77
|
-
|
78
|
-
// (optional) copy tensor between different backends, allow for single-copy tranfers
|
79
|
-
void (*cpy_tensor_from)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
|
80
|
-
void (*cpy_tensor_to) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
|
81
|
-
|
82
|
-
// compute graph with a plan
|
83
|
-
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
84
|
-
void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
85
|
-
void (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
86
|
-
|
87
|
-
// compute graph without a plan
|
88
|
-
void (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
89
|
-
|
90
|
-
// check if the backend supports an operation
|
91
|
-
bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
92
|
-
};
|
93
|
-
|
94
|
-
// TODO: hide behind API
|
95
|
-
struct ggml_backend {
|
96
|
-
struct ggml_backend_i iface;
|
97
|
-
|
98
|
-
ggml_backend_context_t context;
|
99
|
-
};
|
30
|
+
struct ggml_backend;
|
31
|
+
typedef struct ggml_backend * ggml_backend_t;
|
32
|
+
typedef void * ggml_backend_graph_plan_t;
|
100
33
|
|
101
|
-
// backend helper functions
|
102
34
|
GGML_API ggml_backend_t ggml_get_backend(const struct ggml_tensor * tensor);
|
103
35
|
|
104
36
|
GGML_API const char * ggml_backend_name(ggml_backend_t backend);
|
@@ -133,11 +65,72 @@ extern "C" {
|
|
133
65
|
GGML_API ggml_backend_t ggml_backend_cpu_init(void);
|
134
66
|
|
135
67
|
GGML_API bool ggml_backend_is_cpu(ggml_backend_t backend);
|
136
|
-
|
137
68
|
GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
|
138
69
|
|
70
|
+
// Create a backend buffer from an existing pointer
|
139
71
|
GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(ggml_backend_t backend_cpu, void * ptr, size_t size);
|
140
72
|
|
73
|
+
|
74
|
+
//
|
75
|
+
// Backend scheduler
|
76
|
+
//
|
77
|
+
|
78
|
+
// The backend scheduler allows for multiple backends to be used together
|
79
|
+
// Handles compute buffer allocation, assignment of tensors to backends, and copying of tensors between backends
|
80
|
+
// The backends are selected based on:
|
81
|
+
// - the backend that supports the operation
|
82
|
+
// - the location of the pre-allocated tensors (e.g. the weights)
|
83
|
+
/*
|
84
|
+
Example usage:
|
85
|
+
|
86
|
+
sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, num_backends);
|
87
|
+
// sched is initialized with measure allocators and cannot be used until allocated with a measure graph
|
88
|
+
|
89
|
+
// initialize buffers from a measure graph
|
90
|
+
measure_graph = build_graph(sched); // use the allocr to allocate inputs as needed
|
91
|
+
|
92
|
+
// in build_graph:
|
93
|
+
build_graph(...) {
|
94
|
+
// allocating tensors in a specific backend (optional, recommended: pre-allocate inputs in a different buffer)
|
95
|
+
alloc_cpu = ggml_backend_sched_get_allocr(sched, backend_cpu);
|
96
|
+
ggml_allocr_alloc(alloc_cpu, tensor);
|
97
|
+
|
98
|
+
// manually assigning nodes to a backend (optional, shouldn't be needed in most cases)
|
99
|
+
struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
|
100
|
+
ggml_backend_sched_set_node_backend(sched, node, backend_gpu);
|
101
|
+
}
|
102
|
+
|
103
|
+
// allocate backend buffers from measure graph
|
104
|
+
ggml_backend_sched_init_measure(sched, measure_graph);
|
105
|
+
|
106
|
+
// the scheduler is now ready to compute graphs
|
107
|
+
|
108
|
+
// compute
|
109
|
+
graph = build_graph(sched);
|
110
|
+
ggml_backend_sched_graph_compute(sched, graph);
|
111
|
+
*/
|
112
|
+
|
113
|
+
struct ggml_backend_sched;
|
114
|
+
typedef struct ggml_backend_sched * ggml_backend_sched_t;
|
115
|
+
|
116
|
+
// Initialize a backend scheduler
|
117
|
+
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends);
|
118
|
+
|
119
|
+
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
|
120
|
+
|
121
|
+
// Initialize backend buffers from a measure graph
|
122
|
+
GGML_API void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
|
123
|
+
|
124
|
+
GGML_API ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend);
|
125
|
+
GGML_API ggml_backend_buffer_t ggml_backend_sched_get_buffer (ggml_backend_sched_t sched, ggml_backend_t backend);
|
126
|
+
|
127
|
+
GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
|
128
|
+
|
129
|
+
// Allocate a graph on the backend scheduler
|
130
|
+
GGML_API void ggml_backend_sched_graph_compute(
|
131
|
+
ggml_backend_sched_t sched,
|
132
|
+
struct ggml_cgraph * graph);
|
133
|
+
|
141
134
|
#ifdef __cplusplus
|
142
135
|
}
|
143
136
|
#endif
|