llama_cpp 0.9.2 → 0.9.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -1
- data/ext/llama_cpp/llama_cpp.cpp +12 -0
- data/ext/llama_cpp/src/ggml-alloc.c +378 -208
- data/ext/llama_cpp/src/ggml-alloc.h +68 -16
- data/ext/llama_cpp/src/ggml-backend-impl.h +87 -0
- data/ext/llama_cpp/src/ggml-backend.c +578 -13
- data/ext/llama_cpp/src/ggml-backend.h +70 -77
- data/ext/llama_cpp/src/ggml-cuda.cu +260 -46
- data/ext/llama_cpp/src/ggml-impl.h +13 -7
- data/ext/llama_cpp/src/ggml-metal.h +1 -1
- data/ext/llama_cpp/src/ggml-metal.m +113 -32
- data/ext/llama_cpp/src/ggml-metal.metal +107 -1
- data/ext/llama_cpp/src/ggml-quants.c +174 -74
- data/ext/llama_cpp/src/ggml.c +881 -1459
- data/ext/llama_cpp/src/ggml.h +64 -45
- data/ext/llama_cpp/src/llama.cpp +555 -49
- data/ext/llama_cpp/src/llama.h +77 -3
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- metadata +3 -2
@@ -6,27 +6,79 @@
|
|
6
6
|
extern "C" {
|
7
7
|
#endif
|
8
8
|
|
9
|
+
struct ggml_backend;
|
9
10
|
struct ggml_backend_buffer;
|
10
11
|
|
11
|
-
|
12
|
-
|
13
|
-
|
12
|
+
//
|
13
|
+
// Legacy API
|
14
|
+
//
|
15
|
+
|
16
|
+
typedef struct ggml_allocr * ggml_allocr_t;
|
17
|
+
|
18
|
+
// initialize allocator for use with CPU backend only
|
19
|
+
GGML_API ggml_allocr_t ggml_allocr_new(void * data, size_t size, size_t alignment);
|
20
|
+
GGML_API ggml_allocr_t ggml_allocr_new_measure(size_t alignment);
|
21
|
+
|
22
|
+
// initialize allocator for use with ggml-backend
|
23
|
+
GGML_API ggml_allocr_t ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer);
|
24
|
+
GGML_API ggml_allocr_t ggml_allocr_new_from_backend(struct ggml_backend * backend, size_t size); // allocates an owned buffer
|
25
|
+
GGML_API ggml_allocr_t ggml_allocr_new_measure_from_backend(struct ggml_backend * backend);
|
26
|
+
|
27
|
+
GGML_API struct ggml_backend_buffer * ggml_allocr_get_buffer(ggml_allocr_t alloc);
|
14
28
|
|
15
29
|
// tell the allocator to parse nodes following the order described in the list
|
16
30
|
// you should call this if your graph are optimized to execute out-of-order
|
17
|
-
GGML_API void ggml_allocr_set_parse_seq(
|
18
|
-
|
19
|
-
GGML_API void ggml_allocr_free (
|
20
|
-
GGML_API bool ggml_allocr_is_measure (
|
21
|
-
GGML_API void ggml_allocr_reset (
|
22
|
-
GGML_API void ggml_allocr_alloc (
|
23
|
-
GGML_API size_t
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
31
|
+
GGML_API void ggml_allocr_set_parse_seq(ggml_allocr_t alloc, const int * list, int n);
|
32
|
+
|
33
|
+
GGML_API void ggml_allocr_free (ggml_allocr_t alloc);
|
34
|
+
GGML_API bool ggml_allocr_is_measure (ggml_allocr_t alloc);
|
35
|
+
GGML_API void ggml_allocr_reset (ggml_allocr_t alloc);
|
36
|
+
GGML_API void ggml_allocr_alloc (ggml_allocr_t alloc, struct ggml_tensor * tensor);
|
37
|
+
GGML_API size_t ggml_allocr_max_size (ggml_allocr_t alloc);
|
38
|
+
|
39
|
+
GGML_API size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph);
|
40
|
+
|
41
|
+
//
|
42
|
+
// ggml-backend v2 API
|
43
|
+
//
|
44
|
+
|
45
|
+
// Seperate tensor and graph allocator objects
|
46
|
+
// This is necessary for multi-backend allocation because the graph allocator needs to use multiple tensor allocators
|
47
|
+
// The original API is kept as a wrapper around the new API
|
48
|
+
|
49
|
+
// Tensor allocator
|
50
|
+
typedef struct ggml_tallocr * ggml_tallocr_t;
|
51
|
+
|
52
|
+
GGML_API ggml_tallocr_t ggml_tallocr_new(void * data, size_t size, size_t alignment);
|
53
|
+
GGML_API ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment);
|
54
|
+
GGML_API ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer);
|
55
|
+
GGML_API ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size); // allocates an owned buffer
|
56
|
+
GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend);
|
57
|
+
|
58
|
+
GGML_API struct ggml_backend_buffer * ggml_tallocr_get_buffer(ggml_tallocr_t talloc);
|
59
|
+
|
60
|
+
GGML_API void ggml_tallocr_free (ggml_tallocr_t talloc);
|
61
|
+
GGML_API bool ggml_tallocr_is_measure (ggml_tallocr_t talloc);
|
62
|
+
GGML_API void ggml_tallocr_reset (ggml_tallocr_t talloc);
|
63
|
+
GGML_API void ggml_tallocr_alloc (ggml_tallocr_t talloc, struct ggml_tensor * tensor);
|
64
|
+
GGML_API size_t ggml_tallocr_max_size (ggml_tallocr_t talloc);
|
65
|
+
|
66
|
+
|
67
|
+
// Graph allocator
|
68
|
+
typedef struct ggml_gallocr * ggml_gallocr_t;
|
69
|
+
|
70
|
+
GGML_API ggml_gallocr_t ggml_gallocr_new(void);
|
71
|
+
GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc);
|
72
|
+
|
73
|
+
GGML_API void ggml_gallocr_set_parse_seq(ggml_gallocr_t galloc, const int * list, int n);
|
74
|
+
GGML_API size_t ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, ggml_tallocr_t talloc, struct ggml_cgraph * graph);
|
75
|
+
|
76
|
+
// Allocate tensors from the allocators given by the hash table
|
77
|
+
GGML_API void ggml_gallocr_alloc_graph_n(
|
78
|
+
ggml_gallocr_t galloc,
|
79
|
+
struct ggml_cgraph * graph,
|
80
|
+
struct ggml_hash_set hash_set,
|
81
|
+
ggml_tallocr_t * hash_node_talloc);
|
30
82
|
|
31
83
|
#ifdef __cplusplus
|
32
84
|
}
|
@@ -0,0 +1,87 @@
|
|
1
|
+
#pragma once
|
2
|
+
|
3
|
+
// ggml-backend internal header
|
4
|
+
|
5
|
+
#include "ggml-backend.h"
|
6
|
+
|
7
|
+
#ifdef __cplusplus
|
8
|
+
extern "C" {
|
9
|
+
#endif
|
10
|
+
|
11
|
+
//
|
12
|
+
// Backend buffer
|
13
|
+
//
|
14
|
+
|
15
|
+
typedef void * ggml_backend_buffer_context_t;
|
16
|
+
|
17
|
+
struct ggml_backend_buffer_i {
|
18
|
+
void (*free_buffer) (ggml_backend_buffer_t buffer);
|
19
|
+
void * (*get_base) (ggml_backend_buffer_t buffer); // get base pointer
|
20
|
+
size_t (*get_alloc_size)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-allocation callback
|
21
|
+
void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // post-allocation callback
|
22
|
+
void (*free_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-free callback
|
23
|
+
};
|
24
|
+
|
25
|
+
struct ggml_backend_buffer {
|
26
|
+
struct ggml_backend_buffer_i iface;
|
27
|
+
|
28
|
+
ggml_backend_t backend;
|
29
|
+
ggml_backend_buffer_context_t context;
|
30
|
+
|
31
|
+
size_t size;
|
32
|
+
};
|
33
|
+
|
34
|
+
GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(
|
35
|
+
struct ggml_backend * backend,
|
36
|
+
struct ggml_backend_buffer_i iface,
|
37
|
+
ggml_backend_buffer_context_t context,
|
38
|
+
size_t size);
|
39
|
+
|
40
|
+
//
|
41
|
+
// Backend
|
42
|
+
//
|
43
|
+
|
44
|
+
typedef void * ggml_backend_context_t;
|
45
|
+
|
46
|
+
struct ggml_backend_i {
|
47
|
+
const char * (*get_name)(ggml_backend_t backend);
|
48
|
+
|
49
|
+
void (*free)(ggml_backend_t backend);
|
50
|
+
|
51
|
+
// buffer allocation
|
52
|
+
ggml_backend_buffer_t (*alloc_buffer)(ggml_backend_t backend, size_t size);
|
53
|
+
|
54
|
+
// get buffer alignment
|
55
|
+
size_t (*get_alignment)(ggml_backend_t backend);
|
56
|
+
|
57
|
+
// tensor data access
|
58
|
+
// these functions can be asynchronous, helper functions are provided for synchronous access that automatically call synchronize
|
59
|
+
void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
60
|
+
void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
61
|
+
void (*synchronize) (ggml_backend_t backend);
|
62
|
+
|
63
|
+
// (optional) copy tensor between different backends, allow for single-copy tranfers
|
64
|
+
void (*cpy_tensor_from)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
|
65
|
+
void (*cpy_tensor_to) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
|
66
|
+
|
67
|
+
// compute graph with a plan
|
68
|
+
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
69
|
+
void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
70
|
+
void (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
71
|
+
|
72
|
+
// compute graph without a plan
|
73
|
+
void (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
74
|
+
|
75
|
+
// check if the backend supports an operation
|
76
|
+
bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
77
|
+
};
|
78
|
+
|
79
|
+
struct ggml_backend {
|
80
|
+
struct ggml_backend_i iface;
|
81
|
+
|
82
|
+
ggml_backend_context_t context;
|
83
|
+
};
|
84
|
+
|
85
|
+
#ifdef __cplusplus
|
86
|
+
}
|
87
|
+
#endif
|