llama-rb 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/Gemfile +10 -0
- data/Gemfile.lock +85 -0
- data/LICENSE +21 -0
- data/README.md +81 -0
- data/Rakefile +10 -0
- data/ext/llama/common.cpp +311 -0
- data/ext/llama/common.h +95 -0
- data/ext/llama/extconf.rb +12 -0
- data/ext/llama/ggml.c +10642 -0
- data/ext/llama/ggml.h +778 -0
- data/ext/llama/llama.cpp +1815 -0
- data/ext/llama/llama.h +152 -0
- data/ext/llama/model.cpp +192 -0
- data/lib/llama/model.rb +86 -0
- data/lib/llama/version.rb +3 -0
- data/lib/llama.rb +6 -0
- data/llama-rb.gemspec +50 -0
- data/models/.gitkeep +0 -0
- metadata +80 -0
data/ext/llama/ggml.h
ADDED
@@ -0,0 +1,778 @@
|
|
1
|
+
#pragma once
|
2
|
+
|
3
|
+
//
|
4
|
+
// GGML Tensor Library
|
5
|
+
//
|
6
|
+
// This documentation is still a work in progress.
|
7
|
+
// If you wish some specific topics to be covered, feel free to drop a comment:
|
8
|
+
//
|
9
|
+
// https://github.com/ggerganov/whisper.cpp/issues/40
|
10
|
+
//
|
11
|
+
// ## Overview
|
12
|
+
//
|
13
|
+
// This library implements:
|
14
|
+
//
|
15
|
+
// - a set of tensor operations
|
16
|
+
// - automatic differentiation
|
17
|
+
// - basic optimization algorithms
|
18
|
+
//
|
19
|
+
// The aim of this library is to provide a minimalistic approach for various machine learning tasks. This includes,
|
20
|
+
// but is not limited to, the following:
|
21
|
+
//
|
22
|
+
// - linear regression
|
23
|
+
// - support vector machines
|
24
|
+
// - neural networks
|
25
|
+
//
|
26
|
+
// The library allows the user to define a certain function using the available tensor operations. This function
|
27
|
+
// definition is represented internally via a computation graph. Each tensor operation in the function definition
|
28
|
+
// corresponds to a node in the graph. Having the computation graph defined, the user can choose to compute the
|
29
|
+
// function's value and/or its gradient with respect to the input variables. Optionally, the function can be optimized
|
30
|
+
// using one of the available optimization algorithms.
|
31
|
+
//
|
32
|
+
// For example, here we define the function: f(x) = a*x^2 + b
|
33
|
+
//
|
34
|
+
// {
|
35
|
+
// struct ggml_init_params params = {
|
36
|
+
// .mem_size = 16*1024*1024,
|
37
|
+
// .mem_buffer = NULL,
|
38
|
+
// };
|
39
|
+
//
|
40
|
+
// // memory allocation happens here
|
41
|
+
// struct ggml_context * ctx = ggml_init(params);
|
42
|
+
//
|
43
|
+
// struct ggml_tensor * x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
|
44
|
+
//
|
45
|
+
// ggml_set_param(ctx, x); // x is an input variable
|
46
|
+
//
|
47
|
+
// struct ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
|
48
|
+
// struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
|
49
|
+
// struct ggml_tensor * x2 = ggml_mul(ctx, x, x);
|
50
|
+
// struct ggml_tensor * f = ggml_add(ctx, ggml_mul(ctx, a, x2), b);
|
51
|
+
//
|
52
|
+
// ...
|
53
|
+
// }
|
54
|
+
//
|
55
|
+
// Notice that the function definition above does not involve any actual computation. The computation is performed only
|
56
|
+
// when the user explicitly requests it. For example, to compute the function's value at x = 2.0:
|
57
|
+
//
|
58
|
+
// {
|
59
|
+
// ...
|
60
|
+
//
|
61
|
+
// struct ggml_cgraph gf = ggml_build_forward(f);
|
62
|
+
//
|
63
|
+
// // set the input variable and parameter values
|
64
|
+
// ggml_set_f32(x, 2.0f);
|
65
|
+
// ggml_set_f32(a, 3.0f);
|
66
|
+
// ggml_set_f32(b, 4.0f);
|
67
|
+
//
|
68
|
+
// ggml_graph_compute(ctx0, &gf);
|
69
|
+
//
|
70
|
+
// printf("f = %f\n", ggml_get_f32_1d(f, 0));
|
71
|
+
//
|
72
|
+
// ...
|
73
|
+
// }
|
74
|
+
//
|
75
|
+
// The actual computation is performed in the ggml_graph_compute() function.
|
76
|
+
//
|
77
|
+
// The ggml_new_tensor_...() functions create new tensors. They are allocated in the memory buffer provided to the
|
78
|
+
// ggml_init() function. You have to be careful not to exceed the memory buffer size. Therefore, you have to know
|
79
|
+
// in advance how much memory you need for your computation. Alternatively, you can allocate a large enough memory
|
80
|
+
// and after defining the computation graph, call the ggml_used_mem() function to find out how much memory was
|
81
|
+
// actually needed.
|
82
|
+
//
|
83
|
+
// The ggml_set_param() function marks a tensor as an input variable. This is used by the automatic
|
84
|
+
// differentiation and optimization algorithms.
|
85
|
+
//
|
86
|
+
// The described approach allows to define the function graph once and then compute its forward or backward graphs
|
87
|
+
// multiple times. All computations will use the same memory buffer allocated in the ggml_init() function. This way
|
88
|
+
// the user can avoid the memory allocation overhead at runtime.
|
89
|
+
//
|
90
|
+
// The library supports multi-dimensional tensors - up to 4 dimensions. The FP16 and FP32 data types are first class
|
91
|
+
// citizens, but in theory the library can be extended to support FP8 and integer data types.
|
92
|
+
//
|
93
|
+
// Each tensor operation produces a new tensor. Initially the library was envisioned to support only the use of unary
|
94
|
+
// and binary operations. Most of the available operations fall into one of these two categories. With time, it became
|
95
|
+
// clear that the library needs to support more complex operations. The way to support these operations is not clear
|
96
|
+
// yet, but a few examples are demonstrated in the following operations:
|
97
|
+
//
|
98
|
+
// - ggml_permute()
|
99
|
+
// - ggml_conv_1d_1s()
|
100
|
+
// - ggml_conv_1d_2s()
|
101
|
+
//
|
102
|
+
// For each tensor operator, the library implements a forward and backward computation function. The forward function
|
103
|
+
// computes the output tensor value given the input tensor values. The backward function computes the adjoint of the
|
104
|
+
// input tensors given the adjoint of the output tensor. For a detailed explanation of what this means, take a
|
105
|
+
// calculus class, or watch the following video:
|
106
|
+
//
|
107
|
+
// What is Automatic Differentiation?
|
108
|
+
// https://www.youtube.com/watch?v=wG_nF1awSSY
|
109
|
+
//
|
110
|
+
//
|
111
|
+
// ## Tensor data (struct ggml_tensor)
|
112
|
+
//
|
113
|
+
// The tensors are stored in memory via the ggml_tensor struct. The structure provides information about the size of
|
114
|
+
// the tensor, the data type, and the memory buffer where the tensor data is stored. Additionally, it contains
|
115
|
+
// pointers to the "source" tensors - i.e. the tensors that were used to compute the current tensor. For example:
|
116
|
+
//
|
117
|
+
// {
|
118
|
+
// struct ggml_tensor * c = ggml_add(ctx, a, b);
|
119
|
+
//
|
120
|
+
// assert(c->src[0] == a);
|
121
|
+
// assert(c->src[1] == b);
|
122
|
+
// }
|
123
|
+
//
|
124
|
+
// The multi-dimensional tensors are stored in row-major order. The ggml_tensor struct contains fields for the
|
125
|
+
// number of elements in each dimension ("ne") as well as the number of bytes ("nb", a.k.a. stride). This allows
|
126
|
+
// to store tensors that are not contiguous in memory, which is useful for operations such as transposition and
|
127
|
+
// permutation. All tensor operations have to take the stride into account and not assume that the tensor is
|
128
|
+
// contiguous in memory.
|
129
|
+
//
|
130
|
+
// The data of the tensor is accessed via the "data" pointer. For example:
|
131
|
+
//
|
132
|
+
// {
|
133
|
+
// struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3);
|
134
|
+
//
|
135
|
+
// // a[1, 2] = 1.0f;
|
136
|
+
// *(float *) ((char *) a->data + 2*a->nb[1] + 1*a->nb[0]) = 1.0f;
|
137
|
+
//
|
138
|
+
// // a[2, 0] = 2.0f;
|
139
|
+
// *(float *) ((char *) a->data + 0*a->nb[1] + 2*a->nb[0]) = 2.0f;
|
140
|
+
//
|
141
|
+
// ...
|
142
|
+
// }
|
143
|
+
//
|
144
|
+
// Alternatively, there are helper functions, such as ggml_get_f32_1d() and ggml_set_f32_1d() that can be used.
|
145
|
+
//
|
146
|
+
// ## The matrix multiplication operator (ggml_mul_mat)
|
147
|
+
//
|
148
|
+
// TODO
|
149
|
+
//
|
150
|
+
//
|
151
|
+
// ## Multi-threading
|
152
|
+
//
|
153
|
+
// TODO
|
154
|
+
//
|
155
|
+
//
|
156
|
+
// ## Overview of ggml.c
|
157
|
+
//
|
158
|
+
// TODO
|
159
|
+
//
|
160
|
+
//
|
161
|
+
// ## SIMD optimizations
|
162
|
+
//
|
163
|
+
// TODO
|
164
|
+
//
|
165
|
+
//
|
166
|
+
// ## Debugging ggml
|
167
|
+
//
|
168
|
+
// TODO
|
169
|
+
//
|
170
|
+
//
|
171
|
+
|
172
|
+
#ifdef __cplusplus
|
173
|
+
extern "C" {
|
174
|
+
#endif
|
175
|
+
|
176
|
+
#include <stdint.h>
|
177
|
+
#include <stddef.h>
|
178
|
+
#include <stdbool.h>
|
179
|
+
|
180
|
+
#define GGML_MAX_DIMS 4
|
181
|
+
#define GGML_MAX_NODES 4096
|
182
|
+
#define GGML_MAX_PARAMS 16
|
183
|
+
#define GGML_MAX_CONTEXTS 64
|
184
|
+
#define GGML_MAX_OPT 4
|
185
|
+
|
186
|
+
#ifdef __ARM_NEON
|
187
|
+
// we use the built-in 16-bit float type
|
188
|
+
typedef __fp16 ggml_fp16_t;
|
189
|
+
#else
|
190
|
+
typedef uint16_t ggml_fp16_t;
|
191
|
+
#endif
|
192
|
+
|
193
|
+
// convert FP16 <-> FP32
|
194
|
+
float ggml_fp16_to_fp32(ggml_fp16_t x);
|
195
|
+
ggml_fp16_t ggml_fp32_to_fp16(float x);
|
196
|
+
|
197
|
+
struct ggml_object;
|
198
|
+
struct ggml_context;
|
199
|
+
|
200
|
+
enum ggml_type {
|
201
|
+
GGML_TYPE_Q4_0,
|
202
|
+
GGML_TYPE_Q4_1,
|
203
|
+
GGML_TYPE_I8,
|
204
|
+
GGML_TYPE_I16,
|
205
|
+
GGML_TYPE_I32,
|
206
|
+
GGML_TYPE_F16,
|
207
|
+
GGML_TYPE_F32,
|
208
|
+
GGML_TYPE_COUNT,
|
209
|
+
};
|
210
|
+
|
211
|
+
// available tensor operations:
|
212
|
+
enum ggml_op {
|
213
|
+
GGML_OP_NONE = 0,
|
214
|
+
|
215
|
+
GGML_OP_DUP,
|
216
|
+
GGML_OP_ADD,
|
217
|
+
GGML_OP_SUB,
|
218
|
+
GGML_OP_MUL,
|
219
|
+
GGML_OP_DIV,
|
220
|
+
GGML_OP_SQR,
|
221
|
+
GGML_OP_SQRT,
|
222
|
+
GGML_OP_SUM,
|
223
|
+
GGML_OP_MEAN,
|
224
|
+
GGML_OP_REPEAT,
|
225
|
+
GGML_OP_ABS,
|
226
|
+
GGML_OP_SGN,
|
227
|
+
GGML_OP_NEG,
|
228
|
+
GGML_OP_STEP,
|
229
|
+
GGML_OP_RELU,
|
230
|
+
GGML_OP_GELU,
|
231
|
+
GGML_OP_SILU,
|
232
|
+
GGML_OP_NORM, // normalize
|
233
|
+
GGML_OP_RMS_NORM,
|
234
|
+
|
235
|
+
GGML_OP_MUL_MAT,
|
236
|
+
|
237
|
+
GGML_OP_SCALE,
|
238
|
+
GGML_OP_CPY,
|
239
|
+
GGML_OP_RESHAPE,
|
240
|
+
GGML_OP_VIEW,
|
241
|
+
GGML_OP_PERMUTE,
|
242
|
+
GGML_OP_TRANSPOSE,
|
243
|
+
GGML_OP_GET_ROWS,
|
244
|
+
GGML_OP_DIAG_MASK_INF,
|
245
|
+
GGML_OP_SOFT_MAX,
|
246
|
+
GGML_OP_ROPE,
|
247
|
+
GGML_OP_CONV_1D_1S,
|
248
|
+
GGML_OP_CONV_1D_2S,
|
249
|
+
|
250
|
+
GGML_OP_FLASH_ATTN,
|
251
|
+
GGML_OP_FLASH_FF,
|
252
|
+
|
253
|
+
GGML_OP_COUNT,
|
254
|
+
};
|
255
|
+
|
256
|
+
// n-dimensional tensor
|
257
|
+
struct ggml_tensor {
|
258
|
+
enum ggml_type type;
|
259
|
+
|
260
|
+
int n_dims;
|
261
|
+
int ne[GGML_MAX_DIMS]; // number of elements
|
262
|
+
size_t nb[GGML_MAX_DIMS]; // stride in bytes:
|
263
|
+
// nb[0] = sizeof(type)
|
264
|
+
// nb[1] = nb[0] * ne[0] + padding
|
265
|
+
// nb[i] = nb[i-1] * ne[i-1]
|
266
|
+
|
267
|
+
// compute data
|
268
|
+
enum ggml_op op;
|
269
|
+
|
270
|
+
bool is_param;
|
271
|
+
|
272
|
+
struct ggml_tensor * grad;
|
273
|
+
struct ggml_tensor * src0;
|
274
|
+
struct ggml_tensor * src1;
|
275
|
+
struct ggml_tensor * opt[GGML_MAX_OPT];
|
276
|
+
|
277
|
+
// thread scheduling
|
278
|
+
int n_tasks;
|
279
|
+
|
280
|
+
// performance
|
281
|
+
int perf_runs;
|
282
|
+
int64_t perf_cycles;
|
283
|
+
int64_t perf_time_us;
|
284
|
+
|
285
|
+
void * data;
|
286
|
+
char padding[8];
|
287
|
+
};
|
288
|
+
|
289
|
+
// computation graph
|
290
|
+
struct ggml_cgraph {
|
291
|
+
int n_nodes;
|
292
|
+
int n_leafs;
|
293
|
+
int n_threads;
|
294
|
+
|
295
|
+
size_t work_size;
|
296
|
+
struct ggml_tensor * work;
|
297
|
+
|
298
|
+
struct ggml_tensor * nodes[GGML_MAX_NODES];
|
299
|
+
struct ggml_tensor * grads[GGML_MAX_NODES];
|
300
|
+
struct ggml_tensor * leafs[GGML_MAX_NODES];
|
301
|
+
|
302
|
+
// performance
|
303
|
+
int perf_runs;
|
304
|
+
int64_t perf_cycles;
|
305
|
+
int64_t perf_time_us;
|
306
|
+
};
|
307
|
+
|
308
|
+
// scratch buffer
|
309
|
+
struct ggml_scratch {
|
310
|
+
size_t offs;
|
311
|
+
size_t size;
|
312
|
+
void * data;
|
313
|
+
};
|
314
|
+
|
315
|
+
struct ggml_init_params {
|
316
|
+
// memory pool
|
317
|
+
size_t mem_size; // bytes
|
318
|
+
void * mem_buffer; // if NULL, memory will be allocated internally
|
319
|
+
bool no_alloc; // don't allocate memory for the tensor data
|
320
|
+
};
|
321
|
+
|
322
|
+
void ggml_time_init(void); // call this once at the beginning of the program
|
323
|
+
int64_t ggml_time_ms(void);
|
324
|
+
int64_t ggml_time_us(void);
|
325
|
+
int64_t ggml_cycles(void);
|
326
|
+
int64_t ggml_cycles_per_ms(void);
|
327
|
+
|
328
|
+
void ggml_print_object (const struct ggml_object * obj);
|
329
|
+
void ggml_print_objects(const struct ggml_context * ctx);
|
330
|
+
|
331
|
+
int ggml_nelements(const struct ggml_tensor * tensor);
|
332
|
+
size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
333
|
+
|
334
|
+
int ggml_blck_size (enum ggml_type type);
|
335
|
+
size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
|
336
|
+
float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
|
337
|
+
|
338
|
+
size_t ggml_element_size(const struct ggml_tensor * tensor);
|
339
|
+
|
340
|
+
struct ggml_context * ggml_init(struct ggml_init_params params);
|
341
|
+
void ggml_free(struct ggml_context * ctx);
|
342
|
+
|
343
|
+
size_t ggml_used_mem(const struct ggml_context * ctx);
|
344
|
+
|
345
|
+
size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
|
346
|
+
|
347
|
+
bool ggml_mlock_supported(void);
|
348
|
+
bool ggml_mlock(
|
349
|
+
struct ggml_context * ctx,
|
350
|
+
const void *opt_extra_addr,
|
351
|
+
size_t opt_extra_len,
|
352
|
+
char **err_p);
|
353
|
+
|
354
|
+
struct ggml_tensor * ggml_new_tensor(
|
355
|
+
struct ggml_context * ctx,
|
356
|
+
enum ggml_type type,
|
357
|
+
int n_dims,
|
358
|
+
const int *ne);
|
359
|
+
|
360
|
+
struct ggml_tensor * ggml_new_tensor_1d(
|
361
|
+
struct ggml_context * ctx,
|
362
|
+
enum ggml_type type,
|
363
|
+
int ne0);
|
364
|
+
|
365
|
+
struct ggml_tensor * ggml_new_tensor_2d(
|
366
|
+
struct ggml_context * ctx,
|
367
|
+
enum ggml_type type,
|
368
|
+
int ne0,
|
369
|
+
int ne1);
|
370
|
+
|
371
|
+
struct ggml_tensor * ggml_new_tensor_3d(
|
372
|
+
struct ggml_context * ctx,
|
373
|
+
enum ggml_type type,
|
374
|
+
int ne0,
|
375
|
+
int ne1,
|
376
|
+
int ne2);
|
377
|
+
|
378
|
+
struct ggml_tensor * ggml_new_tensor_4d(
|
379
|
+
struct ggml_context * ctx,
|
380
|
+
enum ggml_type type,
|
381
|
+
int ne0,
|
382
|
+
int ne1,
|
383
|
+
int ne2,
|
384
|
+
int ne3);
|
385
|
+
|
386
|
+
struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
|
387
|
+
struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
|
388
|
+
|
389
|
+
struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
|
390
|
+
struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
|
391
|
+
|
392
|
+
struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
393
|
+
struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
|
394
|
+
struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
|
395
|
+
|
396
|
+
int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
|
397
|
+
void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
|
398
|
+
|
399
|
+
float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
|
400
|
+
void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
|
401
|
+
|
402
|
+
void * ggml_get_data (const struct ggml_tensor * tensor);
|
403
|
+
float * ggml_get_data_f32(const struct ggml_tensor * tensor);
|
404
|
+
|
405
|
+
//
|
406
|
+
// operations on tensors with backpropagation
|
407
|
+
//
|
408
|
+
|
409
|
+
struct ggml_tensor * ggml_dup(
|
410
|
+
struct ggml_context * ctx,
|
411
|
+
struct ggml_tensor * a);
|
412
|
+
|
413
|
+
struct ggml_tensor * ggml_add(
|
414
|
+
struct ggml_context * ctx,
|
415
|
+
struct ggml_tensor * a,
|
416
|
+
struct ggml_tensor * b);
|
417
|
+
|
418
|
+
struct ggml_tensor * ggml_sub(
|
419
|
+
struct ggml_context * ctx,
|
420
|
+
struct ggml_tensor * a,
|
421
|
+
struct ggml_tensor * b);
|
422
|
+
|
423
|
+
struct ggml_tensor * ggml_mul(
|
424
|
+
struct ggml_context * ctx,
|
425
|
+
struct ggml_tensor * a,
|
426
|
+
struct ggml_tensor * b);
|
427
|
+
|
428
|
+
struct ggml_tensor * ggml_div(
|
429
|
+
struct ggml_context * ctx,
|
430
|
+
struct ggml_tensor * a,
|
431
|
+
struct ggml_tensor * b);
|
432
|
+
|
433
|
+
struct ggml_tensor * ggml_sqr(
|
434
|
+
struct ggml_context * ctx,
|
435
|
+
struct ggml_tensor * a);
|
436
|
+
|
437
|
+
struct ggml_tensor * ggml_sqrt(
|
438
|
+
struct ggml_context * ctx,
|
439
|
+
struct ggml_tensor * a);
|
440
|
+
|
441
|
+
// return scalar
|
442
|
+
// TODO: compute sum along rows
|
443
|
+
struct ggml_tensor * ggml_sum(
|
444
|
+
struct ggml_context * ctx,
|
445
|
+
struct ggml_tensor * a);
|
446
|
+
|
447
|
+
// mean along rows
|
448
|
+
struct ggml_tensor * ggml_mean(
|
449
|
+
struct ggml_context * ctx,
|
450
|
+
struct ggml_tensor * a);
|
451
|
+
|
452
|
+
// if a is the same shape as b, and a is not parameter, return a
|
453
|
+
// otherwise, return a new tensor: repeat(a) to fit in b
|
454
|
+
struct ggml_tensor * ggml_repeat(
|
455
|
+
struct ggml_context * ctx,
|
456
|
+
struct ggml_tensor * a,
|
457
|
+
struct ggml_tensor * b);
|
458
|
+
|
459
|
+
struct ggml_tensor * ggml_abs(
|
460
|
+
struct ggml_context * ctx,
|
461
|
+
struct ggml_tensor * a);
|
462
|
+
|
463
|
+
struct ggml_tensor * ggml_sgn(
|
464
|
+
struct ggml_context * ctx,
|
465
|
+
struct ggml_tensor * a);
|
466
|
+
|
467
|
+
struct ggml_tensor * ggml_neg(
|
468
|
+
struct ggml_context * ctx,
|
469
|
+
struct ggml_tensor * a);
|
470
|
+
|
471
|
+
struct ggml_tensor * ggml_step(
|
472
|
+
struct ggml_context * ctx,
|
473
|
+
struct ggml_tensor * a);
|
474
|
+
|
475
|
+
struct ggml_tensor * ggml_relu(
|
476
|
+
struct ggml_context * ctx,
|
477
|
+
struct ggml_tensor * a);
|
478
|
+
|
479
|
+
// TODO: double-check this computation is correct
|
480
|
+
struct ggml_tensor * ggml_gelu(
|
481
|
+
struct ggml_context * ctx,
|
482
|
+
struct ggml_tensor * a);
|
483
|
+
|
484
|
+
struct ggml_tensor * ggml_silu(
|
485
|
+
struct ggml_context * ctx,
|
486
|
+
struct ggml_tensor * a);
|
487
|
+
|
488
|
+
// normalize along rows
|
489
|
+
// TODO: eps is hardcoded to 1e-5 for now
|
490
|
+
struct ggml_tensor * ggml_norm(
|
491
|
+
struct ggml_context * ctx,
|
492
|
+
struct ggml_tensor * a);
|
493
|
+
|
494
|
+
struct ggml_tensor * ggml_rms_norm(
|
495
|
+
struct ggml_context * ctx,
|
496
|
+
struct ggml_tensor * a);
|
497
|
+
|
498
|
+
// A: m rows, n columns
|
499
|
+
// B: p rows, n columns (i.e. we transpose it internally)
|
500
|
+
// result is m columns, p rows
|
501
|
+
struct ggml_tensor * ggml_mul_mat(
|
502
|
+
struct ggml_context * ctx,
|
503
|
+
struct ggml_tensor * a,
|
504
|
+
struct ggml_tensor * b);
|
505
|
+
|
506
|
+
//
|
507
|
+
// operations on tensors without backpropagation
|
508
|
+
//
|
509
|
+
|
510
|
+
// in-place, returns view(a)
|
511
|
+
struct ggml_tensor * ggml_scale(
|
512
|
+
struct ggml_context * ctx,
|
513
|
+
struct ggml_tensor * a,
|
514
|
+
struct ggml_tensor * b);
|
515
|
+
|
516
|
+
// a -> b, return view(b)
|
517
|
+
struct ggml_tensor * ggml_cpy(
|
518
|
+
struct ggml_context * ctx,
|
519
|
+
struct ggml_tensor * a,
|
520
|
+
struct ggml_tensor * b);
|
521
|
+
|
522
|
+
// return view(a), b specifies the new shape
|
523
|
+
// TODO: when we start computing gradient, make a copy instead of view
|
524
|
+
struct ggml_tensor * ggml_reshape(
|
525
|
+
struct ggml_context * ctx,
|
526
|
+
struct ggml_tensor * a,
|
527
|
+
struct ggml_tensor * b);
|
528
|
+
|
529
|
+
// return view(a)
|
530
|
+
// TODO: when we start computing gradient, make a copy instead of view
|
531
|
+
struct ggml_tensor * ggml_reshape_2d(
|
532
|
+
struct ggml_context * ctx,
|
533
|
+
struct ggml_tensor * a,
|
534
|
+
int ne0,
|
535
|
+
int ne1);
|
536
|
+
|
537
|
+
// return view(a)
|
538
|
+
// TODO: when we start computing gradient, make a copy instead of view
|
539
|
+
struct ggml_tensor * ggml_reshape_3d(
|
540
|
+
struct ggml_context * ctx,
|
541
|
+
struct ggml_tensor * a,
|
542
|
+
int ne0,
|
543
|
+
int ne1,
|
544
|
+
int ne2);
|
545
|
+
|
546
|
+
// offset in bytes
|
547
|
+
struct ggml_tensor * ggml_view_1d(
|
548
|
+
struct ggml_context * ctx,
|
549
|
+
struct ggml_tensor * a,
|
550
|
+
int ne0,
|
551
|
+
size_t offset);
|
552
|
+
|
553
|
+
struct ggml_tensor * ggml_view_2d(
|
554
|
+
struct ggml_context * ctx,
|
555
|
+
struct ggml_tensor * a,
|
556
|
+
int ne0,
|
557
|
+
int ne1,
|
558
|
+
size_t nb1, // row stride in bytes
|
559
|
+
size_t offset);
|
560
|
+
|
561
|
+
struct ggml_tensor * ggml_permute(
|
562
|
+
struct ggml_context * ctx,
|
563
|
+
struct ggml_tensor * a,
|
564
|
+
int axis0,
|
565
|
+
int axis1,
|
566
|
+
int axis2,
|
567
|
+
int axis3);
|
568
|
+
|
569
|
+
// alias for ggml_permute(ctx, a, 1, 0, 2, 3)
|
570
|
+
struct ggml_tensor * ggml_transpose(
|
571
|
+
struct ggml_context * ctx,
|
572
|
+
struct ggml_tensor * a);
|
573
|
+
|
574
|
+
struct ggml_tensor * ggml_get_rows(
|
575
|
+
struct ggml_context * ctx,
|
576
|
+
struct ggml_tensor * a,
|
577
|
+
struct ggml_tensor * b);
|
578
|
+
|
579
|
+
// set elements above the diagonal to -INF
|
580
|
+
// in-place, returns view(a)
|
581
|
+
struct ggml_tensor * ggml_diag_mask_inf(
|
582
|
+
struct ggml_context * ctx,
|
583
|
+
struct ggml_tensor * a,
|
584
|
+
int n_past);
|
585
|
+
|
586
|
+
// in-place, returns view(a)
|
587
|
+
struct ggml_tensor * ggml_soft_max(
|
588
|
+
struct ggml_context * ctx,
|
589
|
+
struct ggml_tensor * a);
|
590
|
+
|
591
|
+
// rotary position embedding
|
592
|
+
// in-place, returns view(a)
|
593
|
+
// if mode == 1, skip n_past elements
|
594
|
+
// TODO: avoid creating a new tensor every time
|
595
|
+
struct ggml_tensor * ggml_rope(
|
596
|
+
struct ggml_context * ctx,
|
597
|
+
struct ggml_tensor * a,
|
598
|
+
int n_past,
|
599
|
+
int n_dims,
|
600
|
+
int mode);
|
601
|
+
|
602
|
+
// padding = 1
|
603
|
+
// TODO: we don't support extra parameters for now
|
604
|
+
// that's why we are hard-coding the stride, padding, and dilation
|
605
|
+
// not great ..
|
606
|
+
struct ggml_tensor * ggml_conv_1d_1s(
|
607
|
+
struct ggml_context * ctx,
|
608
|
+
struct ggml_tensor * a,
|
609
|
+
struct ggml_tensor * b);
|
610
|
+
|
611
|
+
struct ggml_tensor * ggml_conv_1d_2s(
|
612
|
+
struct ggml_context * ctx,
|
613
|
+
struct ggml_tensor * a,
|
614
|
+
struct ggml_tensor * b);
|
615
|
+
|
616
|
+
struct ggml_tensor * ggml_flash_attn(
|
617
|
+
struct ggml_context * ctx,
|
618
|
+
struct ggml_tensor * q,
|
619
|
+
struct ggml_tensor * k,
|
620
|
+
struct ggml_tensor * v,
|
621
|
+
bool masked);
|
622
|
+
|
623
|
+
struct ggml_tensor * ggml_flash_ff(
|
624
|
+
struct ggml_context * ctx,
|
625
|
+
struct ggml_tensor * a,
|
626
|
+
struct ggml_tensor * b0,
|
627
|
+
struct ggml_tensor * b1,
|
628
|
+
struct ggml_tensor * c0,
|
629
|
+
struct ggml_tensor * c1);
|
630
|
+
|
631
|
+
//
|
632
|
+
// automatic differentiation
|
633
|
+
//
|
634
|
+
|
635
|
+
void ggml_set_param(
|
636
|
+
struct ggml_context * ctx,
|
637
|
+
struct ggml_tensor * tensor);
|
638
|
+
|
639
|
+
void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
640
|
+
|
641
|
+
struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
|
642
|
+
struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
|
643
|
+
|
644
|
+
void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
|
645
|
+
void ggml_graph_reset (struct ggml_cgraph * cgraph);
|
646
|
+
|
647
|
+
// print info and performance information for the graph
|
648
|
+
void ggml_graph_print(const struct ggml_cgraph * cgraph);
|
649
|
+
|
650
|
+
// dump the graph into a file using the dot format
|
651
|
+
void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
|
652
|
+
|
653
|
+
//
|
654
|
+
// optimization
|
655
|
+
//
|
656
|
+
|
657
|
+
// optimization methods
|
658
|
+
enum ggml_opt_type {
|
659
|
+
GGML_OPT_ADAM,
|
660
|
+
GGML_OPT_LBFGS,
|
661
|
+
};
|
662
|
+
|
663
|
+
// linesearch methods
|
664
|
+
enum ggml_linesearch {
|
665
|
+
GGML_LINESEARCH_DEFAULT = 1,
|
666
|
+
|
667
|
+
GGML_LINESEARCH_BACKTRACKING_ARMIJO = 0,
|
668
|
+
GGML_LINESEARCH_BACKTRACKING_WOLFE = 1,
|
669
|
+
GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2,
|
670
|
+
};
|
671
|
+
|
672
|
+
// optimization return values
|
673
|
+
enum ggml_opt_result {
|
674
|
+
GGML_OPT_OK = 0,
|
675
|
+
GGML_OPT_DID_NOT_CONVERGE,
|
676
|
+
GGML_OPT_NO_CONTEXT,
|
677
|
+
GGML_OPT_INVALID_WOLFE,
|
678
|
+
GGML_OPT_FAIL,
|
679
|
+
|
680
|
+
GGML_LINESEARCH_FAIL = -128,
|
681
|
+
GGML_LINESEARCH_MINIMUM_STEP,
|
682
|
+
GGML_LINESEARCH_MAXIMUM_STEP,
|
683
|
+
GGML_LINESEARCH_MAXIMUM_ITERATIONS,
|
684
|
+
GGML_LINESEARCH_INVALID_PARAMETERS,
|
685
|
+
};
|
686
|
+
|
687
|
+
// optimization parameters
|
688
|
+
//
|
689
|
+
// see ggml.c (ggml_opt_default_params) for default values
|
690
|
+
//
|
691
|
+
struct ggml_opt_params {
|
692
|
+
enum ggml_opt_type type;
|
693
|
+
|
694
|
+
int n_threads;
|
695
|
+
|
696
|
+
// delta-based convergence test
|
697
|
+
//
|
698
|
+
// if past == 0 - disabled
|
699
|
+
// if past > 0:
|
700
|
+
// stop if |f(x) - f(x_past)| < delta * max(1, |f(x)|)
|
701
|
+
//
|
702
|
+
int past;
|
703
|
+
float delta;
|
704
|
+
|
705
|
+
// maximum number of iterations without improvement
|
706
|
+
//
|
707
|
+
// if 0 - disabled
|
708
|
+
// if > 0:
|
709
|
+
// assume convergence if no cost improvement in this number of iterations
|
710
|
+
//
|
711
|
+
int max_no_improvement;
|
712
|
+
|
713
|
+
bool print_forward_graph;
|
714
|
+
bool print_backward_graph;
|
715
|
+
|
716
|
+
// ADAM parameters
|
717
|
+
struct {
|
718
|
+
int n_iter;
|
719
|
+
|
720
|
+
float alpha; // learning rate
|
721
|
+
float beta1;
|
722
|
+
float beta2;
|
723
|
+
float eps; // epsilon for numerical stability
|
724
|
+
float eps_f; // epsilon for convergence test
|
725
|
+
float eps_g; // epsilon for convergence test
|
726
|
+
} adam;
|
727
|
+
|
728
|
+
// LBFGS parameters
|
729
|
+
struct {
|
730
|
+
int m; // number of corrections to approximate the inv. Hessian
|
731
|
+
int n_iter;
|
732
|
+
int max_linesearch;
|
733
|
+
|
734
|
+
float eps; // convergence tolerance
|
735
|
+
float ftol; // line search tolerance
|
736
|
+
float wolfe;
|
737
|
+
float min_step;
|
738
|
+
float max_step;
|
739
|
+
|
740
|
+
enum ggml_linesearch linesearch;
|
741
|
+
} lbfgs;
|
742
|
+
};
|
743
|
+
|
744
|
+
struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
|
745
|
+
|
746
|
+
// optimize the function defined by the tensor f
|
747
|
+
enum ggml_opt_result ggml_opt(
|
748
|
+
struct ggml_context * ctx,
|
749
|
+
struct ggml_opt_params params,
|
750
|
+
struct ggml_tensor * f);
|
751
|
+
|
752
|
+
//
|
753
|
+
// quantization
|
754
|
+
//
|
755
|
+
|
756
|
+
size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
757
|
+
size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
758
|
+
|
759
|
+
//
|
760
|
+
// system info
|
761
|
+
//
|
762
|
+
|
763
|
+
int ggml_cpu_has_avx(void);
|
764
|
+
int ggml_cpu_has_avx2(void);
|
765
|
+
int ggml_cpu_has_avx512(void);
|
766
|
+
int ggml_cpu_has_fma(void);
|
767
|
+
int ggml_cpu_has_neon(void);
|
768
|
+
int ggml_cpu_has_arm_fma(void);
|
769
|
+
int ggml_cpu_has_f16c(void);
|
770
|
+
int ggml_cpu_has_fp16_va(void);
|
771
|
+
int ggml_cpu_has_wasm_simd(void);
|
772
|
+
int ggml_cpu_has_blas(void);
|
773
|
+
int ggml_cpu_has_sse3(void);
|
774
|
+
int ggml_cpu_has_vsx(void);
|
775
|
+
|
776
|
+
#ifdef __cplusplus
|
777
|
+
}
|
778
|
+
#endif
|