gpt_neox_client 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,1997 @@
1
+ #pragma once
2
+
3
+ //
4
+ // GGML Tensor Library
5
+ //
6
+ // This documentation is still a work in progress.
7
+ // If you wish some specific topics to be covered, feel free to drop a comment:
8
+ //
9
+ // https://github.com/ggerganov/whisper.cpp/issues/40
10
+ //
11
+ // ## Overview
12
+ //
13
+ // This library implements:
14
+ //
15
+ // - a set of tensor operations
16
+ // - automatic differentiation
17
+ // - basic optimization algorithms
18
+ //
19
+ // The aim of this library is to provide a minimalistic approach for various machine learning tasks. This includes,
20
+ // but is not limited to, the following:
21
+ //
22
+ // - linear regression
23
+ // - support vector machines
24
+ // - neural networks
25
+ //
26
+ // The library allows the user to define a certain function using the available tensor operations. This function
27
+ // definition is represented internally via a computation graph. Each tensor operation in the function definition
28
+ // corresponds to a node in the graph. Having the computation graph defined, the user can choose to compute the
29
+ // function's value and/or its gradient with respect to the input variables. Optionally, the function can be optimized
30
+ // using one of the available optimization algorithms.
31
+ //
32
+ // For example, here we define the function: f(x) = a*x^2 + b
33
+ //
34
+ // {
35
+ // struct ggml_init_params params = {
36
+ // .mem_size = 16*1024*1024,
37
+ // .mem_buffer = NULL,
38
+ // };
39
+ //
40
+ // // memory allocation happens here
41
+ // struct ggml_context * ctx = ggml_init(params);
42
+ //
43
+ // struct ggml_tensor * x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
44
+ //
45
+ // ggml_set_param(ctx, x); // x is an input variable
46
+ //
47
+ // struct ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
48
+ // struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
49
+ // struct ggml_tensor * x2 = ggml_mul(ctx, x, x);
50
+ // struct ggml_tensor * f = ggml_add(ctx, ggml_mul(ctx, a, x2), b);
51
+ //
52
+ // ...
53
+ // }
54
+ //
55
+ // Notice that the function definition above does not involve any actual computation. The computation is performed only
56
+ // when the user explicitly requests it. For example, to compute the function's value at x = 2.0:
57
+ //
58
+ // {
59
+ // ...
60
+ //
61
+ // struct ggml_cgraph gf = ggml_build_forward(f);
62
+ //
63
+ // // set the input variable and parameter values
64
+ // ggml_set_f32(x, 2.0f);
65
+ // ggml_set_f32(a, 3.0f);
66
+ // ggml_set_f32(b, 4.0f);
67
+ //
68
+ // ggml_graph_compute_with_ctx(ctx, &gf, n_threads);
69
+ //
70
+ // printf("f = %f\n", ggml_get_f32_1d(f, 0));
71
+ //
72
+ // ...
73
+ // }
74
+ //
75
+ // The actual computation is performed in the ggml_graph_compute() function.
76
+ //
77
+ // The ggml_new_tensor_...() functions create new tensors. They are allocated in the memory buffer provided to the
78
+ // ggml_init() function. You have to be careful not to exceed the memory buffer size. Therefore, you have to know
79
+ // in advance how much memory you need for your computation. Alternatively, you can allocate a large enough memory
80
+ // and after defining the computation graph, call the ggml_used_mem() function to find out how much memory was
81
+ // actually needed.
82
+ //
83
+ // The ggml_set_param() function marks a tensor as an input variable. This is used by the automatic
84
+ // differentiation and optimization algorithms.
85
+ //
86
+ // The described approach allows to define the function graph once and then compute its forward or backward graphs
87
+ // multiple times. All computations will use the same memory buffer allocated in the ggml_init() function. This way
88
+ // the user can avoid the memory allocation overhead at runtime.
89
+ //
90
+ // The library supports multi-dimensional tensors - up to 4 dimensions. The FP16 and FP32 data types are first class
91
+ // citizens, but in theory the library can be extended to support FP8 and integer data types.
92
+ //
93
+ // Each tensor operation produces a new tensor. Initially the library was envisioned to support only the use of unary
94
+ // and binary operations. Most of the available operations fall into one of these two categories. With time, it became
95
+ // clear that the library needs to support more complex operations. The way to support these operations is not clear
96
+ // yet, but a few examples are demonstrated in the following operations:
97
+ //
98
+ // - ggml_permute()
99
+ // - ggml_conv_1d_1s()
100
+ // - ggml_conv_1d_2s()
101
+ //
102
+ // For each tensor operator, the library implements a forward and backward computation function. The forward function
103
+ // computes the output tensor value given the input tensor values. The backward function computes the adjoint of the
104
+ // input tensors given the adjoint of the output tensor. For a detailed explanation of what this means, take a
105
+ // calculus class, or watch the following video:
106
+ //
107
+ // What is Automatic Differentiation?
108
+ // https://www.youtube.com/watch?v=wG_nF1awSSY
109
+ //
110
+ //
111
+ // ## Tensor data (struct ggml_tensor)
112
+ //
113
+ // The tensors are stored in memory via the ggml_tensor struct. The structure provides information about the size of
114
+ // the tensor, the data type, and the memory buffer where the tensor data is stored. Additionally, it contains
115
+ // pointers to the "source" tensors - i.e. the tensors that were used to compute the current tensor. For example:
116
+ //
117
+ // {
118
+ // struct ggml_tensor * c = ggml_add(ctx, a, b);
119
+ //
120
+ // assert(c->src[0] == a);
121
+ // assert(c->src[1] == b);
122
+ // }
123
+ //
124
+ // The multi-dimensional tensors are stored in row-major order. The ggml_tensor struct contains fields for the
125
+ // number of elements in each dimension ("ne") as well as the number of bytes ("nb", a.k.a. stride). This allows
126
+ // to store tensors that are not contiguous in memory, which is useful for operations such as transposition and
127
+ // permutation. All tensor operations have to take the stride into account and not assume that the tensor is
128
+ // contiguous in memory.
129
+ //
130
+ // The data of the tensor is accessed via the "data" pointer. For example:
131
+ //
132
+ // {
133
+ // const int nx = 2;
134
+ // const int ny = 3;
135
+ //
136
+ // struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nx, ny);
137
+ //
138
+ // for (int y = 0; y < ny; y++) {
139
+ // for (int x = 0; x < nx; x++) {
140
+ // *(float *) ((char *) a->data + y*a->nb[1] + x*a->nb[0]) = x + y;
141
+ // }
142
+ // }
143
+ //
144
+ // ...
145
+ // }
146
+ //
147
+ // Alternatively, there are helper functions, such as ggml_get_f32_1d() and ggml_set_f32_1d() that can be used.
148
+ //
149
+ // ## The matrix multiplication operator (ggml_mul_mat)
150
+ //
151
+ // TODO
152
+ //
153
+ //
154
+ // ## Multi-threading
155
+ //
156
+ // TODO
157
+ //
158
+ //
159
+ // ## Overview of ggml.c
160
+ //
161
+ // TODO
162
+ //
163
+ //
164
+ // ## SIMD optimizations
165
+ //
166
+ // TODO
167
+ //
168
+ //
169
+ // ## Debugging ggml
170
+ //
171
+ // TODO
172
+ //
173
+ //
174
+
175
+ #ifdef GGML_SHARED
176
+ # if defined(_WIN32) && !defined(__MINGW32__)
177
+ # ifdef GGML_BUILD
178
+ # define GGML_API __declspec(dllexport)
179
+ # else
180
+ # define GGML_API __declspec(dllimport)
181
+ # endif
182
+ # else
183
+ # define GGML_API __attribute__ ((visibility ("default")))
184
+ # endif
185
+ #else
186
+ # define GGML_API
187
+ #endif
188
+
189
+ // TODO: support for clang
190
+ #ifdef __GNUC__
191
+ # define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
192
+ #elif defined(_MSC_VER)
193
+ # define GGML_DEPRECATED(func, hint) __declspec(deprecated(hint)) func
194
+ #else
195
+ # define GGML_DEPRECATED(func, hint) func
196
+ #endif
197
+
198
+ #include <stdint.h>
199
+ #include <stddef.h>
200
+ #include <stdbool.h>
201
+
202
+ #define GGML_FILE_MAGIC 0x67676d6c // "ggml"
203
+ #define GGML_FILE_VERSION 1
204
+
205
+ #define GGML_QNT_VERSION 2 // bump this on quantization format changes
206
+ #define GGML_QNT_VERSION_FACTOR 1000 // do not change this
207
+
208
+ #define GGML_MAX_DIMS 4
209
+ #define GGML_MAX_NODES 4096
210
+ #define GGML_MAX_PARAMS 256
211
+ #define GGML_MAX_CONTEXTS 64
212
+ #define GGML_MAX_SRC 6
213
+ #define GGML_MAX_NAME 64
214
+ #define GGML_MAX_OP_PARAMS 32
215
+ #define GGML_DEFAULT_N_THREADS 4
216
+
217
+ #if UINTPTR_MAX == 0xFFFFFFFF
218
+ #define GGML_MEM_ALIGN 4
219
+ #else
220
+ #define GGML_MEM_ALIGN 16
221
+ #endif
222
+
223
+ #define GGML_EXIT_SUCCESS 0
224
+ #define GGML_EXIT_ABORTED 1
225
+
226
+ #define GGUF_MAGIC 0x46554747 // "GGUF"
227
+ #define GGUF_VERSION 2
228
+
229
+ #define GGUF_DEFAULT_ALIGNMENT 32
230
+
231
+ #define GGML_UNUSED(x) (void)(x)
232
+
233
+ #define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
234
+
235
+ #define GGML_ASSERT(x) \
236
+ do { \
237
+ if (!(x)) { \
238
+ fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
239
+ abort(); \
240
+ } \
241
+ } while (0)
242
+
243
+ // used to copy the number of elements and stride in bytes of tensors into local variables.
244
+ // main purpose is to reduce code duplication and improve readability.
245
+ //
246
+ // example:
247
+ //
248
+ // GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
249
+ // GGML_TENSOR_LOCALS(size_t, nb1, src1, nb);
250
+ //
251
+ #define GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \
252
+ const type prefix##0 = (pointer)->array[0]; \
253
+ GGML_UNUSED(prefix##0);
254
+ #define GGML_TENSOR_LOCALS_2(type, prefix, pointer, array) \
255
+ GGML_TENSOR_LOCALS_1 (type, prefix, pointer, array) \
256
+ const type prefix##1 = (pointer)->array[1]; \
257
+ GGML_UNUSED(prefix##1);
258
+ #define GGML_TENSOR_LOCALS_3(type, prefix, pointer, array) \
259
+ GGML_TENSOR_LOCALS_2 (type, prefix, pointer, array) \
260
+ const type prefix##2 = (pointer)->array[2]; \
261
+ GGML_UNUSED(prefix##2);
262
+ #define GGML_TENSOR_LOCALS(type, prefix, pointer, array) \
263
+ GGML_TENSOR_LOCALS_3 (type, prefix, pointer, array) \
264
+ const type prefix##3 = (pointer)->array[3]; \
265
+ GGML_UNUSED(prefix##3);
266
+
267
+ #ifdef __cplusplus
268
+ extern "C" {
269
+ #endif
270
+
271
+ #if defined(__ARM_NEON) && defined(__CUDACC__)
272
+ typedef half ggml_fp16_t;
273
+ #elif defined(__ARM_NEON)
274
+ typedef __fp16 ggml_fp16_t;
275
+ #else
276
+ typedef uint16_t ggml_fp16_t;
277
+ #endif
278
+
279
+ // convert FP16 <-> FP32
280
+ GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
281
+ GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
282
+
283
+ GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n);
284
+ GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n);
285
+
286
+ struct ggml_object;
287
+ struct ggml_context;
288
+
289
+ enum ggml_type {
290
+ GGML_TYPE_F32 = 0,
291
+ GGML_TYPE_F16 = 1,
292
+ GGML_TYPE_Q4_0 = 2,
293
+ GGML_TYPE_Q4_1 = 3,
294
+ // GGML_TYPE_Q4_2 = 4, support has been removed
295
+ // GGML_TYPE_Q4_3 (5) support has been removed
296
+ GGML_TYPE_Q5_0 = 6,
297
+ GGML_TYPE_Q5_1 = 7,
298
+ GGML_TYPE_Q8_0 = 8,
299
+ GGML_TYPE_Q8_1 = 9,
300
+ // k-quantizations
301
+ GGML_TYPE_Q2_K = 10,
302
+ GGML_TYPE_Q3_K = 11,
303
+ GGML_TYPE_Q4_K = 12,
304
+ GGML_TYPE_Q5_K = 13,
305
+ GGML_TYPE_Q6_K = 14,
306
+ GGML_TYPE_Q8_K = 15,
307
+ GGML_TYPE_I8,
308
+ GGML_TYPE_I16,
309
+ GGML_TYPE_I32,
310
+ GGML_TYPE_COUNT,
311
+ };
312
+
313
+ enum ggml_backend {
314
+ GGML_BACKEND_CPU = 0,
315
+ GGML_BACKEND_GPU = 10,
316
+ GGML_BACKEND_GPU_SPLIT = 20,
317
+ };
318
+
319
+ // model file types
320
+ enum ggml_ftype {
321
+ GGML_FTYPE_UNKNOWN = -1,
322
+ GGML_FTYPE_ALL_F32 = 0,
323
+ GGML_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
324
+ GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
325
+ GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
326
+ GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
327
+ GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
328
+ GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
329
+ GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
330
+ GGML_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
331
+ GGML_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors
332
+ GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors
333
+ GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors
334
+ GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
335
+ };
336
+
337
+ // available tensor operations:
338
+ enum ggml_op {
339
+ GGML_OP_NONE = 0,
340
+
341
+ GGML_OP_DUP,
342
+ GGML_OP_ADD,
343
+ GGML_OP_ADD1,
344
+ GGML_OP_ACC,
345
+ GGML_OP_SUB,
346
+ GGML_OP_MUL,
347
+ GGML_OP_DIV,
348
+ GGML_OP_SQR,
349
+ GGML_OP_SQRT,
350
+ GGML_OP_LOG,
351
+ GGML_OP_SUM,
352
+ GGML_OP_SUM_ROWS,
353
+ GGML_OP_MEAN,
354
+ GGML_OP_ARGMAX,
355
+ GGML_OP_REPEAT,
356
+ GGML_OP_REPEAT_BACK,
357
+ GGML_OP_CONCAT,
358
+ GGML_OP_SILU_BACK,
359
+ GGML_OP_NORM, // normalize
360
+ GGML_OP_RMS_NORM,
361
+ GGML_OP_RMS_NORM_BACK,
362
+ GGML_OP_GROUP_NORM,
363
+
364
+ GGML_OP_MUL_MAT,
365
+ GGML_OP_OUT_PROD,
366
+
367
+ GGML_OP_SCALE,
368
+ GGML_OP_SET,
369
+ GGML_OP_CPY,
370
+ GGML_OP_CONT,
371
+ GGML_OP_RESHAPE,
372
+ GGML_OP_VIEW,
373
+ GGML_OP_PERMUTE,
374
+ GGML_OP_TRANSPOSE,
375
+ GGML_OP_GET_ROWS,
376
+ GGML_OP_GET_ROWS_BACK,
377
+ GGML_OP_DIAG,
378
+ GGML_OP_DIAG_MASK_INF,
379
+ GGML_OP_DIAG_MASK_ZERO,
380
+ GGML_OP_SOFT_MAX,
381
+ GGML_OP_SOFT_MAX_BACK,
382
+ GGML_OP_ROPE,
383
+ GGML_OP_ROPE_BACK,
384
+ GGML_OP_ALIBI,
385
+ GGML_OP_CLAMP,
386
+ GGML_OP_CONV_1D,
387
+ GGML_OP_CONV_2D,
388
+ GGML_OP_CONV_TRANSPOSE_2D,
389
+ GGML_OP_POOL_1D,
390
+ GGML_OP_POOL_2D,
391
+
392
+ GGML_OP_UPSCALE, // nearest interpolate
393
+
394
+ GGML_OP_FLASH_ATTN,
395
+ GGML_OP_FLASH_FF,
396
+ GGML_OP_FLASH_ATTN_BACK,
397
+ GGML_OP_WIN_PART,
398
+ GGML_OP_WIN_UNPART,
399
+ GGML_OP_GET_REL_POS,
400
+ GGML_OP_ADD_REL_POS,
401
+
402
+ GGML_OP_UNARY,
403
+
404
+ GGML_OP_MAP_UNARY,
405
+ GGML_OP_MAP_BINARY,
406
+
407
+ GGML_OP_MAP_CUSTOM1_F32,
408
+ GGML_OP_MAP_CUSTOM2_F32,
409
+ GGML_OP_MAP_CUSTOM3_F32,
410
+
411
+ GGML_OP_MAP_CUSTOM1,
412
+ GGML_OP_MAP_CUSTOM2,
413
+ GGML_OP_MAP_CUSTOM3,
414
+
415
+ GGML_OP_CROSS_ENTROPY_LOSS,
416
+ GGML_OP_CROSS_ENTROPY_LOSS_BACK,
417
+
418
+ GGML_OP_COUNT,
419
+ };
420
+
421
+ enum ggml_unary_op {
422
+ GGML_UNARY_OP_ABS,
423
+ GGML_UNARY_OP_SGN,
424
+ GGML_UNARY_OP_NEG,
425
+ GGML_UNARY_OP_STEP,
426
+ GGML_UNARY_OP_TANH,
427
+ GGML_UNARY_OP_ELU,
428
+ GGML_UNARY_OP_RELU,
429
+ GGML_UNARY_OP_GELU,
430
+ GGML_UNARY_OP_GELU_QUICK,
431
+ GGML_UNARY_OP_SILU,
432
+ };
433
+
434
+ enum ggml_object_type {
435
+ GGML_OBJECT_TENSOR,
436
+ GGML_OBJECT_GRAPH,
437
+ GGML_OBJECT_WORK_BUFFER
438
+ };
439
+
440
+ // ggml object
441
+ struct ggml_object {
442
+ size_t offs;
443
+ size_t size;
444
+
445
+ struct ggml_object * next;
446
+
447
+ enum ggml_object_type type;
448
+
449
+ char padding[4];
450
+ };
451
+
452
+ static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
453
+
454
+ // n-dimensional tensor
455
+ struct ggml_tensor {
456
+ enum ggml_type type;
457
+ enum ggml_backend backend;
458
+
459
+ int n_dims;
460
+ int64_t ne[GGML_MAX_DIMS]; // number of elements
461
+ size_t nb[GGML_MAX_DIMS]; // stride in bytes:
462
+ // nb[0] = sizeof(type)
463
+ // nb[1] = nb[0] * ne[0] + padding
464
+ // nb[i] = nb[i-1] * ne[i-1]
465
+
466
+ // compute data
467
+ enum ggml_op op;
468
+
469
+ // op params - allocated as int32_t for alignment
470
+ int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
471
+
472
+ bool is_param;
473
+
474
+ struct ggml_tensor * grad;
475
+ struct ggml_tensor * src[GGML_MAX_SRC];
476
+
477
+ // performance
478
+ int perf_runs;
479
+ int64_t perf_cycles;
480
+ int64_t perf_time_us;
481
+
482
+ void * data;
483
+
484
+ char name[GGML_MAX_NAME];
485
+
486
+ void * extra; // extra things e.g. for ggml-cuda.cu
487
+
488
+ char padding[4];
489
+ };
490
+
491
+ static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
492
+
493
+ // the compute plan that needs to be prepared for ggml_graph_compute()
494
+ // since https://github.com/ggerganov/ggml/issues/287
495
+ struct ggml_cplan {
496
+ size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
497
+ uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
498
+
499
+ int n_threads;
500
+
501
+ // the `n_tasks` of nodes, 1:1 mapping to cgraph nodes
502
+ int n_tasks[GGML_MAX_NODES];
503
+
504
+ // abort ggml_graph_compute when true
505
+ bool (*abort_callback)(void * data);
506
+ void * abort_callback_data;
507
+ };
508
+
509
+ // next prime after GGML_MAX_NODES
510
+ // #define GGML_GRAPH_HASHTABLE_SIZE 4099
511
+ // next prime after GGML_MAX_NODES * 2 (nodes + leafs)
512
+ #define GGML_GRAPH_HASHTABLE_SIZE 8273
513
+
514
+ // computation graph
515
+ struct ggml_cgraph {
516
+ int n_nodes;
517
+ int n_leafs;
518
+
519
+ struct ggml_tensor * nodes[GGML_MAX_NODES];
520
+ struct ggml_tensor * grads[GGML_MAX_NODES];
521
+ struct ggml_tensor * leafs[GGML_MAX_NODES];
522
+
523
+ void * visited_hash_table[GGML_GRAPH_HASHTABLE_SIZE];
524
+
525
+ // performance
526
+ int perf_runs;
527
+ int64_t perf_cycles;
528
+ int64_t perf_time_us;
529
+ };
530
+
531
+ static const size_t GGML_GRAPH_SIZE = sizeof(struct ggml_cgraph);
532
+
533
+ // scratch buffer
534
+ struct ggml_scratch {
535
+ size_t offs;
536
+ size_t size;
537
+ void * data;
538
+ };
539
+
540
+ struct ggml_init_params {
541
+ // memory pool
542
+ size_t mem_size; // bytes
543
+ void * mem_buffer; // if NULL, memory will be allocated internally
544
+ bool no_alloc; // don't allocate memory for the tensor data
545
+ };
546
+
547
+
548
+ // compute types
549
+
550
+ // NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled.
551
+ // This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995.
552
+ enum ggml_task_type {
553
+ GGML_TASK_INIT = 0,
554
+ GGML_TASK_COMPUTE,
555
+ GGML_TASK_FINALIZE,
556
+ };
557
+
558
+ struct ggml_compute_params {
559
+ enum ggml_task_type type;
560
+
561
+ // ith = thread index, nth = number of threads
562
+ int ith, nth;
563
+
564
+ // work buffer for all threads
565
+ size_t wsize;
566
+ void * wdata;
567
+ };
568
+
569
+ // misc
570
+
571
+ GGML_API void ggml_time_init(void); // call this once at the beginning of the program
572
+ GGML_API int64_t ggml_time_ms(void);
573
+ GGML_API int64_t ggml_time_us(void);
574
+ GGML_API int64_t ggml_cycles(void);
575
+ GGML_API int64_t ggml_cycles_per_ms(void);
576
+
577
+ GGML_API void ggml_numa_init(void); // call once for better performance on NUMA systems
578
+ GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
579
+
580
+ GGML_API void ggml_print_object (const struct ggml_object * obj);
581
+ GGML_API void ggml_print_objects(const struct ggml_context * ctx);
582
+
583
+ GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor);
584
+ GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
585
+ GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
586
+ GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
587
+ GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);
588
+
589
+ GGML_API int ggml_blck_size (enum ggml_type type);
590
+ GGML_API size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
591
+ GGML_API float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
592
+
593
+ GGML_API const char * ggml_type_name(enum ggml_type type);
594
+ GGML_API const char * ggml_op_name (enum ggml_op op);
595
+ GGML_API const char * ggml_op_symbol(enum ggml_op op);
596
+
597
+ GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
598
+
599
+ GGML_API bool ggml_is_quantized(enum ggml_type type);
600
+
601
+ // TODO: temporary until model loading of ggml examples is refactored
602
+ GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
603
+
604
+ GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
605
+ GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
606
+ GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
607
+
608
+ GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
609
+
610
+ // use this to compute the memory overhead of a tensor
611
+ GGML_API size_t ggml_tensor_overhead(void);
612
+
613
+ // main
614
+
615
+ GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
616
+ GGML_API void ggml_free(struct ggml_context * ctx);
617
+
618
+ GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
619
+
620
+ GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
621
+ GGML_API bool ggml_get_no_alloc(struct ggml_context * ctx);
622
+ GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
623
+
624
+ GGML_API void * ggml_get_mem_buffer (const struct ggml_context * ctx);
625
+ GGML_API size_t ggml_get_mem_size (const struct ggml_context * ctx);
626
+ GGML_API size_t ggml_get_max_tensor_size(const struct ggml_context * ctx);
627
+
628
+ GGML_API struct ggml_tensor * ggml_new_tensor(
629
+ struct ggml_context * ctx,
630
+ enum ggml_type type,
631
+ int n_dims,
632
+ const int64_t *ne);
633
+
634
+ GGML_API struct ggml_tensor * ggml_new_tensor_1d(
635
+ struct ggml_context * ctx,
636
+ enum ggml_type type,
637
+ int64_t ne0);
638
+
639
+ GGML_API struct ggml_tensor * ggml_new_tensor_2d(
640
+ struct ggml_context * ctx,
641
+ enum ggml_type type,
642
+ int64_t ne0,
643
+ int64_t ne1);
644
+
645
+ GGML_API struct ggml_tensor * ggml_new_tensor_3d(
646
+ struct ggml_context * ctx,
647
+ enum ggml_type type,
648
+ int64_t ne0,
649
+ int64_t ne1,
650
+ int64_t ne2);
651
+
652
+ GGML_API struct ggml_tensor * ggml_new_tensor_4d(
653
+ struct ggml_context * ctx,
654
+ enum ggml_type type,
655
+ int64_t ne0,
656
+ int64_t ne1,
657
+ int64_t ne2,
658
+ int64_t ne3);
659
+
660
+ GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
661
+ GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
662
+
663
+ GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
664
+ GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
665
+
666
+ GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
667
+
668
+ GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
669
+ GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
670
+ GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
671
+
672
+ GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
673
+ GGML_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
674
+
675
+ GGML_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
676
+ GGML_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
677
+
678
+ GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
679
+ GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
680
+
681
+ GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
682
+
683
+ GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor);
684
+ GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name);
685
+ GGML_API struct ggml_tensor * ggml_format_name( struct ggml_tensor * tensor, const char * fmt, ...);
686
+
687
+ //
688
+ // operations on tensors with backpropagation
689
+ //
690
+
691
+ GGML_API struct ggml_tensor * ggml_dup(
692
+ struct ggml_context * ctx,
693
+ struct ggml_tensor * a);
694
+
695
+ // in-place, returns view(a)
696
+ GGML_API struct ggml_tensor * ggml_dup_inplace(
697
+ struct ggml_context * ctx,
698
+ struct ggml_tensor * a);
699
+
700
+ GGML_API struct ggml_tensor * ggml_add(
701
+ struct ggml_context * ctx,
702
+ struct ggml_tensor * a,
703
+ struct ggml_tensor * b);
704
+
705
+ GGML_API struct ggml_tensor * ggml_add_inplace(
706
+ struct ggml_context * ctx,
707
+ struct ggml_tensor * a,
708
+ struct ggml_tensor * b);
709
+
710
+ GGML_API struct ggml_tensor * ggml_add1(
711
+ struct ggml_context * ctx,
712
+ struct ggml_tensor * a,
713
+ struct ggml_tensor * b);
714
+
715
+ GGML_API struct ggml_tensor * ggml_add1_inplace(
716
+ struct ggml_context * ctx,
717
+ struct ggml_tensor * a,
718
+ struct ggml_tensor * b);
719
+
720
+ GGML_API struct ggml_tensor * ggml_acc(
721
+ struct ggml_context * ctx,
722
+ struct ggml_tensor * a,
723
+ struct ggml_tensor * b,
724
+ size_t nb1,
725
+ size_t nb2,
726
+ size_t nb3,
727
+ size_t offset);
728
+
729
+ GGML_API struct ggml_tensor * ggml_acc_inplace(
730
+ struct ggml_context * ctx,
731
+ struct ggml_tensor * a,
732
+ struct ggml_tensor * b,
733
+ size_t nb1,
734
+ size_t nb2,
735
+ size_t nb3,
736
+ size_t offset);
737
+
738
+ GGML_API struct ggml_tensor * ggml_sub(
739
+ struct ggml_context * ctx,
740
+ struct ggml_tensor * a,
741
+ struct ggml_tensor * b);
742
+
743
+ GGML_API struct ggml_tensor * ggml_sub_inplace(
744
+ struct ggml_context * ctx,
745
+ struct ggml_tensor * a,
746
+ struct ggml_tensor * b);
747
+
748
+ GGML_API struct ggml_tensor * ggml_mul(
749
+ struct ggml_context * ctx,
750
+ struct ggml_tensor * a,
751
+ struct ggml_tensor * b);
752
+
753
+ GGML_API struct ggml_tensor * ggml_mul_inplace(
754
+ struct ggml_context * ctx,
755
+ struct ggml_tensor * a,
756
+ struct ggml_tensor * b);
757
+
758
+ GGML_API struct ggml_tensor * ggml_div(
759
+ struct ggml_context * ctx,
760
+ struct ggml_tensor * a,
761
+ struct ggml_tensor * b);
762
+
763
+ GGML_API struct ggml_tensor * ggml_div_inplace(
764
+ struct ggml_context * ctx,
765
+ struct ggml_tensor * a,
766
+ struct ggml_tensor * b);
767
+
768
+ GGML_API struct ggml_tensor * ggml_sqr(
769
+ struct ggml_context * ctx,
770
+ struct ggml_tensor * a);
771
+
772
+ GGML_API struct ggml_tensor * ggml_sqr_inplace(
773
+ struct ggml_context * ctx,
774
+ struct ggml_tensor * a);
775
+
776
+ GGML_API struct ggml_tensor * ggml_sqrt(
777
+ struct ggml_context * ctx,
778
+ struct ggml_tensor * a);
779
+
780
+ GGML_API struct ggml_tensor * ggml_sqrt_inplace(
781
+ struct ggml_context * ctx,
782
+ struct ggml_tensor * a);
783
+
784
+ GGML_API struct ggml_tensor * ggml_log(
785
+ struct ggml_context * ctx,
786
+ struct ggml_tensor * a);
787
+
788
+ GGML_API struct ggml_tensor * ggml_log_inplace(
789
+ struct ggml_context * ctx,
790
+ struct ggml_tensor * a);
791
+
792
+ // return scalar
793
+ GGML_API struct ggml_tensor * ggml_sum(
794
+ struct ggml_context * ctx,
795
+ struct ggml_tensor * a);
796
+
797
+ // sums along rows, with input shape [a,b,c,d] return shape [1,b,c,d]
798
+ GGML_API struct ggml_tensor * ggml_sum_rows(
799
+ struct ggml_context * ctx,
800
+ struct ggml_tensor * a);
801
+
802
+ // mean along rows
803
+ GGML_API struct ggml_tensor * ggml_mean(
804
+ struct ggml_context * ctx,
805
+ struct ggml_tensor * a);
806
+
807
+ // argmax along rows
808
+ GGML_API struct ggml_tensor * ggml_argmax(
809
+ struct ggml_context * ctx,
810
+ struct ggml_tensor * a);
811
+
812
+ // if a is the same shape as b, and a is not parameter, return a
813
+ // otherwise, return a new tensor: repeat(a) to fit in b
814
+ GGML_API struct ggml_tensor * ggml_repeat(
815
+ struct ggml_context * ctx,
816
+ struct ggml_tensor * a,
817
+ struct ggml_tensor * b);
818
+
819
+ GGML_API struct ggml_tensor * ggml_repeat_back(
820
+ struct ggml_context * ctx,
821
+ struct ggml_tensor * a,
822
+ struct ggml_tensor * b);
823
+
824
+ // concat a and b on dim 2
825
+ // used in stable-diffusion
826
+ GGML_API struct ggml_tensor * ggml_concat(
827
+ struct ggml_context * ctx,
828
+ struct ggml_tensor * a,
829
+ struct ggml_tensor * b);
830
+
831
+ GGML_API struct ggml_tensor * ggml_abs(
832
+ struct ggml_context * ctx,
833
+ struct ggml_tensor * a);
834
+
835
+ GGML_API struct ggml_tensor * ggml_abs_inplace(
836
+ struct ggml_context * ctx,
837
+ struct ggml_tensor * a);
838
+
839
+ GGML_API struct ggml_tensor * ggml_sgn(
840
+ struct ggml_context * ctx,
841
+ struct ggml_tensor * a);
842
+
843
+ GGML_API struct ggml_tensor * ggml_sgn_inplace(
844
+ struct ggml_context * ctx,
845
+ struct ggml_tensor * a);
846
+
847
+ GGML_API struct ggml_tensor * ggml_neg(
848
+ struct ggml_context * ctx,
849
+ struct ggml_tensor * a);
850
+
851
+ GGML_API struct ggml_tensor * ggml_neg_inplace(
852
+ struct ggml_context * ctx,
853
+ struct ggml_tensor * a);
854
+
855
+ GGML_API struct ggml_tensor * ggml_step(
856
+ struct ggml_context * ctx,
857
+ struct ggml_tensor * a);
858
+
859
+ GGML_API struct ggml_tensor * ggml_step_inplace(
860
+ struct ggml_context * ctx,
861
+ struct ggml_tensor * a);
862
+
863
+ GGML_API struct ggml_tensor * ggml_tanh(
864
+ struct ggml_context * ctx,
865
+ struct ggml_tensor * a);
866
+
867
+ GGML_API struct ggml_tensor * ggml_tanh_inplace(
868
+ struct ggml_context * ctx,
869
+ struct ggml_tensor * a);
870
+
871
+ GGML_API struct ggml_tensor * ggml_elu(
872
+ struct ggml_context * ctx,
873
+ struct ggml_tensor * a);
874
+
875
+ GGML_API struct ggml_tensor * ggml_elu_inplace(
876
+ struct ggml_context * ctx,
877
+ struct ggml_tensor * a);
878
+
879
+ GGML_API struct ggml_tensor * ggml_relu(
880
+ struct ggml_context * ctx,
881
+ struct ggml_tensor * a);
882
+
883
+ GGML_API struct ggml_tensor * ggml_relu_inplace(
884
+ struct ggml_context * ctx,
885
+ struct ggml_tensor * a);
886
+
887
+ // TODO: double-check this computation is correct
888
+ GGML_API struct ggml_tensor * ggml_gelu(
889
+ struct ggml_context * ctx,
890
+ struct ggml_tensor * a);
891
+
892
+ GGML_API struct ggml_tensor * ggml_gelu_inplace(
893
+ struct ggml_context * ctx,
894
+ struct ggml_tensor * a);
895
+
896
+ GGML_API struct ggml_tensor * ggml_gelu_quick(
897
+ struct ggml_context * ctx,
898
+ struct ggml_tensor * a);
899
+
900
+ GGML_API struct ggml_tensor * ggml_gelu_quick_inplace(
901
+ struct ggml_context * ctx,
902
+ struct ggml_tensor * a);
903
+
904
+ GGML_API struct ggml_tensor * ggml_silu(
905
+ struct ggml_context * ctx,
906
+ struct ggml_tensor * a);
907
+
908
+ GGML_API struct ggml_tensor * ggml_silu_inplace(
909
+ struct ggml_context * ctx,
910
+ struct ggml_tensor * a);
911
+
912
+ // a - x
913
+ // b - dy
914
+ GGML_API struct ggml_tensor * ggml_silu_back(
915
+ struct ggml_context * ctx,
916
+ struct ggml_tensor * a,
917
+ struct ggml_tensor * b);
918
+
919
+ // normalize along rows
920
+ GGML_API struct ggml_tensor * ggml_norm(
921
+ struct ggml_context * ctx,
922
+ struct ggml_tensor * a,
923
+ float eps);
924
+
925
+ GGML_API struct ggml_tensor * ggml_norm_inplace(
926
+ struct ggml_context * ctx,
927
+ struct ggml_tensor * a,
928
+ float eps);
929
+
930
+ GGML_API struct ggml_tensor * ggml_rms_norm(
931
+ struct ggml_context * ctx,
932
+ struct ggml_tensor * a,
933
+ float eps);
934
+
935
+ GGML_API struct ggml_tensor * ggml_rms_norm_inplace(
936
+ struct ggml_context * ctx,
937
+ struct ggml_tensor * a,
938
+ float eps);
939
+
940
+ // group normalize along ne0*ne1*n_groups
941
+ // used in stable-diffusion
942
+ // TODO: eps is hardcoded to 1e-6 for now
943
+ GGML_API struct ggml_tensor * ggml_group_norm(
944
+ struct ggml_context * ctx,
945
+ struct ggml_tensor * a,
946
+ int n_groups);
947
+
948
+ GGML_API struct ggml_tensor * ggml_group_norm_inplace(
949
+ struct ggml_context * ctx,
950
+ struct ggml_tensor * a,
951
+ int n_groups);
952
+
953
+ // a - x
954
+ // b - dy
955
+ // TODO: update with configurable eps
956
+ GGML_API struct ggml_tensor * ggml_rms_norm_back(
957
+ struct ggml_context * ctx,
958
+ struct ggml_tensor * a,
959
+ struct ggml_tensor * b);
960
+
961
+ // A: n columns, m rows
962
+ // B: n columns, p rows (i.e. we transpose it internally)
963
+ // result is m columns, p rows
964
+ GGML_API struct ggml_tensor * ggml_mul_mat(
965
+ struct ggml_context * ctx,
966
+ struct ggml_tensor * a,
967
+ struct ggml_tensor * b);
968
+
969
+ // A: m columns, n rows,
970
+ // B: p columns, n rows,
971
+ // result is m columns, p rows
972
+ GGML_API struct ggml_tensor * ggml_out_prod(
973
+ struct ggml_context * ctx,
974
+ struct ggml_tensor * a,
975
+ struct ggml_tensor * b);
976
+
977
+ //
978
+ // operations on tensors without backpropagation
979
+ //
980
+
981
+ GGML_API struct ggml_tensor * ggml_scale(
982
+ struct ggml_context * ctx,
983
+ struct ggml_tensor * a,
984
+ struct ggml_tensor * b);
985
+
986
+ // in-place, returns view(a)
987
+ GGML_API struct ggml_tensor * ggml_scale_inplace(
988
+ struct ggml_context * ctx,
989
+ struct ggml_tensor * a,
990
+ struct ggml_tensor * b);
991
+
992
+ // b -> view(a,offset,nb1,nb2,3), return modified a
993
+ GGML_API struct ggml_tensor * ggml_set(
994
+ struct ggml_context * ctx,
995
+ struct ggml_tensor * a,
996
+ struct ggml_tensor * b,
997
+ size_t nb1,
998
+ size_t nb2,
999
+ size_t nb3,
1000
+ size_t offset);
1001
+
1002
+ // b -> view(a,offset,nb1,nb2,3), return view(a)
1003
+ GGML_API struct ggml_tensor * ggml_set_inplace(
1004
+ struct ggml_context * ctx,
1005
+ struct ggml_tensor * a,
1006
+ struct ggml_tensor * b,
1007
+ size_t nb1,
1008
+ size_t nb2,
1009
+ size_t nb3,
1010
+ size_t offset);
1011
+
1012
+ GGML_API struct ggml_tensor * ggml_set_1d(
1013
+ struct ggml_context * ctx,
1014
+ struct ggml_tensor * a,
1015
+ struct ggml_tensor * b,
1016
+ size_t offset);
1017
+
1018
+ GGML_API struct ggml_tensor * ggml_set_1d_inplace(
1019
+ struct ggml_context * ctx,
1020
+ struct ggml_tensor * a,
1021
+ struct ggml_tensor * b,
1022
+ size_t offset);
1023
+
1024
+ // b -> view(a,offset,nb1,nb2,3), return modified a
1025
+ GGML_API struct ggml_tensor * ggml_set_2d(
1026
+ struct ggml_context * ctx,
1027
+ struct ggml_tensor * a,
1028
+ struct ggml_tensor * b,
1029
+ size_t nb1,
1030
+ size_t offset);
1031
+
1032
+ // b -> view(a,offset,nb1,nb2,3), return view(a)
1033
+ GGML_API struct ggml_tensor * ggml_set_2d_inplace(
1034
+ struct ggml_context * ctx,
1035
+ struct ggml_tensor * a,
1036
+ struct ggml_tensor * b,
1037
+ size_t nb1,
1038
+ size_t offset);
1039
+
1040
+
1041
+ // a -> b, return view(b)
1042
+ GGML_API struct ggml_tensor * ggml_cpy(
1043
+ struct ggml_context * ctx,
1044
+ struct ggml_tensor * a,
1045
+ struct ggml_tensor * b);
1046
+
1047
+ // a -> b, in-place, return view(b)
1048
+ GGML_API struct ggml_tensor * ggml_cpy_inplace(
1049
+ struct ggml_context * ctx,
1050
+ struct ggml_tensor * a,
1051
+ struct ggml_tensor * b);
1052
+
1053
+ // make contiguous
1054
+ GGML_API struct ggml_tensor * ggml_cont(
1055
+ struct ggml_context * ctx,
1056
+ struct ggml_tensor * a);
1057
+
1058
+ // make contiguous, in-place
1059
+ GGML_API struct ggml_tensor * ggml_cont_inplace(
1060
+ struct ggml_context * ctx,
1061
+ struct ggml_tensor * a);
1062
+
1063
+ // return view(a), b specifies the new shape
1064
+ // TODO: when we start computing gradient, make a copy instead of view
1065
+ GGML_API struct ggml_tensor * ggml_reshape(
1066
+ struct ggml_context * ctx,
1067
+ struct ggml_tensor * a,
1068
+ struct ggml_tensor * b);
1069
+
1070
+ // return view(a)
1071
+ // TODO: when we start computing gradient, make a copy instead of view
1072
+ GGML_API struct ggml_tensor * ggml_reshape_1d(
1073
+ struct ggml_context * ctx,
1074
+ struct ggml_tensor * a,
1075
+ int64_t ne0);
1076
+
1077
+ GGML_API struct ggml_tensor * ggml_reshape_2d(
1078
+ struct ggml_context * ctx,
1079
+ struct ggml_tensor * a,
1080
+ int64_t ne0,
1081
+ int64_t ne1);
1082
+
1083
+ // return view(a)
1084
+ // TODO: when we start computing gradient, make a copy instead of view
1085
+ GGML_API struct ggml_tensor * ggml_reshape_3d(
1086
+ struct ggml_context * ctx,
1087
+ struct ggml_tensor * a,
1088
+ int64_t ne0,
1089
+ int64_t ne1,
1090
+ int64_t ne2);
1091
+
1092
+ GGML_API struct ggml_tensor * ggml_reshape_4d(
1093
+ struct ggml_context * ctx,
1094
+ struct ggml_tensor * a,
1095
+ int64_t ne0,
1096
+ int64_t ne1,
1097
+ int64_t ne2,
1098
+ int64_t ne3);
1099
+
1100
+ // offset in bytes
1101
+ GGML_API struct ggml_tensor * ggml_view_1d(
1102
+ struct ggml_context * ctx,
1103
+ struct ggml_tensor * a,
1104
+ int64_t ne0,
1105
+ size_t offset);
1106
+
1107
+ GGML_API struct ggml_tensor * ggml_view_2d(
1108
+ struct ggml_context * ctx,
1109
+ struct ggml_tensor * a,
1110
+ int64_t ne0,
1111
+ int64_t ne1,
1112
+ size_t nb1, // row stride in bytes
1113
+ size_t offset);
1114
+
1115
+ GGML_API struct ggml_tensor * ggml_view_3d(
1116
+ struct ggml_context * ctx,
1117
+ struct ggml_tensor * a,
1118
+ int64_t ne0,
1119
+ int64_t ne1,
1120
+ int64_t ne2,
1121
+ size_t nb1, // row stride in bytes
1122
+ size_t nb2, // slice stride in bytes
1123
+ size_t offset);
1124
+
1125
+ GGML_API struct ggml_tensor * ggml_view_4d(
1126
+ struct ggml_context * ctx,
1127
+ struct ggml_tensor * a,
1128
+ int64_t ne0,
1129
+ int64_t ne1,
1130
+ int64_t ne2,
1131
+ int64_t ne3,
1132
+ size_t nb1, // row stride in bytes
1133
+ size_t nb2, // slice stride in bytes
1134
+ size_t nb3,
1135
+ size_t offset);
1136
+
1137
+ GGML_API struct ggml_tensor * ggml_permute(
1138
+ struct ggml_context * ctx,
1139
+ struct ggml_tensor * a,
1140
+ int axis0,
1141
+ int axis1,
1142
+ int axis2,
1143
+ int axis3);
1144
+
1145
+ // alias for ggml_permute(ctx, a, 1, 0, 2, 3)
1146
+ GGML_API struct ggml_tensor * ggml_transpose(
1147
+ struct ggml_context * ctx,
1148
+ struct ggml_tensor * a);
1149
+
1150
+ GGML_API struct ggml_tensor * ggml_get_rows(
1151
+ struct ggml_context * ctx,
1152
+ struct ggml_tensor * a,
1153
+ struct ggml_tensor * b);
1154
+
1155
+ GGML_API struct ggml_tensor * ggml_get_rows_back(
1156
+ struct ggml_context * ctx,
1157
+ struct ggml_tensor * a,
1158
+ struct ggml_tensor * b,
1159
+ struct ggml_tensor * c);
1160
+
1161
+ GGML_API struct ggml_tensor * ggml_diag(
1162
+ struct ggml_context * ctx,
1163
+ struct ggml_tensor * a);
1164
+
1165
+ // set elements above the diagonal to -INF
1166
+ GGML_API struct ggml_tensor * ggml_diag_mask_inf(
1167
+ struct ggml_context * ctx,
1168
+ struct ggml_tensor * a,
1169
+ int n_past);
1170
+
1171
+ // in-place, returns view(a)
1172
+ GGML_API struct ggml_tensor * ggml_diag_mask_inf_inplace(
1173
+ struct ggml_context * ctx,
1174
+ struct ggml_tensor * a,
1175
+ int n_past);
1176
+
1177
+ // set elements above the diagonal to 0
1178
+ GGML_API struct ggml_tensor * ggml_diag_mask_zero(
1179
+ struct ggml_context * ctx,
1180
+ struct ggml_tensor * a,
1181
+ int n_past);
1182
+
1183
+ // in-place, returns view(a)
1184
+ GGML_API struct ggml_tensor * ggml_diag_mask_zero_inplace(
1185
+ struct ggml_context * ctx,
1186
+ struct ggml_tensor * a,
1187
+ int n_past);
1188
+
1189
+ GGML_API struct ggml_tensor * ggml_soft_max(
1190
+ struct ggml_context * ctx,
1191
+ struct ggml_tensor * a);
1192
+
1193
+ // in-place, returns view(a)
1194
+ GGML_API struct ggml_tensor * ggml_soft_max_inplace(
1195
+ struct ggml_context * ctx,
1196
+ struct ggml_tensor * a);
1197
+
1198
+ GGML_API struct ggml_tensor * ggml_soft_max_back(
1199
+ struct ggml_context * ctx,
1200
+ struct ggml_tensor * a,
1201
+ struct ggml_tensor * b);
1202
+
1203
+ // in-place, returns view(a)
1204
+ GGML_API struct ggml_tensor * ggml_soft_max_back_inplace(
1205
+ struct ggml_context * ctx,
1206
+ struct ggml_tensor * a,
1207
+ struct ggml_tensor * b);
1208
+
1209
+ // rotary position embedding
1210
+ // if mode & 1 == 1, skip n_past elements
1211
+ // if mode & 2 == 1, GPT-NeoX style
1212
+ // if mode & 4 == 1, ChatGLM style
1213
+ // TODO: avoid creating a new tensor every time
1214
+ GGML_API struct ggml_tensor * ggml_rope(
1215
+ struct ggml_context * ctx,
1216
+ struct ggml_tensor * a,
1217
+ int n_past,
1218
+ int n_dims,
1219
+ int mode,
1220
+ int n_ctx);
1221
+
1222
+ // in-place, returns view(a)
1223
+ GGML_API struct ggml_tensor * ggml_rope_inplace(
1224
+ struct ggml_context * ctx,
1225
+ struct ggml_tensor * a,
1226
+ int n_past,
1227
+ int n_dims,
1228
+ int mode,
1229
+ int n_ctx);
1230
+
1231
+ // custom RoPE
1232
+ GGML_API struct ggml_tensor * ggml_rope_custom(
1233
+ struct ggml_context * ctx,
1234
+ struct ggml_tensor * a,
1235
+ int n_past,
1236
+ int n_dims,
1237
+ int mode,
1238
+ int n_ctx,
1239
+ float freq_base,
1240
+ float freq_scale);
1241
+
1242
+ // in-place, returns view(a)
1243
+ GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
1244
+ struct ggml_context * ctx,
1245
+ struct ggml_tensor * a,
1246
+ int n_past,
1247
+ int n_dims,
1248
+ int mode,
1249
+ int n_ctx,
1250
+ float freq_base,
1251
+ float freq_scale);
1252
+
1253
+ // xPos RoPE, in-place, returns view(a)
1254
+ GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
1255
+ struct ggml_context * ctx,
1256
+ struct ggml_tensor * a,
1257
+ int n_past,
1258
+ int n_dims,
1259
+ float base,
1260
+ bool down);
1261
+
1262
+ // rotary position embedding backward, i.e compute dx from dy
1263
+ // a - dy
1264
+ GGML_API struct ggml_tensor * ggml_rope_back(
1265
+ struct ggml_context * ctx,
1266
+ struct ggml_tensor * a,
1267
+ int n_past,
1268
+ int n_dims,
1269
+ int mode,
1270
+ int n_ctx,
1271
+ float freq_base,
1272
+ float freq_scale,
1273
+ float xpos_base,
1274
+ bool xpos_down);
1275
+
1276
+ // alibi position embedding
1277
+ // in-place, returns view(a)
1278
+ struct ggml_tensor * ggml_alibi(
1279
+ struct ggml_context * ctx,
1280
+ struct ggml_tensor * a,
1281
+ int n_past,
1282
+ int n_head,
1283
+ float bias_max);
1284
+
1285
+ // clamp
1286
+ // in-place, returns view(a)
1287
+ struct ggml_tensor * ggml_clamp(
1288
+ struct ggml_context * ctx,
1289
+ struct ggml_tensor * a,
1290
+ float min,
1291
+ float max);
1292
+
1293
+ GGML_API struct ggml_tensor * ggml_conv_1d(
1294
+ struct ggml_context * ctx,
1295
+ struct ggml_tensor * a,
1296
+ struct ggml_tensor * b,
1297
+ int s0, // stride
1298
+ int p0, // padding
1299
+ int d0); // dilation
1300
+
1301
+ // conv_1d with padding = half
1302
+ // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
1303
+ GGML_API struct ggml_tensor* ggml_conv_1d_ph(
1304
+ struct ggml_context * ctx,
1305
+ struct ggml_tensor * a,
1306
+ struct ggml_tensor * b,
1307
+ int s,
1308
+ int d);
1309
+
1310
+ GGML_API struct ggml_tensor * ggml_conv_2d(
1311
+ struct ggml_context * ctx,
1312
+ struct ggml_tensor * a,
1313
+ struct ggml_tensor * b,
1314
+ int s0,
1315
+ int s1,
1316
+ int p0,
1317
+ int p1,
1318
+ int d0,
1319
+ int d1);
1320
+
1321
+
1322
+ // kernel size is a->ne[0] x a->ne[1]
1323
+ // stride is equal to kernel size
1324
+ // padding is zero
1325
+ // example:
1326
+ // a: 16 16 3 768
1327
+ // b: 1024 1024 3 1
1328
+ // res: 64 64 768 1
1329
+ // used in sam
1330
+ GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0(
1331
+ struct ggml_context * ctx,
1332
+ struct ggml_tensor * a,
1333
+ struct ggml_tensor * b);
1334
+
1335
+ // kernel size is a->ne[0] x a->ne[1]
1336
+ // stride is 1
1337
+ // padding is half
1338
+ // example:
1339
+ // a: 3 3 256 256
1340
+ // b: 64 64 256 1
1341
+ // res: 64 64 256 1
1342
+ // used in sam
1343
+ GGML_API struct ggml_tensor * ggml_conv_2d_s1_ph(
1344
+ struct ggml_context * ctx,
1345
+ struct ggml_tensor * a,
1346
+ struct ggml_tensor * b);
1347
+
1348
+ GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0(
1349
+ struct ggml_context * ctx,
1350
+ struct ggml_tensor * a,
1351
+ struct ggml_tensor * b,
1352
+ int stride);
1353
+
1354
+ enum ggml_op_pool {
1355
+ GGML_OP_POOL_MAX,
1356
+ GGML_OP_POOL_AVG,
1357
+ GGML_OP_POOL_COUNT,
1358
+ };
1359
+
1360
+ GGML_API struct ggml_tensor * ggml_pool_1d(
1361
+ struct ggml_context * ctx,
1362
+ struct ggml_tensor * a,
1363
+ enum ggml_op_pool op,
1364
+ int k0, // kernel size
1365
+ int s0, // stride
1366
+ int p0); // padding
1367
+
1368
+ GGML_API struct ggml_tensor * ggml_pool_2d(
1369
+ struct ggml_context * ctx,
1370
+ struct ggml_tensor * a,
1371
+ enum ggml_op_pool op,
1372
+ int k0,
1373
+ int k1,
1374
+ int s0,
1375
+ int s1,
1376
+ int p0,
1377
+ int p1);
1378
+
1379
+ // nearest interpolate
1380
+ // used in stable-diffusion
1381
+ GGML_API struct ggml_tensor * ggml_upscale(
1382
+ struct ggml_context * ctx,
1383
+ struct ggml_tensor * a,
1384
+ int scale_factor);
1385
+
1386
+ GGML_API struct ggml_tensor * ggml_flash_attn(
1387
+ struct ggml_context * ctx,
1388
+ struct ggml_tensor * q,
1389
+ struct ggml_tensor * k,
1390
+ struct ggml_tensor * v,
1391
+ bool masked);
1392
+
1393
+ GGML_API struct ggml_tensor * ggml_flash_attn_back(
1394
+ struct ggml_context * ctx,
1395
+ struct ggml_tensor * q,
1396
+ struct ggml_tensor * k,
1397
+ struct ggml_tensor * v,
1398
+ struct ggml_tensor * d,
1399
+ bool masked);
1400
+
1401
+ GGML_API struct ggml_tensor * ggml_flash_ff(
1402
+ struct ggml_context * ctx,
1403
+ struct ggml_tensor * a,
1404
+ struct ggml_tensor * b0,
1405
+ struct ggml_tensor * b1,
1406
+ struct ggml_tensor * c0,
1407
+ struct ggml_tensor * c1);
1408
+
1409
+ // partition into non-overlapping windows with padding if needed
1410
+ // example:
1411
+ // a: 768 64 64 1
1412
+ // w: 14
1413
+ // res: 768 14 14 25
1414
+ // used in sam
1415
+ GGML_API struct ggml_tensor * ggml_win_part(
1416
+ struct ggml_context * ctx,
1417
+ struct ggml_tensor * a,
1418
+ int w);
1419
+
1420
+ // reverse of ggml_win_part
1421
+ // used in sam
1422
+ GGML_API struct ggml_tensor * ggml_win_unpart(
1423
+ struct ggml_context * ctx,
1424
+ struct ggml_tensor * a,
1425
+ int w0,
1426
+ int h0,
1427
+ int w);
1428
+
1429
+ GGML_API struct ggml_tensor * ggml_unary(
1430
+ struct ggml_context * ctx,
1431
+ struct ggml_tensor * a,
1432
+ enum ggml_unary_op op);
1433
+
1434
+ GGML_API struct ggml_tensor * ggml_unary_inplace(
1435
+ struct ggml_context * ctx,
1436
+ struct ggml_tensor * a,
1437
+ enum ggml_unary_op op);
1438
+
1439
+ // used in sam
1440
+ GGML_API struct ggml_tensor * ggml_get_rel_pos(
1441
+ struct ggml_context * ctx,
1442
+ struct ggml_tensor * a,
1443
+ int qh,
1444
+ int kh);
1445
+
1446
+ // used in sam
1447
+
1448
+ GGML_API struct ggml_tensor * ggml_add_rel_pos(
1449
+ struct ggml_context * ctx,
1450
+ struct ggml_tensor * a,
1451
+ struct ggml_tensor * pw,
1452
+ struct ggml_tensor * ph);
1453
+
1454
+ GGML_API struct ggml_tensor * ggml_add_rel_pos_inplace(
1455
+ struct ggml_context * ctx,
1456
+ struct ggml_tensor * a,
1457
+ struct ggml_tensor * pw,
1458
+ struct ggml_tensor * ph);
1459
+
1460
+ // custom operators
1461
+
1462
+ typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
1463
+ typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
1464
+
1465
+ typedef void (*ggml_custom1_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *);
1466
+ typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
1467
+ typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
1468
+
1469
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_f32(
1470
+ struct ggml_context * ctx,
1471
+ struct ggml_tensor * a,
1472
+ ggml_unary_op_f32_t fun),
1473
+ "use ggml_map_custom1 instead");
1474
+
1475
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_inplace_f32(
1476
+ struct ggml_context * ctx,
1477
+ struct ggml_tensor * a,
1478
+ ggml_unary_op_f32_t fun),
1479
+ "use ggml_map_custom1_inplace instead");
1480
+
1481
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_f32(
1482
+ struct ggml_context * ctx,
1483
+ struct ggml_tensor * a,
1484
+ struct ggml_tensor * b,
1485
+ ggml_binary_op_f32_t fun),
1486
+ "use ggml_map_custom2 instead");
1487
+
1488
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_inplace_f32(
1489
+ struct ggml_context * ctx,
1490
+ struct ggml_tensor * a,
1491
+ struct ggml_tensor * b,
1492
+ ggml_binary_op_f32_t fun),
1493
+ "use ggml_map_custom2_inplace instead");
1494
+
1495
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_f32(
1496
+ struct ggml_context * ctx,
1497
+ struct ggml_tensor * a,
1498
+ ggml_custom1_op_f32_t fun),
1499
+ "use ggml_map_custom1 instead");
1500
+
1501
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_inplace_f32(
1502
+ struct ggml_context * ctx,
1503
+ struct ggml_tensor * a,
1504
+ ggml_custom1_op_f32_t fun),
1505
+ "use ggml_map_custom1_inplace instead");
1506
+
1507
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_f32(
1508
+ struct ggml_context * ctx,
1509
+ struct ggml_tensor * a,
1510
+ struct ggml_tensor * b,
1511
+ ggml_custom2_op_f32_t fun),
1512
+ "use ggml_map_custom2 instead");
1513
+
1514
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_inplace_f32(
1515
+ struct ggml_context * ctx,
1516
+ struct ggml_tensor * a,
1517
+ struct ggml_tensor * b,
1518
+ ggml_custom2_op_f32_t fun),
1519
+ "use ggml_map_custom2_inplace instead");
1520
+
1521
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_f32(
1522
+ struct ggml_context * ctx,
1523
+ struct ggml_tensor * a,
1524
+ struct ggml_tensor * b,
1525
+ struct ggml_tensor * c,
1526
+ ggml_custom3_op_f32_t fun),
1527
+ "use ggml_map_custom3 instead");
1528
+
1529
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_inplace_f32(
1530
+ struct ggml_context * ctx,
1531
+ struct ggml_tensor * a,
1532
+ struct ggml_tensor * b,
1533
+ struct ggml_tensor * c,
1534
+ ggml_custom3_op_f32_t fun),
1535
+ "use ggml_map_custom3_inplace instead");
1536
+
1537
+ // custom operators v2
1538
+
1539
+ typedef void (*ggml_custom1_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata);
1540
+ typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata);
1541
+ typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata);
1542
+
1543
+ #define GGML_N_TASKS_MAX -1
1544
+
1545
+ GGML_API struct ggml_tensor * ggml_map_custom1(
1546
+ struct ggml_context * ctx,
1547
+ struct ggml_tensor * a,
1548
+ ggml_custom1_op_t fun,
1549
+ int n_tasks,
1550
+ void * userdata);
1551
+
1552
+ GGML_API struct ggml_tensor * ggml_map_custom1_inplace(
1553
+ struct ggml_context * ctx,
1554
+ struct ggml_tensor * a,
1555
+ ggml_custom1_op_t fun,
1556
+ int n_tasks,
1557
+ void * userdata);
1558
+
1559
+ GGML_API struct ggml_tensor * ggml_map_custom2(
1560
+ struct ggml_context * ctx,
1561
+ struct ggml_tensor * a,
1562
+ struct ggml_tensor * b,
1563
+ ggml_custom2_op_t fun,
1564
+ int n_tasks,
1565
+ void * userdata);
1566
+
1567
+ GGML_API struct ggml_tensor * ggml_map_custom2_inplace(
1568
+ struct ggml_context * ctx,
1569
+ struct ggml_tensor * a,
1570
+ struct ggml_tensor * b,
1571
+ ggml_custom2_op_t fun,
1572
+ int n_tasks,
1573
+ void * userdata);
1574
+
1575
+ GGML_API struct ggml_tensor * ggml_map_custom3(
1576
+ struct ggml_context * ctx,
1577
+ struct ggml_tensor * a,
1578
+ struct ggml_tensor * b,
1579
+ struct ggml_tensor * c,
1580
+ ggml_custom3_op_t fun,
1581
+ int n_tasks,
1582
+ void * userdata);
1583
+
1584
+ GGML_API struct ggml_tensor * ggml_map_custom3_inplace(
1585
+ struct ggml_context * ctx,
1586
+ struct ggml_tensor * a,
1587
+ struct ggml_tensor * b,
1588
+ struct ggml_tensor * c,
1589
+ ggml_custom3_op_t fun,
1590
+ int n_tasks,
1591
+ void * userdata);
1592
+
1593
+ // loss function
1594
+
1595
+ GGML_API struct ggml_tensor * ggml_cross_entropy_loss(
1596
+ struct ggml_context * ctx,
1597
+ struct ggml_tensor * a,
1598
+ struct ggml_tensor * b);
1599
+
1600
+ GGML_API struct ggml_tensor * ggml_cross_entropy_loss_back(
1601
+ struct ggml_context * ctx,
1602
+ struct ggml_tensor * a,
1603
+ struct ggml_tensor * b,
1604
+ struct ggml_tensor * c);
1605
+
1606
+ //
1607
+ // automatic differentiation
1608
+ //
1609
+
1610
+ GGML_API void ggml_set_param(
1611
+ struct ggml_context * ctx,
1612
+ struct ggml_tensor * tensor);
1613
+
1614
+
1615
+ GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
1616
+
1617
+ GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
1618
+ GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
1619
+
1620
+ // graph allocation in a context
1621
+ GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx);
1622
+ GGML_API struct ggml_cgraph * ggml_build_forward_ctx(struct ggml_context * ctx, struct ggml_tensor * tensor);
1623
+ GGML_API size_t ggml_graph_overhead(void);
1624
+
1625
+ // ggml_graph_plan() has to be called before ggml_graph_compute()
1626
+ // when plan.work_size > 0, caller must allocate memory for plan.work_data
1627
+ GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
1628
+ GGML_API int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
1629
+ GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
1630
+
1631
+ // same as ggml_graph_compute() but the work data is allocated as a part of the context
1632
+ // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
1633
+ GGML_API void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
1634
+
1635
+ GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
1636
+
1637
+ GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
1638
+ GGML_API struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
1639
+
1640
+ // print info and performance information for the graph
1641
+ GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
1642
+
1643
+ // dump the graph into a file using the dot format
1644
+ GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
1645
+
1646
+ //
1647
+ // optimization
1648
+ //
1649
+
1650
+ // optimization methods
1651
+ enum ggml_opt_type {
1652
+ GGML_OPT_ADAM,
1653
+ GGML_OPT_LBFGS,
1654
+ };
1655
+
1656
+ // linesearch methods
1657
+ enum ggml_linesearch {
1658
+ GGML_LINESEARCH_DEFAULT = 1,
1659
+
1660
+ GGML_LINESEARCH_BACKTRACKING_ARMIJO = 0,
1661
+ GGML_LINESEARCH_BACKTRACKING_WOLFE = 1,
1662
+ GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2,
1663
+ };
1664
+
1665
+ // optimization return values
1666
+ enum ggml_opt_result {
1667
+ GGML_OPT_OK = 0,
1668
+ GGML_OPT_DID_NOT_CONVERGE,
1669
+ GGML_OPT_NO_CONTEXT,
1670
+ GGML_OPT_INVALID_WOLFE,
1671
+ GGML_OPT_FAIL,
1672
+
1673
+ GGML_LINESEARCH_FAIL = -128,
1674
+ GGML_LINESEARCH_MINIMUM_STEP,
1675
+ GGML_LINESEARCH_MAXIMUM_STEP,
1676
+ GGML_LINESEARCH_MAXIMUM_ITERATIONS,
1677
+ GGML_LINESEARCH_INVALID_PARAMETERS,
1678
+ };
1679
+
1680
+ // optimization parameters
1681
+ //
1682
+ // see ggml.c (ggml_opt_default_params) for default values
1683
+ //
1684
+ struct ggml_opt_params {
1685
+ enum ggml_opt_type type;
1686
+
1687
+ int n_threads;
1688
+
1689
+ // delta-based convergence test
1690
+ //
1691
+ // if past == 0 - disabled
1692
+ // if past > 0:
1693
+ // stop if |f(x) - f(x_past)| < delta * max(1, |f(x)|)
1694
+ //
1695
+ int past;
1696
+ float delta;
1697
+
1698
+ // maximum number of iterations without improvement
1699
+ //
1700
+ // if 0 - disabled
1701
+ // if > 0:
1702
+ // assume convergence if no cost improvement in this number of iterations
1703
+ //
1704
+ int max_no_improvement;
1705
+
1706
+ bool print_forward_graph;
1707
+ bool print_backward_graph;
1708
+
1709
+ // ADAM parameters
1710
+ struct {
1711
+ int n_iter;
1712
+
1713
+ float sched; // schedule multiplier (fixed, decay or warmup)
1714
+ float decay; // weight decay for AdamW, use 0.0f to disable
1715
+ float alpha; // learning rate
1716
+ float beta1;
1717
+ float beta2;
1718
+ float eps; // epsilon for numerical stability
1719
+ float eps_f; // epsilon for convergence test
1720
+ float eps_g; // epsilon for convergence test
1721
+ } adam;
1722
+
1723
+ // LBFGS parameters
1724
+ struct {
1725
+ int m; // number of corrections to approximate the inv. Hessian
1726
+ int n_iter;
1727
+ int max_linesearch;
1728
+
1729
+ float eps; // convergence tolerance
1730
+ float ftol; // line search tolerance
1731
+ float wolfe;
1732
+ float min_step;
1733
+ float max_step;
1734
+
1735
+ enum ggml_linesearch linesearch;
1736
+ } lbfgs;
1737
+ };
1738
+
1739
+ struct ggml_opt_context {
1740
+ struct ggml_context * ctx;
1741
+ struct ggml_opt_params params;
1742
+
1743
+ int iter;
1744
+ int64_t nx; // number of parameter elements
1745
+
1746
+ bool just_initialized;
1747
+
1748
+ struct {
1749
+ struct ggml_tensor * x; // view of the parameters
1750
+ struct ggml_tensor * g1; // gradient
1751
+ struct ggml_tensor * g2; // gradient squared
1752
+ struct ggml_tensor * m; // first moment
1753
+ struct ggml_tensor * v; // second moment
1754
+ struct ggml_tensor * mh; // first moment hat
1755
+ struct ggml_tensor * vh; // second moment hat
1756
+ struct ggml_tensor * pf; // past function values
1757
+ float fx_best;
1758
+ float fx_prev;
1759
+ int n_no_improvement;
1760
+ } adam;
1761
+
1762
+ struct {
1763
+ struct ggml_tensor * x; // current parameters
1764
+ struct ggml_tensor * xp; // previous parameters
1765
+ struct ggml_tensor * g; // current gradient
1766
+ struct ggml_tensor * gp; // previous gradient
1767
+ struct ggml_tensor * d; // search direction
1768
+ struct ggml_tensor * pf; // past function values
1769
+ struct ggml_tensor * lmal; // the L-BFGS memory alpha
1770
+ struct ggml_tensor * lmys; // the L-BFGS memory ys
1771
+ struct ggml_tensor * lms; // the L-BFGS memory s
1772
+ struct ggml_tensor * lmy; // the L-BFGS memory y
1773
+ float fx_best;
1774
+ float step;
1775
+ int j;
1776
+ int k;
1777
+ int end;
1778
+ int n_no_improvement;
1779
+ } lbfgs;
1780
+ };
1781
+
1782
+ GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
1783
+
1784
+ // optimize the function defined by the tensor f
1785
+ GGML_API enum ggml_opt_result ggml_opt(
1786
+ struct ggml_context * ctx,
1787
+ struct ggml_opt_params params,
1788
+ struct ggml_tensor * f);
1789
+
1790
+ // initialize optimizer context
1791
+ GGML_API void ggml_opt_init(
1792
+ struct ggml_context * ctx,
1793
+ struct ggml_opt_context * opt,
1794
+ struct ggml_opt_params params,
1795
+ int64_t nx);
1796
+
1797
+ // continue optimizing the function defined by the tensor f
1798
+ GGML_API enum ggml_opt_result ggml_opt_resume(
1799
+ struct ggml_context * ctx,
1800
+ struct ggml_opt_context * opt,
1801
+ struct ggml_tensor * f);
1802
+
1803
+ // continue optimizing the function defined by the tensor f
1804
+ GGML_API enum ggml_opt_result ggml_opt_resume_g(
1805
+ struct ggml_context * ctx,
1806
+ struct ggml_opt_context * opt,
1807
+ struct ggml_tensor * f,
1808
+ struct ggml_cgraph * gf,
1809
+ struct ggml_cgraph * gb);
1810
+
1811
+ //
1812
+ // quantization
1813
+ //
1814
+
1815
+ GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
1816
+ GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
1817
+ GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
1818
+ GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
1819
+ GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
1820
+
1821
+ GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
1822
+
1823
+ //
1824
+ // gguf
1825
+ //
1826
+
1827
+ enum gguf_type {
1828
+ GGUF_TYPE_UINT8 = 0,
1829
+ GGUF_TYPE_INT8 = 1,
1830
+ GGUF_TYPE_UINT16 = 2,
1831
+ GGUF_TYPE_INT16 = 3,
1832
+ GGUF_TYPE_UINT32 = 4,
1833
+ GGUF_TYPE_INT32 = 5,
1834
+ GGUF_TYPE_FLOAT32 = 6,
1835
+ GGUF_TYPE_BOOL = 7,
1836
+ GGUF_TYPE_STRING = 8,
1837
+ GGUF_TYPE_ARRAY = 9,
1838
+ GGUF_TYPE_UINT64 = 10,
1839
+ GGUF_TYPE_INT64 = 11,
1840
+ GGUF_TYPE_FLOAT64 = 12,
1841
+ GGUF_TYPE_COUNT, // marks the end of the enum
1842
+ };
1843
+
1844
+ struct gguf_context;
1845
+
1846
+ struct gguf_init_params {
1847
+ bool no_alloc;
1848
+
1849
+ // if not NULL, create a ggml_context and allocate the tensor data in it
1850
+ struct ggml_context ** ctx;
1851
+ };
1852
+
1853
+ GGML_API struct gguf_context * gguf_init_empty(void);
1854
+ GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
1855
+ //GGML_API struct gguf_context * gguf_init_from_buffer(..);
1856
+
1857
+ GGML_API void gguf_free(struct gguf_context * ctx);
1858
+
1859
+ GGML_API const char * gguf_type_name(enum gguf_type type);
1860
+
1861
+ GGML_API int gguf_get_version (struct gguf_context * ctx);
1862
+ GGML_API size_t gguf_get_alignment (struct gguf_context * ctx);
1863
+ GGML_API size_t gguf_get_data_offset(struct gguf_context * ctx);
1864
+ GGML_API void * gguf_get_data (struct gguf_context * ctx);
1865
+
1866
+ GGML_API int gguf_get_n_kv(struct gguf_context * ctx);
1867
+ GGML_API int gguf_find_key(struct gguf_context * ctx, const char * key);
1868
+ GGML_API const char * gguf_get_key (struct gguf_context * ctx, int i);
1869
+
1870
+ GGML_API enum gguf_type gguf_get_kv_type (struct gguf_context * ctx, int i);
1871
+ GGML_API enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i);
1872
+
1873
+ // results are undefined if the wrong type is used for the key
1874
+ GGML_API uint8_t gguf_get_val_u8 (struct gguf_context * ctx, int i);
1875
+ GGML_API int8_t gguf_get_val_i8 (struct gguf_context * ctx, int i);
1876
+ GGML_API uint16_t gguf_get_val_u16 (struct gguf_context * ctx, int i);
1877
+ GGML_API int16_t gguf_get_val_i16 (struct gguf_context * ctx, int i);
1878
+ GGML_API uint32_t gguf_get_val_u32 (struct gguf_context * ctx, int i);
1879
+ GGML_API int32_t gguf_get_val_i32 (struct gguf_context * ctx, int i);
1880
+ GGML_API float gguf_get_val_f32 (struct gguf_context * ctx, int i);
1881
+ GGML_API uint64_t gguf_get_val_u64 (struct gguf_context * ctx, int i);
1882
+ GGML_API int64_t gguf_get_val_i64 (struct gguf_context * ctx, int i);
1883
+ GGML_API double gguf_get_val_f64 (struct gguf_context * ctx, int i);
1884
+ GGML_API bool gguf_get_val_bool(struct gguf_context * ctx, int i);
1885
+ GGML_API const char * gguf_get_val_str (struct gguf_context * ctx, int i);
1886
+ GGML_API int gguf_get_arr_n (struct gguf_context * ctx, int i);
1887
+ GGML_API const void * gguf_get_arr_data(struct gguf_context * ctx, int i);
1888
+ GGML_API const char * gguf_get_arr_str (struct gguf_context * ctx, int key_id, int i);
1889
+
1890
+ GGML_API int gguf_get_n_tensors (struct gguf_context * ctx);
1891
+ GGML_API int gguf_find_tensor (struct gguf_context * ctx, const char * name);
1892
+ GGML_API size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i);
1893
+ GGML_API char * gguf_get_tensor_name (struct gguf_context * ctx, int i);
1894
+
1895
+ // overrides existing values or adds a new one
1896
+ GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);
1897
+ GGML_API void gguf_set_val_i8 (struct gguf_context * ctx, const char * key, int8_t val);
1898
+ GGML_API void gguf_set_val_u16 (struct gguf_context * ctx, const char * key, uint16_t val);
1899
+ GGML_API void gguf_set_val_i16 (struct gguf_context * ctx, const char * key, int16_t val);
1900
+ GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val);
1901
+ GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t val);
1902
+ GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float val);
1903
+ GGML_API void gguf_set_val_u64 (struct gguf_context * ctx, const char * key, uint64_t val);
1904
+ GGML_API void gguf_set_val_i64 (struct gguf_context * ctx, const char * key, int64_t val);
1905
+ GGML_API void gguf_set_val_f64 (struct gguf_context * ctx, const char * key, double val);
1906
+ GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val);
1907
+ GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val);
1908
+ GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n);
1909
+ GGML_API void gguf_set_arr_str (struct gguf_context * ctx, const char * key, const char ** data, int n);
1910
+
1911
+ // set or add KV pairs from another context
1912
+ GGML_API void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src);
1913
+
1914
+ // manage tensor info
1915
+ GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor);
1916
+ GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type);
1917
+ GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size);
1918
+
1919
+ // writing gguf files can be done in 2 ways:
1920
+ //
1921
+ // - write the entire gguf_context to a binary file in a single pass:
1922
+ //
1923
+ // gguf_write_to_file(ctx, fname);
1924
+ //
1925
+ // - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data:
1926
+ //
1927
+ // FILE * f = fopen(fname, "wb");
1928
+ // fseek(f, gguf_get_meta_size(ctx), SEEK_SET);
1929
+ // fwrite(f, ...);
1930
+ // void * data = gguf_meta_get_meta_data(ctx);
1931
+ // fseek(f, 0, SEEK_SET);
1932
+ // fwrite(f, data, gguf_get_meta_size(ctx));
1933
+ // free(data);
1934
+ // fclose(f);
1935
+ //
1936
+
1937
+ // write the entire context to a binary file
1938
+ GGML_API void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta);
1939
+
1940
+ // get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
1941
+ GGML_API size_t gguf_get_meta_size(struct gguf_context * ctx);
1942
+ GGML_API void gguf_get_meta_data(struct gguf_context * ctx, void * data);
1943
+
1944
+ //
1945
+ // system info
1946
+ //
1947
+
1948
+ GGML_API int ggml_cpu_has_avx (void);
1949
+ GGML_API int ggml_cpu_has_avx2 (void);
1950
+ GGML_API int ggml_cpu_has_avx512 (void);
1951
+ GGML_API int ggml_cpu_has_avx512_vbmi(void);
1952
+ GGML_API int ggml_cpu_has_avx512_vnni(void);
1953
+ GGML_API int ggml_cpu_has_fma (void);
1954
+ GGML_API int ggml_cpu_has_neon (void);
1955
+ GGML_API int ggml_cpu_has_arm_fma (void);
1956
+ GGML_API int ggml_cpu_has_f16c (void);
1957
+ GGML_API int ggml_cpu_has_fp16_va (void);
1958
+ GGML_API int ggml_cpu_has_wasm_simd (void);
1959
+ GGML_API int ggml_cpu_has_blas (void);
1960
+ GGML_API int ggml_cpu_has_cublas (void);
1961
+ GGML_API int ggml_cpu_has_clblast (void);
1962
+ GGML_API int ggml_cpu_has_gpublas (void);
1963
+ GGML_API int ggml_cpu_has_sse3 (void);
1964
+ GGML_API int ggml_cpu_has_ssse3 (void);
1965
+ GGML_API int ggml_cpu_has_vsx (void);
1966
+
1967
+ //
1968
+ // Internal types and functions exposed for tests and benchmarks
1969
+ //
1970
+
1971
+ #ifdef __cplusplus
1972
+ // restrict not standard in C++
1973
+ #define GGML_RESTRICT
1974
+ #else
1975
+ #define GGML_RESTRICT restrict
1976
+ #endif
1977
+ typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
1978
+ typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
1979
+ typedef void (*ggml_vec_dot_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
1980
+
1981
+ typedef struct {
1982
+ const char * type_name;
1983
+ int blck_size;
1984
+ size_t type_size;
1985
+ bool is_quantized;
1986
+ ggml_to_float_t to_float;
1987
+ ggml_from_float_t from_float;
1988
+ ggml_from_float_t from_float_reference;
1989
+ ggml_vec_dot_t vec_dot;
1990
+ enum ggml_type vec_dot_type;
1991
+ } ggml_type_traits_t;
1992
+
1993
+ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
1994
+
1995
+ #ifdef __cplusplus
1996
+ }
1997
+ #endif