whispercpp 1.2.0.2 → 1.3.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (135) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +5 -0
  3. data/LICENSE +1 -1
  4. data/README.md +165 -434
  5. data/Rakefile +46 -86
  6. data/ext/.gitignore +13 -0
  7. data/ext/cpu.mk +9 -0
  8. data/ext/{dr_wav.h → examples/dr_wav.h} +3560 -1179
  9. data/ext/extconf.rb +185 -7
  10. data/ext/ggml/include/ggml-alloc.h +76 -0
  11. data/ext/ggml/include/ggml-backend.h +352 -0
  12. data/ext/ggml/include/ggml-blas.h +25 -0
  13. data/ext/ggml/include/ggml-cann.h +123 -0
  14. data/ext/ggml/include/ggml-cpp.h +38 -0
  15. data/ext/ggml/include/ggml-cpu.h +135 -0
  16. data/ext/ggml/include/ggml-cuda.h +47 -0
  17. data/ext/ggml/include/ggml-kompute.h +50 -0
  18. data/ext/ggml/include/ggml-metal.h +66 -0
  19. data/ext/ggml/include/ggml-opencl.h +26 -0
  20. data/ext/ggml/include/ggml-opt.h +216 -0
  21. data/ext/ggml/include/ggml-rpc.h +28 -0
  22. data/ext/ggml/include/ggml-sycl.h +49 -0
  23. data/ext/ggml/include/ggml-vulkan.h +31 -0
  24. data/ext/ggml/include/ggml.h +2285 -0
  25. data/ext/ggml/src/ggml-alloc.c +1037 -0
  26. data/ext/ggml/src/ggml-amx/common.h +94 -0
  27. data/ext/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
  28. data/ext/ggml/src/ggml-amx/mmq.cpp +2510 -0
  29. data/ext/ggml/src/ggml-amx/mmq.h +17 -0
  30. data/ext/ggml/src/ggml-backend-impl.h +256 -0
  31. data/ext/ggml/src/ggml-backend-reg.cpp +552 -0
  32. data/ext/ggml/src/ggml-backend.cpp +1999 -0
  33. data/ext/ggml/src/ggml-blas/ggml-blas.cpp +517 -0
  34. data/ext/ggml/src/ggml-cann/acl_tensor.cpp +175 -0
  35. data/ext/ggml/src/ggml-cann/acl_tensor.h +258 -0
  36. data/ext/ggml/src/ggml-cann/aclnn_ops.cpp +3427 -0
  37. data/ext/ggml/src/ggml-cann/aclnn_ops.h +592 -0
  38. data/ext/ggml/src/ggml-cann/common.h +286 -0
  39. data/ext/ggml/src/ggml-cann/ggml-cann.cpp +2188 -0
  40. data/ext/ggml/src/ggml-cann/kernels/ascendc_kernels.h +19 -0
  41. data/ext/ggml/src/ggml-cann/kernels/dup.cpp +236 -0
  42. data/ext/ggml/src/ggml-cann/kernels/get_row_f16.cpp +197 -0
  43. data/ext/ggml/src/ggml-cann/kernels/get_row_f32.cpp +190 -0
  44. data/ext/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +204 -0
  45. data/ext/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
  46. data/ext/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +218 -0
  47. data/ext/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +216 -0
  48. data/ext/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +295 -0
  49. data/ext/ggml/src/ggml-common.h +1853 -0
  50. data/ext/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
  51. data/ext/ggml/src/ggml-cpu/amx/amx.h +8 -0
  52. data/ext/ggml/src/ggml-cpu/amx/common.h +91 -0
  53. data/ext/ggml/src/ggml-cpu/amx/mmq.cpp +2511 -0
  54. data/ext/ggml/src/ggml-cpu/amx/mmq.h +10 -0
  55. data/ext/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
  56. data/ext/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +4262 -0
  57. data/ext/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +8 -0
  58. data/ext/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
  59. data/ext/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
  60. data/ext/ggml/src/ggml-cpu/ggml-cpu-impl.h +386 -0
  61. data/ext/ggml/src/ggml-cpu/ggml-cpu-quants.c +10835 -0
  62. data/ext/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  63. data/ext/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
  64. data/ext/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
  65. data/ext/ggml/src/ggml-cpu/ggml-cpu.c +14123 -0
  66. data/ext/ggml/src/ggml-cpu/ggml-cpu.cpp +622 -0
  67. data/ext/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1884 -0
  68. data/ext/ggml/src/ggml-cpu/llamafile/sgemm.h +14 -0
  69. data/ext/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  70. data/ext/ggml/src/ggml-cuda/vendors/hip.h +186 -0
  71. data/ext/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  72. data/ext/ggml/src/ggml-impl.h +556 -0
  73. data/ext/ggml/src/ggml-kompute/ggml-kompute.cpp +2251 -0
  74. data/ext/ggml/src/ggml-metal/ggml-metal-impl.h +288 -0
  75. data/ext/ggml/src/ggml-metal/ggml-metal.m +4884 -0
  76. data/ext/ggml/src/ggml-metal/ggml-metal.metal +6732 -0
  77. data/ext/ggml/src/ggml-opt.cpp +854 -0
  78. data/ext/ggml/src/ggml-quants.c +5238 -0
  79. data/ext/ggml/src/ggml-quants.h +100 -0
  80. data/ext/ggml/src/ggml-rpc/ggml-rpc.cpp +1406 -0
  81. data/ext/ggml/src/ggml-sycl/common.cpp +95 -0
  82. data/ext/ggml/src/ggml-sycl/concat.cpp +196 -0
  83. data/ext/ggml/src/ggml-sycl/conv.cpp +99 -0
  84. data/ext/ggml/src/ggml-sycl/convert.cpp +547 -0
  85. data/ext/ggml/src/ggml-sycl/dmmv.cpp +1023 -0
  86. data/ext/ggml/src/ggml-sycl/element_wise.cpp +1030 -0
  87. data/ext/ggml/src/ggml-sycl/ggml-sycl.cpp +4729 -0
  88. data/ext/ggml/src/ggml-sycl/im2col.cpp +126 -0
  89. data/ext/ggml/src/ggml-sycl/mmq.cpp +3031 -0
  90. data/ext/ggml/src/ggml-sycl/mmvq.cpp +1015 -0
  91. data/ext/ggml/src/ggml-sycl/norm.cpp +378 -0
  92. data/ext/ggml/src/ggml-sycl/outprod.cpp +56 -0
  93. data/ext/ggml/src/ggml-sycl/rope.cpp +276 -0
  94. data/ext/ggml/src/ggml-sycl/softmax.cpp +251 -0
  95. data/ext/ggml/src/ggml-sycl/tsembd.cpp +72 -0
  96. data/ext/ggml/src/ggml-sycl/wkv6.cpp +141 -0
  97. data/ext/ggml/src/ggml-threading.cpp +12 -0
  98. data/ext/ggml/src/ggml-threading.h +14 -0
  99. data/ext/ggml/src/ggml-vulkan/ggml-vulkan.cpp +8657 -0
  100. data/ext/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +593 -0
  101. data/ext/ggml/src/ggml.c +7694 -0
  102. data/ext/include/whisper.h +672 -0
  103. data/ext/metal-embed.mk +17 -0
  104. data/ext/metal.mk +6 -0
  105. data/ext/ruby_whisper.cpp +1608 -159
  106. data/ext/ruby_whisper.h +10 -0
  107. data/ext/scripts/get-flags.mk +38 -0
  108. data/ext/src/coreml/whisper-decoder-impl.h +146 -0
  109. data/ext/src/coreml/whisper-decoder-impl.m +201 -0
  110. data/ext/src/coreml/whisper-encoder-impl.h +142 -0
  111. data/ext/src/coreml/whisper-encoder-impl.m +197 -0
  112. data/ext/src/coreml/whisper-encoder.h +26 -0
  113. data/ext/src/openvino/whisper-openvino-encoder.cpp +108 -0
  114. data/ext/src/openvino/whisper-openvino-encoder.h +31 -0
  115. data/ext/src/whisper.cpp +7393 -0
  116. data/extsources.rb +6 -0
  117. data/lib/whisper/model/uri.rb +157 -0
  118. data/lib/whisper.rb +2 -0
  119. data/tests/helper.rb +7 -0
  120. data/tests/jfk_reader/.gitignore +5 -0
  121. data/tests/jfk_reader/extconf.rb +3 -0
  122. data/tests/jfk_reader/jfk_reader.c +68 -0
  123. data/tests/test_callback.rb +160 -0
  124. data/tests/test_error.rb +20 -0
  125. data/tests/test_model.rb +71 -0
  126. data/tests/test_package.rb +31 -0
  127. data/tests/test_params.rb +160 -0
  128. data/tests/test_segment.rb +83 -0
  129. data/tests/test_whisper.rb +211 -123
  130. data/whispercpp.gemspec +36 -0
  131. metadata +137 -11
  132. data/ext/ggml.c +0 -8616
  133. data/ext/ggml.h +0 -748
  134. data/ext/whisper.cpp +0 -4829
  135. data/ext/whisper.h +0 -402
@@ -0,0 +1,2285 @@
1
+ #pragma once
2
+
3
+ //
4
+ // GGML Tensor Library
5
+ //
6
+ // This documentation is still a work in progress.
7
+ // If you wish some specific topics to be covered, feel free to drop a comment:
8
+ //
9
+ // https://github.com/ggerganov/whisper.cpp/issues/40
10
+ //
11
+ // ## Overview
12
+ //
13
+ // This library implements:
14
+ //
15
+ // - a set of tensor operations
16
+ // - automatic differentiation
17
+ // - basic optimization algorithms
18
+ //
19
+ // The aim of this library is to provide a minimalistic approach for various machine learning tasks. This includes,
20
+ // but is not limited to, the following:
21
+ //
22
+ // - linear regression
23
+ // - support vector machines
24
+ // - neural networks
25
+ //
26
+ // The library allows the user to define a certain function using the available tensor operations. This function
27
+ // definition is represented internally via a computation graph. Each tensor operation in the function definition
28
+ // corresponds to a node in the graph. Having the computation graph defined, the user can choose to compute the
29
+ // function's value and/or its gradient with respect to the input variables. Optionally, the function can be optimized
30
+ // using one of the available optimization algorithms.
31
+ //
32
+ // For example, here we define the function: f(x) = a*x^2 + b
33
+ //
34
+ // {
35
+ // struct ggml_init_params params = {
36
+ // .mem_size = 16*1024*1024,
37
+ // .mem_buffer = NULL,
38
+ // };
39
+ //
40
+ // // memory allocation happens here
41
+ // struct ggml_context * ctx = ggml_init(params);
42
+ //
43
+ // struct ggml_tensor * x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
44
+ //
45
+ // ggml_set_param(ctx, x); // x is an input variable
46
+ //
47
+ // struct ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
48
+ // struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
49
+ // struct ggml_tensor * x2 = ggml_mul(ctx, x, x);
50
+ // struct ggml_tensor * f = ggml_add(ctx, ggml_mul(ctx, a, x2), b);
51
+ //
52
+ // ...
53
+ // }
54
+ //
55
+ // Notice that the function definition above does not involve any actual computation. The computation is performed only
56
+ // when the user explicitly requests it. For example, to compute the function's value at x = 2.0:
57
+ //
58
+ // {
59
+ // ...
60
+ //
61
+ // struct ggml_cgraph * gf = ggml_new_graph(ctx);
62
+ // ggml_build_forward_expand(gf, f);
63
+ //
64
+ // // set the input variable and parameter values
65
+ // ggml_set_f32(x, 2.0f);
66
+ // ggml_set_f32(a, 3.0f);
67
+ // ggml_set_f32(b, 4.0f);
68
+ //
69
+ // ggml_graph_compute_with_ctx(ctx, &gf, n_threads);
70
+ //
71
+ // printf("f = %f\n", ggml_get_f32_1d(f, 0));
72
+ //
73
+ // ...
74
+ // }
75
+ //
76
+ // The actual computation is performed in the ggml_graph_compute() function.
77
+ //
78
+ // The ggml_new_tensor_...() functions create new tensors. They are allocated in the memory buffer provided to the
79
+ // ggml_init() function. You have to be careful not to exceed the memory buffer size. Therefore, you have to know
80
+ // in advance how much memory you need for your computation. Alternatively, you can allocate a large enough memory
81
+ // and after defining the computation graph, call the ggml_used_mem() function to find out how much memory was
82
+ // actually needed.
83
+ //
84
+ // The ggml_set_param() function marks a tensor as an input variable. This is used by the automatic
85
+ // differentiation and optimization algorithms.
86
+ //
87
+ // The described approach allows to define the function graph once and then compute its forward or backward graphs
88
+ // multiple times. All computations will use the same memory buffer allocated in the ggml_init() function. This way
89
+ // the user can avoid the memory allocation overhead at runtime.
90
+ //
91
+ // The library supports multi-dimensional tensors - up to 4 dimensions. The FP16 and FP32 data types are first class
92
+ // citizens, but in theory the library can be extended to support FP8 and integer data types.
93
+ //
94
+ // Each tensor operation produces a new tensor. Initially the library was envisioned to support only the use of unary
95
+ // and binary operations. Most of the available operations fall into one of these two categories. With time, it became
96
+ // clear that the library needs to support more complex operations. The way to support these operations is not clear
97
+ // yet, but a few examples are demonstrated in the following operations:
98
+ //
99
+ // - ggml_permute()
100
+ // - ggml_conv_1d_1s()
101
+ // - ggml_conv_1d_2s()
102
+ //
103
+ // For each tensor operator, the library implements a forward and backward computation function. The forward function
104
+ // computes the output tensor value given the input tensor values. The backward function computes the adjoint of the
105
+ // input tensors given the adjoint of the output tensor. For a detailed explanation of what this means, take a
106
+ // calculus class, or watch the following video:
107
+ //
108
+ // What is Automatic Differentiation?
109
+ // https://www.youtube.com/watch?v=wG_nF1awSSY
110
+ //
111
+ //
112
+ // ## Tensor data (struct ggml_tensor)
113
+ //
114
+ // The tensors are stored in memory via the ggml_tensor struct. The structure provides information about the size of
115
+ // the tensor, the data type, and the memory buffer where the tensor data is stored. Additionally, it contains
116
+ // pointers to the "source" tensors - i.e. the tensors that were used to compute the current tensor. For example:
117
+ //
118
+ // {
119
+ // struct ggml_tensor * c = ggml_add(ctx, a, b);
120
+ //
121
+ // assert(c->src[0] == a);
122
+ // assert(c->src[1] == b);
123
+ // }
124
+ //
125
+ // The multi-dimensional tensors are stored in row-major order. The ggml_tensor struct contains fields for the
126
+ // number of elements in each dimension ("ne") as well as the number of bytes ("nb", a.k.a. stride). This allows
127
+ // to store tensors that are not contiguous in memory, which is useful for operations such as transposition and
128
+ // permutation. All tensor operations have to take the stride into account and not assume that the tensor is
129
+ // contiguous in memory.
130
+ //
131
+ // The data of the tensor is accessed via the "data" pointer. For example:
132
+ //
133
+ // {
134
+ // const int nx = 2;
135
+ // const int ny = 3;
136
+ //
137
+ // struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nx, ny);
138
+ //
139
+ // for (int y = 0; y < ny; y++) {
140
+ // for (int x = 0; x < nx; x++) {
141
+ // *(float *) ((char *) a->data + y*a->nb[1] + x*a->nb[0]) = x + y;
142
+ // }
143
+ // }
144
+ //
145
+ // ...
146
+ // }
147
+ //
148
+ // Alternatively, there are helper functions, such as ggml_get_f32_1d() and ggml_set_f32_1d() that can be used.
149
+ //
150
+ // ## The matrix multiplication operator (ggml_mul_mat)
151
+ //
152
+ // TODO
153
+ //
154
+ //
155
+ // ## Multi-threading
156
+ //
157
+ // TODO
158
+ //
159
+ //
160
+ // ## Overview of ggml.c
161
+ //
162
+ // TODO
163
+ //
164
+ //
165
+ // ## SIMD optimizations
166
+ //
167
+ // TODO
168
+ //
169
+ //
170
+ // ## Debugging ggml
171
+ //
172
+ // TODO
173
+ //
174
+ //
175
+
176
+ #ifdef GGML_SHARED
177
+ # if defined(_WIN32) && !defined(__MINGW32__)
178
+ # ifdef GGML_BUILD
179
+ # define GGML_API __declspec(dllexport) extern
180
+ # else
181
+ # define GGML_API __declspec(dllimport) extern
182
+ # endif
183
+ # else
184
+ # define GGML_API __attribute__ ((visibility ("default"))) extern
185
+ # endif
186
+ #else
187
+ # define GGML_API extern
188
+ #endif
189
+
190
+ // TODO: support for clang
191
+ #ifdef __GNUC__
192
+ # define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
193
+ #elif defined(_MSC_VER)
194
+ # define GGML_DEPRECATED(func, hint) __declspec(deprecated(hint)) func
195
+ #else
196
+ # define GGML_DEPRECATED(func, hint) func
197
+ #endif
198
+
199
+ #ifndef __GNUC__
200
+ # define GGML_ATTRIBUTE_FORMAT(...)
201
+ #elif defined(__MINGW32__)
202
+ # define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
203
+ #else
204
+ # define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
205
+ #endif
206
+
207
+ #include <stdbool.h>
208
+ #include <stddef.h>
209
+ #include <stdint.h>
210
+ #include <stdio.h>
211
+
212
+ #define GGML_FILE_MAGIC 0x67676d6c // "ggml"
213
+ #define GGML_FILE_VERSION 2
214
+
215
+ #define GGML_QNT_VERSION 2 // bump this on quantization format changes
216
+ #define GGML_QNT_VERSION_FACTOR 1000 // do not change this
217
+
218
+ #define GGML_MAX_DIMS 4
219
+ #define GGML_MAX_PARAMS 2048
220
+ #define GGML_MAX_SRC 10
221
+ #define GGML_MAX_N_THREADS 512
222
+ #define GGML_MAX_OP_PARAMS 64
223
+
224
+ #ifndef GGML_MAX_NAME
225
+ # define GGML_MAX_NAME 64
226
+ #endif
227
+
228
+ #define GGML_DEFAULT_N_THREADS 4
229
+ #define GGML_DEFAULT_GRAPH_SIZE 2048
230
+
231
+ #if UINTPTR_MAX == 0xFFFFFFFF
232
+ #define GGML_MEM_ALIGN 4
233
+ #else
234
+ #define GGML_MEM_ALIGN 16
235
+ #endif
236
+
237
+ #define GGML_EXIT_SUCCESS 0
238
+ #define GGML_EXIT_ABORTED 1
239
+
240
+ #define GGML_ROPE_TYPE_NEOX 2
241
+ #define GGML_ROPE_TYPE_MROPE 8
242
+ #define GGML_ROPE_TYPE_VISION 24
243
+
244
+ #define GGUF_MAGIC "GGUF"
245
+
246
+ #define GGUF_VERSION 3
247
+
248
+ #define GGUF_DEFAULT_ALIGNMENT 32
249
+
250
+ #define GGML_UNUSED(x) (void)(x)
251
+
252
+ #define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
253
+
254
+ #ifndef NDEBUG
255
+ # define GGML_UNREACHABLE() do { fprintf(stderr, "statement should be unreachable\n"); abort(); } while(0)
256
+ #elif defined(__GNUC__)
257
+ # define GGML_UNREACHABLE() __builtin_unreachable()
258
+ #elif defined(_MSC_VER)
259
+ # define GGML_UNREACHABLE() __assume(0)
260
+ #else
261
+ # define GGML_UNREACHABLE() ((void) 0)
262
+ #endif
263
+
264
+ #ifdef __cplusplus
265
+ # define GGML_NORETURN [[noreturn]]
266
+ #elif defined(_MSC_VER)
267
+ # define GGML_NORETURN __declspec(noreturn)
268
+ #else
269
+ # define GGML_NORETURN _Noreturn
270
+ #endif
271
+
272
+ #define GGML_ABORT(...) ggml_abort(__FILE__, __LINE__, __VA_ARGS__)
273
+ #define GGML_ASSERT(x) if (!(x)) GGML_ABORT("GGML_ASSERT(%s) failed", #x)
274
+
275
+ // used to copy the number of elements and stride in bytes of tensors into local variables.
276
+ // main purpose is to reduce code duplication and improve readability.
277
+ //
278
+ // example:
279
+ //
280
+ // GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
281
+ // GGML_TENSOR_LOCALS(size_t, nb1, src1, nb);
282
+ //
283
+ #define GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \
284
+ const type prefix##0 = (pointer)->array[0]; \
285
+ GGML_UNUSED(prefix##0);
286
+ #define GGML_TENSOR_LOCALS_2(type, prefix, pointer, array) \
287
+ GGML_TENSOR_LOCALS_1 (type, prefix, pointer, array) \
288
+ const type prefix##1 = (pointer)->array[1]; \
289
+ GGML_UNUSED(prefix##1);
290
+ #define GGML_TENSOR_LOCALS_3(type, prefix, pointer, array) \
291
+ GGML_TENSOR_LOCALS_2 (type, prefix, pointer, array) \
292
+ const type prefix##2 = (pointer)->array[2]; \
293
+ GGML_UNUSED(prefix##2);
294
+ #define GGML_TENSOR_LOCALS(type, prefix, pointer, array) \
295
+ GGML_TENSOR_LOCALS_3 (type, prefix, pointer, array) \
296
+ const type prefix##3 = (pointer)->array[3]; \
297
+ GGML_UNUSED(prefix##3);
298
+
299
+ #define GGML_TENSOR_UNARY_OP_LOCALS \
300
+ GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
301
+ GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
302
+ GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
303
+ GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
304
+
305
+ #define GGML_TENSOR_BINARY_OP_LOCALS \
306
+ GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
307
+ GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
308
+ GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
309
+ GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) \
310
+ GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
311
+ GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
312
+
313
+ #define GGML_TENSOR_BINARY_OP_LOCALS01 \
314
+ GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
315
+ GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
316
+ GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
317
+ GGML_TENSOR_LOCALS(size_t, nb1, src1, nb)
318
+
319
+ #ifdef __cplusplus
320
+ extern "C" {
321
+ #endif
322
+
323
+ GGML_NORETURN GGML_ATTRIBUTE_FORMAT(3, 4)
324
+ GGML_API void ggml_abort(const char * file, int line, const char * fmt, ...);
325
+
326
+ enum ggml_status {
327
+ GGML_STATUS_ALLOC_FAILED = -2,
328
+ GGML_STATUS_FAILED = -1,
329
+ GGML_STATUS_SUCCESS = 0,
330
+ GGML_STATUS_ABORTED = 1,
331
+ };
332
+
333
+ // get ggml_status name string
334
+ GGML_API const char * ggml_status_to_string(enum ggml_status status);
335
+
336
+ // ieee 754-2008 half-precision float16
337
+ // todo: make this not an integral type
338
+ typedef uint16_t ggml_fp16_t;
339
+ GGML_API float ggml_fp16_to_fp32(ggml_fp16_t);
340
+ GGML_API ggml_fp16_t ggml_fp32_to_fp16(float);
341
+ GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t *, float *, int64_t);
342
+ GGML_API void ggml_fp32_to_fp16_row(const float *, ggml_fp16_t *, int64_t);
343
+
344
+ // google brain half-precision bfloat16
345
+ typedef struct { uint16_t bits; } ggml_bf16_t;
346
+ GGML_API ggml_bf16_t ggml_fp32_to_bf16(float);
347
+ GGML_API float ggml_bf16_to_fp32(ggml_bf16_t); // consider just doing << 16
348
+ GGML_API void ggml_bf16_to_fp32_row(const ggml_bf16_t *, float *, int64_t);
349
+ GGML_API void ggml_fp32_to_bf16_row_ref(const float *, ggml_bf16_t *, int64_t);
350
+ GGML_API void ggml_fp32_to_bf16_row(const float *, ggml_bf16_t *, int64_t);
351
+
352
+ struct ggml_object;
353
+ struct ggml_context;
354
+ struct ggml_cgraph;
355
+
356
+ // NOTE: always add types at the end of the enum to keep backward compatibility
357
+ enum ggml_type {
358
+ GGML_TYPE_F32 = 0,
359
+ GGML_TYPE_F16 = 1,
360
+ GGML_TYPE_Q4_0 = 2,
361
+ GGML_TYPE_Q4_1 = 3,
362
+ // GGML_TYPE_Q4_2 = 4, support has been removed
363
+ // GGML_TYPE_Q4_3 = 5, support has been removed
364
+ GGML_TYPE_Q5_0 = 6,
365
+ GGML_TYPE_Q5_1 = 7,
366
+ GGML_TYPE_Q8_0 = 8,
367
+ GGML_TYPE_Q8_1 = 9,
368
+ GGML_TYPE_Q2_K = 10,
369
+ GGML_TYPE_Q3_K = 11,
370
+ GGML_TYPE_Q4_K = 12,
371
+ GGML_TYPE_Q5_K = 13,
372
+ GGML_TYPE_Q6_K = 14,
373
+ GGML_TYPE_Q8_K = 15,
374
+ GGML_TYPE_IQ2_XXS = 16,
375
+ GGML_TYPE_IQ2_XS = 17,
376
+ GGML_TYPE_IQ3_XXS = 18,
377
+ GGML_TYPE_IQ1_S = 19,
378
+ GGML_TYPE_IQ4_NL = 20,
379
+ GGML_TYPE_IQ3_S = 21,
380
+ GGML_TYPE_IQ2_S = 22,
381
+ GGML_TYPE_IQ4_XS = 23,
382
+ GGML_TYPE_I8 = 24,
383
+ GGML_TYPE_I16 = 25,
384
+ GGML_TYPE_I32 = 26,
385
+ GGML_TYPE_I64 = 27,
386
+ GGML_TYPE_F64 = 28,
387
+ GGML_TYPE_IQ1_M = 29,
388
+ GGML_TYPE_BF16 = 30,
389
+ // GGML_TYPE_Q4_0_4_4 = 31, support has been removed from gguf files
390
+ // GGML_TYPE_Q4_0_4_8 = 32,
391
+ // GGML_TYPE_Q4_0_8_8 = 33,
392
+ GGML_TYPE_TQ1_0 = 34,
393
+ GGML_TYPE_TQ2_0 = 35,
394
+ // GGML_TYPE_IQ4_NL_4_4 = 36,
395
+ // GGML_TYPE_IQ4_NL_4_8 = 37,
396
+ // GGML_TYPE_IQ4_NL_8_8 = 38,
397
+ GGML_TYPE_COUNT = 39,
398
+ };
399
+
400
+ // precision
401
+ enum ggml_prec {
402
+ GGML_PREC_DEFAULT,
403
+ GGML_PREC_F32,
404
+ };
405
+
406
+ enum ggml_backend_type {
407
+ GGML_BACKEND_TYPE_CPU = 0,
408
+ GGML_BACKEND_TYPE_GPU = 10,
409
+ GGML_BACKEND_TYPE_GPU_SPLIT = 20,
410
+ };
411
+
412
+ // model file types
413
+ enum ggml_ftype {
414
+ GGML_FTYPE_UNKNOWN = -1,
415
+ GGML_FTYPE_ALL_F32 = 0,
416
+ GGML_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
417
+ GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
418
+ GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
419
+ GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
420
+ GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
421
+ GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
422
+ GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
423
+ GGML_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
424
+ GGML_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors
425
+ GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors
426
+ GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors
427
+ GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
428
+ GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors
429
+ GGML_FTYPE_MOSTLY_IQ2_XS = 16, // except 1d tensors
430
+ GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors
431
+ GGML_FTYPE_MOSTLY_IQ1_S = 18, // except 1d tensors
432
+ GGML_FTYPE_MOSTLY_IQ4_NL = 19, // except 1d tensors
433
+ GGML_FTYPE_MOSTLY_IQ3_S = 20, // except 1d tensors
434
+ GGML_FTYPE_MOSTLY_IQ2_S = 21, // except 1d tensors
435
+ GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
436
+ GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
437
+ GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
438
+ };
439
+
440
+ // available tensor operations:
441
+ enum ggml_op {
442
+ GGML_OP_NONE = 0,
443
+
444
+ GGML_OP_DUP,
445
+ GGML_OP_ADD,
446
+ GGML_OP_ADD1,
447
+ GGML_OP_ACC,
448
+ GGML_OP_SUB,
449
+ GGML_OP_MUL,
450
+ GGML_OP_DIV,
451
+ GGML_OP_SQR,
452
+ GGML_OP_SQRT,
453
+ GGML_OP_LOG,
454
+ GGML_OP_SIN,
455
+ GGML_OP_COS,
456
+ GGML_OP_SUM,
457
+ GGML_OP_SUM_ROWS,
458
+ GGML_OP_MEAN,
459
+ GGML_OP_ARGMAX,
460
+ GGML_OP_COUNT_EQUAL,
461
+ GGML_OP_REPEAT,
462
+ GGML_OP_REPEAT_BACK,
463
+ GGML_OP_CONCAT,
464
+ GGML_OP_SILU_BACK,
465
+ GGML_OP_NORM, // normalize
466
+ GGML_OP_RMS_NORM,
467
+ GGML_OP_RMS_NORM_BACK,
468
+ GGML_OP_GROUP_NORM,
469
+
470
+ GGML_OP_MUL_MAT,
471
+ GGML_OP_MUL_MAT_ID,
472
+ GGML_OP_OUT_PROD,
473
+
474
+ GGML_OP_SCALE,
475
+ GGML_OP_SET,
476
+ GGML_OP_CPY,
477
+ GGML_OP_CONT,
478
+ GGML_OP_RESHAPE,
479
+ GGML_OP_VIEW,
480
+ GGML_OP_PERMUTE,
481
+ GGML_OP_TRANSPOSE,
482
+ GGML_OP_GET_ROWS,
483
+ GGML_OP_GET_ROWS_BACK,
484
+ GGML_OP_DIAG,
485
+ GGML_OP_DIAG_MASK_INF,
486
+ GGML_OP_DIAG_MASK_ZERO,
487
+ GGML_OP_SOFT_MAX,
488
+ GGML_OP_SOFT_MAX_BACK,
489
+ GGML_OP_ROPE,
490
+ GGML_OP_ROPE_BACK,
491
+ GGML_OP_CLAMP,
492
+ GGML_OP_CONV_TRANSPOSE_1D,
493
+ GGML_OP_IM2COL,
494
+ GGML_OP_IM2COL_BACK,
495
+ GGML_OP_CONV_TRANSPOSE_2D,
496
+ GGML_OP_POOL_1D,
497
+ GGML_OP_POOL_2D,
498
+ GGML_OP_POOL_2D_BACK,
499
+ GGML_OP_UPSCALE, // nearest interpolate
500
+ GGML_OP_PAD,
501
+ GGML_OP_PAD_REFLECT_1D,
502
+ GGML_OP_ARANGE,
503
+ GGML_OP_TIMESTEP_EMBEDDING,
504
+ GGML_OP_ARGSORT,
505
+ GGML_OP_LEAKY_RELU,
506
+
507
+ GGML_OP_FLASH_ATTN_EXT,
508
+ GGML_OP_FLASH_ATTN_BACK,
509
+ GGML_OP_SSM_CONV,
510
+ GGML_OP_SSM_SCAN,
511
+ GGML_OP_WIN_PART,
512
+ GGML_OP_WIN_UNPART,
513
+ GGML_OP_GET_REL_POS,
514
+ GGML_OP_ADD_REL_POS,
515
+ GGML_OP_RWKV_WKV6,
516
+
517
+ GGML_OP_UNARY,
518
+
519
+ GGML_OP_MAP_UNARY,
520
+ GGML_OP_MAP_BINARY,
521
+
522
+ GGML_OP_MAP_CUSTOM1_F32,
523
+ GGML_OP_MAP_CUSTOM2_F32,
524
+ GGML_OP_MAP_CUSTOM3_F32,
525
+
526
+ GGML_OP_MAP_CUSTOM1,
527
+ GGML_OP_MAP_CUSTOM2,
528
+ GGML_OP_MAP_CUSTOM3,
529
+
530
+ GGML_OP_CROSS_ENTROPY_LOSS,
531
+ GGML_OP_CROSS_ENTROPY_LOSS_BACK,
532
+ GGML_OP_OPT_STEP_ADAMW,
533
+
534
+ GGML_OP_COUNT,
535
+ };
536
+
537
+ enum ggml_unary_op {
538
+ GGML_UNARY_OP_ABS,
539
+ GGML_UNARY_OP_SGN,
540
+ GGML_UNARY_OP_NEG,
541
+ GGML_UNARY_OP_STEP,
542
+ GGML_UNARY_OP_TANH,
543
+ GGML_UNARY_OP_ELU,
544
+ GGML_UNARY_OP_RELU,
545
+ GGML_UNARY_OP_SIGMOID,
546
+ GGML_UNARY_OP_GELU,
547
+ GGML_UNARY_OP_GELU_QUICK,
548
+ GGML_UNARY_OP_SILU,
549
+ GGML_UNARY_OP_HARDSWISH,
550
+ GGML_UNARY_OP_HARDSIGMOID,
551
+ GGML_UNARY_OP_EXP,
552
+
553
+ GGML_UNARY_OP_COUNT,
554
+ };
555
+
556
+ enum ggml_object_type {
557
+ GGML_OBJECT_TYPE_TENSOR,
558
+ GGML_OBJECT_TYPE_GRAPH,
559
+ GGML_OBJECT_TYPE_WORK_BUFFER
560
+ };
561
+
562
+ enum ggml_log_level {
563
+ GGML_LOG_LEVEL_NONE = 0,
564
+ GGML_LOG_LEVEL_DEBUG = 1,
565
+ GGML_LOG_LEVEL_INFO = 2,
566
+ GGML_LOG_LEVEL_WARN = 3,
567
+ GGML_LOG_LEVEL_ERROR = 4,
568
+ GGML_LOG_LEVEL_CONT = 5, // continue previous log
569
+ };
570
+
571
+ // this tensor...
572
+ enum ggml_tensor_flag {
573
+ GGML_TENSOR_FLAG_INPUT = 1, // ...is an input for the GGML compute graph
574
+ GGML_TENSOR_FLAG_OUTPUT = 2, // ...is an output for the GGML compute graph
575
+ GGML_TENSOR_FLAG_PARAM = 4, // ...contains trainable parameters
576
+ GGML_TENSOR_FLAG_LOSS = 8, // ...defines loss for numerical optimization (multiple loss tensors add up)
577
+ };
578
+
579
+ struct ggml_init_params {
580
+ // memory pool
581
+ size_t mem_size; // bytes
582
+ void * mem_buffer; // if NULL, memory will be allocated internally
583
+ bool no_alloc; // don't allocate memory for the tensor data
584
+ };
585
+
586
+ // n-dimensional tensor
587
+ struct ggml_tensor {
588
+ enum ggml_type type;
589
+
590
+ GGML_DEPRECATED(enum ggml_backend_type backend, "use the buffer type to find the storage location of the tensor");
591
+
592
+ struct ggml_backend_buffer * buffer;
593
+
594
+ int64_t ne[GGML_MAX_DIMS]; // number of elements
595
+ size_t nb[GGML_MAX_DIMS]; // stride in bytes:
596
+ // nb[0] = ggml_type_size(type)
597
+ // nb[1] = nb[0] * (ne[0] / ggml_blck_size(type)) + padding
598
+ // nb[i] = nb[i-1] * ne[i-1]
599
+
600
+ // compute data
601
+ enum ggml_op op;
602
+
603
+ // op params - allocated as int32_t for alignment
604
+ int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
605
+
606
+ int32_t flags;
607
+
608
+ struct ggml_tensor * src[GGML_MAX_SRC];
609
+
610
+ // source tensor and offset for views
611
+ struct ggml_tensor * view_src;
612
+ size_t view_offs;
613
+
614
+ void * data;
615
+
616
+ char name[GGML_MAX_NAME];
617
+
618
+ void * extra; // extra things e.g. for ggml-cuda.cu
619
+
620
+ char padding[8];
621
+ };
622
+
623
+ static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
624
+
625
+ // Abort callback
626
+ // If not NULL, called before ggml computation
627
+ // If it returns true, the computation is aborted
628
+ typedef bool (*ggml_abort_callback)(void * data);
629
+
630
+
631
+ //
632
+ // GUID
633
+ //
634
+
635
+ // GUID types
636
+ typedef uint8_t ggml_guid[16];
637
+ typedef ggml_guid * ggml_guid_t;
638
+
639
+ GGML_API bool ggml_guid_matches(ggml_guid_t guid_a, ggml_guid_t guid_b);
640
+
641
+ // misc
642
+
643
+ GGML_API void ggml_time_init(void); // call this once at the beginning of the program
644
+ GGML_API int64_t ggml_time_ms(void);
645
+ GGML_API int64_t ggml_time_us(void);
646
+ GGML_API int64_t ggml_cycles(void);
647
+ GGML_API int64_t ggml_cycles_per_ms(void);
648
+
649
+ // accepts a UTF-8 path, even on Windows
650
+ GGML_API FILE * ggml_fopen(const char * fname, const char * mode);
651
+
652
+ GGML_API void ggml_print_object (const struct ggml_object * obj);
653
+ GGML_API void ggml_print_objects(const struct ggml_context * ctx);
654
+
655
+ GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor);
656
+ GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
657
+ GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
658
+ GGML_API size_t ggml_nbytes_pad(const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
659
+
660
+ GGML_API int64_t ggml_blck_size(enum ggml_type type);
661
+ GGML_API size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
662
+ GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
663
+
664
+ GGML_DEPRECATED(
665
+ GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
666
+ "use ggml_row_size() instead");
667
+
668
+ GGML_API const char * ggml_type_name(enum ggml_type type);
669
+ GGML_API const char * ggml_op_name (enum ggml_op op);
670
+ GGML_API const char * ggml_op_symbol(enum ggml_op op);
671
+
672
+ GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op);
673
+ GGML_API const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
674
+
675
+ GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
676
+
677
+ GGML_API bool ggml_is_quantized(enum ggml_type type);
678
+
679
+ // TODO: temporary until model loading of ggml examples is refactored
680
+ GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
681
+
682
+ GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
683
+ GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
684
+ GGML_API bool ggml_is_empty (const struct ggml_tensor * tensor);
685
+ GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
686
+ GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
687
+ GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);
688
+ GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
689
+ GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
690
+
691
+ GGML_API bool ggml_is_contiguous (const struct ggml_tensor * tensor);
692
+ GGML_API bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous()
693
+ GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
694
+ GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
695
+
696
+ GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
697
+ GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
698
+
699
+ GGML_API bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
700
+
701
+ // use this to compute the memory overhead of a tensor
702
+ GGML_API size_t ggml_tensor_overhead(void);
703
+
704
+ GGML_API bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbytes);
705
+
706
+ // main
707
+
708
+ GGML_API struct ggml_context * ggml_init (struct ggml_init_params params);
709
+ GGML_API void ggml_reset(struct ggml_context * ctx);
710
+ GGML_API void ggml_free (struct ggml_context * ctx);
711
+
712
+ GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
713
+
714
+ GGML_API bool ggml_get_no_alloc(struct ggml_context * ctx);
715
+ GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
716
+
717
+ GGML_API void * ggml_get_mem_buffer (const struct ggml_context * ctx);
718
+ GGML_API size_t ggml_get_mem_size (const struct ggml_context * ctx);
719
+ GGML_API size_t ggml_get_max_tensor_size(const struct ggml_context * ctx);
720
+
721
+ GGML_API struct ggml_tensor * ggml_new_tensor(
722
+ struct ggml_context * ctx,
723
+ enum ggml_type type,
724
+ int n_dims,
725
+ const int64_t *ne);
726
+
727
+ GGML_API struct ggml_tensor * ggml_new_tensor_1d(
728
+ struct ggml_context * ctx,
729
+ enum ggml_type type,
730
+ int64_t ne0);
731
+
732
+ GGML_API struct ggml_tensor * ggml_new_tensor_2d(
733
+ struct ggml_context * ctx,
734
+ enum ggml_type type,
735
+ int64_t ne0,
736
+ int64_t ne1);
737
+
738
+ GGML_API struct ggml_tensor * ggml_new_tensor_3d(
739
+ struct ggml_context * ctx,
740
+ enum ggml_type type,
741
+ int64_t ne0,
742
+ int64_t ne1,
743
+ int64_t ne2);
744
+
745
+ GGML_API struct ggml_tensor * ggml_new_tensor_4d(
746
+ struct ggml_context * ctx,
747
+ enum ggml_type type,
748
+ int64_t ne0,
749
+ int64_t ne1,
750
+ int64_t ne2,
751
+ int64_t ne3);
752
+
753
+ GGML_API void * ggml_new_buffer(struct ggml_context * ctx, size_t nbytes);
754
+
755
+ GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
756
+ GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
757
+
758
+ // Context tensor enumeration and lookup
759
+ GGML_API struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx);
760
+ GGML_API struct ggml_tensor * ggml_get_next_tensor (const struct ggml_context * ctx, struct ggml_tensor * tensor);
761
+ GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
762
+
763
+ // Converts a flat index into coordinates
764
+ GGML_API void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3);
765
+
766
+ GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
767
+
768
+ GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
769
+ GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
770
+
771
+ GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor);
772
+ GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name);
773
+ GGML_ATTRIBUTE_FORMAT(2, 3)
774
+ GGML_API struct ggml_tensor * ggml_format_name( struct ggml_tensor * tensor, const char * fmt, ...);
775
+
776
+ // Tensor flags
777
+ GGML_API void ggml_set_input(struct ggml_tensor * tensor);
778
+ GGML_API void ggml_set_output(struct ggml_tensor * tensor);
779
+ GGML_API void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor);
780
+ GGML_API void ggml_set_loss(struct ggml_tensor * tensor);
781
+
782
+ //
783
+ // operations on tensors with backpropagation
784
+ //
785
+
786
+ GGML_API struct ggml_tensor * ggml_dup(
787
+ struct ggml_context * ctx,
788
+ struct ggml_tensor * a);
789
+
790
+ // in-place, returns view(a)
791
+ GGML_API struct ggml_tensor * ggml_dup_inplace(
792
+ struct ggml_context * ctx,
793
+ struct ggml_tensor * a);
794
+
795
+ GGML_API struct ggml_tensor * ggml_add(
796
+ struct ggml_context * ctx,
797
+ struct ggml_tensor * a,
798
+ struct ggml_tensor * b);
799
+
800
+ GGML_API struct ggml_tensor * ggml_add_inplace(
801
+ struct ggml_context * ctx,
802
+ struct ggml_tensor * a,
803
+ struct ggml_tensor * b);
804
+
805
+ GGML_API struct ggml_tensor * ggml_add_cast(
806
+ struct ggml_context * ctx,
807
+ struct ggml_tensor * a,
808
+ struct ggml_tensor * b,
809
+ enum ggml_type type);
810
+
811
+ GGML_API struct ggml_tensor * ggml_add1(
812
+ struct ggml_context * ctx,
813
+ struct ggml_tensor * a,
814
+ struct ggml_tensor * b);
815
+
816
+ GGML_API struct ggml_tensor * ggml_add1_inplace(
817
+ struct ggml_context * ctx,
818
+ struct ggml_tensor * a,
819
+ struct ggml_tensor * b);
820
+
821
+ // dst = a
822
+ // view(dst, nb1, nb2, nb3, offset) += b
823
+ // return dst
824
+ GGML_API struct ggml_tensor * ggml_acc(
825
+ struct ggml_context * ctx,
826
+ struct ggml_tensor * a,
827
+ struct ggml_tensor * b,
828
+ size_t nb1,
829
+ size_t nb2,
830
+ size_t nb3,
831
+ size_t offset);
832
+
833
+ GGML_API struct ggml_tensor * ggml_acc_inplace(
834
+ struct ggml_context * ctx,
835
+ struct ggml_tensor * a,
836
+ struct ggml_tensor * b,
837
+ size_t nb1,
838
+ size_t nb2,
839
+ size_t nb3,
840
+ size_t offset);
841
+
842
+ GGML_API struct ggml_tensor * ggml_sub(
843
+ struct ggml_context * ctx,
844
+ struct ggml_tensor * a,
845
+ struct ggml_tensor * b);
846
+
847
+ GGML_API struct ggml_tensor * ggml_sub_inplace(
848
+ struct ggml_context * ctx,
849
+ struct ggml_tensor * a,
850
+ struct ggml_tensor * b);
851
+
852
+ GGML_API struct ggml_tensor * ggml_mul(
853
+ struct ggml_context * ctx,
854
+ struct ggml_tensor * a,
855
+ struct ggml_tensor * b);
856
+
857
+ GGML_API struct ggml_tensor * ggml_mul_inplace(
858
+ struct ggml_context * ctx,
859
+ struct ggml_tensor * a,
860
+ struct ggml_tensor * b);
861
+
862
+ GGML_API struct ggml_tensor * ggml_div(
863
+ struct ggml_context * ctx,
864
+ struct ggml_tensor * a,
865
+ struct ggml_tensor * b);
866
+
867
+ GGML_API struct ggml_tensor * ggml_div_inplace(
868
+ struct ggml_context * ctx,
869
+ struct ggml_tensor * a,
870
+ struct ggml_tensor * b);
871
+
872
+ GGML_API struct ggml_tensor * ggml_sqr(
873
+ struct ggml_context * ctx,
874
+ struct ggml_tensor * a);
875
+
876
+ GGML_API struct ggml_tensor * ggml_sqr_inplace(
877
+ struct ggml_context * ctx,
878
+ struct ggml_tensor * a);
879
+
880
+ GGML_API struct ggml_tensor * ggml_sqrt(
881
+ struct ggml_context * ctx,
882
+ struct ggml_tensor * a);
883
+
884
+ GGML_API struct ggml_tensor * ggml_sqrt_inplace(
885
+ struct ggml_context * ctx,
886
+ struct ggml_tensor * a);
887
+
888
+ GGML_API struct ggml_tensor * ggml_log(
889
+ struct ggml_context * ctx,
890
+ struct ggml_tensor * a);
891
+
892
+ GGML_API struct ggml_tensor * ggml_log_inplace(
893
+ struct ggml_context * ctx,
894
+ struct ggml_tensor * a);
895
+
896
+ GGML_API struct ggml_tensor * ggml_sin(
897
+ struct ggml_context * ctx,
898
+ struct ggml_tensor * a);
899
+
900
+ GGML_API struct ggml_tensor * ggml_sin_inplace(
901
+ struct ggml_context * ctx,
902
+ struct ggml_tensor * a);
903
+
904
+ GGML_API struct ggml_tensor * ggml_cos(
905
+ struct ggml_context * ctx,
906
+ struct ggml_tensor * a);
907
+
908
+ GGML_API struct ggml_tensor * ggml_cos_inplace(
909
+ struct ggml_context * ctx,
910
+ struct ggml_tensor * a);
911
+
912
+ // return scalar
913
+ GGML_API struct ggml_tensor * ggml_sum(
914
+ struct ggml_context * ctx,
915
+ struct ggml_tensor * a);
916
+
917
+ // sums along rows, with input shape [a,b,c,d] return shape [1,b,c,d]
918
+ GGML_API struct ggml_tensor * ggml_sum_rows(
919
+ struct ggml_context * ctx,
920
+ struct ggml_tensor * a);
921
+
922
+ // mean along rows
923
+ GGML_API struct ggml_tensor * ggml_mean(
924
+ struct ggml_context * ctx,
925
+ struct ggml_tensor * a);
926
+
927
+ // argmax along rows
928
+ GGML_API struct ggml_tensor * ggml_argmax(
929
+ struct ggml_context * ctx,
930
+ struct ggml_tensor * a);
931
+
932
+ // count number of equal elements in a and b
933
+ GGML_API struct ggml_tensor * ggml_count_equal(
934
+ struct ggml_context * ctx,
935
+ struct ggml_tensor * a,
936
+ struct ggml_tensor * b);
937
+
938
+ // if a is the same shape as b, and a is not parameter, return a
939
+ // otherwise, return a new tensor: repeat(a) to fit in b
940
+ GGML_API struct ggml_tensor * ggml_repeat(
941
+ struct ggml_context * ctx,
942
+ struct ggml_tensor * a,
943
+ struct ggml_tensor * b);
944
+
945
+ // sums repetitions in a into shape of b
946
+ GGML_API struct ggml_tensor * ggml_repeat_back(
947
+ struct ggml_context * ctx,
948
+ struct ggml_tensor * a,
949
+ struct ggml_tensor * b);
950
+
951
+ // concat a and b along dim
952
+ // used in stable-diffusion
953
+ GGML_API struct ggml_tensor * ggml_concat(
954
+ struct ggml_context * ctx,
955
+ struct ggml_tensor * a,
956
+ struct ggml_tensor * b,
957
+ int dim);
958
+
959
+ GGML_API struct ggml_tensor * ggml_abs(
960
+ struct ggml_context * ctx,
961
+ struct ggml_tensor * a);
962
+
963
+ GGML_API struct ggml_tensor * ggml_abs_inplace(
964
+ struct ggml_context * ctx,
965
+ struct ggml_tensor * a);
966
+
967
+ GGML_API struct ggml_tensor * ggml_sgn(
968
+ struct ggml_context * ctx,
969
+ struct ggml_tensor * a);
970
+
971
+ GGML_API struct ggml_tensor * ggml_sgn_inplace(
972
+ struct ggml_context * ctx,
973
+ struct ggml_tensor * a);
974
+
975
+ GGML_API struct ggml_tensor * ggml_neg(
976
+ struct ggml_context * ctx,
977
+ struct ggml_tensor * a);
978
+
979
+ GGML_API struct ggml_tensor * ggml_neg_inplace(
980
+ struct ggml_context * ctx,
981
+ struct ggml_tensor * a);
982
+
983
+ GGML_API struct ggml_tensor * ggml_step(
984
+ struct ggml_context * ctx,
985
+ struct ggml_tensor * a);
986
+
987
+ GGML_API struct ggml_tensor * ggml_step_inplace(
988
+ struct ggml_context * ctx,
989
+ struct ggml_tensor * a);
990
+
991
+ GGML_API struct ggml_tensor * ggml_tanh(
992
+ struct ggml_context * ctx,
993
+ struct ggml_tensor * a);
994
+
995
+ GGML_API struct ggml_tensor * ggml_tanh_inplace(
996
+ struct ggml_context * ctx,
997
+ struct ggml_tensor * a);
998
+
999
+ GGML_API struct ggml_tensor * ggml_elu(
1000
+ struct ggml_context * ctx,
1001
+ struct ggml_tensor * a);
1002
+
1003
+ GGML_API struct ggml_tensor * ggml_elu_inplace(
1004
+ struct ggml_context * ctx,
1005
+ struct ggml_tensor * a);
1006
+
1007
+ GGML_API struct ggml_tensor * ggml_relu(
1008
+ struct ggml_context * ctx,
1009
+ struct ggml_tensor * a);
1010
+
1011
+ GGML_API struct ggml_tensor * ggml_leaky_relu(
1012
+ struct ggml_context * ctx,
1013
+ struct ggml_tensor * a, float negative_slope, bool inplace);
1014
+
1015
+ GGML_API struct ggml_tensor * ggml_relu_inplace(
1016
+ struct ggml_context * ctx,
1017
+ struct ggml_tensor * a);
1018
+
1019
+ GGML_API struct ggml_tensor * ggml_sigmoid(
1020
+ struct ggml_context * ctx,
1021
+ struct ggml_tensor * a);
1022
+
1023
+ GGML_API struct ggml_tensor * ggml_sigmoid_inplace(
1024
+ struct ggml_context * ctx,
1025
+ struct ggml_tensor * a);
1026
+
1027
+ GGML_API struct ggml_tensor * ggml_gelu(
1028
+ struct ggml_context * ctx,
1029
+ struct ggml_tensor * a);
1030
+
1031
+ GGML_API struct ggml_tensor * ggml_gelu_inplace(
1032
+ struct ggml_context * ctx,
1033
+ struct ggml_tensor * a);
1034
+
1035
+ GGML_API struct ggml_tensor * ggml_gelu_quick(
1036
+ struct ggml_context * ctx,
1037
+ struct ggml_tensor * a);
1038
+
1039
+ GGML_API struct ggml_tensor * ggml_gelu_quick_inplace(
1040
+ struct ggml_context * ctx,
1041
+ struct ggml_tensor * a);
1042
+
1043
+ GGML_API struct ggml_tensor * ggml_silu(
1044
+ struct ggml_context * ctx,
1045
+ struct ggml_tensor * a);
1046
+
1047
+ GGML_API struct ggml_tensor * ggml_silu_inplace(
1048
+ struct ggml_context * ctx,
1049
+ struct ggml_tensor * a);
1050
+
1051
+ // a - x
1052
+ // b - dy
1053
+ GGML_API struct ggml_tensor * ggml_silu_back(
1054
+ struct ggml_context * ctx,
1055
+ struct ggml_tensor * a,
1056
+ struct ggml_tensor * b);
1057
+
1058
+ // hardswish(x) = x * relu6(x + 3) / 6
1059
+ GGML_API struct ggml_tensor * ggml_hardswish(
1060
+ struct ggml_context * ctx,
1061
+ struct ggml_tensor * a);
1062
+
1063
+ // hardsigmoid(x) = relu6(x + 3) / 6
1064
+ GGML_API struct ggml_tensor * ggml_hardsigmoid(
1065
+ struct ggml_context * ctx,
1066
+ struct ggml_tensor * a);
1067
+
1068
+ GGML_API struct ggml_tensor * ggml_exp(
1069
+ struct ggml_context * ctx,
1070
+ struct ggml_tensor * a);
1071
+
1072
+ GGML_API struct ggml_tensor * ggml_exp_inplace(
1073
+ struct ggml_context * ctx,
1074
+ struct ggml_tensor * a);
1075
+
1076
+ // normalize along rows
1077
+ GGML_API struct ggml_tensor * ggml_norm(
1078
+ struct ggml_context * ctx,
1079
+ struct ggml_tensor * a,
1080
+ float eps);
1081
+
1082
+ GGML_API struct ggml_tensor * ggml_norm_inplace(
1083
+ struct ggml_context * ctx,
1084
+ struct ggml_tensor * a,
1085
+ float eps);
1086
+
1087
+ GGML_API struct ggml_tensor * ggml_rms_norm(
1088
+ struct ggml_context * ctx,
1089
+ struct ggml_tensor * a,
1090
+ float eps);
1091
+
1092
+ GGML_API struct ggml_tensor * ggml_rms_norm_inplace(
1093
+ struct ggml_context * ctx,
1094
+ struct ggml_tensor * a,
1095
+ float eps);
1096
+
1097
+ // group normalize along ne0*ne1*n_groups
1098
+ // used in stable-diffusion
1099
+ GGML_API struct ggml_tensor * ggml_group_norm(
1100
+ struct ggml_context * ctx,
1101
+ struct ggml_tensor * a,
1102
+ int n_groups,
1103
+ float eps);
1104
+
1105
+ GGML_API struct ggml_tensor * ggml_group_norm_inplace(
1106
+ struct ggml_context * ctx,
1107
+ struct ggml_tensor * a,
1108
+ int n_groups,
1109
+ float eps);
1110
+
1111
+ // a - x
1112
+ // b - dy
1113
+ GGML_API struct ggml_tensor * ggml_rms_norm_back(
1114
+ struct ggml_context * ctx,
1115
+ struct ggml_tensor * a,
1116
+ struct ggml_tensor * b,
1117
+ float eps);
1118
+
1119
+ // A: k columns, n rows => [ne03, ne02, n, k]
1120
+ // B: k columns, m rows (i.e. we transpose it internally) => [ne03 * x, ne02 * y, m, k]
1121
+ // result is n columns, m rows => [ne03 * x, ne02 * y, m, n]
1122
+ GGML_API struct ggml_tensor * ggml_mul_mat(
1123
+ struct ggml_context * ctx,
1124
+ struct ggml_tensor * a,
1125
+ struct ggml_tensor * b);
1126
+
1127
+ // change the precision of a matrix multiplication
1128
+ // set to GGML_PREC_F32 for higher precision (useful for phi-2)
1129
+ GGML_API void ggml_mul_mat_set_prec(
1130
+ struct ggml_tensor * a,
1131
+ enum ggml_prec prec);
1132
+
1133
+ // indirect matrix multiplication
1134
+ GGML_API struct ggml_tensor * ggml_mul_mat_id(
1135
+ struct ggml_context * ctx,
1136
+ struct ggml_tensor * as,
1137
+ struct ggml_tensor * b,
1138
+ struct ggml_tensor * ids);
1139
+
1140
+ // A: m columns, n rows,
1141
+ // B: p columns, n rows,
1142
+ // result is m columns, p rows
1143
+ GGML_API struct ggml_tensor * ggml_out_prod(
1144
+ struct ggml_context * ctx,
1145
+ struct ggml_tensor * a,
1146
+ struct ggml_tensor * b);
1147
+
1148
+ //
1149
+ // operations on tensors without backpropagation
1150
+ //
1151
+
1152
+ GGML_API struct ggml_tensor * ggml_scale(
1153
+ struct ggml_context * ctx,
1154
+ struct ggml_tensor * a,
1155
+ float s);
1156
+
1157
+ // in-place, returns view(a)
1158
+ GGML_API struct ggml_tensor * ggml_scale_inplace(
1159
+ struct ggml_context * ctx,
1160
+ struct ggml_tensor * a,
1161
+ float s);
1162
+
1163
+ // b -> view(a,offset,nb1,nb2,3), return modified a
1164
+ GGML_API struct ggml_tensor * ggml_set(
1165
+ struct ggml_context * ctx,
1166
+ struct ggml_tensor * a,
1167
+ struct ggml_tensor * b,
1168
+ size_t nb1,
1169
+ size_t nb2,
1170
+ size_t nb3,
1171
+ size_t offset); // in bytes
1172
+
1173
+ // b -> view(a,offset,nb1,nb2,3), return view(a)
1174
+ GGML_API struct ggml_tensor * ggml_set_inplace(
1175
+ struct ggml_context * ctx,
1176
+ struct ggml_tensor * a,
1177
+ struct ggml_tensor * b,
1178
+ size_t nb1,
1179
+ size_t nb2,
1180
+ size_t nb3,
1181
+ size_t offset); // in bytes
1182
+
1183
+ GGML_API struct ggml_tensor * ggml_set_1d(
1184
+ struct ggml_context * ctx,
1185
+ struct ggml_tensor * a,
1186
+ struct ggml_tensor * b,
1187
+ size_t offset); // in bytes
1188
+
1189
+ GGML_API struct ggml_tensor * ggml_set_1d_inplace(
1190
+ struct ggml_context * ctx,
1191
+ struct ggml_tensor * a,
1192
+ struct ggml_tensor * b,
1193
+ size_t offset); // in bytes
1194
+
1195
+ // b -> view(a,offset,nb1,nb2,3), return modified a
1196
+ GGML_API struct ggml_tensor * ggml_set_2d(
1197
+ struct ggml_context * ctx,
1198
+ struct ggml_tensor * a,
1199
+ struct ggml_tensor * b,
1200
+ size_t nb1,
1201
+ size_t offset); // in bytes
1202
+
1203
+ // b -> view(a,offset,nb1,nb2,3), return view(a)
1204
+ GGML_API struct ggml_tensor * ggml_set_2d_inplace(
1205
+ struct ggml_context * ctx,
1206
+ struct ggml_tensor * a,
1207
+ struct ggml_tensor * b,
1208
+ size_t nb1,
1209
+ size_t offset); // in bytes
1210
+
1211
+ // a -> b, return view(b)
1212
+ GGML_API struct ggml_tensor * ggml_cpy(
1213
+ struct ggml_context * ctx,
1214
+ struct ggml_tensor * a,
1215
+ struct ggml_tensor * b);
1216
+
1217
+ GGML_API struct ggml_tensor * ggml_cast(
1218
+ struct ggml_context * ctx,
1219
+ struct ggml_tensor * a,
1220
+ enum ggml_type type);
1221
+
1222
+ // make contiguous
1223
+ GGML_API struct ggml_tensor * ggml_cont(
1224
+ struct ggml_context * ctx,
1225
+ struct ggml_tensor * a);
1226
+
1227
+ // make contiguous, with new shape
1228
+ GGML_API struct ggml_tensor * ggml_cont_1d(
1229
+ struct ggml_context * ctx,
1230
+ struct ggml_tensor * a,
1231
+ int64_t ne0);
1232
+
1233
+ GGML_API struct ggml_tensor * ggml_cont_2d(
1234
+ struct ggml_context * ctx,
1235
+ struct ggml_tensor * a,
1236
+ int64_t ne0,
1237
+ int64_t ne1);
1238
+
1239
+ GGML_API struct ggml_tensor * ggml_cont_3d(
1240
+ struct ggml_context * ctx,
1241
+ struct ggml_tensor * a,
1242
+ int64_t ne0,
1243
+ int64_t ne1,
1244
+ int64_t ne2);
1245
+
1246
+ GGML_API struct ggml_tensor * ggml_cont_4d(
1247
+ struct ggml_context * ctx,
1248
+ struct ggml_tensor * a,
1249
+ int64_t ne0,
1250
+ int64_t ne1,
1251
+ int64_t ne2,
1252
+ int64_t ne3);
1253
+
1254
+ // return view(a), b specifies the new shape
1255
+ // TODO: when we start computing gradient, make a copy instead of view
1256
+ GGML_API struct ggml_tensor * ggml_reshape(
1257
+ struct ggml_context * ctx,
1258
+ struct ggml_tensor * a,
1259
+ struct ggml_tensor * b);
1260
+
1261
+ // return view(a)
1262
+ // TODO: when we start computing gradient, make a copy instead of view
1263
+ GGML_API struct ggml_tensor * ggml_reshape_1d(
1264
+ struct ggml_context * ctx,
1265
+ struct ggml_tensor * a,
1266
+ int64_t ne0);
1267
+
1268
+ GGML_API struct ggml_tensor * ggml_reshape_2d(
1269
+ struct ggml_context * ctx,
1270
+ struct ggml_tensor * a,
1271
+ int64_t ne0,
1272
+ int64_t ne1);
1273
+
1274
+ // return view(a)
1275
+ // TODO: when we start computing gradient, make a copy instead of view
1276
+ GGML_API struct ggml_tensor * ggml_reshape_3d(
1277
+ struct ggml_context * ctx,
1278
+ struct ggml_tensor * a,
1279
+ int64_t ne0,
1280
+ int64_t ne1,
1281
+ int64_t ne2);
1282
+
1283
+ GGML_API struct ggml_tensor * ggml_reshape_4d(
1284
+ struct ggml_context * ctx,
1285
+ struct ggml_tensor * a,
1286
+ int64_t ne0,
1287
+ int64_t ne1,
1288
+ int64_t ne2,
1289
+ int64_t ne3);
1290
+
1291
+ // offset in bytes
1292
+ GGML_API struct ggml_tensor * ggml_view_1d(
1293
+ struct ggml_context * ctx,
1294
+ struct ggml_tensor * a,
1295
+ int64_t ne0,
1296
+ size_t offset);
1297
+
1298
+ GGML_API struct ggml_tensor * ggml_view_2d(
1299
+ struct ggml_context * ctx,
1300
+ struct ggml_tensor * a,
1301
+ int64_t ne0,
1302
+ int64_t ne1,
1303
+ size_t nb1, // row stride in bytes
1304
+ size_t offset);
1305
+
1306
+ GGML_API struct ggml_tensor * ggml_view_3d(
1307
+ struct ggml_context * ctx,
1308
+ struct ggml_tensor * a,
1309
+ int64_t ne0,
1310
+ int64_t ne1,
1311
+ int64_t ne2,
1312
+ size_t nb1, // row stride in bytes
1313
+ size_t nb2, // slice stride in bytes
1314
+ size_t offset);
1315
+
1316
+ GGML_API struct ggml_tensor * ggml_view_4d(
1317
+ struct ggml_context * ctx,
1318
+ struct ggml_tensor * a,
1319
+ int64_t ne0,
1320
+ int64_t ne1,
1321
+ int64_t ne2,
1322
+ int64_t ne3,
1323
+ size_t nb1, // row stride in bytes
1324
+ size_t nb2, // slice stride in bytes
1325
+ size_t nb3,
1326
+ size_t offset);
1327
+
1328
+ GGML_API struct ggml_tensor * ggml_permute(
1329
+ struct ggml_context * ctx,
1330
+ struct ggml_tensor * a,
1331
+ int axis0,
1332
+ int axis1,
1333
+ int axis2,
1334
+ int axis3);
1335
+
1336
+ // alias for ggml_permute(ctx, a, 1, 0, 2, 3)
1337
+ GGML_API struct ggml_tensor * ggml_transpose(
1338
+ struct ggml_context * ctx,
1339
+ struct ggml_tensor * a);
1340
+
1341
+ // supports 3D: a->ne[2] == b->ne[1]
1342
+ GGML_API struct ggml_tensor * ggml_get_rows(
1343
+ struct ggml_context * ctx,
1344
+ struct ggml_tensor * a, // data
1345
+ struct ggml_tensor * b); // row indices
1346
+
1347
+ GGML_API struct ggml_tensor * ggml_get_rows_back(
1348
+ struct ggml_context * ctx,
1349
+ struct ggml_tensor * a, // gradients of ggml_get_rows result
1350
+ struct ggml_tensor * b, // row indices
1351
+ struct ggml_tensor * c); // data for ggml_get_rows, only used for its shape
1352
+
1353
+ GGML_API struct ggml_tensor * ggml_diag(
1354
+ struct ggml_context * ctx,
1355
+ struct ggml_tensor * a);
1356
+
1357
+ // set elements above the diagonal to -INF
1358
+ GGML_API struct ggml_tensor * ggml_diag_mask_inf(
1359
+ struct ggml_context * ctx,
1360
+ struct ggml_tensor * a,
1361
+ int n_past);
1362
+
1363
+ // in-place, returns view(a)
1364
+ GGML_API struct ggml_tensor * ggml_diag_mask_inf_inplace(
1365
+ struct ggml_context * ctx,
1366
+ struct ggml_tensor * a,
1367
+ int n_past);
1368
+
1369
+ // set elements above the diagonal to 0
1370
+ GGML_API struct ggml_tensor * ggml_diag_mask_zero(
1371
+ struct ggml_context * ctx,
1372
+ struct ggml_tensor * a,
1373
+ int n_past);
1374
+
1375
+ // in-place, returns view(a)
1376
+ GGML_API struct ggml_tensor * ggml_diag_mask_zero_inplace(
1377
+ struct ggml_context * ctx,
1378
+ struct ggml_tensor * a,
1379
+ int n_past);
1380
+
1381
+ GGML_API struct ggml_tensor * ggml_soft_max(
1382
+ struct ggml_context * ctx,
1383
+ struct ggml_tensor * a);
1384
+
1385
+ // in-place, returns view(a)
1386
+ GGML_API struct ggml_tensor * ggml_soft_max_inplace(
1387
+ struct ggml_context * ctx,
1388
+ struct ggml_tensor * a);
1389
+
1390
+ // fused soft_max(a*scale + mask*(ALiBi slope))
1391
+ // mask is optional
1392
+ // max_bias = 0.0f for no ALiBi
1393
+ GGML_API struct ggml_tensor * ggml_soft_max_ext(
1394
+ struct ggml_context * ctx,
1395
+ struct ggml_tensor * a,
1396
+ struct ggml_tensor * mask,
1397
+ float scale,
1398
+ float max_bias);
1399
+
1400
+ GGML_API struct ggml_tensor * ggml_soft_max_back(
1401
+ struct ggml_context * ctx,
1402
+ struct ggml_tensor * a,
1403
+ struct ggml_tensor * b);
1404
+
1405
+ // in-place, returns view(a)
1406
+ GGML_API struct ggml_tensor * ggml_soft_max_back_inplace(
1407
+ struct ggml_context * ctx,
1408
+ struct ggml_tensor * a,
1409
+ struct ggml_tensor * b);
1410
+
1411
+ // rotary position embedding
1412
+ // if (mode & 1) - skip n_past elements (NOT SUPPORTED)
1413
+ // if (mode & GGML_ROPE_TYPE_NEOX) - GPT-NeoX style
1414
+ //
1415
+ // b is an int32 vector with size a->ne[2], it contains the positions
1416
+ GGML_API struct ggml_tensor * ggml_rope(
1417
+ struct ggml_context * ctx,
1418
+ struct ggml_tensor * a,
1419
+ struct ggml_tensor * b,
1420
+ int n_dims,
1421
+ int mode);
1422
+
1423
+ // in-place, returns view(a)
1424
+ GGML_API struct ggml_tensor * ggml_rope_inplace(
1425
+ struct ggml_context * ctx,
1426
+ struct ggml_tensor * a,
1427
+ struct ggml_tensor * b,
1428
+ int n_dims,
1429
+ int mode);
1430
+
1431
+ // custom RoPE
1432
+ // c is freq factors (e.g. phi3-128k), (optional)
1433
+ GGML_API struct ggml_tensor * ggml_rope_ext(
1434
+ struct ggml_context * ctx,
1435
+ struct ggml_tensor * a,
1436
+ struct ggml_tensor * b,
1437
+ struct ggml_tensor * c,
1438
+ int n_dims,
1439
+ int mode,
1440
+ int n_ctx_orig,
1441
+ float freq_base,
1442
+ float freq_scale,
1443
+ float ext_factor,
1444
+ float attn_factor,
1445
+ float beta_fast,
1446
+ float beta_slow);
1447
+
1448
+ GGML_API struct ggml_tensor * ggml_rope_multi(
1449
+ struct ggml_context * ctx,
1450
+ struct ggml_tensor * a,
1451
+ struct ggml_tensor * b,
1452
+ struct ggml_tensor * c,
1453
+ int n_dims,
1454
+ int sections[4],
1455
+ int mode,
1456
+ int n_ctx_orig,
1457
+ float freq_base,
1458
+ float freq_scale,
1459
+ float ext_factor,
1460
+ float attn_factor,
1461
+ float beta_fast,
1462
+ float beta_slow);
1463
+
1464
+ // in-place, returns view(a)
1465
+ GGML_API struct ggml_tensor * ggml_rope_ext_inplace(
1466
+ struct ggml_context * ctx,
1467
+ struct ggml_tensor * a,
1468
+ struct ggml_tensor * b,
1469
+ struct ggml_tensor * c,
1470
+ int n_dims,
1471
+ int mode,
1472
+ int n_ctx_orig,
1473
+ float freq_base,
1474
+ float freq_scale,
1475
+ float ext_factor,
1476
+ float attn_factor,
1477
+ float beta_fast,
1478
+ float beta_slow);
1479
+
1480
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom(
1481
+ struct ggml_context * ctx,
1482
+ struct ggml_tensor * a,
1483
+ struct ggml_tensor * b,
1484
+ int n_dims,
1485
+ int mode,
1486
+ int n_ctx_orig,
1487
+ float freq_base,
1488
+ float freq_scale,
1489
+ float ext_factor,
1490
+ float attn_factor,
1491
+ float beta_fast,
1492
+ float beta_slow),
1493
+ "use ggml_rope_ext instead");
1494
+
1495
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
1496
+ struct ggml_context * ctx,
1497
+ struct ggml_tensor * a,
1498
+ struct ggml_tensor * b,
1499
+ int n_dims,
1500
+ int mode,
1501
+ int n_ctx_orig,
1502
+ float freq_base,
1503
+ float freq_scale,
1504
+ float ext_factor,
1505
+ float attn_factor,
1506
+ float beta_fast,
1507
+ float beta_slow),
1508
+ "use ggml_rope_ext_inplace instead");
1509
+
1510
+ // compute correction dims for YaRN RoPE scaling
1511
+ GGML_API void ggml_rope_yarn_corr_dims(
1512
+ int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]);
1513
+
1514
+ // rotary position embedding backward, i.e compute dx from dy
1515
+ // a - dy
1516
+ GGML_API struct ggml_tensor * ggml_rope_back(
1517
+ struct ggml_context * ctx,
1518
+ struct ggml_tensor * a, // gradients of ggml_rope result
1519
+ struct ggml_tensor * b, // positions
1520
+ struct ggml_tensor * c, // freq factors
1521
+ int n_dims,
1522
+ int mode,
1523
+ int n_ctx_orig,
1524
+ float freq_base,
1525
+ float freq_scale,
1526
+ float ext_factor,
1527
+ float attn_factor,
1528
+ float beta_fast,
1529
+ float beta_slow);
1530
+
1531
+ // clamp
1532
+ // in-place, returns view(a)
1533
+ GGML_API struct ggml_tensor * ggml_clamp(
1534
+ struct ggml_context * ctx,
1535
+ struct ggml_tensor * a,
1536
+ float min,
1537
+ float max);
1538
+
1539
+ // im2col
1540
+ // converts data into a format that effectively results in a convolution when combined with matrix multiplication
1541
+ GGML_API struct ggml_tensor * ggml_im2col(
1542
+ struct ggml_context * ctx,
1543
+ struct ggml_tensor * a, // convolution kernel
1544
+ struct ggml_tensor * b, // data
1545
+ int s0, // stride dimension 0
1546
+ int s1, // stride dimension 1
1547
+ int p0, // padding dimension 0
1548
+ int p1, // padding dimension 1
1549
+ int d0, // dilation dimension 0
1550
+ int d1, // dilation dimension 1
1551
+ bool is_2D,
1552
+ enum ggml_type dst_type);
1553
+
1554
+ GGML_API struct ggml_tensor * ggml_im2col_back(
1555
+ struct ggml_context * ctx,
1556
+ struct ggml_tensor * a, // convolution kernel
1557
+ struct ggml_tensor * b, // gradient of im2col output
1558
+ int64_t * ne, // shape of im2col input
1559
+ int s0, // stride dimension 0
1560
+ int s1, // stride dimension 1
1561
+ int p0, // padding dimension 0
1562
+ int p1, // padding dimension 1
1563
+ int d0, // dilation dimension 0
1564
+ int d1, // dilation dimension 1
1565
+ bool is_2D);
1566
+
1567
+ GGML_API struct ggml_tensor * ggml_conv_depthwise_2d(
1568
+ struct ggml_context * ctx,
1569
+ struct ggml_tensor * a, // convolution kernel
1570
+ struct ggml_tensor * b, // data
1571
+ int s0, // stride dimension 0
1572
+ int s1, // stride dimension 1
1573
+ int p0, // padding dimension 0
1574
+ int p1, // padding dimension 1
1575
+ int d0, // dilation dimension 0
1576
+ int d1); // dilation dimension 1
1577
+
1578
+ GGML_API struct ggml_tensor * ggml_conv_1d(
1579
+ struct ggml_context * ctx,
1580
+ struct ggml_tensor * a, // convolution kernel
1581
+ struct ggml_tensor * b, // data
1582
+ int s0, // stride
1583
+ int p0, // padding
1584
+ int d0); // dilation
1585
+
1586
+ // conv_1d with padding = half
1587
+ // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
1588
+ GGML_API struct ggml_tensor* ggml_conv_1d_ph(
1589
+ struct ggml_context * ctx,
1590
+ struct ggml_tensor * a, // convolution kernel
1591
+ struct ggml_tensor * b, // data
1592
+ int s, // stride
1593
+ int d); // dilation
1594
+
1595
+ GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
1596
+ struct ggml_context * ctx,
1597
+ struct ggml_tensor * a, // convolution kernel
1598
+ struct ggml_tensor * b, // data
1599
+ int s0, // stride
1600
+ int p0, // padding
1601
+ int d0); // dilation
1602
+
1603
+ GGML_API struct ggml_tensor * ggml_conv_2d(
1604
+ struct ggml_context * ctx,
1605
+ struct ggml_tensor * a, // convolution kernel
1606
+ struct ggml_tensor * b, // data
1607
+ int s0, // stride dimension 0
1608
+ int s1, // stride dimension 1
1609
+ int p0, // padding dimension 0
1610
+ int p1, // padding dimension 1
1611
+ int d0, // dilation dimension 0
1612
+ int d1); // dilation dimension 1
1613
+
1614
+
1615
+ // kernel size is a->ne[0] x a->ne[1]
1616
+ // stride is equal to kernel size
1617
+ // padding is zero
1618
+ // example:
1619
+ // a: 16 16 3 768
1620
+ // b: 1024 1024 3 1
1621
+ // res: 64 64 768 1
1622
+ // used in sam
1623
+ GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0(
1624
+ struct ggml_context * ctx,
1625
+ struct ggml_tensor * a,
1626
+ struct ggml_tensor * b);
1627
+
1628
+ // kernel size is a->ne[0] x a->ne[1]
1629
+ // stride is 1
1630
+ // padding is half
1631
+ // example:
1632
+ // a: 3 3 256 256
1633
+ // b: 64 64 256 1
1634
+ // res: 64 64 256 1
1635
+ // used in sam
1636
+ GGML_API struct ggml_tensor * ggml_conv_2d_s1_ph(
1637
+ struct ggml_context * ctx,
1638
+ struct ggml_tensor * a,
1639
+ struct ggml_tensor * b);
1640
+
1641
+ GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0(
1642
+ struct ggml_context * ctx,
1643
+ struct ggml_tensor * a,
1644
+ struct ggml_tensor * b,
1645
+ int stride);
1646
+
1647
+ enum ggml_op_pool {
1648
+ GGML_OP_POOL_MAX,
1649
+ GGML_OP_POOL_AVG,
1650
+ GGML_OP_POOL_COUNT,
1651
+ };
1652
+
1653
+ GGML_API struct ggml_tensor * ggml_pool_1d(
1654
+ struct ggml_context * ctx,
1655
+ struct ggml_tensor * a,
1656
+ enum ggml_op_pool op,
1657
+ int k0, // kernel size
1658
+ int s0, // stride
1659
+ int p0); // padding
1660
+
1661
+ // the result will have 2*p0 padding for the first dimension
1662
+ // and 2*p1 padding for the second dimension
1663
+ GGML_API struct ggml_tensor * ggml_pool_2d(
1664
+ struct ggml_context * ctx,
1665
+ struct ggml_tensor * a,
1666
+ enum ggml_op_pool op,
1667
+ int k0,
1668
+ int k1,
1669
+ int s0,
1670
+ int s1,
1671
+ float p0,
1672
+ float p1);
1673
+
1674
+ GGML_API struct ggml_tensor * ggml_pool_2d_back(
1675
+ struct ggml_context * ctx,
1676
+ struct ggml_tensor * a,
1677
+ struct ggml_tensor * af, // "a"/input used in forward pass
1678
+ enum ggml_op_pool op,
1679
+ int k0,
1680
+ int k1,
1681
+ int s0,
1682
+ int s1,
1683
+ float p0,
1684
+ float p1);
1685
+
1686
+ // nearest interpolate
1687
+ // multiplies ne0 and ne1 by scale factor
1688
+ // used in stable-diffusion
1689
+ GGML_API struct ggml_tensor * ggml_upscale(
1690
+ struct ggml_context * ctx,
1691
+ struct ggml_tensor * a,
1692
+ int scale_factor);
1693
+
1694
+ // nearest interpolate
1695
+ // nearest interpolate to specified dimensions
1696
+ // used in tortoise.cpp
1697
+ GGML_API struct ggml_tensor * ggml_upscale_ext(
1698
+ struct ggml_context * ctx,
1699
+ struct ggml_tensor * a,
1700
+ int ne0,
1701
+ int ne1,
1702
+ int ne2,
1703
+ int ne3);
1704
+
1705
+ // pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
1706
+ GGML_API struct ggml_tensor * ggml_pad(
1707
+ struct ggml_context * ctx,
1708
+ struct ggml_tensor * a,
1709
+ int p0,
1710
+ int p1,
1711
+ int p2,
1712
+ int p3);
1713
+
1714
+ // pad each dimension with reflection: [a, b, c, d] -> [b, a, b, c, d, c]
1715
+ GGML_API struct ggml_tensor * ggml_pad_reflect_1d(
1716
+ struct ggml_context * ctx,
1717
+ struct ggml_tensor * a,
1718
+ int p0,
1719
+ int p1);
1720
+
1721
+ // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
1722
+ // timesteps: [N,]
1723
+ // return: [N, dim]
1724
+ GGML_API struct ggml_tensor * ggml_timestep_embedding(
1725
+ struct ggml_context * ctx,
1726
+ struct ggml_tensor * timesteps,
1727
+ int dim,
1728
+ int max_period);
1729
+
1730
+ // sort rows
1731
+ enum ggml_sort_order {
1732
+ GGML_SORT_ORDER_ASC,
1733
+ GGML_SORT_ORDER_DESC,
1734
+ };
1735
+
1736
+ GGML_API struct ggml_tensor * ggml_argsort(
1737
+ struct ggml_context * ctx,
1738
+ struct ggml_tensor * a,
1739
+ enum ggml_sort_order order);
1740
+
1741
+ GGML_API struct ggml_tensor * ggml_arange(
1742
+ struct ggml_context * ctx,
1743
+ float start,
1744
+ float stop,
1745
+ float step);
1746
+
1747
+ // top k elements per row
1748
+ GGML_API struct ggml_tensor * ggml_top_k(
1749
+ struct ggml_context * ctx,
1750
+ struct ggml_tensor * a,
1751
+ int k);
1752
+
1753
+ #define GGML_KQ_MASK_PAD 32
1754
+
1755
+ // q: [n_embd, n_batch, n_head, 1]
1756
+ // k: [n_embd, n_kv, n_head_kv, 1]
1757
+ // v: [n_embd, n_kv, n_head_kv, 1] !! not transposed !!
1758
+ // mask: [n_kv, n_batch_pad, 1, 1] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
1759
+ // res: [n_embd, n_head, n_batch, 1] !! permuted !!
1760
+ GGML_API struct ggml_tensor * ggml_flash_attn_ext(
1761
+ struct ggml_context * ctx,
1762
+ struct ggml_tensor * q,
1763
+ struct ggml_tensor * k,
1764
+ struct ggml_tensor * v,
1765
+ struct ggml_tensor * mask,
1766
+ float scale,
1767
+ float max_bias,
1768
+ float logit_softcap);
1769
+
1770
+ GGML_API void ggml_flash_attn_ext_set_prec(
1771
+ struct ggml_tensor * a,
1772
+ enum ggml_prec prec);
1773
+
1774
+ GGML_API enum ggml_prec ggml_flash_attn_ext_get_prec(
1775
+ const struct ggml_tensor * a);
1776
+
1777
+ // TODO: needs to be adapted to ggml_flash_attn_ext
1778
+ GGML_API struct ggml_tensor * ggml_flash_attn_back(
1779
+ struct ggml_context * ctx,
1780
+ struct ggml_tensor * q,
1781
+ struct ggml_tensor * k,
1782
+ struct ggml_tensor * v,
1783
+ struct ggml_tensor * d,
1784
+ bool masked);
1785
+
1786
+ GGML_API struct ggml_tensor * ggml_ssm_conv(
1787
+ struct ggml_context * ctx,
1788
+ struct ggml_tensor * sx,
1789
+ struct ggml_tensor * c);
1790
+
1791
+ GGML_API struct ggml_tensor * ggml_ssm_scan(
1792
+ struct ggml_context * ctx,
1793
+ struct ggml_tensor * s,
1794
+ struct ggml_tensor * x,
1795
+ struct ggml_tensor * dt,
1796
+ struct ggml_tensor * A,
1797
+ struct ggml_tensor * B,
1798
+ struct ggml_tensor * C);
1799
+
1800
+ // partition into non-overlapping windows with padding if needed
1801
+ // example:
1802
+ // a: 768 64 64 1
1803
+ // w: 14
1804
+ // res: 768 14 14 25
1805
+ // used in sam
1806
+ GGML_API struct ggml_tensor * ggml_win_part(
1807
+ struct ggml_context * ctx,
1808
+ struct ggml_tensor * a,
1809
+ int w);
1810
+
1811
+ // reverse of ggml_win_part
1812
+ // used in sam
1813
+ GGML_API struct ggml_tensor * ggml_win_unpart(
1814
+ struct ggml_context * ctx,
1815
+ struct ggml_tensor * a,
1816
+ int w0,
1817
+ int h0,
1818
+ int w);
1819
+
1820
+ GGML_API struct ggml_tensor * ggml_unary(
1821
+ struct ggml_context * ctx,
1822
+ struct ggml_tensor * a,
1823
+ enum ggml_unary_op op);
1824
+
1825
+ GGML_API struct ggml_tensor * ggml_unary_inplace(
1826
+ struct ggml_context * ctx,
1827
+ struct ggml_tensor * a,
1828
+ enum ggml_unary_op op);
1829
+
1830
+ // used in sam
1831
+ GGML_API struct ggml_tensor * ggml_get_rel_pos(
1832
+ struct ggml_context * ctx,
1833
+ struct ggml_tensor * a,
1834
+ int qh,
1835
+ int kh);
1836
+
1837
+ // used in sam
1838
+ GGML_API struct ggml_tensor * ggml_add_rel_pos(
1839
+ struct ggml_context * ctx,
1840
+ struct ggml_tensor * a,
1841
+ struct ggml_tensor * pw,
1842
+ struct ggml_tensor * ph);
1843
+
1844
+ GGML_API struct ggml_tensor * ggml_add_rel_pos_inplace(
1845
+ struct ggml_context * ctx,
1846
+ struct ggml_tensor * a,
1847
+ struct ggml_tensor * pw,
1848
+ struct ggml_tensor * ph);
1849
+
1850
+ GGML_API struct ggml_tensor * ggml_rwkv_wkv6(
1851
+ struct ggml_context * ctx,
1852
+ struct ggml_tensor * k,
1853
+ struct ggml_tensor * v,
1854
+ struct ggml_tensor * r,
1855
+ struct ggml_tensor * tf,
1856
+ struct ggml_tensor * td,
1857
+ struct ggml_tensor * state);
1858
+
1859
+ // custom operators
1860
+
1861
+ typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
1862
+ typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
1863
+
1864
+ typedef void (*ggml_custom1_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *);
1865
+ typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
1866
+ typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
1867
+
1868
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_f32(
1869
+ struct ggml_context * ctx,
1870
+ struct ggml_tensor * a,
1871
+ ggml_unary_op_f32_t fun),
1872
+ "use ggml_map_custom1 instead");
1873
+
1874
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_inplace_f32(
1875
+ struct ggml_context * ctx,
1876
+ struct ggml_tensor * a,
1877
+ ggml_unary_op_f32_t fun),
1878
+ "use ggml_map_custom1_inplace instead");
1879
+
1880
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_f32(
1881
+ struct ggml_context * ctx,
1882
+ struct ggml_tensor * a,
1883
+ struct ggml_tensor * b,
1884
+ ggml_binary_op_f32_t fun),
1885
+ "use ggml_map_custom2 instead");
1886
+
1887
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_inplace_f32(
1888
+ struct ggml_context * ctx,
1889
+ struct ggml_tensor * a,
1890
+ struct ggml_tensor * b,
1891
+ ggml_binary_op_f32_t fun),
1892
+ "use ggml_map_custom2_inplace instead");
1893
+
1894
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_f32(
1895
+ struct ggml_context * ctx,
1896
+ struct ggml_tensor * a,
1897
+ ggml_custom1_op_f32_t fun),
1898
+ "use ggml_map_custom1 instead");
1899
+
1900
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_inplace_f32(
1901
+ struct ggml_context * ctx,
1902
+ struct ggml_tensor * a,
1903
+ ggml_custom1_op_f32_t fun),
1904
+ "use ggml_map_custom1_inplace instead");
1905
+
1906
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_f32(
1907
+ struct ggml_context * ctx,
1908
+ struct ggml_tensor * a,
1909
+ struct ggml_tensor * b,
1910
+ ggml_custom2_op_f32_t fun),
1911
+ "use ggml_map_custom2 instead");
1912
+
1913
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_inplace_f32(
1914
+ struct ggml_context * ctx,
1915
+ struct ggml_tensor * a,
1916
+ struct ggml_tensor * b,
1917
+ ggml_custom2_op_f32_t fun),
1918
+ "use ggml_map_custom2_inplace instead");
1919
+
1920
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_f32(
1921
+ struct ggml_context * ctx,
1922
+ struct ggml_tensor * a,
1923
+ struct ggml_tensor * b,
1924
+ struct ggml_tensor * c,
1925
+ ggml_custom3_op_f32_t fun),
1926
+ "use ggml_map_custom3 instead");
1927
+
1928
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_inplace_f32(
1929
+ struct ggml_context * ctx,
1930
+ struct ggml_tensor * a,
1931
+ struct ggml_tensor * b,
1932
+ struct ggml_tensor * c,
1933
+ ggml_custom3_op_f32_t fun),
1934
+ "use ggml_map_custom3_inplace instead");
1935
+
1936
+ // custom operators v2
1937
+
1938
+ typedef void (*ggml_custom1_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata);
1939
+ typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata);
1940
+ typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata);
1941
+
1942
+ #define GGML_N_TASKS_MAX (-1)
1943
+ // n_tasks == GGML_N_TASKS_MAX means to use max number of tasks
1944
+
1945
+ GGML_API struct ggml_tensor * ggml_map_custom1(
1946
+ struct ggml_context * ctx,
1947
+ struct ggml_tensor * a,
1948
+ ggml_custom1_op_t fun,
1949
+ int n_tasks,
1950
+ void * userdata);
1951
+
1952
+ GGML_API struct ggml_tensor * ggml_map_custom1_inplace(
1953
+ struct ggml_context * ctx,
1954
+ struct ggml_tensor * a,
1955
+ ggml_custom1_op_t fun,
1956
+ int n_tasks,
1957
+ void * userdata);
1958
+
1959
+ GGML_API struct ggml_tensor * ggml_map_custom2(
1960
+ struct ggml_context * ctx,
1961
+ struct ggml_tensor * a,
1962
+ struct ggml_tensor * b,
1963
+ ggml_custom2_op_t fun,
1964
+ int n_tasks,
1965
+ void * userdata);
1966
+
1967
+ GGML_API struct ggml_tensor * ggml_map_custom2_inplace(
1968
+ struct ggml_context * ctx,
1969
+ struct ggml_tensor * a,
1970
+ struct ggml_tensor * b,
1971
+ ggml_custom2_op_t fun,
1972
+ int n_tasks,
1973
+ void * userdata);
1974
+
1975
+ GGML_API struct ggml_tensor * ggml_map_custom3(
1976
+ struct ggml_context * ctx,
1977
+ struct ggml_tensor * a,
1978
+ struct ggml_tensor * b,
1979
+ struct ggml_tensor * c,
1980
+ ggml_custom3_op_t fun,
1981
+ int n_tasks,
1982
+ void * userdata);
1983
+
1984
+ GGML_API struct ggml_tensor * ggml_map_custom3_inplace(
1985
+ struct ggml_context * ctx,
1986
+ struct ggml_tensor * a,
1987
+ struct ggml_tensor * b,
1988
+ struct ggml_tensor * c,
1989
+ ggml_custom3_op_t fun,
1990
+ int n_tasks,
1991
+ void * userdata);
1992
+
1993
+ // loss function
1994
+
1995
+ GGML_API struct ggml_tensor * ggml_cross_entropy_loss(
1996
+ struct ggml_context * ctx,
1997
+ struct ggml_tensor * a, // logits
1998
+ struct ggml_tensor * b); // labels
1999
+
2000
+ GGML_API struct ggml_tensor * ggml_cross_entropy_loss_back(
2001
+ struct ggml_context * ctx,
2002
+ struct ggml_tensor * a, // logits
2003
+ struct ggml_tensor * b, // labels
2004
+ struct ggml_tensor * c); // gradients of cross_entropy_loss result
2005
+
2006
+ // AdamW optimizer step
2007
+ // Paper: https://arxiv.org/pdf/1711.05101v3.pdf
2008
+ // PyTorch: https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html
2009
+ GGML_API struct ggml_tensor * ggml_opt_step_adamw(
2010
+ struct ggml_context * ctx,
2011
+ struct ggml_tensor * a,
2012
+ struct ggml_tensor * grad,
2013
+ struct ggml_tensor * m,
2014
+ struct ggml_tensor * v,
2015
+ struct ggml_tensor * adamw_params); // parameters such a the learning rate
2016
+
2017
+ //
2018
+ // automatic differentiation
2019
+ //
2020
+
2021
+ GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
2022
+ GGML_API void ggml_build_backward_expand(
2023
+ struct ggml_context * ctx_static, // context for static gradients (loss + gradient accumulation)
2024
+ struct ggml_context * ctx_compute, // context for gradient computation
2025
+ struct ggml_cgraph * cgraph,
2026
+ bool accumulate); // whether or not gradients should be accumulated, requires static allocation of tensors in ctx_static
2027
+
2028
+ // graph allocation in a context
2029
+ GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
2030
+ GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads);
2031
+ GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
2032
+ GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst);
2033
+ GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // set regular grads + optimizer momenta to 0, set loss grad to 1
2034
+ GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph);
2035
+
2036
+ GGML_API int ggml_graph_size (struct ggml_cgraph * cgraph);
2037
+ GGML_API struct ggml_tensor * ggml_graph_node (struct ggml_cgraph * cgraph, int i); // if i < 0, returns nodes[n_nodes + i]
2038
+ GGML_API struct ggml_tensor ** ggml_graph_nodes (struct ggml_cgraph * cgraph);
2039
+ GGML_API int ggml_graph_n_nodes(struct ggml_cgraph * cgraph);
2040
+
2041
+ GGML_API void ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
2042
+
2043
+ GGML_API size_t ggml_graph_overhead(void);
2044
+ GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
2045
+
2046
+ GGML_API struct ggml_tensor * ggml_graph_get_tensor (const struct ggml_cgraph * cgraph, const char * name);
2047
+ GGML_API struct ggml_tensor * ggml_graph_get_grad (const struct ggml_cgraph * cgraph, const struct ggml_tensor * node);
2048
+ GGML_API struct ggml_tensor * ggml_graph_get_grad_acc(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node);
2049
+
2050
+ GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
2051
+ GGML_API struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
2052
+
2053
+ // print info and performance information for the graph
2054
+ GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
2055
+
2056
+ // dump the graph into a file using the dot format
2057
+ GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
2058
+
2059
+ // TODO these functions were sandwiched in the old optimization interface, is there a better place for them?
2060
+ typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
2061
+
2062
+ // Set callback for all future logging events.
2063
+ // If this is not called, or NULL is supplied, everything is output on stderr.
2064
+ GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data);
2065
+
2066
+ GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
2067
+
2068
+ //
2069
+ // quantization
2070
+ //
2071
+
2072
+ // - ggml_quantize_init can be called multiple times with the same type
2073
+ // it will only initialize the quantization tables for the first call or after ggml_quantize_free
2074
+ // automatically called by ggml_quantize_chunk for convenience
2075
+ //
2076
+ // - ggml_quantize_free will free any memory allocated by ggml_quantize_init
2077
+ // call this at the end of the program to avoid memory leaks
2078
+ //
2079
+ // note: these are thread-safe
2080
+ //
2081
+ GGML_API void ggml_quantize_init(enum ggml_type type);
2082
+ GGML_API void ggml_quantize_free(void);
2083
+
2084
+ // some quantization type cannot be used without an importance matrix
2085
+ GGML_API bool ggml_quantize_requires_imatrix(enum ggml_type type);
2086
+
2087
+ // calls ggml_quantize_init internally (i.e. can allocate memory)
2088
+ GGML_API size_t ggml_quantize_chunk(
2089
+ enum ggml_type type,
2090
+ const float * src,
2091
+ void * dst,
2092
+ int64_t start,
2093
+ int64_t nrows,
2094
+ int64_t n_per_row,
2095
+ const float * imatrix);
2096
+
2097
+ //
2098
+ // gguf
2099
+ //
2100
+
2101
+ enum gguf_type {
2102
+ GGUF_TYPE_UINT8 = 0,
2103
+ GGUF_TYPE_INT8 = 1,
2104
+ GGUF_TYPE_UINT16 = 2,
2105
+ GGUF_TYPE_INT16 = 3,
2106
+ GGUF_TYPE_UINT32 = 4,
2107
+ GGUF_TYPE_INT32 = 5,
2108
+ GGUF_TYPE_FLOAT32 = 6,
2109
+ GGUF_TYPE_BOOL = 7,
2110
+ GGUF_TYPE_STRING = 8,
2111
+ GGUF_TYPE_ARRAY = 9,
2112
+ GGUF_TYPE_UINT64 = 10,
2113
+ GGUF_TYPE_INT64 = 11,
2114
+ GGUF_TYPE_FLOAT64 = 12,
2115
+ GGUF_TYPE_COUNT, // marks the end of the enum
2116
+ };
2117
+
2118
+ struct gguf_context;
2119
+
2120
+ struct gguf_init_params {
2121
+ bool no_alloc;
2122
+
2123
+ // if not NULL, create a ggml_context and allocate the tensor data in it
2124
+ struct ggml_context ** ctx;
2125
+ };
2126
+
2127
+ GGML_API struct gguf_context * gguf_init_empty(void);
2128
+ GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
2129
+ //GGML_API struct gguf_context * gguf_init_from_buffer(..);
2130
+
2131
+ GGML_API void gguf_free(struct gguf_context * ctx);
2132
+
2133
+ GGML_API const char * gguf_type_name(enum gguf_type type);
2134
+
2135
+ GGML_API int gguf_get_version (const struct gguf_context * ctx);
2136
+ GGML_API size_t gguf_get_alignment (const struct gguf_context * ctx);
2137
+ GGML_API size_t gguf_get_data_offset(const struct gguf_context * ctx);
2138
+ GGML_API void * gguf_get_data (const struct gguf_context * ctx);
2139
+
2140
+ GGML_API int gguf_get_n_kv(const struct gguf_context * ctx);
2141
+ GGML_API int gguf_find_key(const struct gguf_context * ctx, const char * key);
2142
+ GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int key_id);
2143
+
2144
+ GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int key_id);
2145
+ GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int key_id);
2146
+
2147
+ // will abort if the wrong type is used for the key
2148
+ GGML_API uint8_t gguf_get_val_u8 (const struct gguf_context * ctx, int key_id);
2149
+ GGML_API int8_t gguf_get_val_i8 (const struct gguf_context * ctx, int key_id);
2150
+ GGML_API uint16_t gguf_get_val_u16 (const struct gguf_context * ctx, int key_id);
2151
+ GGML_API int16_t gguf_get_val_i16 (const struct gguf_context * ctx, int key_id);
2152
+ GGML_API uint32_t gguf_get_val_u32 (const struct gguf_context * ctx, int key_id);
2153
+ GGML_API int32_t gguf_get_val_i32 (const struct gguf_context * ctx, int key_id);
2154
+ GGML_API float gguf_get_val_f32 (const struct gguf_context * ctx, int key_id);
2155
+ GGML_API uint64_t gguf_get_val_u64 (const struct gguf_context * ctx, int key_id);
2156
+ GGML_API int64_t gguf_get_val_i64 (const struct gguf_context * ctx, int key_id);
2157
+ GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int key_id);
2158
+ GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id);
2159
+ GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int key_id);
2160
+ GGML_API const void * gguf_get_val_data(const struct gguf_context * ctx, int key_id);
2161
+ GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int key_id);
2162
+ GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
2163
+ GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);
2164
+
2165
+ GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx);
2166
+ GGML_API int gguf_find_tensor (const struct gguf_context * ctx, const char * name);
2167
+ GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i);
2168
+ GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i);
2169
+ GGML_API enum ggml_type gguf_get_tensor_type (const struct gguf_context * ctx, int i);
2170
+
2171
+ // removes key if it exists
2172
+ GGML_API void gguf_remove_key(struct gguf_context * ctx, const char * key);
2173
+
2174
+ // overrides existing values or adds a new one
2175
+ GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);
2176
+ GGML_API void gguf_set_val_i8 (struct gguf_context * ctx, const char * key, int8_t val);
2177
+ GGML_API void gguf_set_val_u16 (struct gguf_context * ctx, const char * key, uint16_t val);
2178
+ GGML_API void gguf_set_val_i16 (struct gguf_context * ctx, const char * key, int16_t val);
2179
+ GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val);
2180
+ GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t val);
2181
+ GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float val);
2182
+ GGML_API void gguf_set_val_u64 (struct gguf_context * ctx, const char * key, uint64_t val);
2183
+ GGML_API void gguf_set_val_i64 (struct gguf_context * ctx, const char * key, int64_t val);
2184
+ GGML_API void gguf_set_val_f64 (struct gguf_context * ctx, const char * key, double val);
2185
+ GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val);
2186
+ GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val);
2187
+ GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n);
2188
+ GGML_API void gguf_set_arr_str (struct gguf_context * ctx, const char * key, const char ** data, int n);
2189
+
2190
+ // set or add KV pairs from another context
2191
+ GGML_API void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src);
2192
+
2193
+ // manage tensor info
2194
+ GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor);
2195
+ GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type);
2196
+ GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size);
2197
+
2198
+ // writing gguf files can be done in 2 ways:
2199
+ //
2200
+ // - write the entire gguf_context to a binary file in a single pass:
2201
+ //
2202
+ // gguf_write_to_file(ctx, fname);
2203
+ //
2204
+ // - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data:
2205
+ //
2206
+ // FILE * f = fopen(fname, "wb");
2207
+ // fseek(f, gguf_get_meta_size(ctx), SEEK_SET);
2208
+ // fwrite(f, ...);
2209
+ // void * data = gguf_meta_get_meta_data(ctx);
2210
+ // fseek(f, 0, SEEK_SET);
2211
+ // fwrite(f, data, gguf_get_meta_size(ctx));
2212
+ // free(data);
2213
+ // fclose(f);
2214
+ //
2215
+
2216
+ // write the entire context to a binary file
2217
+ GGML_API void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta);
2218
+
2219
+ // get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
2220
+ GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx);
2221
+ GGML_API void gguf_get_meta_data(const struct gguf_context * ctx, void * data);
2222
+
2223
+ #ifdef __cplusplus
2224
+ // restrict not standard in C++
2225
+ # if defined(__GNUC__)
2226
+ # define GGML_RESTRICT __restrict__
2227
+ # elif defined(__clang__)
2228
+ # define GGML_RESTRICT __restrict
2229
+ # elif defined(_MSC_VER)
2230
+ # define GGML_RESTRICT __restrict
2231
+ # else
2232
+ # define GGML_RESTRICT
2233
+ # endif
2234
+ #else
2235
+ # define GGML_RESTRICT restrict
2236
+ #endif
2237
+ typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
2238
+ typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
2239
+
2240
+ struct ggml_type_traits {
2241
+ const char * type_name;
2242
+ int64_t blck_size;
2243
+ int64_t blck_size_interleave; // interleave elements in blocks
2244
+ size_t type_size;
2245
+ bool is_quantized;
2246
+ ggml_to_float_t to_float;
2247
+ ggml_from_float_t from_float_ref;
2248
+ };
2249
+
2250
+ GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type);
2251
+
2252
+ // ggml threadpool
2253
+ // TODO: currently, only a few functions are in the base ggml API, while the rest are in the CPU backend
2254
+ // the goal should be to create an API that other backends can use move everything to the ggml base
2255
+
2256
+ // scheduling priorities
2257
+ enum ggml_sched_priority {
2258
+ GGML_SCHED_PRIO_NORMAL,
2259
+ GGML_SCHED_PRIO_MEDIUM,
2260
+ GGML_SCHED_PRIO_HIGH,
2261
+ GGML_SCHED_PRIO_REALTIME
2262
+ };
2263
+
2264
+ // threadpool params
2265
+ // Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
2266
+ struct ggml_threadpool_params {
2267
+ bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
2268
+ int n_threads; // number of threads
2269
+ enum ggml_sched_priority prio; // thread priority
2270
+ uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling)
2271
+ bool strict_cpu; // strict cpu placement
2272
+ bool paused; // start in paused state
2273
+ };
2274
+
2275
+ struct ggml_threadpool; // forward declaration, see ggml.c
2276
+
2277
+ typedef struct ggml_threadpool * ggml_threadpool_t;
2278
+
2279
+ GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
2280
+ GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
2281
+ GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
2282
+
2283
+ #ifdef __cplusplus
2284
+ }
2285
+ #endif