whispercpp 1.3.0 → 1.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +5 -0
  3. data/LICENSE +1 -1
  4. data/README.md +165 -434
  5. data/Rakefile +60 -11
  6. data/ext/.gitignore +13 -0
  7. data/ext/cpu.mk +9 -0
  8. data/ext/{dr_wav.h → examples/dr_wav.h} +3560 -1179
  9. data/ext/extconf.rb +185 -16
  10. data/ext/ggml/include/ggml-alloc.h +76 -0
  11. data/ext/ggml/include/ggml-backend.h +352 -0
  12. data/ext/ggml/include/ggml-blas.h +25 -0
  13. data/ext/ggml/include/ggml-cann.h +123 -0
  14. data/ext/ggml/include/ggml-cpp.h +38 -0
  15. data/ext/ggml/include/ggml-cpu.h +135 -0
  16. data/ext/ggml/include/ggml-cuda.h +47 -0
  17. data/ext/ggml/include/ggml-kompute.h +50 -0
  18. data/ext/ggml/include/ggml-metal.h +66 -0
  19. data/ext/ggml/include/ggml-opencl.h +26 -0
  20. data/ext/ggml/include/ggml-opt.h +216 -0
  21. data/ext/ggml/include/ggml-rpc.h +28 -0
  22. data/ext/ggml/include/ggml-sycl.h +49 -0
  23. data/ext/ggml/include/ggml-vulkan.h +31 -0
  24. data/ext/{ggml.h → ggml/include/ggml.h} +479 -596
  25. data/ext/ggml/src/ggml-alloc.c +1037 -0
  26. data/ext/ggml/src/ggml-amx/common.h +94 -0
  27. data/ext/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
  28. data/ext/ggml/src/ggml-amx/mmq.cpp +2510 -0
  29. data/ext/ggml/src/ggml-amx/mmq.h +17 -0
  30. data/ext/ggml/src/ggml-backend-impl.h +256 -0
  31. data/ext/ggml/src/ggml-backend-reg.cpp +552 -0
  32. data/ext/ggml/src/ggml-backend.cpp +1999 -0
  33. data/ext/ggml/src/ggml-blas/ggml-blas.cpp +517 -0
  34. data/ext/ggml/src/ggml-cann/acl_tensor.cpp +175 -0
  35. data/ext/ggml/src/ggml-cann/acl_tensor.h +258 -0
  36. data/ext/ggml/src/ggml-cann/aclnn_ops.cpp +3427 -0
  37. data/ext/ggml/src/ggml-cann/aclnn_ops.h +592 -0
  38. data/ext/ggml/src/ggml-cann/common.h +286 -0
  39. data/ext/ggml/src/ggml-cann/ggml-cann.cpp +2188 -0
  40. data/ext/ggml/src/ggml-cann/kernels/ascendc_kernels.h +19 -0
  41. data/ext/ggml/src/ggml-cann/kernels/dup.cpp +236 -0
  42. data/ext/ggml/src/ggml-cann/kernels/get_row_f16.cpp +197 -0
  43. data/ext/ggml/src/ggml-cann/kernels/get_row_f32.cpp +190 -0
  44. data/ext/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +204 -0
  45. data/ext/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
  46. data/ext/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +218 -0
  47. data/ext/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +216 -0
  48. data/ext/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +295 -0
  49. data/ext/ggml/src/ggml-common.h +1853 -0
  50. data/ext/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
  51. data/ext/ggml/src/ggml-cpu/amx/amx.h +8 -0
  52. data/ext/ggml/src/ggml-cpu/amx/common.h +91 -0
  53. data/ext/ggml/src/ggml-cpu/amx/mmq.cpp +2511 -0
  54. data/ext/ggml/src/ggml-cpu/amx/mmq.h +10 -0
  55. data/ext/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
  56. data/ext/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +4262 -0
  57. data/ext/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +8 -0
  58. data/ext/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
  59. data/ext/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
  60. data/ext/ggml/src/ggml-cpu/ggml-cpu-impl.h +386 -0
  61. data/ext/ggml/src/ggml-cpu/ggml-cpu-quants.c +10835 -0
  62. data/ext/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  63. data/ext/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
  64. data/ext/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
  65. data/ext/ggml/src/ggml-cpu/ggml-cpu.c +14123 -0
  66. data/ext/ggml/src/ggml-cpu/ggml-cpu.cpp +622 -0
  67. data/ext/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1884 -0
  68. data/ext/ggml/src/ggml-cpu/llamafile/sgemm.h +14 -0
  69. data/ext/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  70. data/ext/ggml/src/ggml-cuda/vendors/hip.h +186 -0
  71. data/ext/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  72. data/ext/ggml/src/ggml-impl.h +556 -0
  73. data/ext/ggml/src/ggml-kompute/ggml-kompute.cpp +2251 -0
  74. data/ext/ggml/src/ggml-metal/ggml-metal-impl.h +288 -0
  75. data/ext/ggml/src/ggml-metal/ggml-metal.m +4884 -0
  76. data/ext/ggml/src/ggml-metal/ggml-metal.metal +6732 -0
  77. data/ext/ggml/src/ggml-opt.cpp +854 -0
  78. data/ext/ggml/src/ggml-quants.c +5238 -0
  79. data/ext/ggml/src/ggml-quants.h +100 -0
  80. data/ext/ggml/src/ggml-rpc/ggml-rpc.cpp +1406 -0
  81. data/ext/ggml/src/ggml-sycl/common.cpp +95 -0
  82. data/ext/ggml/src/ggml-sycl/concat.cpp +196 -0
  83. data/ext/ggml/src/ggml-sycl/conv.cpp +99 -0
  84. data/ext/ggml/src/ggml-sycl/convert.cpp +547 -0
  85. data/ext/ggml/src/ggml-sycl/dmmv.cpp +1023 -0
  86. data/ext/ggml/src/ggml-sycl/element_wise.cpp +1030 -0
  87. data/ext/ggml/src/ggml-sycl/ggml-sycl.cpp +4729 -0
  88. data/ext/ggml/src/ggml-sycl/im2col.cpp +126 -0
  89. data/ext/ggml/src/ggml-sycl/mmq.cpp +3031 -0
  90. data/ext/ggml/src/ggml-sycl/mmvq.cpp +1015 -0
  91. data/ext/ggml/src/ggml-sycl/norm.cpp +378 -0
  92. data/ext/ggml/src/ggml-sycl/outprod.cpp +56 -0
  93. data/ext/ggml/src/ggml-sycl/rope.cpp +276 -0
  94. data/ext/ggml/src/ggml-sycl/softmax.cpp +251 -0
  95. data/ext/ggml/src/ggml-sycl/tsembd.cpp +72 -0
  96. data/ext/ggml/src/ggml-sycl/wkv6.cpp +141 -0
  97. data/ext/ggml/src/ggml-threading.cpp +12 -0
  98. data/ext/ggml/src/ggml-threading.h +14 -0
  99. data/ext/ggml/src/ggml-vulkan/ggml-vulkan.cpp +8657 -0
  100. data/ext/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +593 -0
  101. data/ext/ggml/src/ggml.c +7694 -0
  102. data/ext/{whisper.h → include/whisper.h} +23 -22
  103. data/ext/metal-embed.mk +17 -0
  104. data/ext/metal.mk +6 -0
  105. data/ext/ruby_whisper.cpp +1492 -9
  106. data/ext/ruby_whisper.h +10 -0
  107. data/ext/scripts/get-flags.mk +38 -0
  108. data/ext/src/coreml/whisper-decoder-impl.h +146 -0
  109. data/ext/src/coreml/whisper-decoder-impl.m +201 -0
  110. data/ext/src/coreml/whisper-encoder-impl.h +142 -0
  111. data/ext/src/coreml/whisper-encoder-impl.m +197 -0
  112. data/ext/src/coreml/whisper-encoder.h +26 -0
  113. data/ext/src/openvino/whisper-openvino-encoder.cpp +108 -0
  114. data/ext/src/openvino/whisper-openvino-encoder.h +31 -0
  115. data/ext/{whisper.cpp → src/whisper.cpp} +661 -492
  116. data/extsources.rb +6 -0
  117. data/lib/whisper/model/uri.rb +157 -0
  118. data/lib/whisper.rb +2 -0
  119. data/tests/helper.rb +7 -0
  120. data/tests/jfk_reader/.gitignore +5 -0
  121. data/tests/jfk_reader/extconf.rb +3 -0
  122. data/tests/jfk_reader/jfk_reader.c +68 -0
  123. data/tests/test_callback.rb +160 -0
  124. data/tests/test_error.rb +20 -0
  125. data/tests/test_model.rb +71 -0
  126. data/tests/test_package.rb +31 -0
  127. data/tests/test_params.rb +160 -0
  128. data/tests/test_segment.rb +83 -0
  129. data/tests/test_whisper.rb +211 -123
  130. data/whispercpp.gemspec +36 -0
  131. metadata +137 -11
  132. data/ext/ggml.c +0 -21755
@@ -176,25 +176,15 @@
176
176
  #ifdef GGML_SHARED
177
177
  # if defined(_WIN32) && !defined(__MINGW32__)
178
178
  # ifdef GGML_BUILD
179
- # define GGML_API __declspec(dllexport)
179
+ # define GGML_API __declspec(dllexport) extern
180
180
  # else
181
- # define GGML_API __declspec(dllimport)
181
+ # define GGML_API __declspec(dllimport) extern
182
182
  # endif
183
183
  # else
184
- # define GGML_API __attribute__ ((visibility ("default")))
184
+ # define GGML_API __attribute__ ((visibility ("default"))) extern
185
185
  # endif
186
186
  #else
187
- # define GGML_API
188
- #endif
189
-
190
- #ifdef GGML_MULTIPLATFORM
191
- # if defined(_WIN32)
192
- # define GGML_CALL
193
- # else
194
- # define GGML_CALL __attribute__((__ms_abi__))
195
- # endif
196
- #else
197
- # define GGML_CALL
187
+ # define GGML_API extern
198
188
  #endif
199
189
 
200
190
  // TODO: support for clang
@@ -220,21 +210,24 @@
220
210
  #include <stdio.h>
221
211
 
222
212
  #define GGML_FILE_MAGIC 0x67676d6c // "ggml"
223
- #define GGML_FILE_VERSION 1
213
+ #define GGML_FILE_VERSION 2
224
214
 
225
215
  #define GGML_QNT_VERSION 2 // bump this on quantization format changes
226
216
  #define GGML_QNT_VERSION_FACTOR 1000 // do not change this
227
217
 
228
218
  #define GGML_MAX_DIMS 4
229
219
  #define GGML_MAX_PARAMS 2048
230
- #define GGML_MAX_CONTEXTS 64
231
220
  #define GGML_MAX_SRC 10
221
+ #define GGML_MAX_N_THREADS 512
222
+ #define GGML_MAX_OP_PARAMS 64
223
+
232
224
  #ifndef GGML_MAX_NAME
233
- #define GGML_MAX_NAME 64
225
+ # define GGML_MAX_NAME 64
234
226
  #endif
235
- #define GGML_MAX_OP_PARAMS 64
227
+
236
228
  #define GGML_DEFAULT_N_THREADS 4
237
229
  #define GGML_DEFAULT_GRAPH_SIZE 2048
230
+
238
231
  #if UINTPTR_MAX == 0xFFFFFFFF
239
232
  #define GGML_MEM_ALIGN 4
240
233
  #else
@@ -244,6 +237,10 @@
244
237
  #define GGML_EXIT_SUCCESS 0
245
238
  #define GGML_EXIT_ABORTED 1
246
239
 
240
+ #define GGML_ROPE_TYPE_NEOX 2
241
+ #define GGML_ROPE_TYPE_MROPE 8
242
+ #define GGML_ROPE_TYPE_VISION 24
243
+
247
244
  #define GGUF_MAGIC "GGUF"
248
245
 
249
246
  #define GGUF_VERSION 3
@@ -254,26 +251,27 @@
254
251
 
255
252
  #define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
256
253
 
257
- #define GGML_ASSERT(x) \
258
- do { \
259
- if (!(x)) { \
260
- fflush(stdout); \
261
- fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
262
- ggml_print_backtrace(); \
263
- abort(); \
264
- } \
265
- } while (0)
266
-
267
254
  #ifndef NDEBUG
268
- #define GGML_UNREACHABLE() GGML_ASSERT(!"statement should not be reached")
255
+ # define GGML_UNREACHABLE() do { fprintf(stderr, "statement should be unreachable\n"); abort(); } while(0)
269
256
  #elif defined(__GNUC__)
270
- #define GGML_UNREACHABLE() __builtin_unreachable()
257
+ # define GGML_UNREACHABLE() __builtin_unreachable()
271
258
  #elif defined(_MSC_VER)
272
- #define GGML_UNREACHABLE() __assume(0)
259
+ # define GGML_UNREACHABLE() __assume(0)
273
260
  #else
274
- #define GGML_UNREACHABLE() ((void) 0)
261
+ # define GGML_UNREACHABLE() ((void) 0)
275
262
  #endif
276
263
 
264
+ #ifdef __cplusplus
265
+ # define GGML_NORETURN [[noreturn]]
266
+ #elif defined(_MSC_VER)
267
+ # define GGML_NORETURN __declspec(noreturn)
268
+ #else
269
+ # define GGML_NORETURN _Noreturn
270
+ #endif
271
+
272
+ #define GGML_ABORT(...) ggml_abort(__FILE__, __LINE__, __VA_ARGS__)
273
+ #define GGML_ASSERT(x) if (!(x)) GGML_ABORT("GGML_ASSERT(%s) failed", #x)
274
+
277
275
  // used to copy the number of elements and stride in bytes of tensors into local variables.
278
276
  // main purpose is to reduce code duplication and improve readability.
279
277
  //
@@ -312,10 +310,19 @@
312
310
  GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
313
311
  GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
314
312
 
313
+ #define GGML_TENSOR_BINARY_OP_LOCALS01 \
314
+ GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
315
+ GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
316
+ GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
317
+ GGML_TENSOR_LOCALS(size_t, nb1, src1, nb)
318
+
315
319
  #ifdef __cplusplus
316
320
  extern "C" {
317
321
  #endif
318
322
 
323
+ GGML_NORETURN GGML_ATTRIBUTE_FORMAT(3, 4)
324
+ GGML_API void ggml_abort(const char * file, int line, const char * fmt, ...);
325
+
319
326
  enum ggml_status {
320
327
  GGML_STATUS_ALLOC_FAILED = -2,
321
328
  GGML_STATUS_FAILED = -1,
@@ -324,19 +331,27 @@ extern "C" {
324
331
  };
325
332
 
326
333
  // get ggml_status name string
327
- GGML_API GGML_CALL const char * ggml_status_to_string(enum ggml_status status);
334
+ GGML_API const char * ggml_status_to_string(enum ggml_status status);
328
335
 
336
+ // ieee 754-2008 half-precision float16
337
+ // todo: make this not an integral type
329
338
  typedef uint16_t ggml_fp16_t;
330
-
331
- // convert FP16 <-> FP32
332
- GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
333
- GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
334
-
335
- GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n);
336
- GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n);
339
+ GGML_API float ggml_fp16_to_fp32(ggml_fp16_t);
340
+ GGML_API ggml_fp16_t ggml_fp32_to_fp16(float);
341
+ GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t *, float *, int64_t);
342
+ GGML_API void ggml_fp32_to_fp16_row(const float *, ggml_fp16_t *, int64_t);
343
+
344
+ // google brain half-precision bfloat16
345
+ typedef struct { uint16_t bits; } ggml_bf16_t;
346
+ GGML_API ggml_bf16_t ggml_fp32_to_bf16(float);
347
+ GGML_API float ggml_bf16_to_fp32(ggml_bf16_t); // consider just doing << 16
348
+ GGML_API void ggml_bf16_to_fp32_row(const ggml_bf16_t *, float *, int64_t);
349
+ GGML_API void ggml_fp32_to_bf16_row_ref(const float *, ggml_bf16_t *, int64_t);
350
+ GGML_API void ggml_fp32_to_bf16_row(const float *, ggml_bf16_t *, int64_t);
337
351
 
338
352
  struct ggml_object;
339
353
  struct ggml_context;
354
+ struct ggml_cgraph;
340
355
 
341
356
  // NOTE: always add types at the end of the enum to keep backward compatibility
342
357
  enum ggml_type {
@@ -370,7 +385,16 @@ extern "C" {
370
385
  GGML_TYPE_I64 = 27,
371
386
  GGML_TYPE_F64 = 28,
372
387
  GGML_TYPE_IQ1_M = 29,
373
- GGML_TYPE_COUNT,
388
+ GGML_TYPE_BF16 = 30,
389
+ // GGML_TYPE_Q4_0_4_4 = 31, support has been removed from gguf files
390
+ // GGML_TYPE_Q4_0_4_8 = 32,
391
+ // GGML_TYPE_Q4_0_8_8 = 33,
392
+ GGML_TYPE_TQ1_0 = 34,
393
+ GGML_TYPE_TQ2_0 = 35,
394
+ // GGML_TYPE_IQ4_NL_4_4 = 36,
395
+ // GGML_TYPE_IQ4_NL_4_8 = 37,
396
+ // GGML_TYPE_IQ4_NL_8_8 = 38,
397
+ GGML_TYPE_COUNT = 39,
374
398
  };
375
399
 
376
400
  // precision
@@ -410,6 +434,7 @@ extern "C" {
410
434
  GGML_FTYPE_MOSTLY_IQ2_S = 21, // except 1d tensors
411
435
  GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
412
436
  GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
437
+ GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
413
438
  };
414
439
 
415
440
  // available tensor operations:
@@ -426,10 +451,13 @@ extern "C" {
426
451
  GGML_OP_SQR,
427
452
  GGML_OP_SQRT,
428
453
  GGML_OP_LOG,
454
+ GGML_OP_SIN,
455
+ GGML_OP_COS,
429
456
  GGML_OP_SUM,
430
457
  GGML_OP_SUM_ROWS,
431
458
  GGML_OP_MEAN,
432
459
  GGML_OP_ARGMAX,
460
+ GGML_OP_COUNT_EQUAL,
433
461
  GGML_OP_REPEAT,
434
462
  GGML_OP_REPEAT_BACK,
435
463
  GGML_OP_CONCAT,
@@ -460,22 +488,23 @@ extern "C" {
460
488
  GGML_OP_SOFT_MAX_BACK,
461
489
  GGML_OP_ROPE,
462
490
  GGML_OP_ROPE_BACK,
463
- GGML_OP_ALIBI,
464
491
  GGML_OP_CLAMP,
465
492
  GGML_OP_CONV_TRANSPOSE_1D,
466
493
  GGML_OP_IM2COL,
494
+ GGML_OP_IM2COL_BACK,
467
495
  GGML_OP_CONV_TRANSPOSE_2D,
468
496
  GGML_OP_POOL_1D,
469
497
  GGML_OP_POOL_2D,
498
+ GGML_OP_POOL_2D_BACK,
470
499
  GGML_OP_UPSCALE, // nearest interpolate
471
500
  GGML_OP_PAD,
501
+ GGML_OP_PAD_REFLECT_1D,
472
502
  GGML_OP_ARANGE,
473
503
  GGML_OP_TIMESTEP_EMBEDDING,
474
504
  GGML_OP_ARGSORT,
475
505
  GGML_OP_LEAKY_RELU,
476
506
 
477
- GGML_OP_FLASH_ATTN,
478
- GGML_OP_FLASH_FF,
507
+ GGML_OP_FLASH_ATTN_EXT,
479
508
  GGML_OP_FLASH_ATTN_BACK,
480
509
  GGML_OP_SSM_CONV,
481
510
  GGML_OP_SSM_SCAN,
@@ -483,6 +512,7 @@ extern "C" {
483
512
  GGML_OP_WIN_UNPART,
484
513
  GGML_OP_GET_REL_POS,
485
514
  GGML_OP_ADD_REL_POS,
515
+ GGML_OP_RWKV_WKV6,
486
516
 
487
517
  GGML_OP_UNARY,
488
518
 
@@ -499,6 +529,7 @@ extern "C" {
499
529
 
500
530
  GGML_OP_CROSS_ENTROPY_LOSS,
501
531
  GGML_OP_CROSS_ENTROPY_LOSS_BACK,
532
+ GGML_OP_OPT_STEP_ADAMW,
502
533
 
503
534
  GGML_OP_COUNT,
504
535
  };
@@ -511,11 +542,13 @@ extern "C" {
511
542
  GGML_UNARY_OP_TANH,
512
543
  GGML_UNARY_OP_ELU,
513
544
  GGML_UNARY_OP_RELU,
545
+ GGML_UNARY_OP_SIGMOID,
514
546
  GGML_UNARY_OP_GELU,
515
547
  GGML_UNARY_OP_GELU_QUICK,
516
548
  GGML_UNARY_OP_SILU,
517
549
  GGML_UNARY_OP_HARDSWISH,
518
550
  GGML_UNARY_OP_HARDSIGMOID,
551
+ GGML_UNARY_OP_EXP,
519
552
 
520
553
  GGML_UNARY_OP_COUNT,
521
554
  };
@@ -527,36 +560,34 @@ extern "C" {
527
560
  };
528
561
 
529
562
  enum ggml_log_level {
530
- GGML_LOG_LEVEL_ERROR = 2,
563
+ GGML_LOG_LEVEL_NONE = 0,
564
+ GGML_LOG_LEVEL_DEBUG = 1,
565
+ GGML_LOG_LEVEL_INFO = 2,
531
566
  GGML_LOG_LEVEL_WARN = 3,
532
- GGML_LOG_LEVEL_INFO = 4,
533
- GGML_LOG_LEVEL_DEBUG = 5
567
+ GGML_LOG_LEVEL_ERROR = 4,
568
+ GGML_LOG_LEVEL_CONT = 5, // continue previous log
534
569
  };
535
570
 
571
+ // this tensor...
536
572
  enum ggml_tensor_flag {
537
- GGML_TENSOR_FLAG_INPUT = 1,
538
- GGML_TENSOR_FLAG_OUTPUT = 2,
539
- GGML_TENSOR_FLAG_PARAM = 4,
573
+ GGML_TENSOR_FLAG_INPUT = 1, // ...is an input for the GGML compute graph
574
+ GGML_TENSOR_FLAG_OUTPUT = 2, // ...is an output for the GGML compute graph
575
+ GGML_TENSOR_FLAG_PARAM = 4, // ...contains trainable parameters
576
+ GGML_TENSOR_FLAG_LOSS = 8, // ...defines loss for numerical optimization (multiple loss tensors add up)
540
577
  };
541
578
 
542
- // ggml object
543
- struct ggml_object {
544
- size_t offs;
545
- size_t size;
546
-
547
- struct ggml_object * next;
548
-
549
- enum ggml_object_type type;
550
-
551
- char padding[4];
579
+ struct ggml_init_params {
580
+ // memory pool
581
+ size_t mem_size; // bytes
582
+ void * mem_buffer; // if NULL, memory will be allocated internally
583
+ bool no_alloc; // don't allocate memory for the tensor data
552
584
  };
553
585
 
554
- static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
555
-
556
586
  // n-dimensional tensor
557
587
  struct ggml_tensor {
558
- enum ggml_type type;
559
- enum ggml_backend_type backend;
588
+ enum ggml_type type;
589
+
590
+ GGML_DEPRECATED(enum ggml_backend_type backend, "use the buffer type to find the storage location of the tensor");
560
591
 
561
592
  struct ggml_backend_buffer * buffer;
562
593
 
@@ -574,14 +605,9 @@ extern "C" {
574
605
 
575
606
  int32_t flags;
576
607
 
577
- struct ggml_tensor * grad;
578
608
  struct ggml_tensor * src[GGML_MAX_SRC];
579
609
 
580
- // performance
581
- int perf_runs;
582
- int64_t perf_cycles;
583
- int64_t perf_time_us;
584
-
610
+ // source tensor and offset for views
585
611
  struct ggml_tensor * view_src;
586
612
  size_t view_offs;
587
613
 
@@ -601,95 +627,6 @@ extern "C" {
601
627
  // If it returns true, the computation is aborted
602
628
  typedef bool (*ggml_abort_callback)(void * data);
603
629
 
604
- // the compute plan that needs to be prepared for ggml_graph_compute()
605
- // since https://github.com/ggerganov/ggml/issues/287
606
- struct ggml_cplan {
607
- size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
608
- uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
609
-
610
- int n_threads;
611
-
612
- // abort ggml_graph_compute when true
613
- ggml_abort_callback abort_callback;
614
- void * abort_callback_data;
615
- };
616
-
617
- enum ggml_cgraph_eval_order {
618
- GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
619
- GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
620
- GGML_CGRAPH_EVAL_ORDER_COUNT
621
- };
622
-
623
- struct ggml_hash_set {
624
- size_t size;
625
- struct ggml_tensor ** keys;
626
- };
627
-
628
- // computation graph
629
- struct ggml_cgraph {
630
- int size;
631
- int n_nodes;
632
- int n_leafs;
633
-
634
- struct ggml_tensor ** nodes;
635
- struct ggml_tensor ** grads;
636
- struct ggml_tensor ** leafs;
637
-
638
- struct ggml_hash_set visited_hash_table;
639
-
640
- enum ggml_cgraph_eval_order order;
641
-
642
- // performance
643
- int perf_runs;
644
- int64_t perf_cycles;
645
- int64_t perf_time_us;
646
- };
647
-
648
- // scratch buffer
649
- struct ggml_scratch {
650
- size_t offs;
651
- size_t size;
652
- void * data;
653
- };
654
-
655
- struct ggml_init_params {
656
- // memory pool
657
- size_t mem_size; // bytes
658
- void * mem_buffer; // if NULL, memory will be allocated internally
659
- bool no_alloc; // don't allocate memory for the tensor data
660
- };
661
-
662
-
663
- // compute types
664
-
665
- // NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled.
666
- // This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995.
667
- enum ggml_task_type {
668
- GGML_TASK_TYPE_INIT = 0,
669
- GGML_TASK_TYPE_COMPUTE,
670
- GGML_TASK_TYPE_FINALIZE,
671
- };
672
-
673
- struct ggml_compute_params {
674
- enum ggml_task_type type;
675
-
676
- // ith = thread index, nth = number of threads
677
- int ith, nth;
678
-
679
- // work buffer for all threads
680
- size_t wsize;
681
- void * wdata;
682
- };
683
-
684
- // numa strategies
685
- enum ggml_numa_strategy {
686
- GGML_NUMA_STRATEGY_DISABLED = 0,
687
- GGML_NUMA_STRATEGY_DISTRIBUTE = 1,
688
- GGML_NUMA_STRATEGY_ISOLATE = 2,
689
- GGML_NUMA_STRATEGY_NUMACTL = 3,
690
- GGML_NUMA_STRATEGY_MIRROR = 4,
691
- GGML_NUMA_STRATEGY_COUNT
692
- };
693
630
 
694
631
  //
695
632
  // GUID
@@ -709,67 +646,71 @@ extern "C" {
709
646
  GGML_API int64_t ggml_cycles(void);
710
647
  GGML_API int64_t ggml_cycles_per_ms(void);
711
648
 
712
- GGML_API void ggml_print_backtrace(void);
713
-
714
649
  // accepts a UTF-8 path, even on Windows
715
650
  GGML_API FILE * ggml_fopen(const char * fname, const char * mode);
716
651
 
717
- GGML_API void ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
718
- GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
719
-
720
652
  GGML_API void ggml_print_object (const struct ggml_object * obj);
721
653
  GGML_API void ggml_print_objects(const struct ggml_context * ctx);
722
654
 
723
- GGML_API GGML_CALL int64_t ggml_nelements (const struct ggml_tensor * tensor);
724
- GGML_API GGML_CALL int64_t ggml_nrows (const struct ggml_tensor * tensor);
725
- GGML_API GGML_CALL size_t ggml_nbytes (const struct ggml_tensor * tensor);
726
- GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
655
+ GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor);
656
+ GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
657
+ GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
658
+ GGML_API size_t ggml_nbytes_pad(const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
727
659
 
728
- GGML_API GGML_CALL int ggml_blck_size(enum ggml_type type);
729
- GGML_API GGML_CALL size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
730
- GGML_API GGML_CALL size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
660
+ GGML_API int64_t ggml_blck_size(enum ggml_type type);
661
+ GGML_API size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
662
+ GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
731
663
 
732
664
  GGML_DEPRECATED(
733
665
  GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
734
666
  "use ggml_row_size() instead");
735
667
 
736
- GGML_API GGML_CALL const char * ggml_type_name(enum ggml_type type);
737
- GGML_API GGML_CALL const char * ggml_op_name (enum ggml_op op);
738
- GGML_API const char * ggml_op_symbol(enum ggml_op op);
668
+ GGML_API const char * ggml_type_name(enum ggml_type type);
669
+ GGML_API const char * ggml_op_name (enum ggml_op op);
670
+ GGML_API const char * ggml_op_symbol(enum ggml_op op);
739
671
 
740
- GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op);
741
- GGML_API GGML_CALL const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
672
+ GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op);
673
+ GGML_API const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
742
674
 
743
- GGML_API GGML_CALL size_t ggml_element_size(const struct ggml_tensor * tensor);
675
+ GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
744
676
 
745
- GGML_API GGML_CALL bool ggml_is_quantized(enum ggml_type type);
677
+ GGML_API bool ggml_is_quantized(enum ggml_type type);
746
678
 
747
679
  // TODO: temporary until model loading of ggml examples is refactored
748
680
  GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
749
681
 
750
- GGML_API GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor);
751
- GGML_API GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor);
752
- GGML_API GGML_CALL bool ggml_is_permuted (const struct ggml_tensor * tensor);
753
- GGML_API GGML_CALL bool ggml_is_empty (const struct ggml_tensor * tensor);
754
- GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
755
- GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
756
- GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);
757
- GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
758
- GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
682
+ GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
683
+ GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
684
+ GGML_API bool ggml_is_empty (const struct ggml_tensor * tensor);
685
+ GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
686
+ GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
687
+ GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);
688
+ GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
689
+ GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
690
+
691
+ GGML_API bool ggml_is_contiguous (const struct ggml_tensor * tensor);
692
+ GGML_API bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous()
693
+ GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
694
+ GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
759
695
 
760
- GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
696
+ GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
697
+ GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
698
+
699
+ GGML_API bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
761
700
 
762
701
  // use this to compute the memory overhead of a tensor
763
702
  GGML_API size_t ggml_tensor_overhead(void);
764
703
 
704
+ GGML_API bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbytes);
705
+
765
706
  // main
766
707
 
767
- GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
768
- GGML_API void ggml_free(struct ggml_context * ctx);
708
+ GGML_API struct ggml_context * ggml_init (struct ggml_init_params params);
709
+ GGML_API void ggml_reset(struct ggml_context * ctx);
710
+ GGML_API void ggml_free (struct ggml_context * ctx);
769
711
 
770
712
  GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
771
713
 
772
- GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
773
714
  GGML_API bool ggml_get_no_alloc(struct ggml_context * ctx);
774
715
  GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
775
716
 
@@ -809,8 +750,7 @@ extern "C" {
809
750
  int64_t ne2,
810
751
  int64_t ne3);
811
752
 
812
- GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
813
- GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
753
+ GGML_API void * ggml_new_buffer(struct ggml_context * ctx, size_t nbytes);
814
754
 
815
755
  GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
816
756
  GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
@@ -820,35 +760,25 @@ extern "C" {
820
760
  GGML_API struct ggml_tensor * ggml_get_next_tensor (const struct ggml_context * ctx, struct ggml_tensor * tensor);
821
761
  GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
822
762
 
823
- GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
824
- GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
825
- GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
826
-
827
763
  // Converts a flat index into coordinates
828
- GGML_API void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3);
764
+ GGML_API void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3);
829
765
 
830
- GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
831
- GGML_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
832
-
833
- GGML_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
834
- GGML_API void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
835
-
836
- GGML_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
837
- GGML_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
838
-
839
- GGML_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
840
- GGML_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
766
+ GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
841
767
 
842
768
  GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
843
769
  GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
844
770
 
845
- GGML_API GGML_CALL enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
846
-
847
771
  GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor);
848
772
  GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name);
849
773
  GGML_ATTRIBUTE_FORMAT(2, 3)
850
774
  GGML_API struct ggml_tensor * ggml_format_name( struct ggml_tensor * tensor, const char * fmt, ...);
851
775
 
776
+ // Tensor flags
777
+ GGML_API void ggml_set_input(struct ggml_tensor * tensor);
778
+ GGML_API void ggml_set_output(struct ggml_tensor * tensor);
779
+ GGML_API void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor);
780
+ GGML_API void ggml_set_loss(struct ggml_tensor * tensor);
781
+
852
782
  //
853
783
  // operations on tensors with backpropagation
854
784
  //
@@ -963,6 +893,22 @@ extern "C" {
963
893
  struct ggml_context * ctx,
964
894
  struct ggml_tensor * a);
965
895
 
896
+ GGML_API struct ggml_tensor * ggml_sin(
897
+ struct ggml_context * ctx,
898
+ struct ggml_tensor * a);
899
+
900
+ GGML_API struct ggml_tensor * ggml_sin_inplace(
901
+ struct ggml_context * ctx,
902
+ struct ggml_tensor * a);
903
+
904
+ GGML_API struct ggml_tensor * ggml_cos(
905
+ struct ggml_context * ctx,
906
+ struct ggml_tensor * a);
907
+
908
+ GGML_API struct ggml_tensor * ggml_cos_inplace(
909
+ struct ggml_context * ctx,
910
+ struct ggml_tensor * a);
911
+
966
912
  // return scalar
967
913
  GGML_API struct ggml_tensor * ggml_sum(
968
914
  struct ggml_context * ctx,
@@ -983,6 +929,12 @@ extern "C" {
983
929
  struct ggml_context * ctx,
984
930
  struct ggml_tensor * a);
985
931
 
932
+ // count number of equal elements in a and b
933
+ GGML_API struct ggml_tensor * ggml_count_equal(
934
+ struct ggml_context * ctx,
935
+ struct ggml_tensor * a,
936
+ struct ggml_tensor * b);
937
+
986
938
  // if a is the same shape as b, and a is not parameter, return a
987
939
  // otherwise, return a new tensor: repeat(a) to fit in b
988
940
  GGML_API struct ggml_tensor * ggml_repeat(
@@ -996,12 +948,13 @@ extern "C" {
996
948
  struct ggml_tensor * a,
997
949
  struct ggml_tensor * b);
998
950
 
999
- // concat a and b on dim 2
951
+ // concat a and b along dim
1000
952
  // used in stable-diffusion
1001
953
  GGML_API struct ggml_tensor * ggml_concat(
1002
954
  struct ggml_context * ctx,
1003
955
  struct ggml_tensor * a,
1004
- struct ggml_tensor * b);
956
+ struct ggml_tensor * b,
957
+ int dim);
1005
958
 
1006
959
  GGML_API struct ggml_tensor * ggml_abs(
1007
960
  struct ggml_context * ctx,
@@ -1063,6 +1016,14 @@ extern "C" {
1063
1016
  struct ggml_context * ctx,
1064
1017
  struct ggml_tensor * a);
1065
1018
 
1019
+ GGML_API struct ggml_tensor * ggml_sigmoid(
1020
+ struct ggml_context * ctx,
1021
+ struct ggml_tensor * a);
1022
+
1023
+ GGML_API struct ggml_tensor * ggml_sigmoid_inplace(
1024
+ struct ggml_context * ctx,
1025
+ struct ggml_tensor * a);
1026
+
1066
1027
  GGML_API struct ggml_tensor * ggml_gelu(
1067
1028
  struct ggml_context * ctx,
1068
1029
  struct ggml_tensor * a);
@@ -1104,6 +1065,14 @@ extern "C" {
1104
1065
  struct ggml_context * ctx,
1105
1066
  struct ggml_tensor * a);
1106
1067
 
1068
+ GGML_API struct ggml_tensor * ggml_exp(
1069
+ struct ggml_context * ctx,
1070
+ struct ggml_tensor * a);
1071
+
1072
+ GGML_API struct ggml_tensor * ggml_exp_inplace(
1073
+ struct ggml_context * ctx,
1074
+ struct ggml_tensor * a);
1075
+
1107
1076
  // normalize along rows
1108
1077
  GGML_API struct ggml_tensor * ggml_norm(
1109
1078
  struct ggml_context * ctx,
@@ -1127,16 +1096,17 @@ extern "C" {
1127
1096
 
1128
1097
  // group normalize along ne0*ne1*n_groups
1129
1098
  // used in stable-diffusion
1130
- // TODO: eps is hardcoded to 1e-6 for now
1131
1099
  GGML_API struct ggml_tensor * ggml_group_norm(
1132
1100
  struct ggml_context * ctx,
1133
1101
  struct ggml_tensor * a,
1134
- int n_groups);
1102
+ int n_groups,
1103
+ float eps);
1135
1104
 
1136
1105
  GGML_API struct ggml_tensor * ggml_group_norm_inplace(
1137
1106
  struct ggml_context * ctx,
1138
1107
  struct ggml_tensor * a,
1139
- int n_groups);
1108
+ int n_groups,
1109
+ float eps);
1140
1110
 
1141
1111
  // a - x
1142
1112
  // b - dy
@@ -1161,13 +1131,11 @@ extern "C" {
1161
1131
  enum ggml_prec prec);
1162
1132
 
1163
1133
  // indirect matrix multiplication
1164
- // ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
1165
1134
  GGML_API struct ggml_tensor * ggml_mul_mat_id(
1166
1135
  struct ggml_context * ctx,
1167
1136
  struct ggml_tensor * as,
1168
- struct ggml_tensor * ids,
1169
- int id,
1170
- struct ggml_tensor * b);
1137
+ struct ggml_tensor * b,
1138
+ struct ggml_tensor * ids);
1171
1139
 
1172
1140
  // A: m columns, n rows,
1173
1141
  // B: p columns, n rows,
@@ -1200,7 +1168,7 @@ extern "C" {
1200
1168
  size_t nb1,
1201
1169
  size_t nb2,
1202
1170
  size_t nb3,
1203
- size_t offset);
1171
+ size_t offset); // in bytes
1204
1172
 
1205
1173
  // b -> view(a,offset,nb1,nb2,3), return view(a)
1206
1174
  GGML_API struct ggml_tensor * ggml_set_inplace(
@@ -1210,19 +1178,19 @@ extern "C" {
1210
1178
  size_t nb1,
1211
1179
  size_t nb2,
1212
1180
  size_t nb3,
1213
- size_t offset);
1181
+ size_t offset); // in bytes
1214
1182
 
1215
1183
  GGML_API struct ggml_tensor * ggml_set_1d(
1216
1184
  struct ggml_context * ctx,
1217
1185
  struct ggml_tensor * a,
1218
1186
  struct ggml_tensor * b,
1219
- size_t offset);
1187
+ size_t offset); // in bytes
1220
1188
 
1221
1189
  GGML_API struct ggml_tensor * ggml_set_1d_inplace(
1222
1190
  struct ggml_context * ctx,
1223
1191
  struct ggml_tensor * a,
1224
1192
  struct ggml_tensor * b,
1225
- size_t offset);
1193
+ size_t offset); // in bytes
1226
1194
 
1227
1195
  // b -> view(a,offset,nb1,nb2,3), return modified a
1228
1196
  GGML_API struct ggml_tensor * ggml_set_2d(
@@ -1230,7 +1198,7 @@ extern "C" {
1230
1198
  struct ggml_tensor * a,
1231
1199
  struct ggml_tensor * b,
1232
1200
  size_t nb1,
1233
- size_t offset);
1201
+ size_t offset); // in bytes
1234
1202
 
1235
1203
  // b -> view(a,offset,nb1,nb2,3), return view(a)
1236
1204
  GGML_API struct ggml_tensor * ggml_set_2d_inplace(
@@ -1238,7 +1206,7 @@ extern "C" {
1238
1206
  struct ggml_tensor * a,
1239
1207
  struct ggml_tensor * b,
1240
1208
  size_t nb1,
1241
- size_t offset);
1209
+ size_t offset); // in bytes
1242
1210
 
1243
1211
  // a -> b, return view(b)
1244
1212
  GGML_API struct ggml_tensor * ggml_cpy(
@@ -1373,14 +1341,14 @@ extern "C" {
1373
1341
  // supports 3D: a->ne[2] == b->ne[1]
1374
1342
  GGML_API struct ggml_tensor * ggml_get_rows(
1375
1343
  struct ggml_context * ctx,
1376
- struct ggml_tensor * a,
1377
- struct ggml_tensor * b);
1344
+ struct ggml_tensor * a, // data
1345
+ struct ggml_tensor * b); // row indices
1378
1346
 
1379
1347
  GGML_API struct ggml_tensor * ggml_get_rows_back(
1380
1348
  struct ggml_context * ctx,
1381
- struct ggml_tensor * a,
1382
- struct ggml_tensor * b,
1383
- struct ggml_tensor * c);
1349
+ struct ggml_tensor * a, // gradients of ggml_get_rows result
1350
+ struct ggml_tensor * b, // row indices
1351
+ struct ggml_tensor * c); // data for ggml_get_rows, only used for its shape
1384
1352
 
1385
1353
  GGML_API struct ggml_tensor * ggml_diag(
1386
1354
  struct ggml_context * ctx,
@@ -1419,15 +1387,13 @@ extern "C" {
1419
1387
  struct ggml_context * ctx,
1420
1388
  struct ggml_tensor * a);
1421
1389
 
1422
- // fused soft_max(a*scale + mask + pos[i]*(ALiBi slope))
1390
+ // fused soft_max(a*scale + mask*(ALiBi slope))
1423
1391
  // mask is optional
1424
- // pos is required when max_bias > 0.0f
1425
1392
  // max_bias = 0.0f for no ALiBi
1426
1393
  GGML_API struct ggml_tensor * ggml_soft_max_ext(
1427
1394
  struct ggml_context * ctx,
1428
1395
  struct ggml_tensor * a,
1429
1396
  struct ggml_tensor * mask,
1430
- struct ggml_tensor * pos,
1431
1397
  float scale,
1432
1398
  float max_bias);
1433
1399
 
@@ -1443,9 +1409,8 @@ extern "C" {
1443
1409
  struct ggml_tensor * b);
1444
1410
 
1445
1411
  // rotary position embedding
1446
- // if mode & 1 == 1, skip n_past elements (DEPRECATED)
1447
- // if mode & 2 == 1, GPT-NeoX style
1448
- // if mode & 4 == 1, ChatGLM style
1412
+ // if (mode & 1) - skip n_past elements (NOT SUPPORTED)
1413
+ // if (mode & GGML_ROPE_TYPE_NEOX) - GPT-NeoX style
1449
1414
  //
1450
1415
  // b is an int32 vector with size a->ne[2], it contains the positions
1451
1416
  GGML_API struct ggml_tensor * ggml_rope(
@@ -1453,8 +1418,7 @@ extern "C" {
1453
1418
  struct ggml_tensor * a,
1454
1419
  struct ggml_tensor * b,
1455
1420
  int n_dims,
1456
- int mode,
1457
- int n_ctx);
1421
+ int mode);
1458
1422
 
1459
1423
  // in-place, returns view(a)
1460
1424
  GGML_API struct ggml_tensor * ggml_rope_inplace(
@@ -1462,18 +1426,18 @@ extern "C" {
1462
1426
  struct ggml_tensor * a,
1463
1427
  struct ggml_tensor * b,
1464
1428
  int n_dims,
1465
- int mode,
1466
- int n_ctx);
1429
+ int mode);
1467
1430
 
1468
1431
  // custom RoPE
1469
- GGML_API struct ggml_tensor * ggml_rope_custom(
1432
+ // c is freq factors (e.g. phi3-128k), (optional)
1433
+ GGML_API struct ggml_tensor * ggml_rope_ext(
1470
1434
  struct ggml_context * ctx,
1471
1435
  struct ggml_tensor * a,
1472
1436
  struct ggml_tensor * b,
1437
+ struct ggml_tensor * c,
1473
1438
  int n_dims,
1474
1439
  int mode,
1475
- int n_ctx,
1476
- int n_orig_ctx,
1440
+ int n_ctx_orig,
1477
1441
  float freq_base,
1478
1442
  float freq_scale,
1479
1443
  float ext_factor,
@@ -1481,15 +1445,15 @@ extern "C" {
1481
1445
  float beta_fast,
1482
1446
  float beta_slow);
1483
1447
 
1484
- // in-place, returns view(a)
1485
- GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
1448
+ GGML_API struct ggml_tensor * ggml_rope_multi(
1486
1449
  struct ggml_context * ctx,
1487
1450
  struct ggml_tensor * a,
1488
1451
  struct ggml_tensor * b,
1452
+ struct ggml_tensor * c,
1489
1453
  int n_dims,
1454
+ int sections[4],
1490
1455
  int mode,
1491
- int n_ctx,
1492
- int n_orig_ctx,
1456
+ int n_ctx_orig,
1493
1457
  float freq_base,
1494
1458
  float freq_scale,
1495
1459
  float ext_factor,
@@ -1497,47 +1461,72 @@ extern "C" {
1497
1461
  float beta_fast,
1498
1462
  float beta_slow);
1499
1463
 
1500
- // compute correction dims for YaRN RoPE scaling
1501
- GGML_CALL void ggml_rope_yarn_corr_dims(
1502
- int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
1503
-
1504
- // xPos RoPE, in-place, returns view(a)
1505
- GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
1464
+ // in-place, returns view(a)
1465
+ GGML_API struct ggml_tensor * ggml_rope_ext_inplace(
1506
1466
  struct ggml_context * ctx,
1507
1467
  struct ggml_tensor * a,
1508
1468
  struct ggml_tensor * b,
1469
+ struct ggml_tensor * c,
1509
1470
  int n_dims,
1510
- float base,
1511
- bool down);
1471
+ int mode,
1472
+ int n_ctx_orig,
1473
+ float freq_base,
1474
+ float freq_scale,
1475
+ float ext_factor,
1476
+ float attn_factor,
1477
+ float beta_fast,
1478
+ float beta_slow);
1512
1479
 
1513
- // rotary position embedding backward, i.e compute dx from dy
1514
- // a - dy
1515
- GGML_API struct ggml_tensor * ggml_rope_back(
1480
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom(
1516
1481
  struct ggml_context * ctx,
1517
1482
  struct ggml_tensor * a,
1518
1483
  struct ggml_tensor * b,
1519
1484
  int n_dims,
1520
1485
  int mode,
1521
- int n_ctx,
1522
- int n_orig_ctx,
1486
+ int n_ctx_orig,
1523
1487
  float freq_base,
1524
1488
  float freq_scale,
1525
1489
  float ext_factor,
1526
1490
  float attn_factor,
1527
1491
  float beta_fast,
1528
- float beta_slow,
1529
- float xpos_base,
1530
- bool xpos_down);
1492
+ float beta_slow),
1493
+ "use ggml_rope_ext instead");
1531
1494
 
1532
- // alibi position embedding
1533
- // in-place, returns view(a)
1534
- GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_alibi(
1495
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
1535
1496
  struct ggml_context * ctx,
1536
1497
  struct ggml_tensor * a,
1537
- int n_past,
1538
- int n_head,
1539
- float bias_max),
1540
- "use ggml_soft_max_ext instead (will be removed in Mar 2024)");
1498
+ struct ggml_tensor * b,
1499
+ int n_dims,
1500
+ int mode,
1501
+ int n_ctx_orig,
1502
+ float freq_base,
1503
+ float freq_scale,
1504
+ float ext_factor,
1505
+ float attn_factor,
1506
+ float beta_fast,
1507
+ float beta_slow),
1508
+ "use ggml_rope_ext_inplace instead");
1509
+
1510
+ // compute correction dims for YaRN RoPE scaling
1511
+ GGML_API void ggml_rope_yarn_corr_dims(
1512
+ int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]);
1513
+
1514
+ // rotary position embedding backward, i.e compute dx from dy
1515
+ // a - dy
1516
+ GGML_API struct ggml_tensor * ggml_rope_back(
1517
+ struct ggml_context * ctx,
1518
+ struct ggml_tensor * a, // gradients of ggml_rope result
1519
+ struct ggml_tensor * b, // positions
1520
+ struct ggml_tensor * c, // freq factors
1521
+ int n_dims,
1522
+ int mode,
1523
+ int n_ctx_orig,
1524
+ float freq_base,
1525
+ float freq_scale,
1526
+ float ext_factor,
1527
+ float attn_factor,
1528
+ float beta_fast,
1529
+ float beta_slow);
1541
1530
 
1542
1531
  // clamp
1543
1532
  // in-place, returns view(a)
@@ -1547,34 +1536,49 @@ extern "C" {
1547
1536
  float min,
1548
1537
  float max);
1549
1538
 
1539
+ // im2col
1540
+ // converts data into a format that effectively results in a convolution when combined with matrix multiplication
1550
1541
  GGML_API struct ggml_tensor * ggml_im2col(
1551
1542
  struct ggml_context * ctx,
1552
- struct ggml_tensor * a,
1553
- struct ggml_tensor * b,
1554
- int s0,
1555
- int s1,
1556
- int p0,
1557
- int p1,
1558
- int d0,
1559
- int d1,
1560
- bool is_2D,
1561
- enum ggml_type dst_type);
1543
+ struct ggml_tensor * a, // convolution kernel
1544
+ struct ggml_tensor * b, // data
1545
+ int s0, // stride dimension 0
1546
+ int s1, // stride dimension 1
1547
+ int p0, // padding dimension 0
1548
+ int p1, // padding dimension 1
1549
+ int d0, // dilation dimension 0
1550
+ int d1, // dilation dimension 1
1551
+ bool is_2D,
1552
+ enum ggml_type dst_type);
1553
+
1554
+ GGML_API struct ggml_tensor * ggml_im2col_back(
1555
+ struct ggml_context * ctx,
1556
+ struct ggml_tensor * a, // convolution kernel
1557
+ struct ggml_tensor * b, // gradient of im2col output
1558
+ int64_t * ne, // shape of im2col input
1559
+ int s0, // stride dimension 0
1560
+ int s1, // stride dimension 1
1561
+ int p0, // padding dimension 0
1562
+ int p1, // padding dimension 1
1563
+ int d0, // dilation dimension 0
1564
+ int d1, // dilation dimension 1
1565
+ bool is_2D);
1562
1566
 
1563
1567
  GGML_API struct ggml_tensor * ggml_conv_depthwise_2d(
1564
1568
  struct ggml_context * ctx,
1565
- struct ggml_tensor * a,
1566
- struct ggml_tensor * b,
1567
- int s0,
1568
- int s1,
1569
- int p0,
1570
- int p1,
1571
- int d0,
1572
- int d1);
1569
+ struct ggml_tensor * a, // convolution kernel
1570
+ struct ggml_tensor * b, // data
1571
+ int s0, // stride dimension 0
1572
+ int s1, // stride dimension 1
1573
+ int p0, // padding dimension 0
1574
+ int p1, // padding dimension 1
1575
+ int d0, // dilation dimension 0
1576
+ int d1); // dilation dimension 1
1573
1577
 
1574
1578
  GGML_API struct ggml_tensor * ggml_conv_1d(
1575
1579
  struct ggml_context * ctx,
1576
- struct ggml_tensor * a,
1577
- struct ggml_tensor * b,
1580
+ struct ggml_tensor * a, // convolution kernel
1581
+ struct ggml_tensor * b, // data
1578
1582
  int s0, // stride
1579
1583
  int p0, // padding
1580
1584
  int d0); // dilation
@@ -1583,29 +1587,29 @@ extern "C" {
1583
1587
  // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
1584
1588
  GGML_API struct ggml_tensor* ggml_conv_1d_ph(
1585
1589
  struct ggml_context * ctx,
1586
- struct ggml_tensor * a,
1587
- struct ggml_tensor * b,
1588
- int s,
1589
- int d);
1590
+ struct ggml_tensor * a, // convolution kernel
1591
+ struct ggml_tensor * b, // data
1592
+ int s, // stride
1593
+ int d); // dilation
1590
1594
 
1591
1595
  GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
1592
1596
  struct ggml_context * ctx,
1593
- struct ggml_tensor * a,
1594
- struct ggml_tensor * b,
1595
- int s0,
1596
- int p0,
1597
- int d0);
1597
+ struct ggml_tensor * a, // convolution kernel
1598
+ struct ggml_tensor * b, // data
1599
+ int s0, // stride
1600
+ int p0, // padding
1601
+ int d0); // dilation
1598
1602
 
1599
1603
  GGML_API struct ggml_tensor * ggml_conv_2d(
1600
1604
  struct ggml_context * ctx,
1601
- struct ggml_tensor * a,
1602
- struct ggml_tensor * b,
1603
- int s0,
1604
- int s1,
1605
- int p0,
1606
- int p1,
1607
- int d0,
1608
- int d1);
1605
+ struct ggml_tensor * a, // convolution kernel
1606
+ struct ggml_tensor * b, // data
1607
+ int s0, // stride dimension 0
1608
+ int s1, // stride dimension 1
1609
+ int p0, // padding dimension 0
1610
+ int p1, // padding dimension 1
1611
+ int d0, // dilation dimension 0
1612
+ int d1); // dilation dimension 1
1609
1613
 
1610
1614
 
1611
1615
  // kernel size is a->ne[0] x a->ne[1]
@@ -1667,13 +1671,37 @@ extern "C" {
1667
1671
  float p0,
1668
1672
  float p1);
1669
1673
 
1674
+ GGML_API struct ggml_tensor * ggml_pool_2d_back(
1675
+ struct ggml_context * ctx,
1676
+ struct ggml_tensor * a,
1677
+ struct ggml_tensor * af, // "a"/input used in forward pass
1678
+ enum ggml_op_pool op,
1679
+ int k0,
1680
+ int k1,
1681
+ int s0,
1682
+ int s1,
1683
+ float p0,
1684
+ float p1);
1685
+
1670
1686
  // nearest interpolate
1687
+ // multiplies ne0 and ne1 by scale factor
1671
1688
  // used in stable-diffusion
1672
1689
  GGML_API struct ggml_tensor * ggml_upscale(
1673
1690
  struct ggml_context * ctx,
1674
1691
  struct ggml_tensor * a,
1675
1692
  int scale_factor);
1676
1693
 
1694
+ // nearest interpolate
1695
+ // nearest interpolate to specified dimensions
1696
+ // used in tortoise.cpp
1697
+ GGML_API struct ggml_tensor * ggml_upscale_ext(
1698
+ struct ggml_context * ctx,
1699
+ struct ggml_tensor * a,
1700
+ int ne0,
1701
+ int ne1,
1702
+ int ne2,
1703
+ int ne3);
1704
+
1677
1705
  // pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
1678
1706
  GGML_API struct ggml_tensor * ggml_pad(
1679
1707
  struct ggml_context * ctx,
@@ -1683,6 +1711,13 @@ extern "C" {
1683
1711
  int p2,
1684
1712
  int p3);
1685
1713
 
1714
+ // pad each dimension with reflection: [a, b, c, d] -> [b, a, b, c, d, c]
1715
+ GGML_API struct ggml_tensor * ggml_pad_reflect_1d(
1716
+ struct ggml_context * ctx,
1717
+ struct ggml_tensor * a,
1718
+ int p0,
1719
+ int p1);
1720
+
1686
1721
  // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
1687
1722
  // timesteps: [N,]
1688
1723
  // return: [N, dim]
@@ -1715,13 +1750,31 @@ extern "C" {
1715
1750
  struct ggml_tensor * a,
1716
1751
  int k);
1717
1752
 
1718
- GGML_API struct ggml_tensor * ggml_flash_attn(
1753
+ #define GGML_KQ_MASK_PAD 32
1754
+
1755
+ // q: [n_embd, n_batch, n_head, 1]
1756
+ // k: [n_embd, n_kv, n_head_kv, 1]
1757
+ // v: [n_embd, n_kv, n_head_kv, 1] !! not transposed !!
1758
+ // mask: [n_kv, n_batch_pad, 1, 1] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
1759
+ // res: [n_embd, n_head, n_batch, 1] !! permuted !!
1760
+ GGML_API struct ggml_tensor * ggml_flash_attn_ext(
1719
1761
  struct ggml_context * ctx,
1720
1762
  struct ggml_tensor * q,
1721
1763
  struct ggml_tensor * k,
1722
1764
  struct ggml_tensor * v,
1723
- bool masked);
1765
+ struct ggml_tensor * mask,
1766
+ float scale,
1767
+ float max_bias,
1768
+ float logit_softcap);
1769
+
1770
+ GGML_API void ggml_flash_attn_ext_set_prec(
1771
+ struct ggml_tensor * a,
1772
+ enum ggml_prec prec);
1724
1773
 
1774
+ GGML_API enum ggml_prec ggml_flash_attn_ext_get_prec(
1775
+ const struct ggml_tensor * a);
1776
+
1777
+ // TODO: needs to be adapted to ggml_flash_attn_ext
1725
1778
  GGML_API struct ggml_tensor * ggml_flash_attn_back(
1726
1779
  struct ggml_context * ctx,
1727
1780
  struct ggml_tensor * q,
@@ -1730,20 +1783,10 @@ extern "C" {
1730
1783
  struct ggml_tensor * d,
1731
1784
  bool masked);
1732
1785
 
1733
- GGML_API struct ggml_tensor * ggml_flash_ff(
1734
- struct ggml_context * ctx,
1735
- struct ggml_tensor * a,
1736
- struct ggml_tensor * b0,
1737
- struct ggml_tensor * b1,
1738
- struct ggml_tensor * c0,
1739
- struct ggml_tensor * c1);
1740
-
1741
1786
  GGML_API struct ggml_tensor * ggml_ssm_conv(
1742
1787
  struct ggml_context * ctx,
1743
- struct ggml_tensor * s,
1744
- struct ggml_tensor * x,
1745
- struct ggml_tensor * c,
1746
- struct ggml_tensor * sq);
1788
+ struct ggml_tensor * sx,
1789
+ struct ggml_tensor * c);
1747
1790
 
1748
1791
  GGML_API struct ggml_tensor * ggml_ssm_scan(
1749
1792
  struct ggml_context * ctx,
@@ -1752,8 +1795,7 @@ extern "C" {
1752
1795
  struct ggml_tensor * dt,
1753
1796
  struct ggml_tensor * A,
1754
1797
  struct ggml_tensor * B,
1755
- struct ggml_tensor * C,
1756
- struct ggml_tensor * sq);
1798
+ struct ggml_tensor * C);
1757
1799
 
1758
1800
  // partition into non-overlapping windows with padding if needed
1759
1801
  // example:
@@ -1805,6 +1847,15 @@ extern "C" {
1805
1847
  struct ggml_tensor * pw,
1806
1848
  struct ggml_tensor * ph);
1807
1849
 
1850
+ GGML_API struct ggml_tensor * ggml_rwkv_wkv6(
1851
+ struct ggml_context * ctx,
1852
+ struct ggml_tensor * k,
1853
+ struct ggml_tensor * v,
1854
+ struct ggml_tensor * r,
1855
+ struct ggml_tensor * tf,
1856
+ struct ggml_tensor * td,
1857
+ struct ggml_tensor * state);
1858
+
1808
1859
  // custom operators
1809
1860
 
1810
1861
  typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
@@ -1888,7 +1939,8 @@ extern "C" {
1888
1939
  typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata);
1889
1940
  typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata);
1890
1941
 
1891
- #define GGML_N_TASKS_MAX -1
1942
+ #define GGML_N_TASKS_MAX (-1)
1943
+ // n_tasks == GGML_N_TASKS_MAX means to use max number of tasks
1892
1944
 
1893
1945
  GGML_API struct ggml_tensor * ggml_map_custom1(
1894
1946
  struct ggml_context * ctx,
@@ -1941,49 +1993,59 @@ extern "C" {
1941
1993
  // loss function
1942
1994
 
1943
1995
  GGML_API struct ggml_tensor * ggml_cross_entropy_loss(
1944
- struct ggml_context * ctx,
1945
- struct ggml_tensor * a,
1946
- struct ggml_tensor * b);
1996
+ struct ggml_context * ctx,
1997
+ struct ggml_tensor * a, // logits
1998
+ struct ggml_tensor * b); // labels
1947
1999
 
1948
2000
  GGML_API struct ggml_tensor * ggml_cross_entropy_loss_back(
1949
- struct ggml_context * ctx,
1950
- struct ggml_tensor * a,
1951
- struct ggml_tensor * b,
1952
- struct ggml_tensor * c);
2001
+ struct ggml_context * ctx,
2002
+ struct ggml_tensor * a, // logits
2003
+ struct ggml_tensor * b, // labels
2004
+ struct ggml_tensor * c); // gradients of cross_entropy_loss result
2005
+
2006
+ // AdamW optimizer step
2007
+ // Paper: https://arxiv.org/pdf/1711.05101v3.pdf
2008
+ // PyTorch: https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html
2009
+ GGML_API struct ggml_tensor * ggml_opt_step_adamw(
2010
+ struct ggml_context * ctx,
2011
+ struct ggml_tensor * a,
2012
+ struct ggml_tensor * grad,
2013
+ struct ggml_tensor * m,
2014
+ struct ggml_tensor * v,
2015
+ struct ggml_tensor * adamw_params); // parameters such a the learning rate
1953
2016
 
1954
2017
  //
1955
2018
  // automatic differentiation
1956
2019
  //
1957
2020
 
1958
- GGML_API void ggml_set_param(
1959
- struct ggml_context * ctx,
1960
- struct ggml_tensor * tensor);
2021
+ GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
2022
+ GGML_API void ggml_build_backward_expand(
2023
+ struct ggml_context * ctx_static, // context for static gradients (loss + gradient accumulation)
2024
+ struct ggml_context * ctx_compute, // context for gradient computation
2025
+ struct ggml_cgraph * cgraph,
2026
+ bool accumulate); // whether or not gradients should be accumulated, requires static allocation of tensors in ctx_static
1961
2027
 
2028
+ // graph allocation in a context
2029
+ GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
2030
+ GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads);
2031
+ GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
2032
+ GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst);
2033
+ GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // set regular grads + optimizer momenta to 0, set loss grad to 1
2034
+ GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph);
1962
2035
 
1963
- GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
1964
- GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);
2036
+ GGML_API int ggml_graph_size (struct ggml_cgraph * cgraph);
2037
+ GGML_API struct ggml_tensor * ggml_graph_node (struct ggml_cgraph * cgraph, int i); // if i < 0, returns nodes[n_nodes + i]
2038
+ GGML_API struct ggml_tensor ** ggml_graph_nodes (struct ggml_cgraph * cgraph);
2039
+ GGML_API int ggml_graph_n_nodes(struct ggml_cgraph * cgraph);
1965
2040
 
1966
- // graph allocation in a context
1967
- GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
1968
- GGML_API struct ggml_cgraph * ggml_new_graph_custom (struct ggml_context * ctx, size_t size, bool grads);
1969
- GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
1970
- GGML_API struct ggml_cgraph ggml_graph_view (struct ggml_cgraph * cgraph, int i0, int i1);
1971
- GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst);
1972
- GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // zero grads
1973
- GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph);
2041
+ GGML_API void ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
1974
2042
 
1975
2043
  GGML_API size_t ggml_graph_overhead(void);
1976
2044
  GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
1977
2045
 
1978
- // ggml_graph_plan() has to be called before ggml_graph_compute()
1979
- // when plan.work_size > 0, caller must allocate memory for plan.work_data
1980
- GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
1981
- GGML_API enum ggml_status ggml_graph_compute ( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
1982
- // same as ggml_graph_compute() but the work data is allocated as a part of the context
1983
- // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
1984
- GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
1985
-
1986
- GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
2046
+ GGML_API struct ggml_tensor * ggml_graph_get_tensor (const struct ggml_cgraph * cgraph, const char * name);
2047
+ GGML_API struct ggml_tensor * ggml_graph_get_grad (const struct ggml_cgraph * cgraph, const struct ggml_tensor * node);
2048
+ GGML_API struct ggml_tensor * ggml_graph_get_grad_acc(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node);
1987
2049
 
1988
2050
  GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
1989
2051
  GGML_API struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
@@ -1994,197 +2056,14 @@ extern "C" {
1994
2056
  // dump the graph into a file using the dot format
1995
2057
  GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
1996
2058
 
1997
- // build gradient checkpointing backward graph gb for gf using provided checkpoints
1998
- // gb_tmp will contain original backward graph with rewritten backward process nodes,
1999
- // but without the second forward pass nodes.
2000
- GGML_API void ggml_build_backward_gradient_checkpointing(
2001
- struct ggml_context * ctx,
2002
- struct ggml_cgraph * gf,
2003
- struct ggml_cgraph * gb,
2004
- struct ggml_cgraph * gb_tmp,
2005
- struct ggml_tensor * * checkpoints,
2006
- int n_checkpoints);
2007
- //
2008
- // optimization
2009
- //
2010
-
2011
- // optimization methods
2012
- enum ggml_opt_type {
2013
- GGML_OPT_TYPE_ADAM,
2014
- GGML_OPT_TYPE_LBFGS,
2015
- };
2016
-
2017
- // linesearch methods
2018
- enum ggml_linesearch {
2019
- GGML_LINESEARCH_DEFAULT = 1,
2020
-
2021
- GGML_LINESEARCH_BACKTRACKING_ARMIJO = 0,
2022
- GGML_LINESEARCH_BACKTRACKING_WOLFE = 1,
2023
- GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2,
2024
- };
2025
-
2026
- // optimization return values
2027
- enum ggml_opt_result {
2028
- GGML_OPT_RESULT_OK = 0,
2029
- GGML_OPT_RESULT_DID_NOT_CONVERGE,
2030
- GGML_OPT_RESULT_NO_CONTEXT,
2031
- GGML_OPT_RESULT_INVALID_WOLFE,
2032
- GGML_OPT_RESULT_FAIL,
2033
- GGML_OPT_RESULT_CANCEL,
2034
-
2035
- GGML_LINESEARCH_FAIL = -128,
2036
- GGML_LINESEARCH_MINIMUM_STEP,
2037
- GGML_LINESEARCH_MAXIMUM_STEP,
2038
- GGML_LINESEARCH_MAXIMUM_ITERATIONS,
2039
- GGML_LINESEARCH_INVALID_PARAMETERS,
2040
- };
2041
-
2042
- typedef void (*ggml_opt_callback)(void * data, int accum_step, float * sched, bool * cancel);
2059
+ // TODO these functions were sandwiched in the old optimization interface, is there a better place for them?
2043
2060
  typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
2044
2061
 
2045
- // optimization parameters
2046
- //
2047
- // see ggml.c (ggml_opt_default_params) for default values
2048
- //
2049
- struct ggml_opt_params {
2050
- enum ggml_opt_type type;
2051
-
2052
- size_t graph_size;
2053
-
2054
- int n_threads;
2055
-
2056
- // delta-based convergence test
2057
- //
2058
- // if past == 0 - disabled
2059
- // if past > 0:
2060
- // stop if |f(x) - f(x_past)| < delta * max(1, |f(x)|)
2061
- //
2062
- int past;
2063
- float delta;
2064
-
2065
- // maximum number of iterations without improvement
2066
- //
2067
- // if 0 - disabled
2068
- // if > 0:
2069
- // assume convergence if no cost improvement in this number of iterations
2070
- //
2071
- int max_no_improvement;
2072
-
2073
- bool print_forward_graph;
2074
- bool print_backward_graph;
2075
-
2076
- int n_gradient_accumulation;
2077
-
2078
- // ADAM parameters
2079
- struct {
2080
- int n_iter;
2081
-
2082
- float sched; // schedule multiplier (fixed, decay or warmup)
2083
- float decay; // weight decay for AdamW, use 0.0f to disable
2084
- int decay_min_ndim; // minimum number of tensor dimension to apply weight decay
2085
- float alpha; // learning rate
2086
- float beta1;
2087
- float beta2;
2088
- float eps; // epsilon for numerical stability
2089
- float eps_f; // epsilon for convergence test
2090
- float eps_g; // epsilon for convergence test
2091
- float gclip; // gradient clipping
2092
- } adam;
2093
-
2094
- // LBFGS parameters
2095
- struct {
2096
- int m; // number of corrections to approximate the inv. Hessian
2097
- int n_iter;
2098
- int max_linesearch;
2099
-
2100
- float eps; // convergence tolerance
2101
- float ftol; // line search tolerance
2102
- float wolfe;
2103
- float min_step;
2104
- float max_step;
2105
-
2106
- enum ggml_linesearch linesearch;
2107
- } lbfgs;
2108
- };
2109
-
2110
- struct ggml_opt_context {
2111
- struct ggml_context * ctx;
2112
- struct ggml_opt_params params;
2113
-
2114
- int iter;
2115
- int64_t nx; // number of parameter elements
2116
-
2117
- bool just_initialized;
2118
-
2119
- float loss_before;
2120
- float loss_after;
2121
-
2122
- struct {
2123
- struct ggml_tensor * g; // current gradient
2124
- struct ggml_tensor * m; // first moment
2125
- struct ggml_tensor * v; // second moment
2126
- struct ggml_tensor * pf; // past function values
2127
- float fx_best;
2128
- float fx_prev;
2129
- int n_no_improvement;
2130
- } adam;
2131
-
2132
- struct {
2133
- struct ggml_tensor * x; // current parameters
2134
- struct ggml_tensor * xp; // previous parameters
2135
- struct ggml_tensor * g; // current gradient
2136
- struct ggml_tensor * gp; // previous gradient
2137
- struct ggml_tensor * d; // search direction
2138
- struct ggml_tensor * pf; // past function values
2139
- struct ggml_tensor * lmal; // the L-BFGS memory alpha
2140
- struct ggml_tensor * lmys; // the L-BFGS memory ys
2141
- struct ggml_tensor * lms; // the L-BFGS memory s
2142
- struct ggml_tensor * lmy; // the L-BFGS memory y
2143
- float fx_best;
2144
- float step;
2145
- int j;
2146
- int k;
2147
- int end;
2148
- int n_no_improvement;
2149
- } lbfgs;
2150
- };
2151
-
2152
- GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
2153
-
2154
- // optimize the function defined by the tensor f
2155
- GGML_API enum ggml_opt_result ggml_opt(
2156
- struct ggml_context * ctx,
2157
- struct ggml_opt_params params,
2158
- struct ggml_tensor * f);
2159
-
2160
- // initialize optimizer context
2161
- GGML_API void ggml_opt_init(
2162
- struct ggml_context * ctx,
2163
- struct ggml_opt_context * opt,
2164
- struct ggml_opt_params params,
2165
- int64_t nx);
2166
-
2167
- // continue optimizing the function defined by the tensor f
2168
- GGML_API enum ggml_opt_result ggml_opt_resume(
2169
- struct ggml_context * ctx,
2170
- struct ggml_opt_context * opt,
2171
- struct ggml_tensor * f);
2172
-
2173
- // continue optimizing the function defined by the tensor f
2174
- GGML_API enum ggml_opt_result ggml_opt_resume_g(
2175
- struct ggml_context * ctx,
2176
- struct ggml_opt_context * opt,
2177
- struct ggml_tensor * f,
2178
- struct ggml_cgraph * gf,
2179
- struct ggml_cgraph * gb,
2180
- ggml_opt_callback callback,
2181
- void * callback_data);
2062
+ // Set callback for all future logging events.
2063
+ // If this is not called, or NULL is supplied, everything is output on stderr.
2064
+ GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data);
2182
2065
 
2183
- //
2184
- // tensor flags
2185
- //
2186
- GGML_API void ggml_set_input(struct ggml_tensor * tensor);
2187
- GGML_API void ggml_set_output(struct ggml_tensor * tensor);
2066
+ GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
2188
2067
 
2189
2068
  //
2190
2069
  // quantization
@@ -2289,6 +2168,9 @@ extern "C" {
2289
2168
  GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i);
2290
2169
  GGML_API enum ggml_type gguf_get_tensor_type (const struct gguf_context * ctx, int i);
2291
2170
 
2171
+ // removes key if it exists
2172
+ GGML_API void gguf_remove_key(struct gguf_context * ctx, const char * key);
2173
+
2292
2174
  // overrides existing values or adds a new one
2293
2175
  GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);
2294
2176
  GGML_API void gguf_set_val_i8 (struct gguf_context * ctx, const char * key, int8_t val);
@@ -2338,64 +2220,65 @@ extern "C" {
2338
2220
  GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx);
2339
2221
  GGML_API void gguf_get_meta_data(const struct gguf_context * ctx, void * data);
2340
2222
 
2341
- //
2342
- // system info
2343
- //
2344
-
2345
- GGML_API int ggml_cpu_has_avx (void);
2346
- GGML_API int ggml_cpu_has_avx_vnni (void);
2347
- GGML_API int ggml_cpu_has_avx2 (void);
2348
- GGML_API int ggml_cpu_has_avx512 (void);
2349
- GGML_API int ggml_cpu_has_avx512_vbmi(void);
2350
- GGML_API int ggml_cpu_has_avx512_vnni(void);
2351
- GGML_API int ggml_cpu_has_fma (void);
2352
- GGML_API int ggml_cpu_has_neon (void);
2353
- GGML_API int ggml_cpu_has_arm_fma (void);
2354
- GGML_API int ggml_cpu_has_metal (void);
2355
- GGML_API int ggml_cpu_has_f16c (void);
2356
- GGML_API int ggml_cpu_has_fp16_va (void);
2357
- GGML_API int ggml_cpu_has_wasm_simd (void);
2358
- GGML_API int ggml_cpu_has_blas (void);
2359
- GGML_API int ggml_cpu_has_cuda (void);
2360
- GGML_API int ggml_cpu_has_clblast (void);
2361
- GGML_API int ggml_cpu_has_vulkan (void);
2362
- GGML_API int ggml_cpu_has_kompute (void);
2363
- GGML_API int ggml_cpu_has_gpublas (void);
2364
- GGML_API int ggml_cpu_has_sse3 (void);
2365
- GGML_API int ggml_cpu_has_ssse3 (void);
2366
- GGML_API int ggml_cpu_has_sycl (void);
2367
- GGML_API int ggml_cpu_has_vsx (void);
2368
- GGML_API int ggml_cpu_has_matmul_int8(void);
2369
-
2370
- //
2371
- // Internal types and functions exposed for tests and benchmarks
2372
- //
2373
-
2374
- #ifdef __cplusplus
2375
- // restrict not standard in C++
2376
- #define GGML_RESTRICT
2223
+ #ifdef __cplusplus
2224
+ // restrict not standard in C++
2225
+ # if defined(__GNUC__)
2226
+ # define GGML_RESTRICT __restrict__
2227
+ # elif defined(__clang__)
2228
+ # define GGML_RESTRICT __restrict
2229
+ # elif defined(_MSC_VER)
2230
+ # define GGML_RESTRICT __restrict
2231
+ # else
2232
+ # define GGML_RESTRICT
2233
+ # endif
2377
2234
  #else
2378
- #define GGML_RESTRICT restrict
2235
+ # define GGML_RESTRICT restrict
2379
2236
  #endif
2380
2237
  typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
2381
2238
  typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
2382
- typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
2383
- const void * GGML_RESTRICT y, size_t by, int nrc);
2384
-
2385
- typedef struct {
2386
- const char * type_name;
2387
- int blck_size;
2388
- size_t type_size;
2389
- bool is_quantized;
2390
- ggml_to_float_t to_float;
2391
- ggml_from_float_t from_float;
2392
- ggml_from_float_t from_float_reference;
2393
- ggml_vec_dot_t vec_dot;
2394
- enum ggml_type vec_dot_type;
2395
- int64_t nrows; // number of rows to process simultaneously;
2396
- } ggml_type_traits_t;
2397
-
2398
- GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
2239
+
2240
+ struct ggml_type_traits {
2241
+ const char * type_name;
2242
+ int64_t blck_size;
2243
+ int64_t blck_size_interleave; // interleave elements in blocks
2244
+ size_t type_size;
2245
+ bool is_quantized;
2246
+ ggml_to_float_t to_float;
2247
+ ggml_from_float_t from_float_ref;
2248
+ };
2249
+
2250
+ GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type);
2251
+
2252
+ // ggml threadpool
2253
+ // TODO: currently, only a few functions are in the base ggml API, while the rest are in the CPU backend
2254
+ // the goal should be to create an API that other backends can use move everything to the ggml base
2255
+
2256
+ // scheduling priorities
2257
+ enum ggml_sched_priority {
2258
+ GGML_SCHED_PRIO_NORMAL,
2259
+ GGML_SCHED_PRIO_MEDIUM,
2260
+ GGML_SCHED_PRIO_HIGH,
2261
+ GGML_SCHED_PRIO_REALTIME
2262
+ };
2263
+
2264
+ // threadpool params
2265
+ // Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
2266
+ struct ggml_threadpool_params {
2267
+ bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
2268
+ int n_threads; // number of threads
2269
+ enum ggml_sched_priority prio; // thread priority
2270
+ uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling)
2271
+ bool strict_cpu; // strict cpu placement
2272
+ bool paused; // start in paused state
2273
+ };
2274
+
2275
+ struct ggml_threadpool; // forward declaration, see ggml.c
2276
+
2277
+ typedef struct ggml_threadpool * ggml_threadpool_t;
2278
+
2279
+ GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
2280
+ GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
2281
+ GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
2399
2282
 
2400
2283
  #ifdef __cplusplus
2401
2284
  }