whispercpp 1.3.0 → 1.3.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (132) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +5 -0
  3. data/LICENSE +1 -1
  4. data/README.md +165 -434
  5. data/Rakefile +60 -11
  6. data/ext/.gitignore +13 -0
  7. data/ext/cpu.mk +9 -0
  8. data/ext/{dr_wav.h → examples/dr_wav.h} +3560 -1179
  9. data/ext/extconf.rb +185 -16
  10. data/ext/ggml/include/ggml-alloc.h +76 -0
  11. data/ext/ggml/include/ggml-backend.h +352 -0
  12. data/ext/ggml/include/ggml-blas.h +25 -0
  13. data/ext/ggml/include/ggml-cann.h +123 -0
  14. data/ext/ggml/include/ggml-cpp.h +38 -0
  15. data/ext/ggml/include/ggml-cpu.h +135 -0
  16. data/ext/ggml/include/ggml-cuda.h +47 -0
  17. data/ext/ggml/include/ggml-kompute.h +50 -0
  18. data/ext/ggml/include/ggml-metal.h +66 -0
  19. data/ext/ggml/include/ggml-opencl.h +26 -0
  20. data/ext/ggml/include/ggml-opt.h +216 -0
  21. data/ext/ggml/include/ggml-rpc.h +28 -0
  22. data/ext/ggml/include/ggml-sycl.h +49 -0
  23. data/ext/ggml/include/ggml-vulkan.h +31 -0
  24. data/ext/{ggml.h → ggml/include/ggml.h} +479 -596
  25. data/ext/ggml/src/ggml-alloc.c +1037 -0
  26. data/ext/ggml/src/ggml-amx/common.h +94 -0
  27. data/ext/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
  28. data/ext/ggml/src/ggml-amx/mmq.cpp +2510 -0
  29. data/ext/ggml/src/ggml-amx/mmq.h +17 -0
  30. data/ext/ggml/src/ggml-backend-impl.h +256 -0
  31. data/ext/ggml/src/ggml-backend-reg.cpp +552 -0
  32. data/ext/ggml/src/ggml-backend.cpp +1999 -0
  33. data/ext/ggml/src/ggml-blas/ggml-blas.cpp +517 -0
  34. data/ext/ggml/src/ggml-cann/acl_tensor.cpp +175 -0
  35. data/ext/ggml/src/ggml-cann/acl_tensor.h +258 -0
  36. data/ext/ggml/src/ggml-cann/aclnn_ops.cpp +3427 -0
  37. data/ext/ggml/src/ggml-cann/aclnn_ops.h +592 -0
  38. data/ext/ggml/src/ggml-cann/common.h +286 -0
  39. data/ext/ggml/src/ggml-cann/ggml-cann.cpp +2188 -0
  40. data/ext/ggml/src/ggml-cann/kernels/ascendc_kernels.h +19 -0
  41. data/ext/ggml/src/ggml-cann/kernels/dup.cpp +236 -0
  42. data/ext/ggml/src/ggml-cann/kernels/get_row_f16.cpp +197 -0
  43. data/ext/ggml/src/ggml-cann/kernels/get_row_f32.cpp +190 -0
  44. data/ext/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +204 -0
  45. data/ext/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +191 -0
  46. data/ext/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +218 -0
  47. data/ext/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +216 -0
  48. data/ext/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +295 -0
  49. data/ext/ggml/src/ggml-common.h +1853 -0
  50. data/ext/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
  51. data/ext/ggml/src/ggml-cpu/amx/amx.h +8 -0
  52. data/ext/ggml/src/ggml-cpu/amx/common.h +91 -0
  53. data/ext/ggml/src/ggml-cpu/amx/mmq.cpp +2511 -0
  54. data/ext/ggml/src/ggml-cpu/amx/mmq.h +10 -0
  55. data/ext/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
  56. data/ext/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +4262 -0
  57. data/ext/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +8 -0
  58. data/ext/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
  59. data/ext/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
  60. data/ext/ggml/src/ggml-cpu/ggml-cpu-impl.h +386 -0
  61. data/ext/ggml/src/ggml-cpu/ggml-cpu-quants.c +10835 -0
  62. data/ext/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  63. data/ext/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
  64. data/ext/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
  65. data/ext/ggml/src/ggml-cpu/ggml-cpu.c +14123 -0
  66. data/ext/ggml/src/ggml-cpu/ggml-cpu.cpp +622 -0
  67. data/ext/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1884 -0
  68. data/ext/ggml/src/ggml-cpu/llamafile/sgemm.h +14 -0
  69. data/ext/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  70. data/ext/ggml/src/ggml-cuda/vendors/hip.h +186 -0
  71. data/ext/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  72. data/ext/ggml/src/ggml-impl.h +556 -0
  73. data/ext/ggml/src/ggml-kompute/ggml-kompute.cpp +2251 -0
  74. data/ext/ggml/src/ggml-metal/ggml-metal-impl.h +288 -0
  75. data/ext/ggml/src/ggml-metal/ggml-metal.m +4884 -0
  76. data/ext/ggml/src/ggml-metal/ggml-metal.metal +6732 -0
  77. data/ext/ggml/src/ggml-opt.cpp +854 -0
  78. data/ext/ggml/src/ggml-quants.c +5238 -0
  79. data/ext/ggml/src/ggml-quants.h +100 -0
  80. data/ext/ggml/src/ggml-rpc/ggml-rpc.cpp +1406 -0
  81. data/ext/ggml/src/ggml-sycl/common.cpp +95 -0
  82. data/ext/ggml/src/ggml-sycl/concat.cpp +196 -0
  83. data/ext/ggml/src/ggml-sycl/conv.cpp +99 -0
  84. data/ext/ggml/src/ggml-sycl/convert.cpp +547 -0
  85. data/ext/ggml/src/ggml-sycl/dmmv.cpp +1023 -0
  86. data/ext/ggml/src/ggml-sycl/element_wise.cpp +1030 -0
  87. data/ext/ggml/src/ggml-sycl/ggml-sycl.cpp +4729 -0
  88. data/ext/ggml/src/ggml-sycl/im2col.cpp +126 -0
  89. data/ext/ggml/src/ggml-sycl/mmq.cpp +3031 -0
  90. data/ext/ggml/src/ggml-sycl/mmvq.cpp +1015 -0
  91. data/ext/ggml/src/ggml-sycl/norm.cpp +378 -0
  92. data/ext/ggml/src/ggml-sycl/outprod.cpp +56 -0
  93. data/ext/ggml/src/ggml-sycl/rope.cpp +276 -0
  94. data/ext/ggml/src/ggml-sycl/softmax.cpp +251 -0
  95. data/ext/ggml/src/ggml-sycl/tsembd.cpp +72 -0
  96. data/ext/ggml/src/ggml-sycl/wkv6.cpp +141 -0
  97. data/ext/ggml/src/ggml-threading.cpp +12 -0
  98. data/ext/ggml/src/ggml-threading.h +14 -0
  99. data/ext/ggml/src/ggml-vulkan/ggml-vulkan.cpp +8657 -0
  100. data/ext/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +593 -0
  101. data/ext/ggml/src/ggml.c +7694 -0
  102. data/ext/{whisper.h → include/whisper.h} +23 -22
  103. data/ext/metal-embed.mk +17 -0
  104. data/ext/metal.mk +6 -0
  105. data/ext/ruby_whisper.cpp +1492 -9
  106. data/ext/ruby_whisper.h +10 -0
  107. data/ext/scripts/get-flags.mk +38 -0
  108. data/ext/src/coreml/whisper-decoder-impl.h +146 -0
  109. data/ext/src/coreml/whisper-decoder-impl.m +201 -0
  110. data/ext/src/coreml/whisper-encoder-impl.h +142 -0
  111. data/ext/src/coreml/whisper-encoder-impl.m +197 -0
  112. data/ext/src/coreml/whisper-encoder.h +26 -0
  113. data/ext/src/openvino/whisper-openvino-encoder.cpp +108 -0
  114. data/ext/src/openvino/whisper-openvino-encoder.h +31 -0
  115. data/ext/{whisper.cpp → src/whisper.cpp} +661 -492
  116. data/extsources.rb +6 -0
  117. data/lib/whisper/model/uri.rb +157 -0
  118. data/lib/whisper.rb +2 -0
  119. data/tests/helper.rb +7 -0
  120. data/tests/jfk_reader/.gitignore +5 -0
  121. data/tests/jfk_reader/extconf.rb +3 -0
  122. data/tests/jfk_reader/jfk_reader.c +68 -0
  123. data/tests/test_callback.rb +160 -0
  124. data/tests/test_error.rb +20 -0
  125. data/tests/test_model.rb +71 -0
  126. data/tests/test_package.rb +31 -0
  127. data/tests/test_params.rb +160 -0
  128. data/tests/test_segment.rb +83 -0
  129. data/tests/test_whisper.rb +211 -123
  130. data/whispercpp.gemspec +36 -0
  131. metadata +137 -11
  132. data/ext/ggml.c +0 -21755
@@ -176,25 +176,15 @@
176
176
  #ifdef GGML_SHARED
177
177
  # if defined(_WIN32) && !defined(__MINGW32__)
178
178
  # ifdef GGML_BUILD
179
- # define GGML_API __declspec(dllexport)
179
+ # define GGML_API __declspec(dllexport) extern
180
180
  # else
181
- # define GGML_API __declspec(dllimport)
181
+ # define GGML_API __declspec(dllimport) extern
182
182
  # endif
183
183
  # else
184
- # define GGML_API __attribute__ ((visibility ("default")))
184
+ # define GGML_API __attribute__ ((visibility ("default"))) extern
185
185
  # endif
186
186
  #else
187
- # define GGML_API
188
- #endif
189
-
190
- #ifdef GGML_MULTIPLATFORM
191
- # if defined(_WIN32)
192
- # define GGML_CALL
193
- # else
194
- # define GGML_CALL __attribute__((__ms_abi__))
195
- # endif
196
- #else
197
- # define GGML_CALL
187
+ # define GGML_API extern
198
188
  #endif
199
189
 
200
190
  // TODO: support for clang
@@ -220,21 +210,24 @@
220
210
  #include <stdio.h>
221
211
 
222
212
  #define GGML_FILE_MAGIC 0x67676d6c // "ggml"
223
- #define GGML_FILE_VERSION 1
213
+ #define GGML_FILE_VERSION 2
224
214
 
225
215
  #define GGML_QNT_VERSION 2 // bump this on quantization format changes
226
216
  #define GGML_QNT_VERSION_FACTOR 1000 // do not change this
227
217
 
228
218
  #define GGML_MAX_DIMS 4
229
219
  #define GGML_MAX_PARAMS 2048
230
- #define GGML_MAX_CONTEXTS 64
231
220
  #define GGML_MAX_SRC 10
221
+ #define GGML_MAX_N_THREADS 512
222
+ #define GGML_MAX_OP_PARAMS 64
223
+
232
224
  #ifndef GGML_MAX_NAME
233
- #define GGML_MAX_NAME 64
225
+ # define GGML_MAX_NAME 64
234
226
  #endif
235
- #define GGML_MAX_OP_PARAMS 64
227
+
236
228
  #define GGML_DEFAULT_N_THREADS 4
237
229
  #define GGML_DEFAULT_GRAPH_SIZE 2048
230
+
238
231
  #if UINTPTR_MAX == 0xFFFFFFFF
239
232
  #define GGML_MEM_ALIGN 4
240
233
  #else
@@ -244,6 +237,10 @@
244
237
  #define GGML_EXIT_SUCCESS 0
245
238
  #define GGML_EXIT_ABORTED 1
246
239
 
240
+ #define GGML_ROPE_TYPE_NEOX 2
241
+ #define GGML_ROPE_TYPE_MROPE 8
242
+ #define GGML_ROPE_TYPE_VISION 24
243
+
247
244
  #define GGUF_MAGIC "GGUF"
248
245
 
249
246
  #define GGUF_VERSION 3
@@ -254,26 +251,27 @@
254
251
 
255
252
  #define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
256
253
 
257
- #define GGML_ASSERT(x) \
258
- do { \
259
- if (!(x)) { \
260
- fflush(stdout); \
261
- fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
262
- ggml_print_backtrace(); \
263
- abort(); \
264
- } \
265
- } while (0)
266
-
267
254
  #ifndef NDEBUG
268
- #define GGML_UNREACHABLE() GGML_ASSERT(!"statement should not be reached")
255
+ # define GGML_UNREACHABLE() do { fprintf(stderr, "statement should be unreachable\n"); abort(); } while(0)
269
256
  #elif defined(__GNUC__)
270
- #define GGML_UNREACHABLE() __builtin_unreachable()
257
+ # define GGML_UNREACHABLE() __builtin_unreachable()
271
258
  #elif defined(_MSC_VER)
272
- #define GGML_UNREACHABLE() __assume(0)
259
+ # define GGML_UNREACHABLE() __assume(0)
273
260
  #else
274
- #define GGML_UNREACHABLE() ((void) 0)
261
+ # define GGML_UNREACHABLE() ((void) 0)
275
262
  #endif
276
263
 
264
+ #ifdef __cplusplus
265
+ # define GGML_NORETURN [[noreturn]]
266
+ #elif defined(_MSC_VER)
267
+ # define GGML_NORETURN __declspec(noreturn)
268
+ #else
269
+ # define GGML_NORETURN _Noreturn
270
+ #endif
271
+
272
+ #define GGML_ABORT(...) ggml_abort(__FILE__, __LINE__, __VA_ARGS__)
273
+ #define GGML_ASSERT(x) if (!(x)) GGML_ABORT("GGML_ASSERT(%s) failed", #x)
274
+
277
275
  // used to copy the number of elements and stride in bytes of tensors into local variables.
278
276
  // main purpose is to reduce code duplication and improve readability.
279
277
  //
@@ -312,10 +310,19 @@
312
310
  GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
313
311
  GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
314
312
 
313
+ #define GGML_TENSOR_BINARY_OP_LOCALS01 \
314
+ GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
315
+ GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
316
+ GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
317
+ GGML_TENSOR_LOCALS(size_t, nb1, src1, nb)
318
+
315
319
  #ifdef __cplusplus
316
320
  extern "C" {
317
321
  #endif
318
322
 
323
+ GGML_NORETURN GGML_ATTRIBUTE_FORMAT(3, 4)
324
+ GGML_API void ggml_abort(const char * file, int line, const char * fmt, ...);
325
+
319
326
  enum ggml_status {
320
327
  GGML_STATUS_ALLOC_FAILED = -2,
321
328
  GGML_STATUS_FAILED = -1,
@@ -324,19 +331,27 @@ extern "C" {
324
331
  };
325
332
 
326
333
  // get ggml_status name string
327
- GGML_API GGML_CALL const char * ggml_status_to_string(enum ggml_status status);
334
+ GGML_API const char * ggml_status_to_string(enum ggml_status status);
328
335
 
336
+ // ieee 754-2008 half-precision float16
337
+ // todo: make this not an integral type
329
338
  typedef uint16_t ggml_fp16_t;
330
-
331
- // convert FP16 <-> FP32
332
- GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
333
- GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
334
-
335
- GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n);
336
- GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n);
339
+ GGML_API float ggml_fp16_to_fp32(ggml_fp16_t);
340
+ GGML_API ggml_fp16_t ggml_fp32_to_fp16(float);
341
+ GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t *, float *, int64_t);
342
+ GGML_API void ggml_fp32_to_fp16_row(const float *, ggml_fp16_t *, int64_t);
343
+
344
+ // google brain half-precision bfloat16
345
+ typedef struct { uint16_t bits; } ggml_bf16_t;
346
+ GGML_API ggml_bf16_t ggml_fp32_to_bf16(float);
347
+ GGML_API float ggml_bf16_to_fp32(ggml_bf16_t); // consider just doing << 16
348
+ GGML_API void ggml_bf16_to_fp32_row(const ggml_bf16_t *, float *, int64_t);
349
+ GGML_API void ggml_fp32_to_bf16_row_ref(const float *, ggml_bf16_t *, int64_t);
350
+ GGML_API void ggml_fp32_to_bf16_row(const float *, ggml_bf16_t *, int64_t);
337
351
 
338
352
  struct ggml_object;
339
353
  struct ggml_context;
354
+ struct ggml_cgraph;
340
355
 
341
356
  // NOTE: always add types at the end of the enum to keep backward compatibility
342
357
  enum ggml_type {
@@ -370,7 +385,16 @@ extern "C" {
370
385
  GGML_TYPE_I64 = 27,
371
386
  GGML_TYPE_F64 = 28,
372
387
  GGML_TYPE_IQ1_M = 29,
373
- GGML_TYPE_COUNT,
388
+ GGML_TYPE_BF16 = 30,
389
+ // GGML_TYPE_Q4_0_4_4 = 31, support has been removed from gguf files
390
+ // GGML_TYPE_Q4_0_4_8 = 32,
391
+ // GGML_TYPE_Q4_0_8_8 = 33,
392
+ GGML_TYPE_TQ1_0 = 34,
393
+ GGML_TYPE_TQ2_0 = 35,
394
+ // GGML_TYPE_IQ4_NL_4_4 = 36,
395
+ // GGML_TYPE_IQ4_NL_4_8 = 37,
396
+ // GGML_TYPE_IQ4_NL_8_8 = 38,
397
+ GGML_TYPE_COUNT = 39,
374
398
  };
375
399
 
376
400
  // precision
@@ -410,6 +434,7 @@ extern "C" {
410
434
  GGML_FTYPE_MOSTLY_IQ2_S = 21, // except 1d tensors
411
435
  GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
412
436
  GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
437
+ GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
413
438
  };
414
439
 
415
440
  // available tensor operations:
@@ -426,10 +451,13 @@ extern "C" {
426
451
  GGML_OP_SQR,
427
452
  GGML_OP_SQRT,
428
453
  GGML_OP_LOG,
454
+ GGML_OP_SIN,
455
+ GGML_OP_COS,
429
456
  GGML_OP_SUM,
430
457
  GGML_OP_SUM_ROWS,
431
458
  GGML_OP_MEAN,
432
459
  GGML_OP_ARGMAX,
460
+ GGML_OP_COUNT_EQUAL,
433
461
  GGML_OP_REPEAT,
434
462
  GGML_OP_REPEAT_BACK,
435
463
  GGML_OP_CONCAT,
@@ -460,22 +488,23 @@ extern "C" {
460
488
  GGML_OP_SOFT_MAX_BACK,
461
489
  GGML_OP_ROPE,
462
490
  GGML_OP_ROPE_BACK,
463
- GGML_OP_ALIBI,
464
491
  GGML_OP_CLAMP,
465
492
  GGML_OP_CONV_TRANSPOSE_1D,
466
493
  GGML_OP_IM2COL,
494
+ GGML_OP_IM2COL_BACK,
467
495
  GGML_OP_CONV_TRANSPOSE_2D,
468
496
  GGML_OP_POOL_1D,
469
497
  GGML_OP_POOL_2D,
498
+ GGML_OP_POOL_2D_BACK,
470
499
  GGML_OP_UPSCALE, // nearest interpolate
471
500
  GGML_OP_PAD,
501
+ GGML_OP_PAD_REFLECT_1D,
472
502
  GGML_OP_ARANGE,
473
503
  GGML_OP_TIMESTEP_EMBEDDING,
474
504
  GGML_OP_ARGSORT,
475
505
  GGML_OP_LEAKY_RELU,
476
506
 
477
- GGML_OP_FLASH_ATTN,
478
- GGML_OP_FLASH_FF,
507
+ GGML_OP_FLASH_ATTN_EXT,
479
508
  GGML_OP_FLASH_ATTN_BACK,
480
509
  GGML_OP_SSM_CONV,
481
510
  GGML_OP_SSM_SCAN,
@@ -483,6 +512,7 @@ extern "C" {
483
512
  GGML_OP_WIN_UNPART,
484
513
  GGML_OP_GET_REL_POS,
485
514
  GGML_OP_ADD_REL_POS,
515
+ GGML_OP_RWKV_WKV6,
486
516
 
487
517
  GGML_OP_UNARY,
488
518
 
@@ -499,6 +529,7 @@ extern "C" {
499
529
 
500
530
  GGML_OP_CROSS_ENTROPY_LOSS,
501
531
  GGML_OP_CROSS_ENTROPY_LOSS_BACK,
532
+ GGML_OP_OPT_STEP_ADAMW,
502
533
 
503
534
  GGML_OP_COUNT,
504
535
  };
@@ -511,11 +542,13 @@ extern "C" {
511
542
  GGML_UNARY_OP_TANH,
512
543
  GGML_UNARY_OP_ELU,
513
544
  GGML_UNARY_OP_RELU,
545
+ GGML_UNARY_OP_SIGMOID,
514
546
  GGML_UNARY_OP_GELU,
515
547
  GGML_UNARY_OP_GELU_QUICK,
516
548
  GGML_UNARY_OP_SILU,
517
549
  GGML_UNARY_OP_HARDSWISH,
518
550
  GGML_UNARY_OP_HARDSIGMOID,
551
+ GGML_UNARY_OP_EXP,
519
552
 
520
553
  GGML_UNARY_OP_COUNT,
521
554
  };
@@ -527,36 +560,34 @@ extern "C" {
527
560
  };
528
561
 
529
562
  enum ggml_log_level {
530
- GGML_LOG_LEVEL_ERROR = 2,
563
+ GGML_LOG_LEVEL_NONE = 0,
564
+ GGML_LOG_LEVEL_DEBUG = 1,
565
+ GGML_LOG_LEVEL_INFO = 2,
531
566
  GGML_LOG_LEVEL_WARN = 3,
532
- GGML_LOG_LEVEL_INFO = 4,
533
- GGML_LOG_LEVEL_DEBUG = 5
567
+ GGML_LOG_LEVEL_ERROR = 4,
568
+ GGML_LOG_LEVEL_CONT = 5, // continue previous log
534
569
  };
535
570
 
571
+ // this tensor...
536
572
  enum ggml_tensor_flag {
537
- GGML_TENSOR_FLAG_INPUT = 1,
538
- GGML_TENSOR_FLAG_OUTPUT = 2,
539
- GGML_TENSOR_FLAG_PARAM = 4,
573
+ GGML_TENSOR_FLAG_INPUT = 1, // ...is an input for the GGML compute graph
574
+ GGML_TENSOR_FLAG_OUTPUT = 2, // ...is an output for the GGML compute graph
575
+ GGML_TENSOR_FLAG_PARAM = 4, // ...contains trainable parameters
576
+ GGML_TENSOR_FLAG_LOSS = 8, // ...defines loss for numerical optimization (multiple loss tensors add up)
540
577
  };
541
578
 
542
- // ggml object
543
- struct ggml_object {
544
- size_t offs;
545
- size_t size;
546
-
547
- struct ggml_object * next;
548
-
549
- enum ggml_object_type type;
550
-
551
- char padding[4];
579
+ struct ggml_init_params {
580
+ // memory pool
581
+ size_t mem_size; // bytes
582
+ void * mem_buffer; // if NULL, memory will be allocated internally
583
+ bool no_alloc; // don't allocate memory for the tensor data
552
584
  };
553
585
 
554
- static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
555
-
556
586
  // n-dimensional tensor
557
587
  struct ggml_tensor {
558
- enum ggml_type type;
559
- enum ggml_backend_type backend;
588
+ enum ggml_type type;
589
+
590
+ GGML_DEPRECATED(enum ggml_backend_type backend, "use the buffer type to find the storage location of the tensor");
560
591
 
561
592
  struct ggml_backend_buffer * buffer;
562
593
 
@@ -574,14 +605,9 @@ extern "C" {
574
605
 
575
606
  int32_t flags;
576
607
 
577
- struct ggml_tensor * grad;
578
608
  struct ggml_tensor * src[GGML_MAX_SRC];
579
609
 
580
- // performance
581
- int perf_runs;
582
- int64_t perf_cycles;
583
- int64_t perf_time_us;
584
-
610
+ // source tensor and offset for views
585
611
  struct ggml_tensor * view_src;
586
612
  size_t view_offs;
587
613
 
@@ -601,95 +627,6 @@ extern "C" {
601
627
  // If it returns true, the computation is aborted
602
628
  typedef bool (*ggml_abort_callback)(void * data);
603
629
 
604
- // the compute plan that needs to be prepared for ggml_graph_compute()
605
- // since https://github.com/ggerganov/ggml/issues/287
606
- struct ggml_cplan {
607
- size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
608
- uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
609
-
610
- int n_threads;
611
-
612
- // abort ggml_graph_compute when true
613
- ggml_abort_callback abort_callback;
614
- void * abort_callback_data;
615
- };
616
-
617
- enum ggml_cgraph_eval_order {
618
- GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
619
- GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
620
- GGML_CGRAPH_EVAL_ORDER_COUNT
621
- };
622
-
623
- struct ggml_hash_set {
624
- size_t size;
625
- struct ggml_tensor ** keys;
626
- };
627
-
628
- // computation graph
629
- struct ggml_cgraph {
630
- int size;
631
- int n_nodes;
632
- int n_leafs;
633
-
634
- struct ggml_tensor ** nodes;
635
- struct ggml_tensor ** grads;
636
- struct ggml_tensor ** leafs;
637
-
638
- struct ggml_hash_set visited_hash_table;
639
-
640
- enum ggml_cgraph_eval_order order;
641
-
642
- // performance
643
- int perf_runs;
644
- int64_t perf_cycles;
645
- int64_t perf_time_us;
646
- };
647
-
648
- // scratch buffer
649
- struct ggml_scratch {
650
- size_t offs;
651
- size_t size;
652
- void * data;
653
- };
654
-
655
- struct ggml_init_params {
656
- // memory pool
657
- size_t mem_size; // bytes
658
- void * mem_buffer; // if NULL, memory will be allocated internally
659
- bool no_alloc; // don't allocate memory for the tensor data
660
- };
661
-
662
-
663
- // compute types
664
-
665
- // NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled.
666
- // This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995.
667
- enum ggml_task_type {
668
- GGML_TASK_TYPE_INIT = 0,
669
- GGML_TASK_TYPE_COMPUTE,
670
- GGML_TASK_TYPE_FINALIZE,
671
- };
672
-
673
- struct ggml_compute_params {
674
- enum ggml_task_type type;
675
-
676
- // ith = thread index, nth = number of threads
677
- int ith, nth;
678
-
679
- // work buffer for all threads
680
- size_t wsize;
681
- void * wdata;
682
- };
683
-
684
- // numa strategies
685
- enum ggml_numa_strategy {
686
- GGML_NUMA_STRATEGY_DISABLED = 0,
687
- GGML_NUMA_STRATEGY_DISTRIBUTE = 1,
688
- GGML_NUMA_STRATEGY_ISOLATE = 2,
689
- GGML_NUMA_STRATEGY_NUMACTL = 3,
690
- GGML_NUMA_STRATEGY_MIRROR = 4,
691
- GGML_NUMA_STRATEGY_COUNT
692
- };
693
630
 
694
631
  //
695
632
  // GUID
@@ -709,67 +646,71 @@ extern "C" {
709
646
  GGML_API int64_t ggml_cycles(void);
710
647
  GGML_API int64_t ggml_cycles_per_ms(void);
711
648
 
712
- GGML_API void ggml_print_backtrace(void);
713
-
714
649
  // accepts a UTF-8 path, even on Windows
715
650
  GGML_API FILE * ggml_fopen(const char * fname, const char * mode);
716
651
 
717
- GGML_API void ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
718
- GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
719
-
720
652
  GGML_API void ggml_print_object (const struct ggml_object * obj);
721
653
  GGML_API void ggml_print_objects(const struct ggml_context * ctx);
722
654
 
723
- GGML_API GGML_CALL int64_t ggml_nelements (const struct ggml_tensor * tensor);
724
- GGML_API GGML_CALL int64_t ggml_nrows (const struct ggml_tensor * tensor);
725
- GGML_API GGML_CALL size_t ggml_nbytes (const struct ggml_tensor * tensor);
726
- GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
655
+ GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor);
656
+ GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
657
+ GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
658
+ GGML_API size_t ggml_nbytes_pad(const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
727
659
 
728
- GGML_API GGML_CALL int ggml_blck_size(enum ggml_type type);
729
- GGML_API GGML_CALL size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
730
- GGML_API GGML_CALL size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
660
+ GGML_API int64_t ggml_blck_size(enum ggml_type type);
661
+ GGML_API size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
662
+ GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
731
663
 
732
664
  GGML_DEPRECATED(
733
665
  GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
734
666
  "use ggml_row_size() instead");
735
667
 
736
- GGML_API GGML_CALL const char * ggml_type_name(enum ggml_type type);
737
- GGML_API GGML_CALL const char * ggml_op_name (enum ggml_op op);
738
- GGML_API const char * ggml_op_symbol(enum ggml_op op);
668
+ GGML_API const char * ggml_type_name(enum ggml_type type);
669
+ GGML_API const char * ggml_op_name (enum ggml_op op);
670
+ GGML_API const char * ggml_op_symbol(enum ggml_op op);
739
671
 
740
- GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op);
741
- GGML_API GGML_CALL const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
672
+ GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op);
673
+ GGML_API const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
742
674
 
743
- GGML_API GGML_CALL size_t ggml_element_size(const struct ggml_tensor * tensor);
675
+ GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
744
676
 
745
- GGML_API GGML_CALL bool ggml_is_quantized(enum ggml_type type);
677
+ GGML_API bool ggml_is_quantized(enum ggml_type type);
746
678
 
747
679
  // TODO: temporary until model loading of ggml examples is refactored
748
680
  GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
749
681
 
750
- GGML_API GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor);
751
- GGML_API GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor);
752
- GGML_API GGML_CALL bool ggml_is_permuted (const struct ggml_tensor * tensor);
753
- GGML_API GGML_CALL bool ggml_is_empty (const struct ggml_tensor * tensor);
754
- GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
755
- GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
756
- GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);
757
- GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
758
- GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
682
+ GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
683
+ GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
684
+ GGML_API bool ggml_is_empty (const struct ggml_tensor * tensor);
685
+ GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
686
+ GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
687
+ GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);
688
+ GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
689
+ GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
690
+
691
+ GGML_API bool ggml_is_contiguous (const struct ggml_tensor * tensor);
692
+ GGML_API bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous()
693
+ GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
694
+ GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
759
695
 
760
- GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
696
+ GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
697
+ GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
698
+
699
+ GGML_API bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
761
700
 
762
701
  // use this to compute the memory overhead of a tensor
763
702
  GGML_API size_t ggml_tensor_overhead(void);
764
703
 
704
+ GGML_API bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbytes);
705
+
765
706
  // main
766
707
 
767
- GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
768
- GGML_API void ggml_free(struct ggml_context * ctx);
708
+ GGML_API struct ggml_context * ggml_init (struct ggml_init_params params);
709
+ GGML_API void ggml_reset(struct ggml_context * ctx);
710
+ GGML_API void ggml_free (struct ggml_context * ctx);
769
711
 
770
712
  GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
771
713
 
772
- GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
773
714
  GGML_API bool ggml_get_no_alloc(struct ggml_context * ctx);
774
715
  GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
775
716
 
@@ -809,8 +750,7 @@ extern "C" {
809
750
  int64_t ne2,
810
751
  int64_t ne3);
811
752
 
812
- GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
813
- GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
753
+ GGML_API void * ggml_new_buffer(struct ggml_context * ctx, size_t nbytes);
814
754
 
815
755
  GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
816
756
  GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
@@ -820,35 +760,25 @@ extern "C" {
820
760
  GGML_API struct ggml_tensor * ggml_get_next_tensor (const struct ggml_context * ctx, struct ggml_tensor * tensor);
821
761
  GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
822
762
 
823
- GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
824
- GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
825
- GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
826
-
827
763
  // Converts a flat index into coordinates
828
- GGML_API void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3);
764
+ GGML_API void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3);
829
765
 
830
- GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
831
- GGML_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
832
-
833
- GGML_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
834
- GGML_API void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
835
-
836
- GGML_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
837
- GGML_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
838
-
839
- GGML_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
840
- GGML_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
766
+ GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
841
767
 
842
768
  GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
843
769
  GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
844
770
 
845
- GGML_API GGML_CALL enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
846
-
847
771
  GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor);
848
772
  GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name);
849
773
  GGML_ATTRIBUTE_FORMAT(2, 3)
850
774
  GGML_API struct ggml_tensor * ggml_format_name( struct ggml_tensor * tensor, const char * fmt, ...);
851
775
 
776
+ // Tensor flags
777
+ GGML_API void ggml_set_input(struct ggml_tensor * tensor);
778
+ GGML_API void ggml_set_output(struct ggml_tensor * tensor);
779
+ GGML_API void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor);
780
+ GGML_API void ggml_set_loss(struct ggml_tensor * tensor);
781
+
852
782
  //
853
783
  // operations on tensors with backpropagation
854
784
  //
@@ -963,6 +893,22 @@ extern "C" {
963
893
  struct ggml_context * ctx,
964
894
  struct ggml_tensor * a);
965
895
 
896
+ GGML_API struct ggml_tensor * ggml_sin(
897
+ struct ggml_context * ctx,
898
+ struct ggml_tensor * a);
899
+
900
+ GGML_API struct ggml_tensor * ggml_sin_inplace(
901
+ struct ggml_context * ctx,
902
+ struct ggml_tensor * a);
903
+
904
+ GGML_API struct ggml_tensor * ggml_cos(
905
+ struct ggml_context * ctx,
906
+ struct ggml_tensor * a);
907
+
908
+ GGML_API struct ggml_tensor * ggml_cos_inplace(
909
+ struct ggml_context * ctx,
910
+ struct ggml_tensor * a);
911
+
966
912
  // return scalar
967
913
  GGML_API struct ggml_tensor * ggml_sum(
968
914
  struct ggml_context * ctx,
@@ -983,6 +929,12 @@ extern "C" {
983
929
  struct ggml_context * ctx,
984
930
  struct ggml_tensor * a);
985
931
 
932
+ // count number of equal elements in a and b
933
+ GGML_API struct ggml_tensor * ggml_count_equal(
934
+ struct ggml_context * ctx,
935
+ struct ggml_tensor * a,
936
+ struct ggml_tensor * b);
937
+
986
938
  // if a is the same shape as b, and a is not parameter, return a
987
939
  // otherwise, return a new tensor: repeat(a) to fit in b
988
940
  GGML_API struct ggml_tensor * ggml_repeat(
@@ -996,12 +948,13 @@ extern "C" {
996
948
  struct ggml_tensor * a,
997
949
  struct ggml_tensor * b);
998
950
 
999
- // concat a and b on dim 2
951
+ // concat a and b along dim
1000
952
  // used in stable-diffusion
1001
953
  GGML_API struct ggml_tensor * ggml_concat(
1002
954
  struct ggml_context * ctx,
1003
955
  struct ggml_tensor * a,
1004
- struct ggml_tensor * b);
956
+ struct ggml_tensor * b,
957
+ int dim);
1005
958
 
1006
959
  GGML_API struct ggml_tensor * ggml_abs(
1007
960
  struct ggml_context * ctx,
@@ -1063,6 +1016,14 @@ extern "C" {
1063
1016
  struct ggml_context * ctx,
1064
1017
  struct ggml_tensor * a);
1065
1018
 
1019
+ GGML_API struct ggml_tensor * ggml_sigmoid(
1020
+ struct ggml_context * ctx,
1021
+ struct ggml_tensor * a);
1022
+
1023
+ GGML_API struct ggml_tensor * ggml_sigmoid_inplace(
1024
+ struct ggml_context * ctx,
1025
+ struct ggml_tensor * a);
1026
+
1066
1027
  GGML_API struct ggml_tensor * ggml_gelu(
1067
1028
  struct ggml_context * ctx,
1068
1029
  struct ggml_tensor * a);
@@ -1104,6 +1065,14 @@ extern "C" {
1104
1065
  struct ggml_context * ctx,
1105
1066
  struct ggml_tensor * a);
1106
1067
 
1068
+ GGML_API struct ggml_tensor * ggml_exp(
1069
+ struct ggml_context * ctx,
1070
+ struct ggml_tensor * a);
1071
+
1072
+ GGML_API struct ggml_tensor * ggml_exp_inplace(
1073
+ struct ggml_context * ctx,
1074
+ struct ggml_tensor * a);
1075
+
1107
1076
  // normalize along rows
1108
1077
  GGML_API struct ggml_tensor * ggml_norm(
1109
1078
  struct ggml_context * ctx,
@@ -1127,16 +1096,17 @@ extern "C" {
1127
1096
 
1128
1097
  // group normalize along ne0*ne1*n_groups
1129
1098
  // used in stable-diffusion
1130
- // TODO: eps is hardcoded to 1e-6 for now
1131
1099
  GGML_API struct ggml_tensor * ggml_group_norm(
1132
1100
  struct ggml_context * ctx,
1133
1101
  struct ggml_tensor * a,
1134
- int n_groups);
1102
+ int n_groups,
1103
+ float eps);
1135
1104
 
1136
1105
  GGML_API struct ggml_tensor * ggml_group_norm_inplace(
1137
1106
  struct ggml_context * ctx,
1138
1107
  struct ggml_tensor * a,
1139
- int n_groups);
1108
+ int n_groups,
1109
+ float eps);
1140
1110
 
1141
1111
  // a - x
1142
1112
  // b - dy
@@ -1161,13 +1131,11 @@ extern "C" {
1161
1131
  enum ggml_prec prec);
1162
1132
 
1163
1133
  // indirect matrix multiplication
1164
- // ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
1165
1134
  GGML_API struct ggml_tensor * ggml_mul_mat_id(
1166
1135
  struct ggml_context * ctx,
1167
1136
  struct ggml_tensor * as,
1168
- struct ggml_tensor * ids,
1169
- int id,
1170
- struct ggml_tensor * b);
1137
+ struct ggml_tensor * b,
1138
+ struct ggml_tensor * ids);
1171
1139
 
1172
1140
  // A: m columns, n rows,
1173
1141
  // B: p columns, n rows,
@@ -1200,7 +1168,7 @@ extern "C" {
1200
1168
  size_t nb1,
1201
1169
  size_t nb2,
1202
1170
  size_t nb3,
1203
- size_t offset);
1171
+ size_t offset); // in bytes
1204
1172
 
1205
1173
  // b -> view(a,offset,nb1,nb2,3), return view(a)
1206
1174
  GGML_API struct ggml_tensor * ggml_set_inplace(
@@ -1210,19 +1178,19 @@ extern "C" {
1210
1178
  size_t nb1,
1211
1179
  size_t nb2,
1212
1180
  size_t nb3,
1213
- size_t offset);
1181
+ size_t offset); // in bytes
1214
1182
 
1215
1183
  GGML_API struct ggml_tensor * ggml_set_1d(
1216
1184
  struct ggml_context * ctx,
1217
1185
  struct ggml_tensor * a,
1218
1186
  struct ggml_tensor * b,
1219
- size_t offset);
1187
+ size_t offset); // in bytes
1220
1188
 
1221
1189
  GGML_API struct ggml_tensor * ggml_set_1d_inplace(
1222
1190
  struct ggml_context * ctx,
1223
1191
  struct ggml_tensor * a,
1224
1192
  struct ggml_tensor * b,
1225
- size_t offset);
1193
+ size_t offset); // in bytes
1226
1194
 
1227
1195
  // b -> view(a,offset,nb1,nb2,3), return modified a
1228
1196
  GGML_API struct ggml_tensor * ggml_set_2d(
@@ -1230,7 +1198,7 @@ extern "C" {
1230
1198
  struct ggml_tensor * a,
1231
1199
  struct ggml_tensor * b,
1232
1200
  size_t nb1,
1233
- size_t offset);
1201
+ size_t offset); // in bytes
1234
1202
 
1235
1203
  // b -> view(a,offset,nb1,nb2,3), return view(a)
1236
1204
  GGML_API struct ggml_tensor * ggml_set_2d_inplace(
@@ -1238,7 +1206,7 @@ extern "C" {
1238
1206
  struct ggml_tensor * a,
1239
1207
  struct ggml_tensor * b,
1240
1208
  size_t nb1,
1241
- size_t offset);
1209
+ size_t offset); // in bytes
1242
1210
 
1243
1211
  // a -> b, return view(b)
1244
1212
  GGML_API struct ggml_tensor * ggml_cpy(
@@ -1373,14 +1341,14 @@ extern "C" {
1373
1341
  // supports 3D: a->ne[2] == b->ne[1]
1374
1342
  GGML_API struct ggml_tensor * ggml_get_rows(
1375
1343
  struct ggml_context * ctx,
1376
- struct ggml_tensor * a,
1377
- struct ggml_tensor * b);
1344
+ struct ggml_tensor * a, // data
1345
+ struct ggml_tensor * b); // row indices
1378
1346
 
1379
1347
  GGML_API struct ggml_tensor * ggml_get_rows_back(
1380
1348
  struct ggml_context * ctx,
1381
- struct ggml_tensor * a,
1382
- struct ggml_tensor * b,
1383
- struct ggml_tensor * c);
1349
+ struct ggml_tensor * a, // gradients of ggml_get_rows result
1350
+ struct ggml_tensor * b, // row indices
1351
+ struct ggml_tensor * c); // data for ggml_get_rows, only used for its shape
1384
1352
 
1385
1353
  GGML_API struct ggml_tensor * ggml_diag(
1386
1354
  struct ggml_context * ctx,
@@ -1419,15 +1387,13 @@ extern "C" {
1419
1387
  struct ggml_context * ctx,
1420
1388
  struct ggml_tensor * a);
1421
1389
 
1422
- // fused soft_max(a*scale + mask + pos[i]*(ALiBi slope))
1390
+ // fused soft_max(a*scale + mask*(ALiBi slope))
1423
1391
  // mask is optional
1424
- // pos is required when max_bias > 0.0f
1425
1392
  // max_bias = 0.0f for no ALiBi
1426
1393
  GGML_API struct ggml_tensor * ggml_soft_max_ext(
1427
1394
  struct ggml_context * ctx,
1428
1395
  struct ggml_tensor * a,
1429
1396
  struct ggml_tensor * mask,
1430
- struct ggml_tensor * pos,
1431
1397
  float scale,
1432
1398
  float max_bias);
1433
1399
 
@@ -1443,9 +1409,8 @@ extern "C" {
1443
1409
  struct ggml_tensor * b);
1444
1410
 
1445
1411
  // rotary position embedding
1446
- // if mode & 1 == 1, skip n_past elements (DEPRECATED)
1447
- // if mode & 2 == 1, GPT-NeoX style
1448
- // if mode & 4 == 1, ChatGLM style
1412
+ // if (mode & 1) - skip n_past elements (NOT SUPPORTED)
1413
+ // if (mode & GGML_ROPE_TYPE_NEOX) - GPT-NeoX style
1449
1414
  //
1450
1415
  // b is an int32 vector with size a->ne[2], it contains the positions
1451
1416
  GGML_API struct ggml_tensor * ggml_rope(
@@ -1453,8 +1418,7 @@ extern "C" {
1453
1418
  struct ggml_tensor * a,
1454
1419
  struct ggml_tensor * b,
1455
1420
  int n_dims,
1456
- int mode,
1457
- int n_ctx);
1421
+ int mode);
1458
1422
 
1459
1423
  // in-place, returns view(a)
1460
1424
  GGML_API struct ggml_tensor * ggml_rope_inplace(
@@ -1462,18 +1426,18 @@ extern "C" {
1462
1426
  struct ggml_tensor * a,
1463
1427
  struct ggml_tensor * b,
1464
1428
  int n_dims,
1465
- int mode,
1466
- int n_ctx);
1429
+ int mode);
1467
1430
 
1468
1431
  // custom RoPE
1469
- GGML_API struct ggml_tensor * ggml_rope_custom(
1432
+ // c is freq factors (e.g. phi3-128k), (optional)
1433
+ GGML_API struct ggml_tensor * ggml_rope_ext(
1470
1434
  struct ggml_context * ctx,
1471
1435
  struct ggml_tensor * a,
1472
1436
  struct ggml_tensor * b,
1437
+ struct ggml_tensor * c,
1473
1438
  int n_dims,
1474
1439
  int mode,
1475
- int n_ctx,
1476
- int n_orig_ctx,
1440
+ int n_ctx_orig,
1477
1441
  float freq_base,
1478
1442
  float freq_scale,
1479
1443
  float ext_factor,
@@ -1481,15 +1445,15 @@ extern "C" {
1481
1445
  float beta_fast,
1482
1446
  float beta_slow);
1483
1447
 
1484
- // in-place, returns view(a)
1485
- GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
1448
+ GGML_API struct ggml_tensor * ggml_rope_multi(
1486
1449
  struct ggml_context * ctx,
1487
1450
  struct ggml_tensor * a,
1488
1451
  struct ggml_tensor * b,
1452
+ struct ggml_tensor * c,
1489
1453
  int n_dims,
1454
+ int sections[4],
1490
1455
  int mode,
1491
- int n_ctx,
1492
- int n_orig_ctx,
1456
+ int n_ctx_orig,
1493
1457
  float freq_base,
1494
1458
  float freq_scale,
1495
1459
  float ext_factor,
@@ -1497,47 +1461,72 @@ extern "C" {
1497
1461
  float beta_fast,
1498
1462
  float beta_slow);
1499
1463
 
1500
- // compute correction dims for YaRN RoPE scaling
1501
- GGML_CALL void ggml_rope_yarn_corr_dims(
1502
- int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
1503
-
1504
- // xPos RoPE, in-place, returns view(a)
1505
- GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
1464
+ // in-place, returns view(a)
1465
+ GGML_API struct ggml_tensor * ggml_rope_ext_inplace(
1506
1466
  struct ggml_context * ctx,
1507
1467
  struct ggml_tensor * a,
1508
1468
  struct ggml_tensor * b,
1469
+ struct ggml_tensor * c,
1509
1470
  int n_dims,
1510
- float base,
1511
- bool down);
1471
+ int mode,
1472
+ int n_ctx_orig,
1473
+ float freq_base,
1474
+ float freq_scale,
1475
+ float ext_factor,
1476
+ float attn_factor,
1477
+ float beta_fast,
1478
+ float beta_slow);
1512
1479
 
1513
- // rotary position embedding backward, i.e compute dx from dy
1514
- // a - dy
1515
- GGML_API struct ggml_tensor * ggml_rope_back(
1480
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom(
1516
1481
  struct ggml_context * ctx,
1517
1482
  struct ggml_tensor * a,
1518
1483
  struct ggml_tensor * b,
1519
1484
  int n_dims,
1520
1485
  int mode,
1521
- int n_ctx,
1522
- int n_orig_ctx,
1486
+ int n_ctx_orig,
1523
1487
  float freq_base,
1524
1488
  float freq_scale,
1525
1489
  float ext_factor,
1526
1490
  float attn_factor,
1527
1491
  float beta_fast,
1528
- float beta_slow,
1529
- float xpos_base,
1530
- bool xpos_down);
1492
+ float beta_slow),
1493
+ "use ggml_rope_ext instead");
1531
1494
 
1532
- // alibi position embedding
1533
- // in-place, returns view(a)
1534
- GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_alibi(
1495
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
1535
1496
  struct ggml_context * ctx,
1536
1497
  struct ggml_tensor * a,
1537
- int n_past,
1538
- int n_head,
1539
- float bias_max),
1540
- "use ggml_soft_max_ext instead (will be removed in Mar 2024)");
1498
+ struct ggml_tensor * b,
1499
+ int n_dims,
1500
+ int mode,
1501
+ int n_ctx_orig,
1502
+ float freq_base,
1503
+ float freq_scale,
1504
+ float ext_factor,
1505
+ float attn_factor,
1506
+ float beta_fast,
1507
+ float beta_slow),
1508
+ "use ggml_rope_ext_inplace instead");
1509
+
1510
+ // compute correction dims for YaRN RoPE scaling
1511
+ GGML_API void ggml_rope_yarn_corr_dims(
1512
+ int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]);
1513
+
1514
+ // rotary position embedding backward, i.e compute dx from dy
1515
+ // a - dy
1516
+ GGML_API struct ggml_tensor * ggml_rope_back(
1517
+ struct ggml_context * ctx,
1518
+ struct ggml_tensor * a, // gradients of ggml_rope result
1519
+ struct ggml_tensor * b, // positions
1520
+ struct ggml_tensor * c, // freq factors
1521
+ int n_dims,
1522
+ int mode,
1523
+ int n_ctx_orig,
1524
+ float freq_base,
1525
+ float freq_scale,
1526
+ float ext_factor,
1527
+ float attn_factor,
1528
+ float beta_fast,
1529
+ float beta_slow);
1541
1530
 
1542
1531
  // clamp
1543
1532
  // in-place, returns view(a)
@@ -1547,34 +1536,49 @@ extern "C" {
1547
1536
  float min,
1548
1537
  float max);
1549
1538
 
1539
+ // im2col
1540
+ // converts data into a format that effectively results in a convolution when combined with matrix multiplication
1550
1541
  GGML_API struct ggml_tensor * ggml_im2col(
1551
1542
  struct ggml_context * ctx,
1552
- struct ggml_tensor * a,
1553
- struct ggml_tensor * b,
1554
- int s0,
1555
- int s1,
1556
- int p0,
1557
- int p1,
1558
- int d0,
1559
- int d1,
1560
- bool is_2D,
1561
- enum ggml_type dst_type);
1543
+ struct ggml_tensor * a, // convolution kernel
1544
+ struct ggml_tensor * b, // data
1545
+ int s0, // stride dimension 0
1546
+ int s1, // stride dimension 1
1547
+ int p0, // padding dimension 0
1548
+ int p1, // padding dimension 1
1549
+ int d0, // dilation dimension 0
1550
+ int d1, // dilation dimension 1
1551
+ bool is_2D,
1552
+ enum ggml_type dst_type);
1553
+
1554
+ GGML_API struct ggml_tensor * ggml_im2col_back(
1555
+ struct ggml_context * ctx,
1556
+ struct ggml_tensor * a, // convolution kernel
1557
+ struct ggml_tensor * b, // gradient of im2col output
1558
+ int64_t * ne, // shape of im2col input
1559
+ int s0, // stride dimension 0
1560
+ int s1, // stride dimension 1
1561
+ int p0, // padding dimension 0
1562
+ int p1, // padding dimension 1
1563
+ int d0, // dilation dimension 0
1564
+ int d1, // dilation dimension 1
1565
+ bool is_2D);
1562
1566
 
1563
1567
  GGML_API struct ggml_tensor * ggml_conv_depthwise_2d(
1564
1568
  struct ggml_context * ctx,
1565
- struct ggml_tensor * a,
1566
- struct ggml_tensor * b,
1567
- int s0,
1568
- int s1,
1569
- int p0,
1570
- int p1,
1571
- int d0,
1572
- int d1);
1569
+ struct ggml_tensor * a, // convolution kernel
1570
+ struct ggml_tensor * b, // data
1571
+ int s0, // stride dimension 0
1572
+ int s1, // stride dimension 1
1573
+ int p0, // padding dimension 0
1574
+ int p1, // padding dimension 1
1575
+ int d0, // dilation dimension 0
1576
+ int d1); // dilation dimension 1
1573
1577
 
1574
1578
  GGML_API struct ggml_tensor * ggml_conv_1d(
1575
1579
  struct ggml_context * ctx,
1576
- struct ggml_tensor * a,
1577
- struct ggml_tensor * b,
1580
+ struct ggml_tensor * a, // convolution kernel
1581
+ struct ggml_tensor * b, // data
1578
1582
  int s0, // stride
1579
1583
  int p0, // padding
1580
1584
  int d0); // dilation
@@ -1583,29 +1587,29 @@ extern "C" {
1583
1587
  // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
1584
1588
  GGML_API struct ggml_tensor* ggml_conv_1d_ph(
1585
1589
  struct ggml_context * ctx,
1586
- struct ggml_tensor * a,
1587
- struct ggml_tensor * b,
1588
- int s,
1589
- int d);
1590
+ struct ggml_tensor * a, // convolution kernel
1591
+ struct ggml_tensor * b, // data
1592
+ int s, // stride
1593
+ int d); // dilation
1590
1594
 
1591
1595
  GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
1592
1596
  struct ggml_context * ctx,
1593
- struct ggml_tensor * a,
1594
- struct ggml_tensor * b,
1595
- int s0,
1596
- int p0,
1597
- int d0);
1597
+ struct ggml_tensor * a, // convolution kernel
1598
+ struct ggml_tensor * b, // data
1599
+ int s0, // stride
1600
+ int p0, // padding
1601
+ int d0); // dilation
1598
1602
 
1599
1603
  GGML_API struct ggml_tensor * ggml_conv_2d(
1600
1604
  struct ggml_context * ctx,
1601
- struct ggml_tensor * a,
1602
- struct ggml_tensor * b,
1603
- int s0,
1604
- int s1,
1605
- int p0,
1606
- int p1,
1607
- int d0,
1608
- int d1);
1605
+ struct ggml_tensor * a, // convolution kernel
1606
+ struct ggml_tensor * b, // data
1607
+ int s0, // stride dimension 0
1608
+ int s1, // stride dimension 1
1609
+ int p0, // padding dimension 0
1610
+ int p1, // padding dimension 1
1611
+ int d0, // dilation dimension 0
1612
+ int d1); // dilation dimension 1
1609
1613
 
1610
1614
 
1611
1615
  // kernel size is a->ne[0] x a->ne[1]
@@ -1667,13 +1671,37 @@ extern "C" {
1667
1671
  float p0,
1668
1672
  float p1);
1669
1673
 
1674
+ GGML_API struct ggml_tensor * ggml_pool_2d_back(
1675
+ struct ggml_context * ctx,
1676
+ struct ggml_tensor * a,
1677
+ struct ggml_tensor * af, // "a"/input used in forward pass
1678
+ enum ggml_op_pool op,
1679
+ int k0,
1680
+ int k1,
1681
+ int s0,
1682
+ int s1,
1683
+ float p0,
1684
+ float p1);
1685
+
1670
1686
  // nearest interpolate
1687
+ // multiplies ne0 and ne1 by scale factor
1671
1688
  // used in stable-diffusion
1672
1689
  GGML_API struct ggml_tensor * ggml_upscale(
1673
1690
  struct ggml_context * ctx,
1674
1691
  struct ggml_tensor * a,
1675
1692
  int scale_factor);
1676
1693
 
1694
+ // nearest interpolate
1695
+ // nearest interpolate to specified dimensions
1696
+ // used in tortoise.cpp
1697
+ GGML_API struct ggml_tensor * ggml_upscale_ext(
1698
+ struct ggml_context * ctx,
1699
+ struct ggml_tensor * a,
1700
+ int ne0,
1701
+ int ne1,
1702
+ int ne2,
1703
+ int ne3);
1704
+
1677
1705
  // pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
1678
1706
  GGML_API struct ggml_tensor * ggml_pad(
1679
1707
  struct ggml_context * ctx,
@@ -1683,6 +1711,13 @@ extern "C" {
1683
1711
  int p2,
1684
1712
  int p3);
1685
1713
 
1714
+ // pad each dimension with reflection: [a, b, c, d] -> [b, a, b, c, d, c]
1715
+ GGML_API struct ggml_tensor * ggml_pad_reflect_1d(
1716
+ struct ggml_context * ctx,
1717
+ struct ggml_tensor * a,
1718
+ int p0,
1719
+ int p1);
1720
+
1686
1721
  // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
1687
1722
  // timesteps: [N,]
1688
1723
  // return: [N, dim]
@@ -1715,13 +1750,31 @@ extern "C" {
1715
1750
  struct ggml_tensor * a,
1716
1751
  int k);
1717
1752
 
1718
- GGML_API struct ggml_tensor * ggml_flash_attn(
1753
+ #define GGML_KQ_MASK_PAD 32
1754
+
1755
+ // q: [n_embd, n_batch, n_head, 1]
1756
+ // k: [n_embd, n_kv, n_head_kv, 1]
1757
+ // v: [n_embd, n_kv, n_head_kv, 1] !! not transposed !!
1758
+ // mask: [n_kv, n_batch_pad, 1, 1] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
1759
+ // res: [n_embd, n_head, n_batch, 1] !! permuted !!
1760
+ GGML_API struct ggml_tensor * ggml_flash_attn_ext(
1719
1761
  struct ggml_context * ctx,
1720
1762
  struct ggml_tensor * q,
1721
1763
  struct ggml_tensor * k,
1722
1764
  struct ggml_tensor * v,
1723
- bool masked);
1765
+ struct ggml_tensor * mask,
1766
+ float scale,
1767
+ float max_bias,
1768
+ float logit_softcap);
1769
+
1770
+ GGML_API void ggml_flash_attn_ext_set_prec(
1771
+ struct ggml_tensor * a,
1772
+ enum ggml_prec prec);
1724
1773
 
1774
+ GGML_API enum ggml_prec ggml_flash_attn_ext_get_prec(
1775
+ const struct ggml_tensor * a);
1776
+
1777
+ // TODO: needs to be adapted to ggml_flash_attn_ext
1725
1778
  GGML_API struct ggml_tensor * ggml_flash_attn_back(
1726
1779
  struct ggml_context * ctx,
1727
1780
  struct ggml_tensor * q,
@@ -1730,20 +1783,10 @@ extern "C" {
1730
1783
  struct ggml_tensor * d,
1731
1784
  bool masked);
1732
1785
 
1733
- GGML_API struct ggml_tensor * ggml_flash_ff(
1734
- struct ggml_context * ctx,
1735
- struct ggml_tensor * a,
1736
- struct ggml_tensor * b0,
1737
- struct ggml_tensor * b1,
1738
- struct ggml_tensor * c0,
1739
- struct ggml_tensor * c1);
1740
-
1741
1786
  GGML_API struct ggml_tensor * ggml_ssm_conv(
1742
1787
  struct ggml_context * ctx,
1743
- struct ggml_tensor * s,
1744
- struct ggml_tensor * x,
1745
- struct ggml_tensor * c,
1746
- struct ggml_tensor * sq);
1788
+ struct ggml_tensor * sx,
1789
+ struct ggml_tensor * c);
1747
1790
 
1748
1791
  GGML_API struct ggml_tensor * ggml_ssm_scan(
1749
1792
  struct ggml_context * ctx,
@@ -1752,8 +1795,7 @@ extern "C" {
1752
1795
  struct ggml_tensor * dt,
1753
1796
  struct ggml_tensor * A,
1754
1797
  struct ggml_tensor * B,
1755
- struct ggml_tensor * C,
1756
- struct ggml_tensor * sq);
1798
+ struct ggml_tensor * C);
1757
1799
 
1758
1800
  // partition into non-overlapping windows with padding if needed
1759
1801
  // example:
@@ -1805,6 +1847,15 @@ extern "C" {
1805
1847
  struct ggml_tensor * pw,
1806
1848
  struct ggml_tensor * ph);
1807
1849
 
1850
+ GGML_API struct ggml_tensor * ggml_rwkv_wkv6(
1851
+ struct ggml_context * ctx,
1852
+ struct ggml_tensor * k,
1853
+ struct ggml_tensor * v,
1854
+ struct ggml_tensor * r,
1855
+ struct ggml_tensor * tf,
1856
+ struct ggml_tensor * td,
1857
+ struct ggml_tensor * state);
1858
+
1808
1859
  // custom operators
1809
1860
 
1810
1861
  typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
@@ -1888,7 +1939,8 @@ extern "C" {
1888
1939
  typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata);
1889
1940
  typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata);
1890
1941
 
1891
- #define GGML_N_TASKS_MAX -1
1942
+ #define GGML_N_TASKS_MAX (-1)
1943
+ // n_tasks == GGML_N_TASKS_MAX means to use max number of tasks
1892
1944
 
1893
1945
  GGML_API struct ggml_tensor * ggml_map_custom1(
1894
1946
  struct ggml_context * ctx,
@@ -1941,49 +1993,59 @@ extern "C" {
1941
1993
  // loss function
1942
1994
 
1943
1995
  GGML_API struct ggml_tensor * ggml_cross_entropy_loss(
1944
- struct ggml_context * ctx,
1945
- struct ggml_tensor * a,
1946
- struct ggml_tensor * b);
1996
+ struct ggml_context * ctx,
1997
+ struct ggml_tensor * a, // logits
1998
+ struct ggml_tensor * b); // labels
1947
1999
 
1948
2000
  GGML_API struct ggml_tensor * ggml_cross_entropy_loss_back(
1949
- struct ggml_context * ctx,
1950
- struct ggml_tensor * a,
1951
- struct ggml_tensor * b,
1952
- struct ggml_tensor * c);
2001
+ struct ggml_context * ctx,
2002
+ struct ggml_tensor * a, // logits
2003
+ struct ggml_tensor * b, // labels
2004
+ struct ggml_tensor * c); // gradients of cross_entropy_loss result
2005
+
2006
+ // AdamW optimizer step
2007
+ // Paper: https://arxiv.org/pdf/1711.05101v3.pdf
2008
+ // PyTorch: https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html
2009
+ GGML_API struct ggml_tensor * ggml_opt_step_adamw(
2010
+ struct ggml_context * ctx,
2011
+ struct ggml_tensor * a,
2012
+ struct ggml_tensor * grad,
2013
+ struct ggml_tensor * m,
2014
+ struct ggml_tensor * v,
2015
+ struct ggml_tensor * adamw_params); // parameters such a the learning rate
1953
2016
 
1954
2017
  //
1955
2018
  // automatic differentiation
1956
2019
  //
1957
2020
 
1958
- GGML_API void ggml_set_param(
1959
- struct ggml_context * ctx,
1960
- struct ggml_tensor * tensor);
2021
+ GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
2022
+ GGML_API void ggml_build_backward_expand(
2023
+ struct ggml_context * ctx_static, // context for static gradients (loss + gradient accumulation)
2024
+ struct ggml_context * ctx_compute, // context for gradient computation
2025
+ struct ggml_cgraph * cgraph,
2026
+ bool accumulate); // whether or not gradients should be accumulated, requires static allocation of tensors in ctx_static
1961
2027
 
2028
+ // graph allocation in a context
2029
+ GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
2030
+ GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads);
2031
+ GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
2032
+ GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst);
2033
+ GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // set regular grads + optimizer momenta to 0, set loss grad to 1
2034
+ GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph);
1962
2035
 
1963
- GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
1964
- GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);
2036
+ GGML_API int ggml_graph_size (struct ggml_cgraph * cgraph);
2037
+ GGML_API struct ggml_tensor * ggml_graph_node (struct ggml_cgraph * cgraph, int i); // if i < 0, returns nodes[n_nodes + i]
2038
+ GGML_API struct ggml_tensor ** ggml_graph_nodes (struct ggml_cgraph * cgraph);
2039
+ GGML_API int ggml_graph_n_nodes(struct ggml_cgraph * cgraph);
1965
2040
 
1966
- // graph allocation in a context
1967
- GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
1968
- GGML_API struct ggml_cgraph * ggml_new_graph_custom (struct ggml_context * ctx, size_t size, bool grads);
1969
- GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
1970
- GGML_API struct ggml_cgraph ggml_graph_view (struct ggml_cgraph * cgraph, int i0, int i1);
1971
- GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst);
1972
- GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // zero grads
1973
- GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph);
2041
+ GGML_API void ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
1974
2042
 
1975
2043
  GGML_API size_t ggml_graph_overhead(void);
1976
2044
  GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
1977
2045
 
1978
- // ggml_graph_plan() has to be called before ggml_graph_compute()
1979
- // when plan.work_size > 0, caller must allocate memory for plan.work_data
1980
- GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
1981
- GGML_API enum ggml_status ggml_graph_compute ( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
1982
- // same as ggml_graph_compute() but the work data is allocated as a part of the context
1983
- // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
1984
- GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
1985
-
1986
- GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
2046
+ GGML_API struct ggml_tensor * ggml_graph_get_tensor (const struct ggml_cgraph * cgraph, const char * name);
2047
+ GGML_API struct ggml_tensor * ggml_graph_get_grad (const struct ggml_cgraph * cgraph, const struct ggml_tensor * node);
2048
+ GGML_API struct ggml_tensor * ggml_graph_get_grad_acc(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node);
1987
2049
 
1988
2050
  GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
1989
2051
  GGML_API struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
@@ -1994,197 +2056,14 @@ extern "C" {
1994
2056
  // dump the graph into a file using the dot format
1995
2057
  GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
1996
2058
 
1997
- // build gradient checkpointing backward graph gb for gf using provided checkpoints
1998
- // gb_tmp will contain original backward graph with rewritten backward process nodes,
1999
- // but without the second forward pass nodes.
2000
- GGML_API void ggml_build_backward_gradient_checkpointing(
2001
- struct ggml_context * ctx,
2002
- struct ggml_cgraph * gf,
2003
- struct ggml_cgraph * gb,
2004
- struct ggml_cgraph * gb_tmp,
2005
- struct ggml_tensor * * checkpoints,
2006
- int n_checkpoints);
2007
- //
2008
- // optimization
2009
- //
2010
-
2011
- // optimization methods
2012
- enum ggml_opt_type {
2013
- GGML_OPT_TYPE_ADAM,
2014
- GGML_OPT_TYPE_LBFGS,
2015
- };
2016
-
2017
- // linesearch methods
2018
- enum ggml_linesearch {
2019
- GGML_LINESEARCH_DEFAULT = 1,
2020
-
2021
- GGML_LINESEARCH_BACKTRACKING_ARMIJO = 0,
2022
- GGML_LINESEARCH_BACKTRACKING_WOLFE = 1,
2023
- GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2,
2024
- };
2025
-
2026
- // optimization return values
2027
- enum ggml_opt_result {
2028
- GGML_OPT_RESULT_OK = 0,
2029
- GGML_OPT_RESULT_DID_NOT_CONVERGE,
2030
- GGML_OPT_RESULT_NO_CONTEXT,
2031
- GGML_OPT_RESULT_INVALID_WOLFE,
2032
- GGML_OPT_RESULT_FAIL,
2033
- GGML_OPT_RESULT_CANCEL,
2034
-
2035
- GGML_LINESEARCH_FAIL = -128,
2036
- GGML_LINESEARCH_MINIMUM_STEP,
2037
- GGML_LINESEARCH_MAXIMUM_STEP,
2038
- GGML_LINESEARCH_MAXIMUM_ITERATIONS,
2039
- GGML_LINESEARCH_INVALID_PARAMETERS,
2040
- };
2041
-
2042
- typedef void (*ggml_opt_callback)(void * data, int accum_step, float * sched, bool * cancel);
2059
+ // TODO these functions were sandwiched in the old optimization interface, is there a better place for them?
2043
2060
  typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
2044
2061
 
2045
- // optimization parameters
2046
- //
2047
- // see ggml.c (ggml_opt_default_params) for default values
2048
- //
2049
- struct ggml_opt_params {
2050
- enum ggml_opt_type type;
2051
-
2052
- size_t graph_size;
2053
-
2054
- int n_threads;
2055
-
2056
- // delta-based convergence test
2057
- //
2058
- // if past == 0 - disabled
2059
- // if past > 0:
2060
- // stop if |f(x) - f(x_past)| < delta * max(1, |f(x)|)
2061
- //
2062
- int past;
2063
- float delta;
2064
-
2065
- // maximum number of iterations without improvement
2066
- //
2067
- // if 0 - disabled
2068
- // if > 0:
2069
- // assume convergence if no cost improvement in this number of iterations
2070
- //
2071
- int max_no_improvement;
2072
-
2073
- bool print_forward_graph;
2074
- bool print_backward_graph;
2075
-
2076
- int n_gradient_accumulation;
2077
-
2078
- // ADAM parameters
2079
- struct {
2080
- int n_iter;
2081
-
2082
- float sched; // schedule multiplier (fixed, decay or warmup)
2083
- float decay; // weight decay for AdamW, use 0.0f to disable
2084
- int decay_min_ndim; // minimum number of tensor dimension to apply weight decay
2085
- float alpha; // learning rate
2086
- float beta1;
2087
- float beta2;
2088
- float eps; // epsilon for numerical stability
2089
- float eps_f; // epsilon for convergence test
2090
- float eps_g; // epsilon for convergence test
2091
- float gclip; // gradient clipping
2092
- } adam;
2093
-
2094
- // LBFGS parameters
2095
- struct {
2096
- int m; // number of corrections to approximate the inv. Hessian
2097
- int n_iter;
2098
- int max_linesearch;
2099
-
2100
- float eps; // convergence tolerance
2101
- float ftol; // line search tolerance
2102
- float wolfe;
2103
- float min_step;
2104
- float max_step;
2105
-
2106
- enum ggml_linesearch linesearch;
2107
- } lbfgs;
2108
- };
2109
-
2110
- struct ggml_opt_context {
2111
- struct ggml_context * ctx;
2112
- struct ggml_opt_params params;
2113
-
2114
- int iter;
2115
- int64_t nx; // number of parameter elements
2116
-
2117
- bool just_initialized;
2118
-
2119
- float loss_before;
2120
- float loss_after;
2121
-
2122
- struct {
2123
- struct ggml_tensor * g; // current gradient
2124
- struct ggml_tensor * m; // first moment
2125
- struct ggml_tensor * v; // second moment
2126
- struct ggml_tensor * pf; // past function values
2127
- float fx_best;
2128
- float fx_prev;
2129
- int n_no_improvement;
2130
- } adam;
2131
-
2132
- struct {
2133
- struct ggml_tensor * x; // current parameters
2134
- struct ggml_tensor * xp; // previous parameters
2135
- struct ggml_tensor * g; // current gradient
2136
- struct ggml_tensor * gp; // previous gradient
2137
- struct ggml_tensor * d; // search direction
2138
- struct ggml_tensor * pf; // past function values
2139
- struct ggml_tensor * lmal; // the L-BFGS memory alpha
2140
- struct ggml_tensor * lmys; // the L-BFGS memory ys
2141
- struct ggml_tensor * lms; // the L-BFGS memory s
2142
- struct ggml_tensor * lmy; // the L-BFGS memory y
2143
- float fx_best;
2144
- float step;
2145
- int j;
2146
- int k;
2147
- int end;
2148
- int n_no_improvement;
2149
- } lbfgs;
2150
- };
2151
-
2152
- GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
2153
-
2154
- // optimize the function defined by the tensor f
2155
- GGML_API enum ggml_opt_result ggml_opt(
2156
- struct ggml_context * ctx,
2157
- struct ggml_opt_params params,
2158
- struct ggml_tensor * f);
2159
-
2160
- // initialize optimizer context
2161
- GGML_API void ggml_opt_init(
2162
- struct ggml_context * ctx,
2163
- struct ggml_opt_context * opt,
2164
- struct ggml_opt_params params,
2165
- int64_t nx);
2166
-
2167
- // continue optimizing the function defined by the tensor f
2168
- GGML_API enum ggml_opt_result ggml_opt_resume(
2169
- struct ggml_context * ctx,
2170
- struct ggml_opt_context * opt,
2171
- struct ggml_tensor * f);
2172
-
2173
- // continue optimizing the function defined by the tensor f
2174
- GGML_API enum ggml_opt_result ggml_opt_resume_g(
2175
- struct ggml_context * ctx,
2176
- struct ggml_opt_context * opt,
2177
- struct ggml_tensor * f,
2178
- struct ggml_cgraph * gf,
2179
- struct ggml_cgraph * gb,
2180
- ggml_opt_callback callback,
2181
- void * callback_data);
2062
+ // Set callback for all future logging events.
2063
+ // If this is not called, or NULL is supplied, everything is output on stderr.
2064
+ GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data);
2182
2065
 
2183
- //
2184
- // tensor flags
2185
- //
2186
- GGML_API void ggml_set_input(struct ggml_tensor * tensor);
2187
- GGML_API void ggml_set_output(struct ggml_tensor * tensor);
2066
+ GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
2188
2067
 
2189
2068
  //
2190
2069
  // quantization
@@ -2289,6 +2168,9 @@ extern "C" {
2289
2168
  GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i);
2290
2169
  GGML_API enum ggml_type gguf_get_tensor_type (const struct gguf_context * ctx, int i);
2291
2170
 
2171
+ // removes key if it exists
2172
+ GGML_API void gguf_remove_key(struct gguf_context * ctx, const char * key);
2173
+
2292
2174
  // overrides existing values or adds a new one
2293
2175
  GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);
2294
2176
  GGML_API void gguf_set_val_i8 (struct gguf_context * ctx, const char * key, int8_t val);
@@ -2338,64 +2220,65 @@ extern "C" {
2338
2220
  GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx);
2339
2221
  GGML_API void gguf_get_meta_data(const struct gguf_context * ctx, void * data);
2340
2222
 
2341
- //
2342
- // system info
2343
- //
2344
-
2345
- GGML_API int ggml_cpu_has_avx (void);
2346
- GGML_API int ggml_cpu_has_avx_vnni (void);
2347
- GGML_API int ggml_cpu_has_avx2 (void);
2348
- GGML_API int ggml_cpu_has_avx512 (void);
2349
- GGML_API int ggml_cpu_has_avx512_vbmi(void);
2350
- GGML_API int ggml_cpu_has_avx512_vnni(void);
2351
- GGML_API int ggml_cpu_has_fma (void);
2352
- GGML_API int ggml_cpu_has_neon (void);
2353
- GGML_API int ggml_cpu_has_arm_fma (void);
2354
- GGML_API int ggml_cpu_has_metal (void);
2355
- GGML_API int ggml_cpu_has_f16c (void);
2356
- GGML_API int ggml_cpu_has_fp16_va (void);
2357
- GGML_API int ggml_cpu_has_wasm_simd (void);
2358
- GGML_API int ggml_cpu_has_blas (void);
2359
- GGML_API int ggml_cpu_has_cuda (void);
2360
- GGML_API int ggml_cpu_has_clblast (void);
2361
- GGML_API int ggml_cpu_has_vulkan (void);
2362
- GGML_API int ggml_cpu_has_kompute (void);
2363
- GGML_API int ggml_cpu_has_gpublas (void);
2364
- GGML_API int ggml_cpu_has_sse3 (void);
2365
- GGML_API int ggml_cpu_has_ssse3 (void);
2366
- GGML_API int ggml_cpu_has_sycl (void);
2367
- GGML_API int ggml_cpu_has_vsx (void);
2368
- GGML_API int ggml_cpu_has_matmul_int8(void);
2369
-
2370
- //
2371
- // Internal types and functions exposed for tests and benchmarks
2372
- //
2373
-
2374
- #ifdef __cplusplus
2375
- // restrict not standard in C++
2376
- #define GGML_RESTRICT
2223
+ #ifdef __cplusplus
2224
+ // restrict not standard in C++
2225
+ # if defined(__GNUC__)
2226
+ # define GGML_RESTRICT __restrict__
2227
+ # elif defined(__clang__)
2228
+ # define GGML_RESTRICT __restrict
2229
+ # elif defined(_MSC_VER)
2230
+ # define GGML_RESTRICT __restrict
2231
+ # else
2232
+ # define GGML_RESTRICT
2233
+ # endif
2377
2234
  #else
2378
- #define GGML_RESTRICT restrict
2235
+ # define GGML_RESTRICT restrict
2379
2236
  #endif
2380
2237
  typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
2381
2238
  typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
2382
- typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
2383
- const void * GGML_RESTRICT y, size_t by, int nrc);
2384
-
2385
- typedef struct {
2386
- const char * type_name;
2387
- int blck_size;
2388
- size_t type_size;
2389
- bool is_quantized;
2390
- ggml_to_float_t to_float;
2391
- ggml_from_float_t from_float;
2392
- ggml_from_float_t from_float_reference;
2393
- ggml_vec_dot_t vec_dot;
2394
- enum ggml_type vec_dot_type;
2395
- int64_t nrows; // number of rows to process simultaneously;
2396
- } ggml_type_traits_t;
2397
-
2398
- GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
2239
+
2240
+ struct ggml_type_traits {
2241
+ const char * type_name;
2242
+ int64_t blck_size;
2243
+ int64_t blck_size_interleave; // interleave elements in blocks
2244
+ size_t type_size;
2245
+ bool is_quantized;
2246
+ ggml_to_float_t to_float;
2247
+ ggml_from_float_t from_float_ref;
2248
+ };
2249
+
2250
+ GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type);
2251
+
2252
+ // ggml threadpool
2253
+ // TODO: currently, only a few functions are in the base ggml API, while the rest are in the CPU backend
2254
+ // the goal should be to create an API that other backends can use move everything to the ggml base
2255
+
2256
+ // scheduling priorities
2257
+ enum ggml_sched_priority {
2258
+ GGML_SCHED_PRIO_NORMAL,
2259
+ GGML_SCHED_PRIO_MEDIUM,
2260
+ GGML_SCHED_PRIO_HIGH,
2261
+ GGML_SCHED_PRIO_REALTIME
2262
+ };
2263
+
2264
+ // threadpool params
2265
+ // Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
2266
+ struct ggml_threadpool_params {
2267
+ bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
2268
+ int n_threads; // number of threads
2269
+ enum ggml_sched_priority prio; // thread priority
2270
+ uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling)
2271
+ bool strict_cpu; // strict cpu placement
2272
+ bool paused; // start in paused state
2273
+ };
2274
+
2275
+ struct ggml_threadpool; // forward declaration, see ggml.c
2276
+
2277
+ typedef struct ggml_threadpool * ggml_threadpool_t;
2278
+
2279
+ GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
2280
+ GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
2281
+ GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
2399
2282
 
2400
2283
  #ifdef __cplusplus
2401
2284
  }