@fugood/llama.node 0.3.2 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (190) hide show
  1. package/CMakeLists.txt +2 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +1 -1
  17. package/src/DetokenizeWorker.cpp +1 -1
  18. package/src/EmbeddingWorker.cpp +2 -2
  19. package/src/LlamaCompletionWorker.cpp +8 -8
  20. package/src/LlamaCompletionWorker.h +2 -2
  21. package/src/LlamaContext.cpp +8 -9
  22. package/src/TokenizeWorker.cpp +1 -1
  23. package/src/common.hpp +4 -4
  24. package/src/llama.cpp/.github/workflows/build.yml +43 -9
  25. package/src/llama.cpp/.github/workflows/docker.yml +3 -0
  26. package/src/llama.cpp/CMakeLists.txt +7 -4
  27. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  28. package/src/llama.cpp/common/CMakeLists.txt +0 -2
  29. package/src/llama.cpp/common/arg.cpp +642 -607
  30. package/src/llama.cpp/common/arg.h +22 -22
  31. package/src/llama.cpp/common/common.cpp +79 -281
  32. package/src/llama.cpp/common/common.h +130 -100
  33. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  34. package/src/llama.cpp/common/log.cpp +50 -50
  35. package/src/llama.cpp/common/log.h +18 -18
  36. package/src/llama.cpp/common/ngram-cache.cpp +36 -36
  37. package/src/llama.cpp/common/ngram-cache.h +19 -19
  38. package/src/llama.cpp/common/sampling.cpp +116 -108
  39. package/src/llama.cpp/common/sampling.h +20 -20
  40. package/src/llama.cpp/docs/build.md +37 -17
  41. package/src/llama.cpp/examples/CMakeLists.txt +1 -1
  42. package/src/llama.cpp/examples/batched/batched.cpp +14 -14
  43. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
  44. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
  45. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
  46. package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
  47. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
  48. package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
  49. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
  50. package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
  51. package/src/llama.cpp/examples/imatrix/imatrix.cpp +20 -11
  52. package/src/llama.cpp/examples/infill/infill.cpp +40 -86
  53. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +42 -151
  54. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  55. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
  56. package/src/llama.cpp/examples/llava/clip.cpp +1 -0
  57. package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
  58. package/src/llama.cpp/examples/llava/llava.cpp +37 -3
  59. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
  60. package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
  61. package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
  62. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  63. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +14 -14
  64. package/src/llama.cpp/examples/lookup/lookup.cpp +29 -29
  65. package/src/llama.cpp/examples/main/main.cpp +64 -109
  66. package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
  67. package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
  68. package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
  69. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
  70. package/src/llama.cpp/examples/retrieval/retrieval.cpp +13 -13
  71. package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
  72. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +34 -17
  73. package/src/llama.cpp/examples/server/CMakeLists.txt +4 -13
  74. package/src/llama.cpp/examples/server/server.cpp +553 -691
  75. package/src/llama.cpp/examples/server/utils.hpp +312 -25
  76. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  77. package/src/llama.cpp/examples/simple/simple.cpp +128 -96
  78. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
  79. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
  80. package/src/llama.cpp/examples/speculative/speculative.cpp +54 -51
  81. package/src/llama.cpp/examples/tokenize/tokenize.cpp +2 -2
  82. package/src/llama.cpp/ggml/CMakeLists.txt +15 -9
  83. package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
  84. package/src/llama.cpp/ggml/include/ggml-backend.h +46 -33
  85. package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
  86. package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
  87. package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
  88. package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
  89. package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
  90. package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
  91. package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
  92. package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
  93. package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
  94. package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
  95. package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
  96. package/src/llama.cpp/ggml/include/ggml.h +53 -393
  97. package/src/llama.cpp/ggml/src/CMakeLists.txt +66 -1149
  98. package/src/llama.cpp/ggml/src/ggml-aarch64.c +46 -3126
  99. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
  100. package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -27
  101. package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
  102. package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
  103. package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
  104. package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
  105. package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
  106. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +6 -25
  107. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
  108. package/src/llama.cpp/ggml/src/ggml-backend.cpp +303 -864
  109. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
  110. package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +213 -65
  111. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
  112. package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +255 -149
  113. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
  114. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
  115. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
  116. package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +0 -243
  117. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
  118. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  119. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
  120. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
  121. package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +667 -1
  122. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
  123. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
  124. package/src/llama.cpp/ggml/src/ggml-impl.h +366 -16
  125. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
  126. package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +238 -72
  127. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
  128. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
  129. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
  130. package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
  131. package/src/llama.cpp/ggml/src/ggml-quants.c +187 -10692
  132. package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
  133. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
  134. package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +475 -300
  135. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
  136. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
  137. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +40 -0
  138. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +258 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
  140. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +2 -22
  141. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
  142. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
  143. package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3584 -4142
  144. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +69 -67
  145. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +3 -3
  146. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
  147. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
  148. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
  149. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
  150. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
  151. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
  152. package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  153. package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
  154. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
  155. package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +555 -623
  156. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +125 -206
  157. package/src/llama.cpp/ggml/src/ggml.c +4032 -19890
  158. package/src/llama.cpp/include/llama.h +67 -33
  159. package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
  160. package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
  161. package/src/llama.cpp/src/CMakeLists.txt +2 -1
  162. package/src/llama.cpp/src/llama-sampling.cpp +745 -105
  163. package/src/llama.cpp/src/llama-sampling.h +21 -2
  164. package/src/llama.cpp/src/llama-vocab.cpp +49 -9
  165. package/src/llama.cpp/src/llama-vocab.h +35 -11
  166. package/src/llama.cpp/src/llama.cpp +2636 -2406
  167. package/src/llama.cpp/src/unicode-data.cpp +2 -2
  168. package/src/llama.cpp/tests/CMakeLists.txt +1 -2
  169. package/src/llama.cpp/tests/test-arg-parser.cpp +14 -14
  170. package/src/llama.cpp/tests/test-backend-ops.cpp +185 -60
  171. package/src/llama.cpp/tests/test-barrier.cpp +1 -0
  172. package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
  173. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
  174. package/src/llama.cpp/tests/test-log.cpp +2 -2
  175. package/src/llama.cpp/tests/test-opt.cpp +853 -142
  176. package/src/llama.cpp/tests/test-quantize-fns.cpp +22 -19
  177. package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
  178. package/src/llama.cpp/tests/test-rope.cpp +1 -0
  179. package/src/llama.cpp/tests/test-sampling.cpp +162 -137
  180. package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
  181. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
  182. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
  183. package/src/llama.cpp/common/train.cpp +0 -1515
  184. package/src/llama.cpp/common/train.h +0 -233
  185. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
  186. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
  187. package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
  188. /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
  189. /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
  190. /package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +0 -0
@@ -176,15 +176,15 @@
176
176
  #ifdef GGML_SHARED
177
177
  # if defined(_WIN32) && !defined(__MINGW32__)
178
178
  # ifdef GGML_BUILD
179
- # define GGML_API __declspec(dllexport)
179
+ # define GGML_API __declspec(dllexport) extern
180
180
  # else
181
- # define GGML_API __declspec(dllimport)
181
+ # define GGML_API __declspec(dllimport) extern
182
182
  # endif
183
183
  # else
184
- # define GGML_API __attribute__ ((visibility ("default")))
184
+ # define GGML_API __attribute__ ((visibility ("default"))) extern
185
185
  # endif
186
186
  #else
187
- # define GGML_API
187
+ # define GGML_API extern
188
188
  #endif
189
189
 
190
190
  // TODO: support for clang
@@ -217,7 +217,6 @@
217
217
 
218
218
  #define GGML_MAX_DIMS 4
219
219
  #define GGML_MAX_PARAMS 2048
220
- #define GGML_MAX_CONTEXTS 64
221
220
  #define GGML_MAX_SRC 10
222
221
  #define GGML_MAX_N_THREADS 512
223
222
  #define GGML_MAX_OP_PARAMS 64
@@ -510,7 +509,7 @@ extern "C" {
510
509
  GGML_OP_WIN_UNPART,
511
510
  GGML_OP_GET_REL_POS,
512
511
  GGML_OP_ADD_REL_POS,
513
- GGML_OP_RWKV_WKV,
512
+ GGML_OP_RWKV_WKV6,
514
513
 
515
514
  GGML_OP_UNARY,
516
515
 
@@ -559,10 +558,10 @@ extern "C" {
559
558
 
560
559
  enum ggml_log_level {
561
560
  GGML_LOG_LEVEL_NONE = 0,
562
- GGML_LOG_LEVEL_INFO = 1,
563
- GGML_LOG_LEVEL_WARN = 2,
564
- GGML_LOG_LEVEL_ERROR = 3,
565
- GGML_LOG_LEVEL_DEBUG = 4,
561
+ GGML_LOG_LEVEL_DEBUG = 1,
562
+ GGML_LOG_LEVEL_INFO = 2,
563
+ GGML_LOG_LEVEL_WARN = 3,
564
+ GGML_LOG_LEVEL_ERROR = 4,
566
565
  GGML_LOG_LEVEL_CONT = 5, // continue previous log
567
566
  };
568
567
 
@@ -574,6 +573,13 @@ extern "C" {
574
573
  GGML_TENSOR_FLAG_LOSS = 8, // ...defines loss for numerical optimization (multiple loss tensors add up)
575
574
  };
576
575
 
576
+ struct ggml_init_params {
577
+ // memory pool
578
+ size_t mem_size; // bytes
579
+ void * mem_buffer; // if NULL, memory will be allocated internally
580
+ bool no_alloc; // don't allocate memory for the tensor data
581
+ };
582
+
577
583
  // n-dimensional tensor
578
584
  struct ggml_tensor {
579
585
  enum ggml_type type;
@@ -596,7 +602,6 @@ extern "C" {
596
602
 
597
603
  int32_t flags;
598
604
 
599
- struct ggml_tensor * grad;
600
605
  struct ggml_tensor * src[GGML_MAX_SRC];
601
606
 
602
607
  // source tensor and offset for views
@@ -609,7 +614,7 @@ extern "C" {
609
614
 
610
615
  void * extra; // extra things e.g. for ggml-cuda.cu
611
616
 
612
- // char padding[4];
617
+ char padding[8];
613
618
  };
614
619
 
615
620
  static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@@ -619,66 +624,6 @@ extern "C" {
619
624
  // If it returns true, the computation is aborted
620
625
  typedef bool (*ggml_abort_callback)(void * data);
621
626
 
622
- // Scheduling priorities
623
- enum ggml_sched_priority {
624
- GGML_SCHED_PRIO_NORMAL,
625
- GGML_SCHED_PRIO_MEDIUM,
626
- GGML_SCHED_PRIO_HIGH,
627
- GGML_SCHED_PRIO_REALTIME
628
- };
629
-
630
- // Threadpool params
631
- // Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
632
- struct ggml_threadpool_params {
633
- bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
634
- int n_threads; // number of threads
635
- enum ggml_sched_priority prio; // thread priority
636
- uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling)
637
- bool strict_cpu; // strict cpu placement
638
- bool paused; // start in paused state
639
- };
640
-
641
- struct ggml_threadpool; // forward declaration, see ggml.c
642
-
643
- typedef struct ggml_threadpool * ggml_threadpool_t;
644
-
645
- // the compute plan that needs to be prepared for ggml_graph_compute()
646
- // since https://github.com/ggerganov/ggml/issues/287
647
- struct ggml_cplan {
648
- size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
649
- uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
650
-
651
- int n_threads;
652
- struct ggml_threadpool * threadpool;
653
-
654
- // abort ggml_graph_compute when true
655
- ggml_abort_callback abort_callback;
656
- void * abort_callback_data;
657
- };
658
-
659
- // scratch buffer
660
- struct ggml_scratch {
661
- size_t offs;
662
- size_t size;
663
- void * data;
664
- };
665
-
666
- struct ggml_init_params {
667
- // memory pool
668
- size_t mem_size; // bytes
669
- void * mem_buffer; // if NULL, memory will be allocated internally
670
- bool no_alloc; // don't allocate memory for the tensor data
671
- };
672
-
673
- // numa strategies
674
- enum ggml_numa_strategy {
675
- GGML_NUMA_STRATEGY_DISABLED = 0,
676
- GGML_NUMA_STRATEGY_DISTRIBUTE = 1,
677
- GGML_NUMA_STRATEGY_ISOLATE = 2,
678
- GGML_NUMA_STRATEGY_NUMACTL = 3,
679
- GGML_NUMA_STRATEGY_MIRROR = 4,
680
- GGML_NUMA_STRATEGY_COUNT
681
- };
682
627
 
683
628
  //
684
629
  // GUID
@@ -701,9 +646,6 @@ extern "C" {
701
646
  // accepts a UTF-8 path, even on Windows
702
647
  GGML_API FILE * ggml_fopen(const char * fname, const char * mode);
703
648
 
704
- GGML_API void ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
705
- GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
706
-
707
649
  GGML_API void ggml_print_object (const struct ggml_object * obj);
708
650
  GGML_API void ggml_print_objects(const struct ggml_context * ctx);
709
651
 
@@ -760,12 +702,12 @@ extern "C" {
760
702
 
761
703
  // main
762
704
 
763
- GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
764
- GGML_API void ggml_free(struct ggml_context * ctx);
705
+ GGML_API struct ggml_context * ggml_init (struct ggml_init_params params);
706
+ GGML_API void ggml_reset(struct ggml_context * ctx);
707
+ GGML_API void ggml_free (struct ggml_context * ctx);
765
708
 
766
709
  GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
767
710
 
768
- GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
769
711
  GGML_API bool ggml_get_no_alloc(struct ggml_context * ctx);
770
712
  GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
771
713
 
@@ -805,8 +747,7 @@ extern "C" {
805
747
  int64_t ne2,
806
748
  int64_t ne3);
807
749
 
808
- GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
809
- GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
750
+ GGML_API void * ggml_new_buffer(struct ggml_context * ctx, size_t nbytes);
810
751
 
811
752
  GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
812
753
  GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
@@ -816,35 +757,25 @@ extern "C" {
816
757
  GGML_API struct ggml_tensor * ggml_get_next_tensor (const struct ggml_context * ctx, struct ggml_tensor * tensor);
817
758
  GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
818
759
 
819
- GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
820
- GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
821
- GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
822
-
823
760
  // Converts a flat index into coordinates
824
- GGML_API void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3);
825
-
826
- GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
827
- GGML_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
828
-
829
- GGML_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
830
- GGML_API void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
831
-
832
- GGML_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
833
- GGML_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
761
+ GGML_API void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3);
834
762
 
835
- GGML_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
836
- GGML_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
763
+ GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
837
764
 
838
765
  GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
839
766
  GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
840
767
 
841
- GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
842
-
843
768
  GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor);
844
769
  GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name);
845
770
  GGML_ATTRIBUTE_FORMAT(2, 3)
846
771
  GGML_API struct ggml_tensor * ggml_format_name( struct ggml_tensor * tensor, const char * fmt, ...);
847
772
 
773
+ // Tensor flags
774
+ GGML_API void ggml_set_input(struct ggml_tensor * tensor);
775
+ GGML_API void ggml_set_output(struct ggml_tensor * tensor);
776
+ GGML_API void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor);
777
+ GGML_API void ggml_set_loss(struct ggml_tensor * tensor);
778
+
848
779
  //
849
780
  // operations on tensors with backpropagation
850
781
  //
@@ -1558,7 +1489,7 @@ extern "C" {
1558
1489
  "use ggml_rope_ext_inplace instead");
1559
1490
 
1560
1491
  // compute correction dims for YaRN RoPE scaling
1561
- void ggml_rope_yarn_corr_dims(
1492
+ GGML_API void ggml_rope_yarn_corr_dims(
1562
1493
  int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]);
1563
1494
 
1564
1495
  // rotary position embedding backward, i.e compute dx from dy
@@ -1814,6 +1745,9 @@ extern "C" {
1814
1745
  struct ggml_tensor * a,
1815
1746
  enum ggml_prec prec);
1816
1747
 
1748
+ GGML_API enum ggml_prec ggml_flash_attn_ext_get_prec(
1749
+ const struct ggml_tensor * a);
1750
+
1817
1751
  // TODO: needs to be adapted to ggml_flash_attn_ext
1818
1752
  GGML_API struct ggml_tensor * ggml_flash_attn_back(
1819
1753
  struct ggml_context * ctx,
@@ -1887,7 +1821,7 @@ extern "C" {
1887
1821
  struct ggml_tensor * pw,
1888
1822
  struct ggml_tensor * ph);
1889
1823
 
1890
- GGML_API struct ggml_tensor * ggml_rwkv_wkv(
1824
+ GGML_API struct ggml_tensor * ggml_rwkv_wkv6(
1891
1825
  struct ggml_context * ctx,
1892
1826
  struct ggml_tensor * k,
1893
1827
  struct ggml_tensor * v,
@@ -2050,31 +1984,20 @@ extern "C" {
2050
1984
  struct ggml_context * ctx,
2051
1985
  struct ggml_tensor * a,
2052
1986
  struct ggml_tensor * grad,
2053
- float alpha,
2054
- float beta1,
2055
- float beta2,
2056
- float eps,
2057
- float wd); // weight decay
1987
+ struct ggml_tensor * m,
1988
+ struct ggml_tensor * v,
1989
+ struct ggml_tensor * adamw_params); // parameters such a the learning rate
2058
1990
 
2059
1991
  //
2060
1992
  // automatic differentiation
2061
1993
  //
2062
1994
 
2063
- GGML_API void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor);
2064
- GGML_API void ggml_set_loss(struct ggml_tensor * tensor);
2065
-
2066
- GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
2067
- GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool accumulate);
2068
-
2069
- GGML_API void ggml_build_opt_adamw(
2070
- struct ggml_context * ctx,
2071
- struct ggml_cgraph * gf,
2072
- struct ggml_cgraph * gb,
2073
- float alpha,
2074
- float beta1,
2075
- float beta2,
2076
- float eps,
2077
- float wd); // weight decay
1995
+ GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
1996
+ GGML_API void ggml_build_backward_expand(
1997
+ struct ggml_context * ctx_static, // context for static gradients (loss + gradient accumulation)
1998
+ struct ggml_context * ctx_compute, // context for gradient computation
1999
+ struct ggml_cgraph * cgraph,
2000
+ bool accumulate); // whether or not gradients should be accumulated, requires static allocation of tensors in ctx_static
2078
2001
 
2079
2002
  // graph allocation in a context
2080
2003
  GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
@@ -2094,28 +2017,9 @@ extern "C" {
2094
2017
  GGML_API size_t ggml_graph_overhead(void);
2095
2018
  GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
2096
2019
 
2097
- GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
2098
- GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
2099
- GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
2100
- GGML_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params);
2101
- GGML_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
2102
- GGML_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
2103
- GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
2104
- GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
2105
-
2106
- // ggml_graph_plan() has to be called before ggml_graph_compute()
2107
- // when plan.work_size > 0, caller must allocate memory for plan.work_data
2108
- GGML_API struct ggml_cplan ggml_graph_plan(
2109
- const struct ggml_cgraph * cgraph,
2110
- int n_threads, /* = GGML_DEFAULT_N_THREADS */
2111
- struct ggml_threadpool * threadpool /* = NULL */ );
2112
- GGML_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
2113
-
2114
- // same as ggml_graph_compute() but the work data is allocated as a part of the context
2115
- // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
2116
- GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
2117
-
2118
- GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
2020
+ GGML_API struct ggml_tensor * ggml_graph_get_tensor (const struct ggml_cgraph * cgraph, const char * name);
2021
+ GGML_API struct ggml_tensor * ggml_graph_get_grad (const struct ggml_cgraph * cgraph, const struct ggml_tensor * node);
2022
+ GGML_API struct ggml_tensor * ggml_graph_get_grad_acc(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node);
2119
2023
 
2120
2024
  GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
2121
2025
  GGML_API struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
@@ -2126,201 +2030,14 @@ extern "C" {
2126
2030
  // dump the graph into a file using the dot format
2127
2031
  GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
2128
2032
 
2129
- // build gradient checkpointing backward graph gb for gf using provided checkpoints
2130
- // gb_tmp will contain original backward graph with rewritten backward process nodes,
2131
- // but without the second forward pass nodes.
2132
- GGML_API void ggml_build_backward_gradient_checkpointing(
2133
- struct ggml_context * ctx,
2134
- struct ggml_cgraph * gf,
2135
- struct ggml_cgraph * gb,
2136
- struct ggml_cgraph * gb_tmp,
2137
- struct ggml_tensor * * checkpoints,
2138
- int n_checkpoints);
2139
- //
2140
- // optimization
2141
- //
2142
-
2143
- // optimization methods
2144
- enum ggml_opt_type {
2145
- GGML_OPT_TYPE_ADAM,
2146
- GGML_OPT_TYPE_LBFGS,
2147
- };
2148
-
2149
- // linesearch methods
2150
- enum ggml_linesearch {
2151
- GGML_LINESEARCH_DEFAULT = 1,
2152
-
2153
- GGML_LINESEARCH_BACKTRACKING_ARMIJO = 0,
2154
- GGML_LINESEARCH_BACKTRACKING_WOLFE = 1,
2155
- GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2,
2156
- };
2157
-
2158
- // optimization return values
2159
- enum ggml_opt_result {
2160
- GGML_OPT_RESULT_OK = 0,
2161
- GGML_OPT_RESULT_DID_NOT_CONVERGE,
2162
- GGML_OPT_RESULT_NO_CONTEXT,
2163
- GGML_OPT_RESULT_INVALID_WOLFE,
2164
- GGML_OPT_RESULT_FAIL,
2165
- GGML_OPT_RESULT_CANCEL,
2166
-
2167
- GGML_LINESEARCH_FAIL = -128,
2168
- GGML_LINESEARCH_MINIMUM_STEP,
2169
- GGML_LINESEARCH_MAXIMUM_STEP,
2170
- GGML_LINESEARCH_MAXIMUM_ITERATIONS,
2171
- GGML_LINESEARCH_INVALID_PARAMETERS,
2172
- };
2173
-
2174
- typedef void (*ggml_opt_callback)(void * data, int accum_step, float * sched, bool * cancel);
2033
+ // TODO these functions were sandwiched in the old optimization interface, is there a better place for them?
2175
2034
  typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
2176
2035
 
2177
2036
  // Set callback for all future logging events.
2178
2037
  // If this is not called, or NULL is supplied, everything is output on stderr.
2179
2038
  GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data);
2180
2039
 
2181
- // optimization parameters
2182
- //
2183
- // see ggml.c (ggml_opt_default_params) for default values
2184
- //
2185
- struct ggml_opt_params {
2186
- enum ggml_opt_type type;
2187
-
2188
- size_t graph_size;
2189
-
2190
- int n_threads;
2191
-
2192
- // delta-based convergence test
2193
- //
2194
- // if past == 0 - disabled
2195
- // if past > 0:
2196
- // stop if |f(x) - f(x_past)| < delta * max(1, |f(x)|)
2197
- //
2198
- int past;
2199
- float delta;
2200
-
2201
- // maximum number of iterations without improvement
2202
- //
2203
- // if 0 - disabled
2204
- // if > 0:
2205
- // assume convergence if no cost improvement in this number of iterations
2206
- //
2207
- int max_no_improvement;
2208
-
2209
- bool print_forward_graph;
2210
- bool print_backward_graph;
2211
-
2212
- int n_gradient_accumulation;
2213
-
2214
- // ADAM parameters
2215
- struct {
2216
- int n_iter;
2217
-
2218
- float sched; // schedule multiplier (fixed, decay or warmup)
2219
- float decay; // weight decay for AdamW, use 0.0f to disable
2220
- int decay_min_ndim; // minimum number of tensor dimension to apply weight decay
2221
- float alpha; // learning rate
2222
- float beta1;
2223
- float beta2;
2224
- float eps; // epsilon for numerical stability
2225
- float eps_f; // epsilon for convergence test
2226
- float eps_g; // epsilon for convergence test
2227
- float gclip; // gradient clipping
2228
- } adam;
2229
-
2230
- // LBFGS parameters
2231
- struct {
2232
- int m; // number of corrections to approximate the inv. Hessian
2233
- int n_iter;
2234
- int max_linesearch;
2235
-
2236
- float eps; // convergence tolerance
2237
- float ftol; // line search tolerance
2238
- float wolfe;
2239
- float min_step;
2240
- float max_step;
2241
-
2242
- enum ggml_linesearch linesearch;
2243
- } lbfgs;
2244
- };
2245
-
2246
- struct ggml_opt_context {
2247
- struct ggml_context * ctx;
2248
- struct ggml_opt_params params;
2249
-
2250
- int iter;
2251
- int64_t nx; // number of parameter elements
2252
-
2253
- bool just_initialized;
2254
-
2255
- float loss_before;
2256
- float loss_after;
2257
-
2258
- struct {
2259
- struct ggml_tensor * g; // current gradient
2260
- struct ggml_tensor * m; // first moment
2261
- struct ggml_tensor * v; // second moment
2262
- struct ggml_tensor * pf; // past function values
2263
- float fx_best;
2264
- float fx_prev;
2265
- int n_no_improvement;
2266
- } adam;
2267
-
2268
- struct {
2269
- struct ggml_tensor * x; // current parameters
2270
- struct ggml_tensor * xp; // previous parameters
2271
- struct ggml_tensor * g; // current gradient
2272
- struct ggml_tensor * gp; // previous gradient
2273
- struct ggml_tensor * d; // search direction
2274
- struct ggml_tensor * pf; // past function values
2275
- struct ggml_tensor * lmal; // the L-BFGS memory alpha
2276
- struct ggml_tensor * lmys; // the L-BFGS memory ys
2277
- struct ggml_tensor * lms; // the L-BFGS memory s
2278
- struct ggml_tensor * lmy; // the L-BFGS memory y
2279
- float fx_best;
2280
- float step;
2281
- int j;
2282
- int k;
2283
- int end;
2284
- int n_no_improvement;
2285
- } lbfgs;
2286
- };
2287
-
2288
- GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
2289
-
2290
- // optimize the function defined by the tensor f
2291
- GGML_API enum ggml_opt_result ggml_opt(
2292
- struct ggml_context * ctx,
2293
- struct ggml_opt_params params,
2294
- struct ggml_tensor * f);
2295
-
2296
- // initialize optimizer context
2297
- GGML_API void ggml_opt_init(
2298
- struct ggml_context * ctx,
2299
- struct ggml_opt_context * opt,
2300
- struct ggml_opt_params params,
2301
- int64_t nx);
2302
-
2303
- // continue optimizing the function defined by the tensor f
2304
- GGML_API enum ggml_opt_result ggml_opt_resume(
2305
- struct ggml_context * ctx,
2306
- struct ggml_opt_context * opt,
2307
- struct ggml_tensor * f);
2308
-
2309
- // continue optimizing the function defined by the tensor f
2310
- GGML_API enum ggml_opt_result ggml_opt_resume_g(
2311
- struct ggml_context * ctx,
2312
- struct ggml_opt_context * opt,
2313
- struct ggml_tensor * f,
2314
- struct ggml_cgraph * gf,
2315
- struct ggml_cgraph * gb,
2316
- ggml_opt_callback callback,
2317
- void * callback_data);
2318
-
2319
- //
2320
- // tensor flags
2321
- //
2322
- GGML_API void ggml_set_input(struct ggml_tensor * tensor);
2323
- GGML_API void ggml_set_output(struct ggml_tensor * tensor);
2040
+ GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
2324
2041
 
2325
2042
  //
2326
2043
  // quantization
@@ -2477,47 +2194,6 @@ extern "C" {
2477
2194
  GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx);
2478
2195
  GGML_API void gguf_get_meta_data(const struct gguf_context * ctx, void * data);
2479
2196
 
2480
- //
2481
- // system info
2482
- //
2483
-
2484
- GGML_API int ggml_cpu_has_avx (void);
2485
- GGML_API int ggml_cpu_has_avx_vnni (void);
2486
- GGML_API int ggml_cpu_has_avx2 (void);
2487
- GGML_API int ggml_cpu_has_avx512 (void);
2488
- GGML_API int ggml_cpu_has_avx512_vbmi(void);
2489
- GGML_API int ggml_cpu_has_avx512_vnni(void);
2490
- GGML_API int ggml_cpu_has_avx512_bf16(void);
2491
- GGML_API int ggml_cpu_has_fma (void);
2492
- GGML_API int ggml_cpu_has_neon (void);
2493
- GGML_API int ggml_cpu_has_sve (void);
2494
- GGML_API int ggml_cpu_has_arm_fma (void);
2495
- GGML_API int ggml_cpu_has_metal (void);
2496
- GGML_API int ggml_cpu_has_f16c (void);
2497
- GGML_API int ggml_cpu_has_fp16_va (void);
2498
- GGML_API int ggml_cpu_has_wasm_simd (void);
2499
- GGML_API int ggml_cpu_has_blas (void);
2500
- GGML_API int ggml_cpu_has_cuda (void);
2501
- GGML_API int ggml_cpu_has_vulkan (void);
2502
- GGML_API int ggml_cpu_has_kompute (void);
2503
- GGML_API int ggml_cpu_has_gpublas (void);
2504
- GGML_API int ggml_cpu_has_sse3 (void);
2505
- GGML_API int ggml_cpu_has_ssse3 (void);
2506
- GGML_API int ggml_cpu_has_riscv_v (void);
2507
- GGML_API int ggml_cpu_has_sycl (void);
2508
- GGML_API int ggml_cpu_has_rpc (void);
2509
- GGML_API int ggml_cpu_has_vsx (void);
2510
- GGML_API int ggml_cpu_has_matmul_int8(void);
2511
- GGML_API int ggml_cpu_has_cann (void);
2512
- GGML_API int ggml_cpu_has_llamafile (void);
2513
-
2514
- // get the sve vector length in bytes
2515
- GGML_API int ggml_cpu_get_sve_cnt(void);
2516
-
2517
- //
2518
- // Internal types and functions exposed for tests and benchmarks
2519
- //
2520
-
2521
2197
  #ifdef __cplusplus
2522
2198
  // restrict not standard in C++
2523
2199
  #define GGML_RESTRICT
@@ -2526,34 +2202,18 @@ extern "C" {
2526
2202
  #endif
2527
2203
  typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
2528
2204
  typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
2529
- typedef void (*ggml_from_float_to_mat_t)
2530
- (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nr, int64_t k, int64_t bs);
2531
- typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
2532
- const void * GGML_RESTRICT y, size_t by, int nrc);
2533
- typedef void (*ggml_gemv_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
2534
- const void * GGML_RESTRICT y, int nr, int nc);
2535
- typedef void (*ggml_gemm_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
2536
- const void * GGML_RESTRICT y, int nr, int nc);
2537
-
2538
- typedef struct {
2205
+
2206
+ struct ggml_type_traits {
2539
2207
  const char * type_name;
2540
2208
  int64_t blck_size;
2541
2209
  int64_t blck_size_interleave; // interleave elements in blocks
2542
2210
  size_t type_size;
2543
2211
  bool is_quantized;
2544
2212
  ggml_to_float_t to_float;
2545
- ggml_from_float_t from_float;
2546
2213
  ggml_from_float_t from_float_ref;
2547
- ggml_from_float_to_mat_t from_float_to_mat;
2548
- ggml_vec_dot_t vec_dot;
2549
- enum ggml_type vec_dot_type;
2550
- int64_t nrows; // number of rows to process simultaneously
2551
- int64_t ncols; // number of columns to process simultaneously
2552
- ggml_gemv_t gemv;
2553
- ggml_gemm_t gemm;
2554
- } ggml_type_traits_t;
2555
-
2556
- GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
2214
+ };
2215
+
2216
+ GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type);
2557
2217
 
2558
2218
  #ifdef __cplusplus
2559
2219
  }