@novastera-oss/llamarn 0.2.6 → 0.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (192) hide show
  1. package/android/src/main/cpp/include/llama.h +134 -36
  2. package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
  3. package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
  4. package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
  6. package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
  7. package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
  8. package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
  10. package/cpp/LlamaCppModel.cpp +2 -2
  11. package/cpp/LlamaCppModel.h +3 -3
  12. package/cpp/PureCppImpl.cpp +1 -1
  13. package/cpp/PureCppImpl.h +2 -2
  14. package/cpp/build-info.cpp +2 -2
  15. package/cpp/llama.cpp/CMakeLists.txt +15 -4
  16. package/cpp/llama.cpp/Makefile +2 -2
  17. package/cpp/llama.cpp/README.md +32 -13
  18. package/cpp/llama.cpp/common/CMakeLists.txt +10 -20
  19. package/cpp/llama.cpp/common/arg.cpp +30 -6
  20. package/cpp/llama.cpp/common/build-info.cpp.in +2 -2
  21. package/cpp/llama.cpp/common/chat-parser.cpp +5 -0
  22. package/cpp/llama.cpp/common/chat-parser.h +2 -0
  23. package/cpp/llama.cpp/common/chat.cpp +12 -9
  24. package/cpp/llama.cpp/common/chat.h +1 -1
  25. package/cpp/llama.cpp/common/common.cpp +50 -40
  26. package/cpp/llama.cpp/common/common.h +5 -2
  27. package/cpp/llama.cpp/common/speculative.cpp +6 -4
  28. package/cpp/llama.cpp/convert_hf_to_gguf.py +97 -56
  29. package/cpp/llama.cpp/ggml/CMakeLists.txt +47 -2
  30. package/cpp/llama.cpp/ggml/cmake/common.cmake +1 -2
  31. package/cpp/llama.cpp/ggml/src/CMakeLists.txt +47 -13
  32. package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +5 -0
  33. package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +6 -1
  34. package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
  35. package/cpp/llama.cpp/ggml/src/ggml-common.h +4 -0
  36. package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +93 -24
  37. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
  38. package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +1 -1
  39. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  40. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4113 -0
  41. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2174 -0
  42. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2638 -0
  43. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2731 -0
  44. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2068 -0
  45. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +396 -0
  46. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1299 -0
  47. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1480 -0
  48. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +4310 -0
  49. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +59 -3206
  50. package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +184 -0
  51. package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +1 -1
  52. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +7 -4
  53. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +10 -2
  54. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +8 -8
  55. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
  56. package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
  57. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +56 -7
  58. package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
  59. package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +2 -2
  60. package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +1157 -0
  61. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
  62. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +1555 -0
  63. package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.h +98 -0
  64. package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +2 -4
  65. package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
  66. package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +5 -8
  67. package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +4 -1
  68. package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +6 -8
  69. package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
  70. package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -0
  71. package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +11 -10
  72. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +33 -8
  73. package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +135 -100
  74. package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +7 -0
  75. package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +908 -3
  76. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
  77. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
  78. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
  79. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
  80. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
  81. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
  82. package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
  83. package/cpp/llama.cpp/ggml/src/ggml-quants.c +0 -2
  84. package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
  85. package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +1 -1
  86. package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +19 -24
  87. package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +21 -2
  88. package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +121 -4
  89. package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +32 -0
  90. package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +3 -0
  91. package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +2 -96
  92. package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +164 -38
  93. package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +32 -8
  94. package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +38 -10
  95. package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
  96. package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +26 -29
  97. package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +431 -247
  98. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -12
  99. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
  100. package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +2 -0
  101. package/cpp/llama.cpp/ggml/src/ggml.c +0 -6
  102. package/cpp/llama.cpp/gguf-py/gguf/constants.py +57 -0
  103. package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +4 -1
  104. package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +14 -3
  105. package/cpp/llama.cpp/include/llama.h +134 -36
  106. package/cpp/llama.cpp/requirements/requirements-compare-llama-bench.txt +1 -0
  107. package/cpp/llama.cpp/src/CMakeLists.txt +2 -2
  108. package/cpp/llama.cpp/src/llama-arch.cpp +95 -3
  109. package/cpp/llama.cpp/src/llama-arch.h +7 -1
  110. package/cpp/llama.cpp/src/llama-batch.cpp +270 -19
  111. package/cpp/llama.cpp/src/llama-batch.h +36 -11
  112. package/cpp/llama.cpp/src/llama-chat.cpp +19 -2
  113. package/cpp/llama.cpp/src/llama-chat.h +1 -0
  114. package/cpp/llama.cpp/src/llama-context.cpp +313 -213
  115. package/cpp/llama.cpp/src/llama-context.h +16 -12
  116. package/cpp/llama.cpp/src/llama-cparams.cpp +1 -1
  117. package/cpp/llama.cpp/src/llama-cparams.h +1 -1
  118. package/cpp/llama.cpp/src/llama-graph.cpp +249 -129
  119. package/cpp/llama.cpp/src/llama-graph.h +90 -34
  120. package/cpp/llama.cpp/src/llama-hparams.cpp +6 -2
  121. package/cpp/llama.cpp/src/llama-hparams.h +8 -2
  122. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +82 -50
  123. package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +23 -26
  124. package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +292 -174
  125. package/cpp/llama.cpp/src/llama-kv-cache-unified.h +68 -38
  126. package/cpp/llama.cpp/src/llama-kv-cells.h +18 -13
  127. package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +247 -0
  128. package/cpp/llama.cpp/src/llama-memory-hybrid.h +143 -0
  129. package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.cpp → llama-memory-recurrent.cpp} +266 -282
  130. package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.h → llama-memory-recurrent.h} +54 -57
  131. package/cpp/llama.cpp/src/llama-memory.cpp +41 -0
  132. package/cpp/llama.cpp/src/llama-memory.h +64 -23
  133. package/cpp/llama.cpp/src/llama-mmap.cpp +1 -1
  134. package/cpp/llama.cpp/src/llama-model-loader.cpp +42 -17
  135. package/cpp/llama.cpp/src/llama-model.cpp +726 -141
  136. package/cpp/llama.cpp/src/llama-model.h +4 -0
  137. package/cpp/llama.cpp/src/llama-quant.cpp +2 -1
  138. package/cpp/llama.cpp/src/llama-vocab.cpp +32 -23
  139. package/cpp/llama.cpp/src/llama.cpp +11 -7
  140. package/cpp/llama.cpp/src/unicode.cpp +5 -0
  141. package/cpp/rn-completion.cpp +2 -2
  142. package/cpp/{rn-llama.hpp → rn-llama.h} +1 -1
  143. package/ios/include/chat.h +1 -1
  144. package/ios/include/common.h +5 -2
  145. package/ios/include/llama.h +134 -36
  146. package/ios/libs/llama.xcframework/Info.plist +18 -18
  147. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  148. package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4863 -4689
  149. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +134 -36
  150. package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
  151. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  152. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4710
  153. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3742 -3622
  154. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +134 -36
  155. package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
  156. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  157. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4710
  158. package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3744 -3624
  159. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +134 -36
  160. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +134 -36
  161. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
  162. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +134 -36
  163. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
  164. package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
  165. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  166. package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4863 -4689
  167. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +134 -36
  168. package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
  169. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  170. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4710
  171. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3742 -3622
  172. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +134 -36
  173. package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
  174. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  175. package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4900 -4725
  176. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +134 -36
  177. package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
  178. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
  179. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4871 -4746
  180. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3773 -3652
  181. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +134 -36
  182. package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
  183. package/package.json +1 -2
  184. package/cpp/llama.cpp/common/cmake/build-info-gen-cpp.cmake +0 -24
  185. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
  186. package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13891
  187. package/cpp/llama.cpp/src/llama-kv-cache.cpp +0 -1
  188. package/cpp/llama.cpp/src/llama-kv-cache.h +0 -44
  189. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
  190. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
  191. /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
  192. /package/cpp/{rn-utils.hpp → rn-utils.h} +0 -0
@@ -61,7 +61,10 @@ extern "C" {
61
61
  struct llama_model;
62
62
  struct llama_context;
63
63
  struct llama_sampler;
64
- struct llama_kv_cache;
64
+
65
+ typedef struct llama_memory_i * llama_memory_t;
66
+
67
+ struct llama_kv_cache; // DEPRECATED (use llama_memory instead)
65
68
 
66
69
  typedef int32_t llama_pos;
67
70
  typedef int32_t llama_token;
@@ -240,18 +243,21 @@ extern "C" {
240
243
 
241
244
  typedef bool (*llama_progress_callback)(float progress, void * user_data);
242
245
 
243
- // Input data for llama_decode
246
+ // Input data for llama_encode/llama_decode
244
247
  // A llama_batch object can contain input about one or many sequences
245
248
  // The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens
246
249
  //
247
250
  // - token : the token ids of the input (used when embd is NULL)
248
251
  // - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
249
252
  // - pos : the positions of the respective token in the sequence
250
- // (if set to NULL, the token position will be tracked automatically by llama_decode)
253
+ // (if set to NULL, the token position will be tracked automatically by llama_encode/llama_decode)
251
254
  // - seq_id : the sequence to which the respective token belongs
252
255
  // (if set to NULL, the sequence ID will be assumed to be 0)
253
256
  // - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
254
- // (if set to NULL, only the logits for last token will be returned)
257
+ // (if set to NULL:
258
+ // - if embeddings: all tokens are output
259
+ // - if not: only the last token is output
260
+ // )
255
261
  //
256
262
  typedef struct llama_batch {
257
263
  int32_t n_tokens;
@@ -259,8 +265,8 @@ extern "C" {
259
265
  llama_token * token;
260
266
  float * embd;
261
267
  llama_pos * pos;
262
- int32_t * n_seq_id; // TODO: remove, should belong to only 1 sequence
263
- llama_seq_id ** seq_id; // TODO: become llama_seq_id * seq_id;
268
+ int32_t * n_seq_id;
269
+ llama_seq_id ** seq_id;
264
270
  int8_t * logits; // TODO: rename this to "output"
265
271
  } llama_batch;
266
272
 
@@ -493,9 +499,11 @@ extern "C" {
493
499
  DEPRECATED(LLAMA_API int32_t llama_n_vocab (const struct llama_vocab * vocab), "use llama_vocab_n_tokens instead");
494
500
 
495
501
  LLAMA_API const struct llama_model * llama_get_model (const struct llama_context * ctx);
496
- LLAMA_API struct llama_kv_cache * llama_get_kv_self ( struct llama_context * ctx);
502
+ LLAMA_API llama_memory_t llama_get_memory (const struct llama_context * ctx);
497
503
  LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); // TODO: rename to llama_get_pooling_type
498
504
 
505
+ DEPRECATED(LLAMA_API struct llama_kv_cache * llama_get_kv_self(struct llama_context * ctx), "use llama_get_memory instead");
506
+
499
507
  LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
500
508
  LLAMA_API enum llama_rope_type llama_model_rope_type(const struct llama_model * model);
501
509
 
@@ -509,6 +517,13 @@ extern "C" {
509
517
  // Get the model's RoPE frequency scaling factor
510
518
  LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model);
511
519
 
520
+ // Returns the number of classifier outputs (only valid for classifier models)
521
+ // Undefined behavior for non-classifier models
522
+ LLAMA_API uint32_t llama_model_n_cls_out(const struct llama_model * model);
523
+
524
+ // Returns label of classifier output by index (<n_cls_out). Returns nullptr if no label provided
525
+ LLAMA_API const char * llama_model_cls_label(const struct llama_model * model, uint32_t i);
526
+
512
527
  LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_vocab * vocab);
513
528
 
514
529
  LLAMA_API int32_t llama_vocab_n_tokens(const struct llama_vocab * vocab);
@@ -609,7 +624,81 @@ extern "C" {
609
624
  int32_t il_end);
610
625
 
611
626
  //
612
- // KV cache
627
+ // Memory
628
+ //
629
+
630
+ // Clear the memory contents
631
+ // If data == true, the data buffers will also be cleared together with the metadata
632
+ LLAMA_API void llama_memory_clear(
633
+ llama_memory_t mem,
634
+ bool data);
635
+
636
+ // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
637
+ // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
638
+ // seq_id < 0 : match any sequence
639
+ // p0 < 0 : [0, p1]
640
+ // p1 < 0 : [p0, inf)
641
+ LLAMA_API bool llama_memory_seq_rm(
642
+ llama_memory_t mem,
643
+ llama_seq_id seq_id,
644
+ llama_pos p0,
645
+ llama_pos p1);
646
+
647
+ // Copy all tokens that belong to the specified sequence to another sequence
648
+ // p0 < 0 : [0, p1]
649
+ // p1 < 0 : [p0, inf)
650
+ LLAMA_API void llama_memory_seq_cp(
651
+ llama_memory_t mem,
652
+ llama_seq_id seq_id_src,
653
+ llama_seq_id seq_id_dst,
654
+ llama_pos p0,
655
+ llama_pos p1);
656
+
657
+ // Removes all tokens that do not belong to the specified sequence
658
+ LLAMA_API void llama_memory_seq_keep(
659
+ llama_memory_t mem,
660
+ llama_seq_id seq_id);
661
+
662
+ // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
663
+ // p0 < 0 : [0, p1]
664
+ // p1 < 0 : [p0, inf)
665
+ LLAMA_API void llama_memory_seq_add(
666
+ llama_memory_t mem,
667
+ llama_seq_id seq_id,
668
+ llama_pos p0,
669
+ llama_pos p1,
670
+ llama_pos delta);
671
+
672
+ // Integer division of the positions by factor of `d > 1`
673
+ // p0 < 0 : [0, p1]
674
+ // p1 < 0 : [p0, inf)
675
+ LLAMA_API void llama_memory_seq_div(
676
+ llama_memory_t mem,
677
+ llama_seq_id seq_id,
678
+ llama_pos p0,
679
+ llama_pos p1,
680
+ int d);
681
+
682
+ // Returns the smallest position present in the memory for the specified sequence
683
+ // This is typically non-zero only for SWA caches
684
+ // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the memory
685
+ // Return -1 if the sequence is empty
686
+ LLAMA_API llama_pos llama_memory_seq_pos_min(
687
+ llama_memory_t mem,
688
+ llama_seq_id seq_id);
689
+
690
+ // Returns the largest position present in the memory for the specified sequence
691
+ // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the memory
692
+ // Return -1 if the sequence is empty
693
+ LLAMA_API llama_pos llama_memory_seq_pos_max(
694
+ llama_memory_t mem,
695
+ llama_seq_id seq_id);
696
+
697
+ // Check if the memory supports shifting
698
+ LLAMA_API bool llama_memory_can_shift(llama_memory_t mem);
699
+
700
+ //
701
+ // KV cache for self-attention (TODO: deprecate in favor of llama_memory)
613
702
  //
614
703
 
615
704
  // Returns the number of tokens in the KV cache (slow, use only for debug)
@@ -622,86 +711,95 @@ extern "C" {
622
711
  "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
623
712
 
624
713
  // Clear the KV cache - both cell info is erased and KV data is zeroed
625
- LLAMA_API void llama_kv_self_clear(
626
- struct llama_context * ctx);
714
+ DEPRECATED(LLAMA_API void llama_kv_self_clear(
715
+ struct llama_context * ctx),
716
+ "Use llama_memory_clear() instead");
627
717
 
628
718
  // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
629
719
  // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
630
720
  // seq_id < 0 : match any sequence
631
721
  // p0 < 0 : [0, p1]
632
722
  // p1 < 0 : [p0, inf)
633
- LLAMA_API bool llama_kv_self_seq_rm(
723
+ DEPRECATED(LLAMA_API bool llama_kv_self_seq_rm(
634
724
  struct llama_context * ctx,
635
725
  llama_seq_id seq_id,
636
726
  llama_pos p0,
637
- llama_pos p1);
727
+ llama_pos p1),
728
+ "Use llama_memory_seq_rm() instead");
638
729
 
639
730
  // Copy all tokens that belong to the specified sequence to another sequence
640
731
  // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
641
732
  // p0 < 0 : [0, p1]
642
733
  // p1 < 0 : [p0, inf)
643
- LLAMA_API void llama_kv_self_seq_cp(
734
+ DEPRECATED(LLAMA_API void llama_kv_self_seq_cp(
644
735
  struct llama_context * ctx,
645
736
  llama_seq_id seq_id_src,
646
737
  llama_seq_id seq_id_dst,
647
738
  llama_pos p0,
648
- llama_pos p1);
739
+ llama_pos p1),
740
+ "Use llama_memory_seq_cp() instead");
649
741
 
650
742
  // Removes all tokens that do not belong to the specified sequence
651
- LLAMA_API void llama_kv_self_seq_keep(
743
+ DEPRECATED(LLAMA_API void llama_kv_self_seq_keep(
652
744
  struct llama_context * ctx,
653
- llama_seq_id seq_id);
745
+ llama_seq_id seq_id),
746
+ "Use llama_memory_seq_keep() instead");
654
747
 
655
748
  // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
656
749
  // If the KV cache is RoPEd, the KV data is updated accordingly:
657
750
  // - lazily on next llama_decode()
658
751
  // p0 < 0 : [0, p1]
659
752
  // p1 < 0 : [p0, inf)
660
- LLAMA_API void llama_kv_self_seq_add(
753
+ DEPRECATED(LLAMA_API void llama_kv_self_seq_add(
661
754
  struct llama_context * ctx,
662
755
  llama_seq_id seq_id,
663
756
  llama_pos p0,
664
757
  llama_pos p1,
665
- llama_pos delta);
758
+ llama_pos delta),
759
+ "Use llama_memory_seq_add() instead");
666
760
 
667
761
  // Integer division of the positions by factor of `d > 1`
668
762
  // If the KV cache is RoPEd, the KV data is updated accordingly:
669
763
  // - lazily on next llama_decode()
670
764
  // p0 < 0 : [0, p1]
671
765
  // p1 < 0 : [p0, inf)
672
- LLAMA_API void llama_kv_self_seq_div(
766
+ DEPRECATED(void llama_kv_self_seq_div(
673
767
  struct llama_context * ctx,
674
768
  llama_seq_id seq_id,
675
769
  llama_pos p0,
676
770
  llama_pos p1,
677
- int d);
771
+ int d),
772
+ "Use llama_memory_seq_div() instead");
678
773
 
679
774
  // Returns the smallest position present in the KV cache for the specified sequence
680
775
  // This is typically non-zero only for SWA caches
681
776
  // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
682
777
  // Return -1 if the sequence is empty
683
- LLAMA_API llama_pos llama_kv_self_seq_pos_min(
778
+ DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_min(
684
779
  struct llama_context * ctx,
685
- llama_seq_id seq_id);
780
+ llama_seq_id seq_id),
781
+ "Use llama_memory_seq_pos_min() instead");
686
782
 
687
783
  // Returns the largest position present in the KV cache for the specified sequence
688
784
  // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
689
785
  // Return -1 if the sequence is empty
690
- LLAMA_API llama_pos llama_kv_self_seq_pos_max(
786
+ DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_max(
691
787
  struct llama_context * ctx,
692
- llama_seq_id seq_id);
788
+ llama_seq_id seq_id),
789
+ "Use llama_memory_seq_pos_max() instead");
693
790
 
694
791
  // Defragment the KV cache
695
792
  // This will be applied:
696
793
  // - lazily on next llama_decode()
697
- LLAMA_API DEPRECATED(void llama_kv_self_defrag(struct llama_context * ctx),
794
+ DEPRECATED(LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx),
698
795
  "simply remove this call, the context will automatically decide when to do a defragmentation based on 'defrag_thold'");
699
796
 
700
797
  // Check if the context supports KV cache shifting
701
- LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx);
798
+ DEPRECATED(LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx),
799
+ "use llama_memory_can_shift() instead");
702
800
 
703
801
  // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
704
- LLAMA_API DEPRECATED(void llama_kv_self_update(struct llama_context * ctx),
802
+ DEPRECATED(LLAMA_API void llama_kv_self_update(struct llama_context * ctx),
705
803
  "simply remove this call, updates are applied lazily on the next llama_decode()");
706
804
 
707
805
  //
@@ -709,7 +807,7 @@ extern "C" {
709
807
  //
710
808
 
711
809
  // Returns the *actual* size in bytes of the state
712
- // (logits, embedding and kv_cache)
810
+ // (logits, embedding and memory)
713
811
  // Only use when saving the state, not when restoring it, otherwise the size may be too small.
714
812
  LLAMA_API size_t llama_state_get_size(struct llama_context * ctx);
715
813
  LLAMA_API DEPRECATED(size_t llama_get_state_size(struct llama_context * ctx),
@@ -765,12 +863,12 @@ extern "C" {
765
863
  size_t n_token_count),
766
864
  "use llama_state_save_file instead");
767
865
 
768
- // Get the exact size needed to copy the KV cache of a single sequence
866
+ // Get the exact size needed to copy the state of a single sequence
769
867
  LLAMA_API size_t llama_state_seq_get_size(
770
868
  struct llama_context * ctx,
771
869
  llama_seq_id seq_id);
772
870
 
773
- // Copy the KV cache of a single sequence into the specified buffer
871
+ // Copy the state of a single sequence into the specified buffer
774
872
  LLAMA_API size_t llama_state_seq_get_data(
775
873
  struct llama_context * ctx,
776
874
  uint8_t * dst,
@@ -836,16 +934,16 @@ extern "C" {
836
934
  // For encode-decoder contexts, processes the batch using the encoder.
837
935
  // Can store the encoder output internally for later use by the decoder's cross-attention layers.
838
936
  // 0 - success
839
- // < 0 - error. the KV cache state is restored to the state before this call
937
+ // < 0 - error. the memory state is restored to the state before this call
840
938
  LLAMA_API int32_t llama_encode(
841
939
  struct llama_context * ctx,
842
940
  struct llama_batch batch);
843
941
 
844
942
  // Process a batch of tokens.
845
- // Requires KV cache.
943
+ // Requires the context to have a memory.
846
944
  // For encode-decoder contexts, processes the batch using the decoder.
847
945
  // Positive return values does not mean a fatal error, but rather a warning.
848
- // Upon non-zero return values, the KV cache state is restored to the state before this call
946
+ // Upon non-zero return values, the memory state is restored to the state before this call
849
947
  // 0 - success
850
948
  // 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
851
949
  // 2 - aborted
@@ -866,8 +964,8 @@ extern "C" {
866
964
  // Get the number of threads used for prompt and batch processing (multiple token).
867
965
  LLAMA_API int32_t llama_n_threads_batch(struct llama_context * ctx);
868
966
 
869
- // Set whether the model is in embeddings mode or not
870
- // If true, embeddings will be returned but logits will not
967
+ // Set whether the context outputs embeddings or not
968
+ // TODO: rename to avoid confusion with llama_get_embeddings()
871
969
  LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings);
872
970
 
873
971
  // Set whether to use causal attention or not
@@ -916,7 +1014,7 @@ extern "C" {
916
1014
 
917
1015
  // Get the embeddings for a sequence id
918
1016
  // Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
919
- // when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[1] with the rank of the sequence
1017
+ // when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[n_cls_out] with the rank(s) of the sequence
920
1018
  // otherwise: float[n_embd] (1-dimensional)
921
1019
  LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
922
1020
 
@@ -61,7 +61,10 @@ extern "C" {
61
61
  struct llama_model;
62
62
  struct llama_context;
63
63
  struct llama_sampler;
64
- struct llama_kv_cache;
64
+
65
+ typedef struct llama_memory_i * llama_memory_t;
66
+
67
+ struct llama_kv_cache; // DEPRECATED (use llama_memory instead)
65
68
 
66
69
  typedef int32_t llama_pos;
67
70
  typedef int32_t llama_token;
@@ -240,18 +243,21 @@ extern "C" {
240
243
 
241
244
  typedef bool (*llama_progress_callback)(float progress, void * user_data);
242
245
 
243
- // Input data for llama_decode
246
+ // Input data for llama_encode/llama_decode
244
247
  // A llama_batch object can contain input about one or many sequences
245
248
  // The provided arrays (i.e. token, embd, pos, etc.) must have size of n_tokens
246
249
  //
247
250
  // - token : the token ids of the input (used when embd is NULL)
248
251
  // - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
249
252
  // - pos : the positions of the respective token in the sequence
250
- // (if set to NULL, the token position will be tracked automatically by llama_decode)
253
+ // (if set to NULL, the token position will be tracked automatically by llama_encode/llama_decode)
251
254
  // - seq_id : the sequence to which the respective token belongs
252
255
  // (if set to NULL, the sequence ID will be assumed to be 0)
253
256
  // - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
254
- // (if set to NULL, only the logits for last token will be returned)
257
+ // (if set to NULL:
258
+ // - if embeddings: all tokens are output
259
+ // - if not: only the last token is output
260
+ // )
255
261
  //
256
262
  typedef struct llama_batch {
257
263
  int32_t n_tokens;
@@ -259,8 +265,8 @@ extern "C" {
259
265
  llama_token * token;
260
266
  float * embd;
261
267
  llama_pos * pos;
262
- int32_t * n_seq_id; // TODO: remove, should belong to only 1 sequence
263
- llama_seq_id ** seq_id; // TODO: become llama_seq_id * seq_id;
268
+ int32_t * n_seq_id;
269
+ llama_seq_id ** seq_id;
264
270
  int8_t * logits; // TODO: rename this to "output"
265
271
  } llama_batch;
266
272
 
@@ -493,9 +499,11 @@ extern "C" {
493
499
  DEPRECATED(LLAMA_API int32_t llama_n_vocab (const struct llama_vocab * vocab), "use llama_vocab_n_tokens instead");
494
500
 
495
501
  LLAMA_API const struct llama_model * llama_get_model (const struct llama_context * ctx);
496
- LLAMA_API struct llama_kv_cache * llama_get_kv_self ( struct llama_context * ctx);
502
+ LLAMA_API llama_memory_t llama_get_memory (const struct llama_context * ctx);
497
503
  LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); // TODO: rename to llama_get_pooling_type
498
504
 
505
+ DEPRECATED(LLAMA_API struct llama_kv_cache * llama_get_kv_self(struct llama_context * ctx), "use llama_get_memory instead");
506
+
499
507
  LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
500
508
  LLAMA_API enum llama_rope_type llama_model_rope_type(const struct llama_model * model);
501
509
 
@@ -509,6 +517,13 @@ extern "C" {
509
517
  // Get the model's RoPE frequency scaling factor
510
518
  LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model);
511
519
 
520
+ // Returns the number of classifier outputs (only valid for classifier models)
521
+ // Undefined behavior for non-classifier models
522
+ LLAMA_API uint32_t llama_model_n_cls_out(const struct llama_model * model);
523
+
524
+ // Returns label of classifier output by index (<n_cls_out). Returns nullptr if no label provided
525
+ LLAMA_API const char * llama_model_cls_label(const struct llama_model * model, uint32_t i);
526
+
512
527
  LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_vocab * vocab);
513
528
 
514
529
  LLAMA_API int32_t llama_vocab_n_tokens(const struct llama_vocab * vocab);
@@ -609,7 +624,81 @@ extern "C" {
609
624
  int32_t il_end);
610
625
 
611
626
  //
612
- // KV cache
627
+ // Memory
628
+ //
629
+
630
+ // Clear the memory contents
631
+ // If data == true, the data buffers will also be cleared together with the metadata
632
+ LLAMA_API void llama_memory_clear(
633
+ llama_memory_t mem,
634
+ bool data);
635
+
636
+ // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
637
+ // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
638
+ // seq_id < 0 : match any sequence
639
+ // p0 < 0 : [0, p1]
640
+ // p1 < 0 : [p0, inf)
641
+ LLAMA_API bool llama_memory_seq_rm(
642
+ llama_memory_t mem,
643
+ llama_seq_id seq_id,
644
+ llama_pos p0,
645
+ llama_pos p1);
646
+
647
+ // Copy all tokens that belong to the specified sequence to another sequence
648
+ // p0 < 0 : [0, p1]
649
+ // p1 < 0 : [p0, inf)
650
+ LLAMA_API void llama_memory_seq_cp(
651
+ llama_memory_t mem,
652
+ llama_seq_id seq_id_src,
653
+ llama_seq_id seq_id_dst,
654
+ llama_pos p0,
655
+ llama_pos p1);
656
+
657
+ // Removes all tokens that do not belong to the specified sequence
658
+ LLAMA_API void llama_memory_seq_keep(
659
+ llama_memory_t mem,
660
+ llama_seq_id seq_id);
661
+
662
+ // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
663
+ // p0 < 0 : [0, p1]
664
+ // p1 < 0 : [p0, inf)
665
+ LLAMA_API void llama_memory_seq_add(
666
+ llama_memory_t mem,
667
+ llama_seq_id seq_id,
668
+ llama_pos p0,
669
+ llama_pos p1,
670
+ llama_pos delta);
671
+
672
+ // Integer division of the positions by factor of `d > 1`
673
+ // p0 < 0 : [0, p1]
674
+ // p1 < 0 : [p0, inf)
675
+ LLAMA_API void llama_memory_seq_div(
676
+ llama_memory_t mem,
677
+ llama_seq_id seq_id,
678
+ llama_pos p0,
679
+ llama_pos p1,
680
+ int d);
681
+
682
+ // Returns the smallest position present in the memory for the specified sequence
683
+ // This is typically non-zero only for SWA caches
684
+ // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the memory
685
+ // Return -1 if the sequence is empty
686
+ LLAMA_API llama_pos llama_memory_seq_pos_min(
687
+ llama_memory_t mem,
688
+ llama_seq_id seq_id);
689
+
690
+ // Returns the largest position present in the memory for the specified sequence
691
+ // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the memory
692
+ // Return -1 if the sequence is empty
693
+ LLAMA_API llama_pos llama_memory_seq_pos_max(
694
+ llama_memory_t mem,
695
+ llama_seq_id seq_id);
696
+
697
+ // Check if the memory supports shifting
698
+ LLAMA_API bool llama_memory_can_shift(llama_memory_t mem);
699
+
700
+ //
701
+ // KV cache for self-attention (TODO: deprecate in favor of llama_memory)
613
702
  //
614
703
 
615
704
  // Returns the number of tokens in the KV cache (slow, use only for debug)
@@ -622,86 +711,95 @@ extern "C" {
622
711
  "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
623
712
 
624
713
  // Clear the KV cache - both cell info is erased and KV data is zeroed
625
- LLAMA_API void llama_kv_self_clear(
626
- struct llama_context * ctx);
714
+ DEPRECATED(LLAMA_API void llama_kv_self_clear(
715
+ struct llama_context * ctx),
716
+ "Use llama_memory_clear() instead");
627
717
 
628
718
  // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
629
719
  // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
630
720
  // seq_id < 0 : match any sequence
631
721
  // p0 < 0 : [0, p1]
632
722
  // p1 < 0 : [p0, inf)
633
- LLAMA_API bool llama_kv_self_seq_rm(
723
+ DEPRECATED(LLAMA_API bool llama_kv_self_seq_rm(
634
724
  struct llama_context * ctx,
635
725
  llama_seq_id seq_id,
636
726
  llama_pos p0,
637
- llama_pos p1);
727
+ llama_pos p1),
728
+ "Use llama_memory_seq_rm() instead");
638
729
 
639
730
  // Copy all tokens that belong to the specified sequence to another sequence
640
731
  // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
641
732
  // p0 < 0 : [0, p1]
642
733
  // p1 < 0 : [p0, inf)
643
- LLAMA_API void llama_kv_self_seq_cp(
734
+ DEPRECATED(LLAMA_API void llama_kv_self_seq_cp(
644
735
  struct llama_context * ctx,
645
736
  llama_seq_id seq_id_src,
646
737
  llama_seq_id seq_id_dst,
647
738
  llama_pos p0,
648
- llama_pos p1);
739
+ llama_pos p1),
740
+ "Use llama_memory_seq_cp() instead");
649
741
 
650
742
  // Removes all tokens that do not belong to the specified sequence
651
- LLAMA_API void llama_kv_self_seq_keep(
743
+ DEPRECATED(LLAMA_API void llama_kv_self_seq_keep(
652
744
  struct llama_context * ctx,
653
- llama_seq_id seq_id);
745
+ llama_seq_id seq_id),
746
+ "Use llama_memory_seq_keep() instead");
654
747
 
655
748
  // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
656
749
  // If the KV cache is RoPEd, the KV data is updated accordingly:
657
750
  // - lazily on next llama_decode()
658
751
  // p0 < 0 : [0, p1]
659
752
  // p1 < 0 : [p0, inf)
660
- LLAMA_API void llama_kv_self_seq_add(
753
+ DEPRECATED(LLAMA_API void llama_kv_self_seq_add(
661
754
  struct llama_context * ctx,
662
755
  llama_seq_id seq_id,
663
756
  llama_pos p0,
664
757
  llama_pos p1,
665
- llama_pos delta);
758
+ llama_pos delta),
759
+ "Use llama_memory_seq_add() instead");
666
760
 
667
761
  // Integer division of the positions by factor of `d > 1`
668
762
  // If the KV cache is RoPEd, the KV data is updated accordingly:
669
763
  // - lazily on next llama_decode()
670
764
  // p0 < 0 : [0, p1]
671
765
  // p1 < 0 : [p0, inf)
672
- LLAMA_API void llama_kv_self_seq_div(
766
+ DEPRECATED(void llama_kv_self_seq_div(
673
767
  struct llama_context * ctx,
674
768
  llama_seq_id seq_id,
675
769
  llama_pos p0,
676
770
  llama_pos p1,
677
- int d);
771
+ int d),
772
+ "Use llama_memory_seq_div() instead");
678
773
 
679
774
  // Returns the smallest position present in the KV cache for the specified sequence
680
775
  // This is typically non-zero only for SWA caches
681
776
  // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
682
777
  // Return -1 if the sequence is empty
683
- LLAMA_API llama_pos llama_kv_self_seq_pos_min(
778
+ DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_min(
684
779
  struct llama_context * ctx,
685
- llama_seq_id seq_id);
780
+ llama_seq_id seq_id),
781
+ "Use llama_memory_seq_pos_min() instead");
686
782
 
687
783
  // Returns the largest position present in the KV cache for the specified sequence
688
784
  // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
689
785
  // Return -1 if the sequence is empty
690
- LLAMA_API llama_pos llama_kv_self_seq_pos_max(
786
+ DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_max(
691
787
  struct llama_context * ctx,
692
- llama_seq_id seq_id);
788
+ llama_seq_id seq_id),
789
+ "Use llama_memory_seq_pos_max() instead");
693
790
 
694
791
  // Defragment the KV cache
695
792
  // This will be applied:
696
793
  // - lazily on next llama_decode()
697
- LLAMA_API DEPRECATED(void llama_kv_self_defrag(struct llama_context * ctx),
794
+ DEPRECATED(LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx),
698
795
  "simply remove this call, the context will automatically decide when to do a defragmentation based on 'defrag_thold'");
699
796
 
700
797
  // Check if the context supports KV cache shifting
701
- LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx);
798
+ DEPRECATED(LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx),
799
+ "use llama_memory_can_shift() instead");
702
800
 
703
801
  // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
704
- LLAMA_API DEPRECATED(void llama_kv_self_update(struct llama_context * ctx),
802
+ DEPRECATED(LLAMA_API void llama_kv_self_update(struct llama_context * ctx),
705
803
  "simply remove this call, updates are applied lazily on the next llama_decode()");
706
804
 
707
805
  //
@@ -709,7 +807,7 @@ extern "C" {
709
807
  //
710
808
 
711
809
  // Returns the *actual* size in bytes of the state
712
- // (logits, embedding and kv_cache)
810
+ // (logits, embedding and memory)
713
811
  // Only use when saving the state, not when restoring it, otherwise the size may be too small.
714
812
  LLAMA_API size_t llama_state_get_size(struct llama_context * ctx);
715
813
  LLAMA_API DEPRECATED(size_t llama_get_state_size(struct llama_context * ctx),
@@ -765,12 +863,12 @@ extern "C" {
765
863
  size_t n_token_count),
766
864
  "use llama_state_save_file instead");
767
865
 
768
- // Get the exact size needed to copy the KV cache of a single sequence
866
+ // Get the exact size needed to copy the state of a single sequence
769
867
  LLAMA_API size_t llama_state_seq_get_size(
770
868
  struct llama_context * ctx,
771
869
  llama_seq_id seq_id);
772
870
 
773
- // Copy the KV cache of a single sequence into the specified buffer
871
+ // Copy the state of a single sequence into the specified buffer
774
872
  LLAMA_API size_t llama_state_seq_get_data(
775
873
  struct llama_context * ctx,
776
874
  uint8_t * dst,
@@ -836,16 +934,16 @@ extern "C" {
836
934
  // For encode-decoder contexts, processes the batch using the encoder.
837
935
  // Can store the encoder output internally for later use by the decoder's cross-attention layers.
838
936
  // 0 - success
839
- // < 0 - error. the KV cache state is restored to the state before this call
937
+ // < 0 - error. the memory state is restored to the state before this call
840
938
  LLAMA_API int32_t llama_encode(
841
939
  struct llama_context * ctx,
842
940
  struct llama_batch batch);
843
941
 
844
942
  // Process a batch of tokens.
845
- // Requires KV cache.
943
+ // Requires the context to have a memory.
846
944
  // For encode-decoder contexts, processes the batch using the decoder.
847
945
  // Positive return values does not mean a fatal error, but rather a warning.
848
- // Upon non-zero return values, the KV cache state is restored to the state before this call
946
+ // Upon non-zero return values, the memory state is restored to the state before this call
849
947
  // 0 - success
850
948
  // 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
851
949
  // 2 - aborted
@@ -866,8 +964,8 @@ extern "C" {
866
964
  // Get the number of threads used for prompt and batch processing (multiple token).
867
965
  LLAMA_API int32_t llama_n_threads_batch(struct llama_context * ctx);
868
966
 
869
- // Set whether the model is in embeddings mode or not
870
- // If true, embeddings will be returned but logits will not
967
+ // Set whether the context outputs embeddings or not
968
+ // TODO: rename to avoid confusion with llama_get_embeddings()
871
969
  LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings);
872
970
 
873
971
  // Set whether to use causal attention or not
@@ -916,7 +1014,7 @@ extern "C" {
916
1014
 
917
1015
  // Get the embeddings for a sequence id
918
1016
  // Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
919
- // when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[1] with the rank of the sequence
1017
+ // when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[n_cls_out] with the rank(s) of the sequence
920
1018
  // otherwise: float[n_embd] (1-dimensional)
921
1019
  LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
922
1020