cui-llama.rn 1.6.0 → 1.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (195) hide show
  1. package/README.md +35 -7
  2. package/android/src/main/CMakeLists.txt +16 -11
  3. package/android/src/main/java/com/rnllama/LlamaContext.java +4 -1
  4. package/android/src/main/jni.cpp +20 -4
  5. package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
  6. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
  9. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
  10. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
  11. package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
  12. package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
  13. package/cpp/LICENSE +21 -0
  14. package/cpp/chat.cpp +1 -1
  15. package/cpp/common.cpp +17 -2
  16. package/cpp/common.h +7 -3
  17. package/cpp/ggml-alloc.c +4 -1
  18. package/cpp/ggml-cpp.h +1 -1
  19. package/cpp/ggml-cpu/amx/amx.cpp +221 -0
  20. package/cpp/ggml-cpu/amx/amx.h +8 -0
  21. package/cpp/ggml-cpu/amx/common.h +91 -0
  22. package/cpp/ggml-cpu/amx/mmq.cpp +2511 -0
  23. package/cpp/ggml-cpu/amx/mmq.h +10 -0
  24. package/{ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers → cpp/ggml-cpu}/binary-ops.h +1 -1
  25. package/cpp/ggml-cpu/common.h +72 -0
  26. package/cpp/{ggml-cpu-aarch64.cpp → ggml-cpu/ggml-cpu-aarch64.cpp} +809 -101
  27. package/cpp/{ggml-cpu.c → ggml-cpu/ggml-cpu.c} +109 -42
  28. package/cpp/{ggml-cpu.cpp → ggml-cpu/ggml-cpu.cpp} +3 -0
  29. package/cpp/{ops.cpp → ggml-cpu/ops.cpp} +246 -160
  30. package/{ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers → cpp/ggml-cpu}/ops.h +2 -20
  31. package/cpp/{sgemm.cpp → ggml-cpu/sgemm.cpp} +501 -0
  32. package/{ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers → cpp/ggml-cpu}/simd-mappings.h +7 -3
  33. package/{ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers → cpp/ggml-cpu}/unary-ops.h +1 -1
  34. package/cpp/ggml-cpu.h +5 -0
  35. package/cpp/ggml-impl.h +16 -9
  36. package/cpp/ggml-llama-sim.metallib +0 -0
  37. package/cpp/ggml-llama.metallib +0 -0
  38. package/cpp/ggml-metal.m +492 -47
  39. package/cpp/ggml.c +134 -244
  40. package/cpp/ggml.h +61 -94
  41. package/cpp/json-schema-to-grammar.cpp +3 -0
  42. package/cpp/llama-arch.cpp +46 -17
  43. package/cpp/llama-arch.h +9 -0
  44. package/cpp/llama-batch.cpp +5 -1
  45. package/cpp/llama-batch.h +2 -1
  46. package/cpp/llama-chat.cpp +31 -10
  47. package/cpp/llama-chat.h +3 -2
  48. package/cpp/llama-context.cpp +104 -489
  49. package/cpp/llama-context.h +14 -30
  50. package/cpp/llama-graph.cpp +69 -62
  51. package/cpp/llama-graph.h +21 -18
  52. package/cpp/llama-hparams.h +5 -0
  53. package/cpp/llama-kv-cache.cpp +1497 -391
  54. package/cpp/llama-kv-cache.h +272 -80
  55. package/cpp/llama-memory.h +11 -1
  56. package/cpp/llama-model.cpp +502 -176
  57. package/cpp/llama-model.h +13 -3
  58. package/cpp/llama-sampling.cpp +2 -1
  59. package/cpp/llama-vocab.cpp +8 -1
  60. package/cpp/llama.h +14 -11
  61. package/cpp/rn-llama.cpp +20 -172
  62. package/cpp/rn-llama.h +1 -5
  63. package/ios/CMakeLists.txt +13 -10
  64. package/ios/RNLlama.h +6 -0
  65. package/ios/RNLlama.mm +5 -0
  66. package/ios/RNLlamaContext.mm +26 -28
  67. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/common.h +7 -3
  68. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpp.h +1 -1
  69. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu.h +5 -0
  70. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-impl.h +16 -9
  71. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml.h +61 -94
  72. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-arch.h +9 -0
  73. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-batch.h +2 -1
  74. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-chat.h +3 -2
  75. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-context.h +14 -30
  76. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-graph.h +21 -18
  77. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-hparams.h +5 -0
  78. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-kv-cache.h +272 -80
  79. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-memory.h +11 -1
  80. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama-model.h +13 -3
  81. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/llama.h +14 -11
  82. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/rn-llama.h +1 -5
  83. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/ggml-llama.metallib +0 -0
  84. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/rnllama +0 -0
  85. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +7 -3
  86. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpp.h +1 -1
  87. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +5 -0
  88. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +16 -9
  89. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +61 -94
  90. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +9 -0
  91. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +2 -1
  92. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +3 -2
  93. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +14 -30
  94. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +21 -18
  95. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +5 -0
  96. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +272 -80
  97. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +11 -1
  98. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +13 -3
  99. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +14 -11
  100. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +1 -5
  101. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
  102. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
  103. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/common.h +7 -3
  104. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpp.h +1 -1
  105. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu.h +5 -0
  106. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-impl.h +16 -9
  107. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml.h +61 -94
  108. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-arch.h +9 -0
  109. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-batch.h +2 -1
  110. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-chat.h +3 -2
  111. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-context.h +14 -30
  112. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-graph.h +21 -18
  113. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-hparams.h +5 -0
  114. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-kv-cache.h +272 -80
  115. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-memory.h +11 -1
  116. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama-model.h +13 -3
  117. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/llama.h +14 -11
  118. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-llama.h +1 -5
  119. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/ggml-llama.metallib +0 -0
  120. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/rnllama +0 -0
  121. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/common.h +7 -3
  122. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpp.h +1 -1
  123. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu.h +5 -0
  124. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-impl.h +16 -9
  125. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml.h +61 -94
  126. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-arch.h +9 -0
  127. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-batch.h +2 -1
  128. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-chat.h +3 -2
  129. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-context.h +14 -30
  130. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-graph.h +21 -18
  131. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-hparams.h +5 -0
  132. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-kv-cache.h +272 -80
  133. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-memory.h +11 -1
  134. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama-model.h +13 -3
  135. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/llama.h +14 -11
  136. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/rn-llama.h +1 -5
  137. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/ggml-llama-sim.metallib +0 -0
  138. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/rnllama +0 -0
  139. package/lib/commonjs/NativeRNLlama.js.map +1 -1
  140. package/lib/module/NativeRNLlama.js.map +1 -1
  141. package/lib/typescript/NativeRNLlama.d.ts +4 -0
  142. package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
  143. package/package.json +1 -1
  144. package/src/NativeRNLlama.ts +5 -0
  145. package/cpp/binary-ops.h +0 -16
  146. package/cpp/ops.h +0 -128
  147. package/cpp/simd-mappings.h +0 -888
  148. package/cpp/unary-ops.h +0 -28
  149. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/binary-ops.h +0 -16
  150. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
  151. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
  152. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
  153. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
  154. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/ops.h +0 -128
  155. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/sgemm.h +0 -14
  156. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/simd-mappings.h +0 -888
  157. package/ios/rnllama.xcframework/ios-arm64/rnllama.framework/Headers/vec.h +0 -802
  158. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
  159. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
  160. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
  161. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
  162. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/sgemm.h +0 -14
  163. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/unary-ops.h +0 -28
  164. package/ios/rnllama.xcframework/ios-arm64_x86_64-simulator/rnllama.framework/Headers/vec.h +0 -802
  165. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/binary-ops.h +0 -16
  166. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
  167. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
  168. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
  169. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
  170. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/ops.h +0 -128
  171. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/sgemm.h +0 -14
  172. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/simd-mappings.h +0 -888
  173. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/unary-ops.h +0 -28
  174. package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/vec.h +0 -802
  175. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/binary-ops.h +0 -16
  176. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-aarch64.h +0 -8
  177. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-impl.h +0 -512
  178. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-quants.h +0 -63
  179. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ggml-cpu-traits.h +0 -38
  180. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/ops.h +0 -128
  181. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/sgemm.h +0 -14
  182. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/simd-mappings.h +0 -888
  183. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/unary-ops.h +0 -28
  184. package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/vec.h +0 -802
  185. /package/cpp/{binary-ops.cpp → ggml-cpu/binary-ops.cpp} +0 -0
  186. /package/cpp/{ggml-cpu-aarch64.h → ggml-cpu/ggml-cpu-aarch64.h} +0 -0
  187. /package/cpp/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +0 -0
  188. /package/cpp/{ggml-cpu-quants.c → ggml-cpu/ggml-cpu-quants.c} +0 -0
  189. /package/cpp/{ggml-cpu-quants.h → ggml-cpu/ggml-cpu-quants.h} +0 -0
  190. /package/cpp/{ggml-cpu-traits.cpp → ggml-cpu/ggml-cpu-traits.cpp} +0 -0
  191. /package/cpp/{ggml-cpu-traits.h → ggml-cpu/ggml-cpu-traits.h} +0 -0
  192. /package/cpp/{sgemm.h → ggml-cpu/sgemm.h} +0 -0
  193. /package/cpp/{unary-ops.cpp → ggml-cpu/unary-ops.cpp} +0 -0
  194. /package/cpp/{vec.cpp → ggml-cpu/vec.cpp} +0 -0
  195. /package/cpp/{vec.h → ggml-cpu/vec.h} +0 -0
@@ -2,32 +2,72 @@
2
2
 
3
3
  #include "llama.h"
4
4
  #include "llama-io.h"
5
+ #include "llama-graph.h"
5
6
  #include "llama-memory.h"
6
7
 
7
8
  #include "ggml-cpp.h"
8
9
 
9
- #include <functional>
10
10
  #include <set>
11
11
  #include <vector>
12
12
 
13
13
  struct llama_cparams;
14
14
  struct llama_hparams;
15
15
  struct llama_ubatch;
16
+ struct llama_sbatch;
17
+ struct llama_model;
18
+ struct llama_context;
16
19
 
17
20
  struct llama_kv_cache : public llama_memory_i {
18
- using llama_memory_i::llama_memory_i;
21
+ virtual ~llama_kv_cache() = default;
19
22
 
20
- virtual void restore() = 0; // call if batch processing fails - restores the cache state
21
- virtual void commit() = 0; // call after successful batch processing - clears any pending state
23
+ // call if batch processing fails - restores the cache state
24
+ virtual void restore() = 0;
22
25
 
23
- virtual int32_t get_n_tokens() const = 0;
24
- virtual int32_t get_used_cells() const = 0; // TODO: remove, this is too-specific to the unified cache
26
+ // call after successful batch processing - clears any pending state
27
+ virtual void commit() = 0;
25
28
 
26
- virtual bool get_can_shift() const = 0;
29
+ // process any pending defrag/shift/etc. operations
30
+ // optionally call once before processing a new batch
31
+ virtual bool update(llama_context & lctx) = 0;
32
+
33
+ // schedule a defrag if the fragmentation threshold is exceeded. otherwise, do nothing
34
+ virtual void defrag_sched(float thold) = 0;
35
+
36
+ // simulate full cache, used for allocating worst-case compute buffers
37
+ virtual void set_full() = 0;
38
+
39
+ //
40
+ // batch processing
41
+ //
42
+
43
+ virtual llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) = 0;
44
+
45
+ // different KV caches require different batch splitting strategies
46
+ virtual llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const = 0;
47
+
48
+ // find an empty slot of size "n_tokens" in the cache
49
+ virtual bool find_slot(const llama_ubatch & batch) = 0;
50
+
51
+ // getters
52
+ virtual int32_t get_n_tokens() const = 0;
53
+ virtual int32_t get_used_cells() const = 0; // TODO: remove, this is too-specific to the unified cache
54
+ virtual llama_pos get_pos_max() const = 0;
55
+ virtual bool get_can_shift() const = 0;
27
56
 
28
57
  bool get_can_edit() const override { return get_can_shift(); }
58
+
59
+ //
60
+ // state write/read
61
+ //
62
+
63
+ virtual void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const = 0;
64
+ virtual void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) = 0;
29
65
  };
30
66
 
67
+ //
68
+ // llama_kv_cache_guard
69
+ //
70
+
31
71
  struct llama_kv_cache_guard {
32
72
  llama_kv_cache_guard(llama_kv_cache * kv) : kv(kv) {}
33
73
 
@@ -43,65 +83,50 @@ private:
43
83
  llama_kv_cache * kv;
44
84
  };
45
85
 
46
- struct llama_kv_cell {
47
- llama_pos pos = -1;
48
- llama_pos delta = 0;
49
- int32_t src = -1; // used by recurrent state models to copy states
50
- int32_t tail = -1;
86
+ //
87
+ // llama_kv_cache_unified
88
+ //
51
89
 
52
- std::set<llama_seq_id> seq_id;
90
+ // TODO: add notion of max sequences
91
+ class llama_kv_cache_unified : public llama_kv_cache {
92
+ public:
93
+ struct kv_cell {
94
+ llama_pos pos = -1;
95
+ llama_pos delta = 0;
53
96
 
54
- bool has_seq_id(const llama_seq_id & id) const {
55
- return seq_id.find(id) != seq_id.end();
56
- }
97
+ std::set<llama_seq_id> seq_id;
57
98
 
58
- bool is_empty() const {
59
- return seq_id.empty();
60
- }
99
+ bool has_seq_id(const llama_seq_id & id) const {
100
+ return seq_id.find(id) != seq_id.end();
101
+ }
61
102
 
62
- bool is_same_seq(const llama_kv_cell & other) const {
63
- return seq_id == other.seq_id;
64
- }
65
- };
103
+ bool is_empty() const {
104
+ return seq_id.empty();
105
+ }
66
106
 
67
- // ring-buffer of cached KV data
68
- // TODO: pimpl
69
- // TODO: add notion of max sequences
70
- class llama_kv_cache_unified : public llama_kv_cache {
71
- public:
72
- // can be used to query data from the model if needed
73
- struct callbacks {
74
- std::function<lm_ggml_tensor * (uint32_t n_ctx_per_seq, int il)> get_rope_factors;
107
+ bool is_same_seq(const kv_cell & other) const {
108
+ return seq_id == other.seq_id;
109
+ }
75
110
  };
76
111
 
77
- llama_kv_cache_unified(
78
- const llama_hparams & hparams,
79
- callbacks cbs);
80
-
81
- virtual ~llama_kv_cache_unified() = default;
112
+ static uint32_t get_padding(const llama_cparams & cparams);
82
113
 
83
- // TODO: become constructor
84
- bool init(
85
- const llama_model & model, // TODO: do not reference the model
86
- const llama_cparams & cparams,
114
+ llama_kv_cache_unified(
115
+ const llama_model & model,
87
116
  lm_ggml_type type_k,
88
117
  lm_ggml_type type_v,
118
+ bool v_trans,
119
+ bool offload,
89
120
  uint32_t kv_size,
90
- bool offload);
91
-
92
- int32_t get_n_tokens() const override;
93
- int32_t get_used_cells() const override;
121
+ uint32_t padding);
94
122
 
95
- size_t total_size() const;
123
+ ~llama_kv_cache_unified() = default;
96
124
 
97
- // TODO: better data structures to reduce the cost of this operation
98
- llama_pos pos_max() const;
125
+ //
126
+ // llama_memory_i
127
+ //
99
128
 
100
129
  void clear() override;
101
- void defrag() override;
102
-
103
- virtual void restore() override;
104
- virtual void commit() override;
105
130
 
106
131
  bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) override;
107
132
  void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
@@ -111,25 +136,76 @@ public:
111
136
 
112
137
  llama_pos seq_pos_max(llama_seq_id seq_id) const override;
113
138
 
114
- bool get_can_shift() const override;
139
+ //
140
+ // llama_kv_cache
141
+ //
142
+
143
+ void restore() override;
144
+ void commit() override;
145
+
146
+ bool update(llama_context & ctx) override;
147
+
148
+ void defrag_sched(float thold) override;
149
+
150
+ void set_full() override;
151
+
152
+ llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) override;
153
+
154
+ llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const override;
115
155
 
116
- // find an empty slot of size "n_tokens" in the cache
117
156
  // updates the cache head
118
157
  // Note: On success, it's important that cache.head points
119
158
  // to the first cell of the slot.
120
- bool find_slot(const llama_ubatch & batch);
159
+ bool find_slot(const llama_ubatch & batch) override;
121
160
 
122
- // TODO: maybe not needed
123
- uint32_t get_padding(const llama_cparams & cparams) const;
161
+ int32_t get_n_tokens() const override;
162
+ int32_t get_used_cells() const override;
124
163
 
125
- // find how many cells are currently in use
126
- uint32_t cell_max() const;
164
+ // TODO: better data structures to reduce the cost of this operation
165
+ llama_pos get_pos_max() const override;
127
166
 
128
- size_t size_k_bytes() const;
129
- size_t size_v_bytes() const;
167
+ bool get_can_shift() const override;
130
168
 
131
- // defrag
169
+ // state write/load
170
+
171
+ void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
172
+ void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
173
+
174
+ // Note: The value of head isn't only used to optimize searching
175
+ // for a free KV slot. llama_decode_impl also uses it, so it
176
+ // cannot be freely changed after a slot has been allocated.
177
+ uint32_t head = 0;
178
+ uint32_t size = 0;
179
+ uint32_t used = 0; // used cells (i.e. at least one seq_id)
180
+
181
+ // computed before each graph build
182
+ uint32_t n = 0;
183
+
184
+ std::vector<kv_cell> cells;
185
+
186
+ std::vector<lm_ggml_tensor *> k_l; // per layer
187
+ std::vector<lm_ggml_tensor *> v_l;
188
+
189
+ private:
190
+ const llama_model & model;
191
+ const llama_hparams & hparams;
192
+
193
+ bool has_shift = false;
194
+ bool do_defrag = false;
195
+
196
+ bool v_trans = true; // the value tensor is transposed
197
+ bool can_shift = false;
198
+
199
+ // required padding
200
+ uint32_t padding = 1;
201
+
202
+ lm_ggml_type type_k = LM_GGML_TYPE_F16;
203
+ lm_ggml_type type_v = LM_GGML_TYPE_F16;
204
+
205
+ std::vector<lm_ggml_context_ptr> ctxs;
206
+ std::vector<lm_ggml_backend_buffer_ptr> bufs;
132
207
 
208
+ // defrag
133
209
  struct {
134
210
  std::vector<uint32_t> ids;
135
211
  } defrag_info;
@@ -138,7 +214,6 @@ public:
138
214
  bool defrag_prepare(int32_t n_max_nodes);
139
215
 
140
216
  // commit/restore cache
141
-
142
217
  struct slot_range {
143
218
  uint32_t c0 = 0; // note: these are cell indices, not sequence positions
144
219
  uint32_t c1 = 0;
@@ -149,25 +224,124 @@ public:
149
224
  std::vector<slot_range> ranges;
150
225
  } pending;
151
226
 
152
- // state write/load
227
+ // find how many cells are currently in use
228
+ uint32_t cell_max() const;
153
229
 
154
- void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const;
155
- void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1);
230
+ size_t total_size() const;
156
231
 
157
- // members
232
+ size_t size_k_bytes() const;
233
+ size_t size_v_bytes() const;
158
234
 
159
- const llama_hparams & hparams;
235
+ lm_ggml_tensor * build_rope_shift(
236
+ const llama_cparams & cparams,
237
+ lm_ggml_context * ctx,
238
+ lm_ggml_tensor * cur,
239
+ lm_ggml_tensor * shift,
240
+ lm_ggml_tensor * factors,
241
+ float freq_base,
242
+ float freq_scale) const;
243
+
244
+ llm_graph_result_ptr build_graph_shift(
245
+ const llama_cparams & cparams,
246
+ lm_ggml_context * ctx,
247
+ lm_ggml_cgraph * gf) const;
248
+
249
+ llm_graph_result_ptr build_graph_defrag(
250
+ const llama_cparams & cparams,
251
+ lm_ggml_context * ctx,
252
+ lm_ggml_cgraph * gf) const;
160
253
 
161
- callbacks cbs;
254
+ void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
255
+ void state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const;
162
256
 
163
- bool has_shift = false;
164
- bool do_defrag = false;
257
+ bool state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id = -1);
258
+ bool state_read_data(llama_io_read_i & io, uint32_t cell_count);
259
+ };
165
260
 
166
- // TODO: remove this and implement llama_kv_cache_recurrent instead
167
- bool recurrent = false; // with recurrent state models, a cell can hold the state for more than one past token
261
+ //
262
+ // llama_kv_cache_recurrent
263
+ //
168
264
 
169
- bool v_trans = true; // the value tensor is transposed
170
- bool can_shift = false;
265
+ class llama_kv_cache_recurrent : public llama_kv_cache {
266
+ public:
267
+ struct kv_cell {
268
+ llama_pos pos = -1;
269
+ int32_t src = -1; // used to copy states
270
+ int32_t tail = -1;
271
+
272
+ std::set<llama_seq_id> seq_id;
273
+
274
+ bool has_seq_id(const llama_seq_id & id) const {
275
+ return seq_id.find(id) != seq_id.end();
276
+ }
277
+
278
+ bool is_empty() const {
279
+ return seq_id.empty();
280
+ }
281
+
282
+ bool is_same_seq(const kv_cell & other) const {
283
+ return seq_id == other.seq_id;
284
+ }
285
+ };
286
+
287
+ llama_kv_cache_recurrent(
288
+ const llama_model & model,
289
+ lm_ggml_type type_k,
290
+ lm_ggml_type type_v,
291
+ bool offload,
292
+ uint32_t kv_size);
293
+
294
+ ~llama_kv_cache_recurrent() = default;
295
+
296
+ //
297
+ // llama_memory_i
298
+ //
299
+
300
+ void clear() override;
301
+
302
+ bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) override;
303
+ void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
304
+ void seq_keep(llama_seq_id seq_id) override;
305
+ void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) override;
306
+ void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) override;
307
+
308
+ llama_pos seq_pos_max(llama_seq_id seq_id) const override;
309
+
310
+ //
311
+ // llama_kv_cache
312
+ //
313
+
314
+ void restore() override;
315
+ void commit() override;
316
+
317
+ bool update(llama_context & lctx) override;
318
+
319
+ void defrag_sched(float thold) override;
320
+
321
+ void set_full() override;
322
+
323
+ llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) override;
324
+
325
+ llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const override;
326
+
327
+ bool find_slot(const llama_ubatch & batch) override;
328
+
329
+ int32_t get_n_tokens() const override;
330
+ int32_t get_used_cells() const override;
331
+
332
+ // TODO: better data structures to reduce the cost of this operation
333
+ llama_pos get_pos_max() const override;
334
+
335
+ bool get_can_shift() const override;
336
+
337
+ // TODO: temporary methods - they are not really const as they do const_cast<>, fix this
338
+ int32_t s_copy(int i) const;
339
+ float s_mask(int i) const;
340
+
341
+ // state write/load
342
+
343
+ void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
344
+ void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
171
345
 
172
346
  // Note: The value of head isn't only used to optimize searching
173
347
  // for a free KV slot. llama_decode_impl also uses it, so it
@@ -179,18 +353,41 @@ public:
179
353
  // computed before each graph build
180
354
  uint32_t n = 0;
181
355
 
182
- std::vector<llama_kv_cell> cells;
356
+ std::vector<kv_cell> cells;
183
357
 
184
358
  std::vector<lm_ggml_tensor *> k_l; // per layer
185
359
  std::vector<lm_ggml_tensor *> v_l;
186
360
 
187
361
  private:
362
+ //const llama_model & model;
363
+ const llama_hparams & hparams;
364
+
365
+ // commit/restore cache
366
+ // TODO: rework for recurrent cache
367
+ struct slot_range {
368
+ uint32_t c0 = 0; // note: these are cell indices, not sequence positions
369
+ uint32_t c1 = 0;
370
+ };
371
+
372
+ // pending cell updates that are not yet committed
373
+ struct {
374
+ std::vector<slot_range> ranges;
375
+ } pending;
376
+
188
377
  lm_ggml_type type_k = LM_GGML_TYPE_F16;
189
378
  lm_ggml_type type_v = LM_GGML_TYPE_F16;
190
379
 
191
380
  std::vector<lm_ggml_context_ptr> ctxs;
192
381
  std::vector<lm_ggml_backend_buffer_ptr> bufs;
193
382
 
383
+ // find how many cells are currently in use
384
+ uint32_t cell_max() const;
385
+
386
+ size_t total_size() const;
387
+
388
+ size_t size_k_bytes() const;
389
+ size_t size_v_bytes() const;
390
+
194
391
  void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
195
392
  void state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const;
196
393
 
@@ -198,11 +395,6 @@ private:
198
395
  bool state_read_data(llama_io_read_i & io, uint32_t cell_count);
199
396
  };
200
397
 
201
- // TODO: temporary reusing llama_kv_cache_unified -- implement recurrent cache and simplify llama_kv_cache_unified
202
- //class llama_kv_cache_recurrent : public llama_kv_cache_unified {
203
- //public:
204
- // using llama_kv_cache_unified::llama_kv_cache_unified;
205
- //};
206
398
 
207
399
  //
208
400
  // kv cache view
@@ -2,12 +2,22 @@
2
2
 
3
3
  #include "llama.h"
4
4
 
5
+ struct llama_memory_params {
6
+ // kv cache
7
+ lm_ggml_type type_k;
8
+ lm_ggml_type type_v;
9
+
10
+ // parameters for other types of memory
11
+ // ...
12
+ };
13
+
5
14
  // general concept of LLM memory
6
15
  // the KV cache is a type of LLM memory, but there can be other types
7
16
  class llama_memory_i {
8
17
  public:
18
+ virtual ~llama_memory_i() = default;
19
+
9
20
  virtual void clear() = 0;
10
- virtual void defrag() = 0;
11
21
 
12
22
  virtual bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) = 0;
13
23
  virtual void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) = 0;
@@ -36,14 +36,17 @@ enum llm_type {
36
36
  LLM_TYPE_335M,
37
37
  LLM_TYPE_410M,
38
38
  LLM_TYPE_450M,
39
+ LLM_TYPE_475M,
39
40
  LLM_TYPE_770M,
40
41
  LLM_TYPE_780M,
41
42
  LLM_TYPE_0_5B,
43
+ LLM_TYPE_0_6B,
42
44
  LLM_TYPE_1B,
43
45
  LLM_TYPE_1_3B,
44
46
  LLM_TYPE_1_4B,
45
47
  LLM_TYPE_1_5B,
46
48
  LLM_TYPE_1_6B,
49
+ LLM_TYPE_1_7B,
47
50
  LLM_TYPE_1_8B,
48
51
  LLM_TYPE_2B,
49
52
  LLM_TYPE_2_8B,
@@ -62,6 +65,7 @@ enum llm_type {
62
65
  LLM_TYPE_15B,
63
66
  LLM_TYPE_16B,
64
67
  LLM_TYPE_20B,
68
+ LLM_TYPE_27B,
65
69
  LLM_TYPE_30B,
66
70
  LLM_TYPE_32B,
67
71
  LLM_TYPE_34B,
@@ -70,6 +74,7 @@ enum llm_type {
70
74
  LLM_TYPE_65B,
71
75
  LLM_TYPE_70B,
72
76
  LLM_TYPE_236B,
77
+ LLM_TYPE_290B,
73
78
  LLM_TYPE_314B,
74
79
  LLM_TYPE_671B,
75
80
  LLM_TYPE_SMALL,
@@ -84,10 +89,10 @@ enum llm_type {
84
89
  LLM_TYPE_16x3_8B,
85
90
  LLM_TYPE_10B_128x3_66B,
86
91
  LLM_TYPE_57B_A14B,
87
- LLM_TYPE_27B,
88
- LLM_TYPE_290B,
89
92
  LLM_TYPE_17B_16E, // llama4 Scout
90
93
  LLM_TYPE_17B_128E, // llama4 Maverick
94
+ LLM_TYPE_30B_A3B,
95
+ LLM_TYPE_235B_A22B,
91
96
  };
92
97
 
93
98
  struct llama_layer_posnet {
@@ -171,6 +176,8 @@ struct llama_layer {
171
176
  struct lm_ggml_tensor * wq_b = nullptr;
172
177
  struct lm_ggml_tensor * wkv_a_mqa = nullptr;
173
178
  struct lm_ggml_tensor * wkv_b = nullptr;
179
+ struct lm_ggml_tensor * wk_b = nullptr;
180
+ struct lm_ggml_tensor * wv_b = nullptr;
174
181
  struct lm_ggml_tensor * wq_cross = nullptr;
175
182
  struct lm_ggml_tensor * wk_cross = nullptr;
176
183
  struct lm_ggml_tensor * wv_cross = nullptr;
@@ -388,8 +395,11 @@ struct llama_model {
388
395
 
389
396
  const struct lm_ggml_tensor * get_tensor(const char * name) const;
390
397
 
398
+ lm_ggml_tensor * get_rope_factors(uint32_t n_ctx_per_seq, int il) const;
399
+
400
+ // note: can mutate `cparams`
391
401
  // TODO: move this to new llm_arch_model_i interface
392
- llama_memory_i * create_memory() const; // TODO: params
402
+ llama_memory_i * create_memory(const llama_memory_params & params, llama_cparams & cparams) const;
393
403
 
394
404
  // TODO: move this to new llm_arch_model_i interface
395
405
  llm_graph_result_ptr build_graph(
@@ -112,6 +112,7 @@ extern "C" {
112
112
  LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
113
113
  LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
114
114
  LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
115
+ LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
115
116
  };
116
117
 
117
118
  enum llama_rope_type {
@@ -368,17 +369,18 @@ extern "C" {
368
369
 
369
370
  // model quantization parameters
370
371
  typedef struct llama_model_quantize_params {
371
- int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
372
- enum llama_ftype ftype; // quantize to this llama_ftype
373
- enum lm_ggml_type output_tensor_type; // output tensor type
374
- enum lm_ggml_type token_embedding_type; // token embeddings tensor type
375
- bool allow_requantize; // allow quantizing non-f32/f16 tensors
376
- bool quantize_output_tensor; // quantize output.weight
377
- bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
378
- bool pure; // quantize all tensors to the default type
379
- bool keep_split; // quantize to the same number of shards
380
- void * imatrix; // pointer to importance matrix data
381
- void * kv_overrides; // pointer to vector containing overrides
372
+ int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
373
+ enum llama_ftype ftype; // quantize to this llama_ftype
374
+ enum lm_ggml_type output_tensor_type; // output tensor type
375
+ enum lm_ggml_type token_embedding_type; // token embeddings tensor type
376
+ bool allow_requantize; // allow quantizing non-f32/f16 tensors
377
+ bool quantize_output_tensor; // quantize output.weight
378
+ bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
379
+ bool pure; // quantize all tensors to the default type
380
+ bool keep_split; // quantize to the same number of shards
381
+ void * imatrix; // pointer to importance matrix data
382
+ void * kv_overrides; // pointer to vector containing overrides
383
+ void * tensor_types; // pointer to vector containing tensor types
382
384
  } llama_model_quantize_params;
383
385
 
384
386
  typedef struct llama_logit_bias {
@@ -1231,6 +1233,7 @@ extern "C" {
1231
1233
  "will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)");
1232
1234
 
1233
1235
  /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
1236
+ /// Setting k <= 0 makes this a noop
1234
1237
  LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k);
1235
1238
 
1236
1239
  /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
@@ -16,7 +16,6 @@
16
16
 
17
17
  namespace rnllama {
18
18
 
19
-
20
19
  std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token);
21
20
 
22
21
  std::string tokens_to_str(llama_context *ctx, const std::vector<llama_token>::const_iterator begin, const std::vector<llama_token>::const_iterator end);
@@ -69,6 +68,7 @@ struct llama_rn_context {
69
68
 
70
69
  int n_ctx;
71
70
 
71
+ bool context_full = false;
72
72
  bool truncated = false;
73
73
  bool stopped_eos = false;
74
74
  bool stopped_word = false;
@@ -107,10 +107,6 @@ struct llama_rn_context {
107
107
  int applyLoraAdapters(std::vector<common_adapter_lora_info> lora);
108
108
  void removeLoraAdapters();
109
109
  std::vector<common_adapter_lora_info> getLoadedLoraAdapters();
110
- std::vector<int> longest_common_subseq(const std::vector<int> x, const std::vector<int> y);
111
- bool arr_start_with(const std::vector<int> targetArray, const std::vector<int> searchSeq);
112
- int arr_find_index_of(const std::vector<int> targetArray, const std::vector<int> searchSeq);
113
- void purge_missing_tokens(llama_context * ctx, std::vector<int> &current_context_tokens, std::vector<int> &new_context_tokens, const int genamt, const int nctx);
114
110
  };\
115
111
 
116
112
  // Logging macros
@@ -355,8 +355,10 @@ struct common_params {
355
355
 
356
356
  common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
357
357
 
358
- // multimodal models (see examples/llava)
358
+ // multimodal models (see tools/llava)
359
359
  struct common_params_model mmproj;
360
+ bool mmproj_use_gpu = true; // use GPU for multimodal model
361
+ bool no_mmproj = false; // explicitly disable multimodal model
360
362
  std::vector<std::string> image; // path to image file(s)
361
363
 
362
364
  // embedding
@@ -427,8 +429,8 @@ struct common_params {
427
429
  int n_pca_batch = 100;
428
430
  int n_pca_iterations = 1000;
429
431
  dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
430
- std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
431
- std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
432
+ std::string cvector_positive_file = "tools/cvector-generator/positive.txt";
433
+ std::string cvector_negative_file = "tools/cvector-generator/negative.txt";
432
434
 
433
435
  bool spm_infill = false; // suffix/prefix/middle pattern for infill
434
436
 
@@ -558,6 +560,8 @@ struct lm_ggml_threadpool_params lm_ggml_threadpool_params_from_cpu_params(const
558
560
  // clear LoRA adapters from context, then apply new list of adapters
559
561
  void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
560
562
 
563
+ std::string get_model_endpoint();
564
+
561
565
  //
562
566
  // Batch utils
563
567
  //
@@ -24,7 +24,7 @@ typedef std::unique_ptr<lm_gguf_context, lm_gguf_context_deleter> lm_gguf_contex
24
24
 
25
25
  struct lm_ggml_gallocr_deleter { void operator()(lm_ggml_gallocr_t galloc) { lm_ggml_gallocr_free(galloc); } };
26
26
 
27
- typedef std::unique_ptr<lm_ggml_gallocr_t, lm_ggml_gallocr_deleter> lm_ggml_gallocr_ptr;
27
+ typedef std::unique_ptr<lm_ggml_gallocr, lm_ggml_gallocr_deleter> lm_ggml_gallocr_ptr;
28
28
 
29
29
  // ggml-backend
30
30