@fugood/llama.node 0.3.2 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (190) hide show
  1. package/CMakeLists.txt +2 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +1 -1
  17. package/src/DetokenizeWorker.cpp +1 -1
  18. package/src/EmbeddingWorker.cpp +2 -2
  19. package/src/LlamaCompletionWorker.cpp +8 -8
  20. package/src/LlamaCompletionWorker.h +2 -2
  21. package/src/LlamaContext.cpp +8 -9
  22. package/src/TokenizeWorker.cpp +1 -1
  23. package/src/common.hpp +4 -4
  24. package/src/llama.cpp/.github/workflows/build.yml +43 -9
  25. package/src/llama.cpp/.github/workflows/docker.yml +3 -0
  26. package/src/llama.cpp/CMakeLists.txt +7 -4
  27. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  28. package/src/llama.cpp/common/CMakeLists.txt +0 -2
  29. package/src/llama.cpp/common/arg.cpp +642 -607
  30. package/src/llama.cpp/common/arg.h +22 -22
  31. package/src/llama.cpp/common/common.cpp +79 -281
  32. package/src/llama.cpp/common/common.h +130 -100
  33. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  34. package/src/llama.cpp/common/log.cpp +50 -50
  35. package/src/llama.cpp/common/log.h +18 -18
  36. package/src/llama.cpp/common/ngram-cache.cpp +36 -36
  37. package/src/llama.cpp/common/ngram-cache.h +19 -19
  38. package/src/llama.cpp/common/sampling.cpp +116 -108
  39. package/src/llama.cpp/common/sampling.h +20 -20
  40. package/src/llama.cpp/docs/build.md +37 -17
  41. package/src/llama.cpp/examples/CMakeLists.txt +1 -1
  42. package/src/llama.cpp/examples/batched/batched.cpp +14 -14
  43. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
  44. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
  45. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
  46. package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
  47. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
  48. package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
  49. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
  50. package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
  51. package/src/llama.cpp/examples/imatrix/imatrix.cpp +20 -11
  52. package/src/llama.cpp/examples/infill/infill.cpp +40 -86
  53. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +42 -151
  54. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  55. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
  56. package/src/llama.cpp/examples/llava/clip.cpp +1 -0
  57. package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
  58. package/src/llama.cpp/examples/llava/llava.cpp +37 -3
  59. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
  60. package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
  61. package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
  62. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  63. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +14 -14
  64. package/src/llama.cpp/examples/lookup/lookup.cpp +29 -29
  65. package/src/llama.cpp/examples/main/main.cpp +64 -109
  66. package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
  67. package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
  68. package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
  69. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
  70. package/src/llama.cpp/examples/retrieval/retrieval.cpp +13 -13
  71. package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
  72. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +34 -17
  73. package/src/llama.cpp/examples/server/CMakeLists.txt +4 -13
  74. package/src/llama.cpp/examples/server/server.cpp +553 -691
  75. package/src/llama.cpp/examples/server/utils.hpp +312 -25
  76. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  77. package/src/llama.cpp/examples/simple/simple.cpp +128 -96
  78. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
  79. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
  80. package/src/llama.cpp/examples/speculative/speculative.cpp +54 -51
  81. package/src/llama.cpp/examples/tokenize/tokenize.cpp +2 -2
  82. package/src/llama.cpp/ggml/CMakeLists.txt +15 -9
  83. package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
  84. package/src/llama.cpp/ggml/include/ggml-backend.h +46 -33
  85. package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
  86. package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
  87. package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
  88. package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
  89. package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
  90. package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
  91. package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
  92. package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
  93. package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
  94. package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
  95. package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
  96. package/src/llama.cpp/ggml/include/ggml.h +53 -393
  97. package/src/llama.cpp/ggml/src/CMakeLists.txt +66 -1149
  98. package/src/llama.cpp/ggml/src/ggml-aarch64.c +46 -3126
  99. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
  100. package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -27
  101. package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
  102. package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
  103. package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
  104. package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
  105. package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
  106. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +6 -25
  107. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
  108. package/src/llama.cpp/ggml/src/ggml-backend.cpp +303 -864
  109. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
  110. package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +213 -65
  111. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
  112. package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +255 -149
  113. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
  114. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
  115. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
  116. package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +0 -243
  117. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
  118. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  119. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
  120. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
  121. package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +667 -1
  122. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
  123. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
  124. package/src/llama.cpp/ggml/src/ggml-impl.h +366 -16
  125. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
  126. package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +238 -72
  127. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
  128. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
  129. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
  130. package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
  131. package/src/llama.cpp/ggml/src/ggml-quants.c +187 -10692
  132. package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
  133. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
  134. package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +475 -300
  135. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
  136. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
  137. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +40 -0
  138. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +258 -0
  139. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
  140. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +2 -22
  141. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
  142. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
  143. package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3584 -4142
  144. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +69 -67
  145. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +3 -3
  146. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
  147. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
  148. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
  149. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
  150. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
  151. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
  152. package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  153. package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
  154. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
  155. package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +555 -623
  156. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +125 -206
  157. package/src/llama.cpp/ggml/src/ggml.c +4032 -19890
  158. package/src/llama.cpp/include/llama.h +67 -33
  159. package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
  160. package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
  161. package/src/llama.cpp/src/CMakeLists.txt +2 -1
  162. package/src/llama.cpp/src/llama-sampling.cpp +745 -105
  163. package/src/llama.cpp/src/llama-sampling.h +21 -2
  164. package/src/llama.cpp/src/llama-vocab.cpp +49 -9
  165. package/src/llama.cpp/src/llama-vocab.h +35 -11
  166. package/src/llama.cpp/src/llama.cpp +2636 -2406
  167. package/src/llama.cpp/src/unicode-data.cpp +2 -2
  168. package/src/llama.cpp/tests/CMakeLists.txt +1 -2
  169. package/src/llama.cpp/tests/test-arg-parser.cpp +14 -14
  170. package/src/llama.cpp/tests/test-backend-ops.cpp +185 -60
  171. package/src/llama.cpp/tests/test-barrier.cpp +1 -0
  172. package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
  173. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
  174. package/src/llama.cpp/tests/test-log.cpp +2 -2
  175. package/src/llama.cpp/tests/test-opt.cpp +853 -142
  176. package/src/llama.cpp/tests/test-quantize-fns.cpp +22 -19
  177. package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
  178. package/src/llama.cpp/tests/test-rope.cpp +1 -0
  179. package/src/llama.cpp/tests/test-sampling.cpp +162 -137
  180. package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
  181. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
  182. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
  183. package/src/llama.cpp/common/train.cpp +0 -1515
  184. package/src/llama.cpp/common/train.h +0 -233
  185. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
  186. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
  187. package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
  188. /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
  189. /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
  190. /package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +0 -0
@@ -2,6 +2,7 @@
2
2
  #define LLAMA_H
3
3
 
4
4
  #include "ggml.h"
5
+ #include "ggml-cpu.h"
5
6
  #include "ggml-backend.h"
6
7
 
7
8
  #include <stddef.h>
@@ -205,7 +206,7 @@ extern "C" {
205
206
  enum llama_split_mode {
206
207
  LLAMA_SPLIT_MODE_NONE = 0, // single GPU
207
208
  LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
208
- LLAMA_SPLIT_MODE_ROW = 2, // split rows across GPUs
209
+ LLAMA_SPLIT_MODE_ROW = 2, // split layers and KV across GPUs, use tensor parallelism if supported
209
210
  };
210
211
 
211
212
  // TODO: simplify (https://github.com/ggerganov/llama.cpp/pull/9294#pullrequestreview-2286561979)
@@ -217,6 +218,7 @@ extern "C" {
217
218
 
218
219
  typedef struct llama_token_data_array {
219
220
  // TODO: consider SoA
221
+ // NOTE: this pointer can be modified by the samplers
220
222
  llama_token_data * data;
221
223
  size_t size;
222
224
  int64_t selected; // this is the index in the data array (i.e. not the token id)
@@ -232,8 +234,11 @@ extern "C" {
232
234
  // - token : the token ids of the input (used when embd is NULL)
233
235
  // - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
234
236
  // - pos : the positions of the respective token in the sequence
237
+ // (if set to NULL, the token position will be tracked automatically by llama_decode)
235
238
  // - seq_id : the sequence to which the respective token belongs
239
+ // (if set to NULL, the sequence ID will be assumed to be 0)
236
240
  // - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
241
+ // (if set to NULL, only the logits for last token will be returned)
237
242
  //
238
243
  typedef struct llama_batch {
239
244
  int32_t n_tokens;
@@ -244,15 +249,6 @@ extern "C" {
244
249
  int32_t * n_seq_id;
245
250
  llama_seq_id ** seq_id;
246
251
  int8_t * logits; // TODO: rename this to "output"
247
-
248
- // NOTE: helpers for smooth API transition - can be deprecated in the future
249
- // for future-proof code, use the above fields instead and ignore everything below
250
- //
251
- // pos[i] = all_pos_0 + i*all_pos_1
252
- //
253
- llama_pos all_pos_0; // used if pos == NULL
254
- llama_pos all_pos_1; // used if pos == NULL
255
- llama_seq_id all_seq_id; // used if seq_id == NULL
256
252
  } llama_batch;
257
253
 
258
254
  enum llama_model_kv_override_type {
@@ -279,10 +275,7 @@ extern "C" {
279
275
  int32_t n_gpu_layers; // number of layers to store in VRAM
280
276
  enum llama_split_mode split_mode; // how to split the model across multiple GPUs
281
277
 
282
- // main_gpu interpretation depends on split_mode:
283
- // LLAMA_SPLIT_MODE_NONE: the GPU that is used for the entire model
284
- // LLAMA_SPLIT_MODE_ROW: the GPU that is used for small tensors and intermediate results
285
- // LLAMA_SPLIT_MODE_LAYER: ignored
278
+ // the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE
286
279
  int32_t main_gpu;
287
280
 
288
281
  // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
@@ -433,6 +426,7 @@ extern "C" {
433
426
  LLAMA_API bool llama_supports_mmap (void);
434
427
  LLAMA_API bool llama_supports_mlock (void);
435
428
  LLAMA_API bool llama_supports_gpu_offload(void);
429
+ LLAMA_API bool llama_supports_rpc (void);
436
430
 
437
431
  LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx);
438
432
  LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
@@ -775,15 +769,15 @@ extern "C" {
775
769
  // Decoding
776
770
  //
777
771
 
778
- // Return batch for single sequence of tokens starting at pos_0
772
+ // Return batch for single sequence of tokens
773
+ // The sequence ID will be fixed to 0
774
+ // The position of the tokens will be tracked automatically by llama_decode
779
775
  //
780
776
  // NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
781
777
  //
782
778
  LLAMA_API struct llama_batch llama_batch_get_one(
783
779
  llama_token * tokens,
784
- int32_t n_tokens,
785
- llama_pos pos_0,
786
- llama_seq_id seq_id);
780
+ int32_t n_tokens);
787
781
 
788
782
  // Allocates a batch of tokens on the heap that can hold a maximum of n_tokens
789
783
  // Each token can be assigned up to n_seq_max sequence ids
@@ -803,7 +797,7 @@ extern "C" {
803
797
  // Processes a batch of tokens with the ecoder part of the encoder-decoder model.
804
798
  // Stores the encoder output internally for later use by the decoder cross-attention layers.
805
799
  // 0 - success
806
- // < 0 - error
800
+ // < 0 - error. the KV cache state is restored to the state before this call
807
801
  LLAMA_API int32_t llama_encode(
808
802
  struct llama_context * ctx,
809
803
  struct llama_batch batch);
@@ -811,7 +805,7 @@ extern "C" {
811
805
  // Positive return values does not mean a fatal error, but rather a warning.
812
806
  // 0 - success
813
807
  // 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
814
- // < 0 - error
808
+ // < 0 - error. the KV cache state is restored to the state before this call
815
809
  LLAMA_API int32_t llama_decode(
816
810
  struct llama_context * ctx,
817
811
  struct llama_batch batch);
@@ -896,6 +890,7 @@ extern "C" {
896
890
  // Special tokens
897
891
  LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
898
892
  LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
893
+ LLAMA_API llama_token llama_token_eot(const struct llama_model * model); // end-of-turn
899
894
  LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification
900
895
  LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator
901
896
  LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
@@ -904,11 +899,17 @@ extern "C" {
904
899
  LLAMA_API bool llama_add_bos_token(const struct llama_model * model);
905
900
  LLAMA_API bool llama_add_eos_token(const struct llama_model * model);
906
901
 
907
- // Codellama infill tokens
908
- LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
909
- LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
910
- LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix
911
- LLAMA_API llama_token llama_token_eot (const struct llama_model * model); // End of infill middle
902
+ // infill tokens
903
+ DEPRECATED(LLAMA_API llama_token llama_token_prefix(const struct llama_model * model), "use llama_token_fim_pre instead");
904
+ DEPRECATED(LLAMA_API llama_token llama_token_middle(const struct llama_model * model), "use llama_token_fim_mid instead");
905
+ DEPRECATED(LLAMA_API llama_token llama_token_suffix(const struct llama_model * model), "use llama_token_fim_suf instead");
906
+
907
+ LLAMA_API llama_token llama_token_fim_pre(const struct llama_model * model);
908
+ LLAMA_API llama_token llama_token_fim_suf(const struct llama_model * model);
909
+ LLAMA_API llama_token llama_token_fim_mid(const struct llama_model * model);
910
+ LLAMA_API llama_token llama_token_fim_pad(const struct llama_model * model);
911
+ LLAMA_API llama_token llama_token_fim_rep(const struct llama_model * model);
912
+ LLAMA_API llama_token llama_token_fim_sep(const struct llama_model * model);
912
913
 
913
914
  //
914
915
  // Tokenization
@@ -1067,12 +1068,13 @@ extern "C" {
1067
1068
 
1068
1069
  // available samplers:
1069
1070
 
1070
- LLAMA_API struct llama_sampler * llama_sampler_init_greedy (void);
1071
- LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed);
1071
+ LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void);
1072
+ LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed);
1072
1073
 
1073
1074
  /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
1074
1075
  /// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first.
1075
- LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void);
1076
+ DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void),
1077
+ "will be removed in the future (see https://github.com/ggerganov/llama.cpp/pull/9896#discussion_r1800920915)");
1076
1078
 
1077
1079
  /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
1078
1080
  LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k);
@@ -1083,16 +1085,18 @@ extern "C" {
1083
1085
  /// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
1084
1086
  LLAMA_API struct llama_sampler * llama_sampler_init_min_p (float p, size_t min_keep);
1085
1087
 
1086
- /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
1087
- LLAMA_API struct llama_sampler * llama_sampler_init_tail_free (float z, size_t min_keep);
1088
-
1089
1088
  /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
1090
1089
  LLAMA_API struct llama_sampler * llama_sampler_init_typical (float p, size_t min_keep);
1090
+
1091
+ /// #details Updates the logits l_i` = l_i/t. When t <= 0.0f, the maximum logit is kept at it's original value, the rest are set to -inf
1091
1092
  LLAMA_API struct llama_sampler * llama_sampler_init_temp (float t);
1092
1093
 
1093
1094
  /// @details Dynamic temperature implementation (a.k.a. entropy) described in the paper https://arxiv.org/abs/2309.02772.
1094
1095
  LLAMA_API struct llama_sampler * llama_sampler_init_temp_ext (float t, float delta, float exponent);
1095
1096
 
1097
+ /// @details XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335
1098
+ LLAMA_API struct llama_sampler * llama_sampler_init_xtc (float p, float t, size_t min_keep, uint32_t seed);
1099
+
1096
1100
  /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
1097
1101
  /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
1098
1102
  /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
@@ -1132,11 +1136,43 @@ extern "C" {
1132
1136
  bool penalize_nl, // consider newlines as a repeatable token
1133
1137
  bool ignore_eos); // ignore the end-of-sequence token
1134
1138
 
1139
+ /// @details DRY sampler, designed by p-e-w, as described in: https://github.com/oobabooga/text-generation-webui/pull/5677, porting Koboldcpp implementation authored by pi6am: https://github.com/LostRuins/koboldcpp/pull/982
1140
+ LLAMA_API struct llama_sampler * llama_sampler_init_dry(
1141
+ const struct llama_model * model,
1142
+ float dry_multiplier,
1143
+ float dry_base,
1144
+ int32_t dry_allowed_length,
1145
+ int32_t dry_penalty_last_n,
1146
+ const char ** seq_breakers,
1147
+ size_t num_breakers);
1148
+
1135
1149
  LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias(
1136
1150
  int32_t n_vocab,
1137
1151
  int32_t n_logit_bias,
1138
1152
  const llama_logit_bias * logit_bias);
1139
1153
 
1154
+ // this sampler is meant to be used for fill-in-the-middle infilling
1155
+ // it's supposed to be used after top_k + top_p sampling
1156
+ //
1157
+ // 1. if the sum of the EOG probs times the number of candidates is higher than the sum of the other probs -> pick EOG
1158
+ // 2. combine probs of tokens that have the same prefix
1159
+ //
1160
+ // example:
1161
+ //
1162
+ // - before:
1163
+ // "hel": 0.5
1164
+ // "hell": 0.2
1165
+ // "hello": 0.1
1166
+ // "dummy": 0.1
1167
+ //
1168
+ // - after:
1169
+ // "hel": 0.8
1170
+ // "dummy": 0.1
1171
+ //
1172
+ // 3. discard non-EOG tokens with low prob
1173
+ // 4. if no tokens are left -> pick EOT
1174
+ //
1175
+ LLAMA_API struct llama_sampler * llama_sampler_init_infill(const struct llama_model * model);
1140
1176
 
1141
1177
  // Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise
1142
1178
  LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl);
@@ -1208,8 +1244,6 @@ extern "C" {
1208
1244
  LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain);
1209
1245
  LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain);
1210
1246
 
1211
- LLAMA_API void llama_perf_dump_yaml(FILE * stream, const struct llama_context * ctx);
1212
-
1213
1247
  #ifdef __cplusplus
1214
1248
  }
1215
1249
  #endif
@@ -11,6 +11,7 @@
11
11
  #include <type_traits>
12
12
 
13
13
  #include <ggml.h>
14
+ #include <ggml-cpu.h>
14
15
 
15
16
  constexpr int kVecSize = 1 << 16;
16
17
 
@@ -136,7 +137,7 @@ int main(int argc, char** argv) {
136
137
 
137
138
  auto ggml_type = type == 0 ? GGML_TYPE_Q4_0 : GGML_TYPE_Q4_1;
138
139
 
139
- auto funcs = ggml_internal_get_type_traits(ggml_type);
140
+ const auto * funcs = ggml_get_type_traits_cpu(ggml_type);
140
141
 
141
142
  Stat simple, ggml;
142
143
 
@@ -156,8 +157,8 @@ int main(int argc, char** argv) {
156
157
 
157
158
  t1 = std::chrono::high_resolution_clock::now();
158
159
  float fs;
159
- if (type == 0) funcs.vec_dot(kVecSize * QK4_1, &fs, 0, x40.data(), 0, y.data(), 0, 1);
160
- else funcs.vec_dot(kVecSize * QK4_1, &fs, 0, x41.data(), 0, y.data(), 0, 1);
160
+ if (type == 0) funcs->vec_dot(kVecSize * QK4_1, &fs, 0, x40.data(), 0, y.data(), 0, 1);
161
+ else funcs->vec_dot(kVecSize * QK4_1, &fs, 0, x41.data(), 0, y.data(), 0, 1);
161
162
  t2 = std::chrono::high_resolution_clock::now();
162
163
  t = 1e-3*std::chrono::duration_cast<std::chrono::nanoseconds>(t2-t1).count();
163
164
  if (iloop > 3) ggml.addResult(fs, t);
@@ -9,6 +9,7 @@
9
9
  #include <array>
10
10
 
11
11
  #include <ggml.h>
12
+ #include <ggml-cpu.h>
12
13
 
13
14
  #if defined(_MSC_VER)
14
15
  #pragma warning(disable: 4244 4267) // possible loss of data
@@ -236,7 +237,7 @@ int main(int argc, char** argv) {
236
237
  int n4 = useQ4_1 ? kVecSize / QK4_1 : kVecSize / QK4_0; n4 = 64*((n4 + 63)/64);
237
238
  int n8 = kVecSize / QK8_0; n8 = 64*((n8 + 63)/64);
238
239
 
239
- auto funcs = useQ4_1 ? ggml_internal_get_type_traits(GGML_TYPE_Q4_1) : ggml_internal_get_type_traits(GGML_TYPE_Q4_0);
240
+ const auto * funcs_cpu = ggml_get_type_traits_cpu(useQ4_1 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q4_0);
240
241
 
241
242
  std::vector<block_q4_0> q40;
242
243
  std::vector<block_q4_1> q41;
@@ -261,9 +262,9 @@ int main(int argc, char** argv) {
261
262
  // Note, we do not include this in the timing as in practical application
262
263
  // we already have the quantized model weights.
263
264
  if (useQ4_1) {
264
- funcs.from_float(x1.data(), q41.data(), kVecSize);
265
+ funcs_cpu->from_float(x1.data(), q41.data(), kVecSize);
265
266
  } else {
266
- funcs.from_float(x1.data(), q40.data(), kVecSize);
267
+ funcs_cpu->from_float(x1.data(), q40.data(), kVecSize);
267
268
  }
268
269
 
269
270
  // Now measure time the dot product needs using the "scalar" version above
@@ -282,10 +283,10 @@ int main(int argc, char** argv) {
282
283
  dot_q4_q8(kVecSize, &result, q40.data(), q8.data());
283
284
  }
284
285
  else {
285
- auto vdot = ggml_internal_get_type_traits(funcs.vec_dot_type);
286
- vdot.from_float(y1.data(), q8.data(), kVecSize);
287
- if (useQ4_1) funcs.vec_dot(kVecSize, &result, 0, q41.data(), 0, q8.data(), 0, 1);
288
- else funcs.vec_dot(kVecSize, &result, 0, q40.data(), 0, q8.data(), 0, 1);
286
+ const auto * vdot = ggml_get_type_traits_cpu(funcs_cpu->vec_dot_type);
287
+ vdot->from_float(y1.data(), q8.data(), kVecSize);
288
+ if (useQ4_1) funcs_cpu->vec_dot(kVecSize, &result, 0, q41.data(), 0, q8.data(), 0, 1);
289
+ else funcs_cpu->vec_dot(kVecSize, &result, 0, q40.data(), 0, q8.data(), 0, 1);
289
290
  }
290
291
  sumq += result;
291
292
  t2 = std::chrono::high_resolution_clock::now();
@@ -29,5 +29,6 @@ target_link_libraries(llama PUBLIC ggml)
29
29
 
30
30
  if (BUILD_SHARED_LIBS)
31
31
  set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON)
32
- target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD)
32
+ target_compile_definitions(llama PRIVATE LLAMA_BUILD)
33
+ target_compile_definitions(llama PUBLIC LLAMA_SHARED)
33
34
  endif()