@fugood/llama.node 0.3.3 → 0.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (225) hide show
  1. package/CMakeLists.txt +5 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/lib/binding.ts +29 -1
  17. package/package.json +1 -1
  18. package/src/EmbeddingWorker.cpp +15 -5
  19. package/src/EmbeddingWorker.h +2 -1
  20. package/src/LlamaCompletionWorker.cpp +17 -1
  21. package/src/LlamaContext.cpp +86 -18
  22. package/src/LlamaContext.h +2 -0
  23. package/src/llama.cpp/.github/workflows/build.yml +197 -159
  24. package/src/llama.cpp/.github/workflows/docker.yml +5 -8
  25. package/src/llama.cpp/.github/workflows/python-lint.yml +8 -1
  26. package/src/llama.cpp/.github/workflows/server.yml +21 -14
  27. package/src/llama.cpp/CMakeLists.txt +11 -6
  28. package/src/llama.cpp/Sources/llama/llama.h +4 -0
  29. package/src/llama.cpp/cmake/common.cmake +33 -0
  30. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +11 -0
  31. package/src/llama.cpp/common/CMakeLists.txt +6 -2
  32. package/src/llama.cpp/common/arg.cpp +426 -245
  33. package/src/llama.cpp/common/common.cpp +143 -80
  34. package/src/llama.cpp/common/common.h +81 -24
  35. package/src/llama.cpp/common/sampling.cpp +53 -19
  36. package/src/llama.cpp/common/sampling.h +22 -1
  37. package/src/llama.cpp/common/speculative.cpp +274 -0
  38. package/src/llama.cpp/common/speculative.h +28 -0
  39. package/src/llama.cpp/docs/build.md +101 -148
  40. package/src/llama.cpp/examples/CMakeLists.txt +32 -13
  41. package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
  42. package/src/llama.cpp/examples/batched/batched.cpp +5 -4
  43. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
  44. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
  45. package/src/llama.cpp/examples/cvector-generator/CMakeLists.txt +1 -1
  46. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +1 -1
  47. package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
  48. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +3 -2
  49. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
  50. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +1 -1
  51. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +4 -7
  52. package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +1 -1
  53. package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
  54. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +8 -1
  55. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
  56. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +2 -2
  57. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
  58. package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
  59. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
  60. package/src/llama.cpp/examples/imatrix/imatrix.cpp +11 -2
  61. package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
  62. package/src/llama.cpp/examples/infill/infill.cpp +1 -1
  63. package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +1 -1
  64. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +405 -316
  65. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  66. package/src/llama.cpp/examples/llava/CMakeLists.txt +10 -3
  67. package/src/llama.cpp/examples/llava/clip.cpp +262 -66
  68. package/src/llama.cpp/examples/llava/clip.h +8 -2
  69. package/src/llama.cpp/examples/llava/llava-cli.cpp +1 -1
  70. package/src/llama.cpp/examples/llava/llava.cpp +46 -19
  71. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +1 -1
  72. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +581 -0
  73. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
  74. package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -1
  75. package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
  76. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -1
  77. package/src/llama.cpp/examples/lookup/lookup.cpp +2 -2
  78. package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
  79. package/src/llama.cpp/examples/main/main.cpp +9 -5
  80. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +1 -1
  81. package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
  82. package/src/llama.cpp/examples/parallel/parallel.cpp +1 -1
  83. package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
  84. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
  85. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  86. package/src/llama.cpp/examples/quantize/quantize.cpp +0 -3
  87. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
  88. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
  89. package/src/llama.cpp/examples/retrieval/retrieval.cpp +4 -4
  90. package/src/llama.cpp/examples/run/CMakeLists.txt +5 -0
  91. package/src/llama.cpp/examples/run/run.cpp +911 -0
  92. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
  93. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +4 -4
  94. package/src/llama.cpp/examples/server/CMakeLists.txt +3 -7
  95. package/src/llama.cpp/examples/server/server.cpp +1758 -886
  96. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
  97. package/src/llama.cpp/examples/server/utils.hpp +94 -304
  98. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  99. package/src/llama.cpp/examples/simple/simple.cpp +4 -0
  100. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +1 -1
  101. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +3 -0
  102. package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
  103. package/src/llama.cpp/examples/speculative/speculative.cpp +16 -15
  104. package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +5 -0
  105. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +265 -0
  106. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
  107. package/src/llama.cpp/examples/tokenize/tokenize.cpp +1 -1
  108. package/src/llama.cpp/examples/tts/CMakeLists.txt +5 -0
  109. package/src/llama.cpp/examples/tts/tts.cpp +932 -0
  110. package/src/llama.cpp/ggml/CMakeLists.txt +46 -34
  111. package/src/llama.cpp/ggml/include/ggml-backend.h +16 -0
  112. package/src/llama.cpp/ggml/include/ggml-cpu.h +7 -49
  113. package/src/llama.cpp/ggml/include/ggml-opencl.h +26 -0
  114. package/src/llama.cpp/ggml/include/ggml.h +106 -24
  115. package/src/llama.cpp/ggml/src/CMakeLists.txt +73 -24
  116. package/src/llama.cpp/ggml/src/ggml-alloc.c +0 -1
  117. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +51 -11
  118. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +379 -22
  119. package/src/llama.cpp/ggml/src/ggml-backend.cpp +4 -4
  120. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +3 -7
  121. package/src/llama.cpp/ggml/src/ggml-blas/ggml-blas.cpp +5 -2
  122. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +33 -3
  123. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +456 -111
  124. package/src/llama.cpp/ggml/src/ggml-cann/common.h +6 -3
  125. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +95 -35
  126. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -5
  127. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +22 -9
  128. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +24 -13
  129. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +23 -13
  130. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +11 -0
  131. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +10 -0
  132. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +10 -0
  133. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +17 -0
  134. package/src/llama.cpp/ggml/src/ggml-common.h +42 -42
  135. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +288 -213
  136. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
  137. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
  138. package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/common.h +19 -22
  139. package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/mmq.cpp +93 -92
  140. package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/mmq.h +2 -9
  141. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
  142. package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.c → ggml-cpu-aarch64.cpp} +892 -190
  143. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +2 -24
  144. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
  145. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
  146. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +15 -0
  147. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +38 -25
  148. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
  149. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
  150. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +552 -399
  151. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +101 -136
  152. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +2 -2
  153. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +7 -10
  154. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +8 -0
  155. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -6
  156. package/src/llama.cpp/ggml/src/ggml-impl.h +32 -11
  157. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +13 -9
  158. package/src/llama.cpp/ggml/src/ggml-kompute/ggml-kompute.cpp +131 -64
  159. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +3 -6
  160. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +39 -0
  161. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +14 -7
  162. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +147 -0
  163. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +4004 -0
  164. package/src/llama.cpp/ggml/src/ggml-opt.cpp +67 -80
  165. package/src/llama.cpp/ggml/src/ggml-quants.c +0 -9
  166. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +3 -5
  167. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +5 -2
  168. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +13 -10
  169. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +2 -11
  170. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +1 -0
  171. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +2 -2
  172. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
  173. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +5 -5
  174. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +32 -13
  175. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +80 -61
  176. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +4 -4
  177. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +159 -114
  178. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +3 -2
  179. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +6 -6
  180. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +6 -20
  181. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +4 -3
  182. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +8 -8
  183. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +4 -3
  184. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +7 -7
  185. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +1 -0
  186. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +4 -1
  187. package/src/llama.cpp/ggml/src/ggml-threading.h +4 -2
  188. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +21 -7
  189. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1718 -399
  190. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +3 -1
  191. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +105 -31
  192. package/src/llama.cpp/ggml/src/ggml.c +367 -207
  193. package/src/llama.cpp/include/llama-cpp.h +25 -0
  194. package/src/llama.cpp/include/llama.h +26 -19
  195. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +112 -0
  196. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +46 -0
  197. package/src/llama.cpp/pocs/CMakeLists.txt +3 -1
  198. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
  199. package/src/llama.cpp/src/CMakeLists.txt +2 -7
  200. package/src/llama.cpp/src/llama-grammar.cpp +15 -15
  201. package/src/llama.cpp/src/llama-grammar.h +2 -5
  202. package/src/llama.cpp/src/llama-sampling.cpp +35 -90
  203. package/src/llama.cpp/src/llama-vocab.cpp +6 -1
  204. package/src/llama.cpp/src/llama.cpp +1748 -640
  205. package/src/llama.cpp/src/unicode.cpp +62 -51
  206. package/src/llama.cpp/src/unicode.h +9 -10
  207. package/src/llama.cpp/tests/CMakeLists.txt +48 -37
  208. package/src/llama.cpp/tests/test-arg-parser.cpp +2 -2
  209. package/src/llama.cpp/tests/test-backend-ops.cpp +140 -21
  210. package/src/llama.cpp/tests/test-chat-template.cpp +50 -4
  211. package/src/llama.cpp/tests/test-gguf.cpp +1303 -0
  212. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -6
  213. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -4
  214. package/src/llama.cpp/tests/test-quantize-fns.cpp +3 -3
  215. package/src/llama.cpp/tests/test-rope.cpp +61 -20
  216. package/src/llama.cpp/tests/test-sampling.cpp +2 -2
  217. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +0 -72
  218. package/src/llama.cpp/.github/workflows/nix-ci.yml +0 -79
  219. package/src/llama.cpp/.github/workflows/nix-flake-update.yml +0 -22
  220. package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +0 -36
  221. package/src/llama.cpp/ggml/include/ggml-amx.h +0 -25
  222. package/src/llama.cpp/ggml/src/ggml-aarch64.c +0 -129
  223. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -19
  224. package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +0 -107
  225. package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +0 -446
@@ -0,0 +1,25 @@
1
+ #pragma once
2
+
3
+ #ifndef __cplusplus
4
+ #error "This header is for C++ only"
5
+ #endif
6
+
7
+ #include <memory>
8
+
9
+ #include "llama.h"
10
+
11
+ struct llama_model_deleter {
12
+ void operator()(llama_model * model) { llama_free_model(model); }
13
+ };
14
+
15
+ struct llama_context_deleter {
16
+ void operator()(llama_context * context) { llama_free(context); }
17
+ };
18
+
19
+ struct llama_sampler_deleter {
20
+ void operator()(llama_sampler * sampler) { llama_sampler_free(sampler); }
21
+ };
22
+
23
+ typedef std::unique_ptr<llama_model, llama_model_deleter> llama_model_ptr;
24
+ typedef std::unique_ptr<llama_context, llama_context_deleter> llama_context_ptr;
25
+ typedef std::unique_ptr<llama_sampler, llama_sampler_deleter> llama_sampler_ptr;
@@ -104,12 +104,15 @@ extern "C" {
104
104
  LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
105
105
  LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
106
106
  LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
107
+ LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
107
108
  };
108
109
 
109
110
  enum llama_rope_type {
110
- LLAMA_ROPE_TYPE_NONE = -1,
111
- LLAMA_ROPE_TYPE_NORM = 0,
112
- LLAMA_ROPE_TYPE_NEOX = GGML_ROPE_TYPE_NEOX,
111
+ LLAMA_ROPE_TYPE_NONE = -1,
112
+ LLAMA_ROPE_TYPE_NORM = 0,
113
+ LLAMA_ROPE_TYPE_NEOX = GGML_ROPE_TYPE_NEOX,
114
+ LLAMA_ROPE_TYPE_MROPE = GGML_ROPE_TYPE_MROPE,
115
+ LLAMA_ROPE_TYPE_VISION = GGML_ROPE_TYPE_VISION,
113
116
  };
114
117
 
115
118
  enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file
@@ -171,9 +174,9 @@ extern "C" {
171
174
  LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
172
175
  LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
173
176
  LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors
174
- LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, // except 1d tensors
175
- LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, // except 1d tensors
176
- LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // except 1d tensors
177
+ //LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, // removed from gguf files, use Q4_0 and runtime repack
178
+ //LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, // removed from gguf files, use Q4_0 and runtime repack
179
+ //LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // removed from gguf files, use Q4_0 and runtime repack
177
180
  LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors
178
181
  LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors
179
182
 
@@ -185,7 +188,8 @@ extern "C" {
185
188
  LLAMA_ROPE_SCALING_TYPE_NONE = 0,
186
189
  LLAMA_ROPE_SCALING_TYPE_LINEAR = 1,
187
190
  LLAMA_ROPE_SCALING_TYPE_YARN = 2,
188
- LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_YARN,
191
+ LLAMA_ROPE_SCALING_TYPE_LONGROPE = 3,
192
+ LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_LONGROPE,
189
193
  };
190
194
 
191
195
  enum llama_pooling_type {
@@ -272,6 +276,9 @@ extern "C" {
272
276
  };
273
277
 
274
278
  struct llama_model_params {
279
+ // NULL-terminated list of devices to use for offloading (if NULL, all available devices are used)
280
+ ggml_backend_dev_t * devices;
281
+
275
282
  int32_t n_gpu_layers; // number of layers to store in VRAM
276
283
  enum llama_split_mode split_mode; // how to split the model across multiple GPUs
277
284
 
@@ -451,6 +458,7 @@ extern "C" {
451
458
  // Functions to access the model's GGUF metadata scalar values
452
459
  // - The functions return the length of the string on success, or -1 on failure
453
460
  // - The output string is always null-terminated and cleared on failure
461
+ // - When retrieving a string, an extra byte must be allocated to account for the null terminator
454
462
  // - GGUF array values are not supported by these functions
455
463
 
456
464
  // Get metadata value as a string by key name
@@ -474,9 +482,6 @@ extern "C" {
474
482
  // Returns the total number of parameters in the model
475
483
  LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
476
484
 
477
- // Get a llama model tensor
478
- LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
479
-
480
485
  // Returns true if the model contains an encoder that requires llama_encode() call
481
486
  LLAMA_API bool llama_model_has_encoder(const struct llama_model * model);
482
487
 
@@ -667,6 +672,9 @@ extern "C" {
667
672
  // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
668
673
  LLAMA_API void llama_kv_cache_update(struct llama_context * ctx);
669
674
 
675
+ // Check if the context supports KV cache shifting
676
+ LLAMA_API bool llama_kv_cache_can_shift(struct llama_context * ctx);
677
+
670
678
  //
671
679
  // State / sessions
672
680
  //
@@ -984,6 +992,9 @@ extern "C" {
984
992
  char * buf,
985
993
  int32_t length);
986
994
 
995
+ // Get list of built-in chat templates
996
+ LLAMA_API int32_t llama_chat_builtin_templates(const char ** output, size_t len);
997
+
987
998
  //
988
999
  // Sampling API
989
1000
  //
@@ -1125,16 +1136,12 @@ extern "C" {
1125
1136
  const char * grammar_str,
1126
1137
  const char * grammar_root);
1127
1138
 
1139
+ /// NOTE: Avoid using on the full vocabulary as searching for repeated tokens can become slow. For example, apply top-k or top-p sampling first.
1128
1140
  LLAMA_API struct llama_sampler * llama_sampler_init_penalties(
1129
- int32_t n_vocab, // llama_n_vocab()
1130
- llama_token special_eos_id, // llama_token_eos()
1131
- llama_token linefeed_id, // llama_token_nl()
1132
- int32_t penalty_last_n, // last n tokens to penalize (0 = disable penalty, -1 = context size)
1133
- float penalty_repeat, // 1.0 = disabled
1134
- float penalty_freq, // 0.0 = disabled
1135
- float penalty_present, // 0.0 = disabled
1136
- bool penalize_nl, // consider newlines as a repeatable token
1137
- bool ignore_eos); // ignore the end-of-sequence token
1141
+ int32_t penalty_last_n, // last n tokens to penalize (0 = disable penalty, -1 = context size)
1142
+ float penalty_repeat, // 1.0 = disabled
1143
+ float penalty_freq, // 0.0 = disabled
1144
+ float penalty_present); // 0.0 = disabled
1138
1145
 
1139
1146
  /// @details DRY sampler, designed by p-e-w, as described in: https://github.com/oobabooga/text-generation-webui/pull/5677, porting Koboldcpp implementation authored by pi6am: https://github.com/LostRuins/koboldcpp/pull/982
1140
1147
  LLAMA_API struct llama_sampler * llama_sampler_init_dry(
@@ -0,0 +1,112 @@
1
+ ied 4 ½ months
2
+ __ggml_vocab_test__
3
+ Führer
4
+ __ggml_vocab_test__
5
+
6
+ __ggml_vocab_test__
7
+
8
+ __ggml_vocab_test__
9
+
10
+ __ggml_vocab_test__
11
+
12
+ __ggml_vocab_test__
13
+
14
+ __ggml_vocab_test__
15
+
16
+
17
+ __ggml_vocab_test__
18
+
19
+
20
+
21
+ __ggml_vocab_test__
22
+
23
+
24
+
25
+
26
+ __ggml_vocab_test__
27
+
28
+
29
+ __ggml_vocab_test__
30
+ Hello world
31
+ __ggml_vocab_test__
32
+ Hello world
33
+ __ggml_vocab_test__
34
+ Hello World
35
+ __ggml_vocab_test__
36
+ Hello World
37
+ __ggml_vocab_test__
38
+ Hello World!
39
+ __ggml_vocab_test__
40
+ Hello, world!
41
+ __ggml_vocab_test__
42
+ Hello, world!
43
+ __ggml_vocab_test__
44
+ this is 🦙.cpp
45
+ __ggml_vocab_test__
46
+ w048 7tuijk dsdfhu
47
+ __ggml_vocab_test__
48
+ нещо на Български
49
+ __ggml_vocab_test__
50
+ កាន់តែពិសេសអាចខលចេញ
51
+ __ggml_vocab_test__
52
+ 🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
53
+ __ggml_vocab_test__
54
+ Hello
55
+ __ggml_vocab_test__
56
+ Hello
57
+ __ggml_vocab_test__
58
+ Hello
59
+ __ggml_vocab_test__
60
+ Hello
61
+ __ggml_vocab_test__
62
+ Hello
63
+ __ggml_vocab_test__
64
+ Hello
65
+ Hello
66
+ __ggml_vocab_test__
67
+ (
68
+ __ggml_vocab_test__
69
+
70
+ =
71
+ __ggml_vocab_test__
72
+ ' era
73
+ __ggml_vocab_test__
74
+ Hello, y'all! How are you 😁 ?我想在apple工作1314151天~
75
+ __ggml_vocab_test__
76
+ !!!!!!
77
+ __ggml_vocab_test__
78
+ 3
79
+ __ggml_vocab_test__
80
+ 33
81
+ __ggml_vocab_test__
82
+ 333
83
+ __ggml_vocab_test__
84
+ 3333
85
+ __ggml_vocab_test__
86
+ 33333
87
+ __ggml_vocab_test__
88
+ 333333
89
+ __ggml_vocab_test__
90
+ 3333333
91
+ __ggml_vocab_test__
92
+ 33333333
93
+ __ggml_vocab_test__
94
+ 333333333
95
+ __ggml_vocab_test__
96
+ Cửa Việt
97
+ __ggml_vocab_test__
98
+ discards
99
+ __ggml_vocab_test__
100
+
101
+
102
+
103
+
104
+
105
+
106
+
107
+
108
+
109
+
110
+
111
+ 🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
112
+ __ggml_vocab_test__
@@ -0,0 +1,46 @@
1
+ 2550 204 18430 377
2
+ 597 2768 298 8564
3
+
4
+ 1437
5
+ 1437 1437
6
+ 1437 1437 1437
7
+ 50117
8
+ 50118
9
+ 50140
10
+ 50140 50118
11
+ 50117 50118
12
+ 31414 232
13
+ 20920 232
14
+ 31414 623
15
+ 20920 623
16
+ 20920 623 328
17
+ 31414 6 232 328
18
+ 20920 6 232 328
19
+ 42 16 8103 18164 27 4 49317
20
+ 605 40976 262 10109 18474 385 29 36807 6455
21
+ 36765 25482 22063 23171 34251 18697 10809 26161 18697 3602 22063 27969 40966 25417 15264 26161 24269 36709 41171 35328
22
+ 1376 17772 7471 1376 17772 19002 1376 17772 9085 1376 4333 13859 1376 17772 9357 1376 4333 9264 1376 17772 25448 1376 17772 18400 1376 17772 4333 1376 4333 10172 1376 17772 4333 1376 17772 7258 1376 17772 19002 1376 17772 5782 1376 17772 10172 1376 17772 3726 1376 17772 5782 1376 4333 10172 1376 17772 23171
23
+ 6569 15113 7471 36 21113 43 17841 19002 17 8384 6569 14285 4958 12605 36 34654 2841 4203 354 10146 26511 1070 43 36174 5782 36 8338 21554 14 34 63 308 19233 43
24
+ 31414
25
+ 20920
26
+ 1437 20920
27
+ 1437 1437 20920
28
+ 1437 1437 1437 20920
29
+ 1437 1437 1437 20920 50118 1437 1437 1437 20920
30
+ 36
31
+ 50118 5457
32
+ 108 3567
33
+ 31414 6 1423 108 1250 328 1336 32 47 17841 10172 17487 47876 3602 48617 15264 46537 11423 27326 48494 8210 49233 1558 1570 27761 49429 43251 10809 17772
34
+ 32376 12846
35
+ 246
36
+ 3103
37
+ 25631
38
+ 46152
39
+ 3103 25631
40
+ 46152 3103
41
+ 46152 25631
42
+ 46152 46152
43
+ 46152 3103 25631
44
+ 347 1376 2023 12410 102 16376 1376 2023 6382 90
45
+ 9553 5954
46
+ 50118 1437 50140 1437 50140 50118 1437 50117 1437 50117 50117 1437 50117 50118 1437 1437 50118 1437 1437 1437 50118 1437 1437 1437 1437 50118 1437 1437 1437 1437 1437 50118 6569 15113 7471 36 21113 43 17841 19002 17 8384 6569 14285 4958 12605 36 34654 2841 4203 354 10146 26511 1070 43 36174 5782 8103 18164 27 6569 18164 27 155 2357 30242 155 25631 30242 3103 30242 25631 30242 46152 30242 3103 25631 155 4 246 155 7586 246 155 734 246 25974 17772 7471 1376 17772 19002 1376 17772 9085 1376 4333 13859 1376 17772 9357 1376 4333 9264 1376 17772 25448 1376 17772 18400 1376 17772 4333 1376 4333 10172 1376 17772 4333 1376 17772 7258 1376 17772 19002 1376 17772 5782 18636 10172 17487 47876 3602 48617 15264 46537 11423 27326 48494 8210 49233 1558 1570 27761 49429 43251 10809 17772 36738 48332 47463 18697 10809 25482 22063 23171 34251 18697 10809 26161 18697 3602 22063 27969 40966 25417 15264 26161 24269 36709 41171 35328 128 49690 108 49972 49519 12905 48149 48149 43796 32376 12846 27282 28749 38 348 57 128 41042 37 18 89 6 128 4629 47 686 116 128 448 45 686 38 581 146 24 6 128 495 47 101 103 6845 116 166 108 30660 10 108 462 574
@@ -8,5 +8,7 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
8
8
 
9
9
  if (EMSCRIPTEN)
10
10
  else()
11
- add_subdirectory(vdot)
11
+ if (NOT GGML_BACKEND_DL)
12
+ add_subdirectory(vdot)
13
+ endif()
12
14
  endif()
@@ -1,9 +1,9 @@
1
1
  set(TARGET llama-vdot)
2
2
  add_executable(${TARGET} vdot.cpp)
3
3
  target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
4
- target_compile_features(${TARGET} PRIVATE cxx_std_11)
4
+ target_compile_features(${TARGET} PRIVATE cxx_std_17)
5
5
 
6
6
  set(TARGET llama-q8dot)
7
7
  add_executable(${TARGET} q8dot.cpp)
8
8
  target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
9
- target_compile_features(${TARGET} PRIVATE cxx_std_11)
9
+ target_compile_features(${TARGET} PRIVATE cxx_std_17)
@@ -1,9 +1,4 @@
1
- # TODO: should not use this
2
- if (WIN32)
3
- if (BUILD_SHARED_LIBS)
4
- set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
5
- endif()
6
- endif()
1
+ llama_add_compile_flags()
7
2
 
8
3
  #
9
4
  # libraries
@@ -23,7 +18,7 @@ add_library(llama
23
18
  )
24
19
 
25
20
  target_include_directories(llama PUBLIC . ../include)
26
- target_compile_features (llama PUBLIC cxx_std_11) # don't bump
21
+ target_compile_features (llama PUBLIC cxx_std_17) # don't bump
27
22
 
28
23
  target_link_libraries(llama PUBLIC ggml)
29
24
 
@@ -822,15 +822,11 @@ llama_grammar_stacks & llama_grammar_get_stacks(struct llama_grammar * grammar)
822
822
  return grammar->stacks;
823
823
  }
824
824
 
825
- void llama_grammar_accept(
826
- const llama_grammar_rules & rules,
827
- const llama_grammar_stacks & stacks,
828
- const uint32_t chr,
829
- llama_grammar_stacks & stacks_new) {
830
- stacks_new.clear();
831
- stacks_new.reserve(stacks.size());
825
+ void llama_grammar_accept(struct llama_grammar * grammar, uint32_t chr) {
826
+ llama_grammar_stacks stacks_new;
827
+ stacks_new.reserve(grammar->stacks.size());
832
828
 
833
- for (const auto & stack : stacks) {
829
+ for (const auto & stack : grammar->stacks) {
834
830
  if (stack.empty()) {
835
831
  continue;
836
832
  }
@@ -844,9 +840,11 @@ void llama_grammar_accept(
844
840
  if (!llama_grammar_is_end_of_sequence(pos)) {
845
841
  new_stack.push_back(pos);
846
842
  }
847
- llama_grammar_advance_stack(rules, new_stack, stacks_new);
843
+ llama_grammar_advance_stack(grammar->rules, new_stack, stacks_new);
848
844
  }
849
845
  }
846
+
847
+ grammar->stacks = std::move(stacks_new);
850
848
  }
851
849
 
852
850
  llama_grammar_candidates llama_grammar_reject_candidates_for_stack(
@@ -1051,7 +1049,12 @@ void llama_grammar_free_impl(struct llama_grammar * grammar) {
1051
1049
  }
1052
1050
 
1053
1051
  struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & grammar) {
1054
- llama_grammar * result = new llama_grammar { grammar.vocab, grammar.rules, grammar.stacks, grammar.partial_utf8, };
1052
+ llama_grammar * result = new llama_grammar {
1053
+ grammar.vocab,
1054
+ grammar.rules,
1055
+ grammar.stacks,
1056
+ grammar.partial_utf8,
1057
+ };
1055
1058
 
1056
1059
  // redirect elements in stacks to point to new rules
1057
1060
  for (size_t is = 0; is < result->stacks.size(); is++) {
@@ -1059,7 +1062,7 @@ struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & gra
1059
1062
  for (size_t ir0 = 0; ir0 < grammar.rules.size(); ir0++) {
1060
1063
  for (size_t ir1 = 0; ir1 < grammar.rules[ir0].size(); ir1++) {
1061
1064
  if (grammar.stacks[is][ie] == &grammar.rules[ir0][ir1]) {
1062
- result->stacks[is][ie] = &result->rules[ir0][ir1];
1065
+ result->stacks[is][ie] = &result->rules[ir0][ir1];
1063
1066
  }
1064
1067
  }
1065
1068
  }
@@ -1126,11 +1129,8 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
1126
1129
  const auto decoded = decode_utf8(piece, grammar.partial_utf8);
1127
1130
  const auto & code_points = decoded.first;
1128
1131
 
1129
- llama_grammar_stacks stacks_new;
1130
-
1131
1132
  for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
1132
- llama_grammar_accept(grammar.rules, grammar.stacks, *it, stacks_new);
1133
- grammar.stacks = std::move(stacks_new);
1133
+ llama_grammar_accept(&grammar, *it);
1134
1134
  }
1135
1135
 
1136
1136
  grammar.partial_utf8 = decoded.second;
@@ -58,6 +58,7 @@ using llama_grammar_rules = std::vector<llama_grammar_rule>;
58
58
  using llama_grammar_stacks = std::vector<llama_grammar_stack>;
59
59
  using llama_grammar_candidates = std::vector<llama_grammar_candidate>;
60
60
 
61
+ // TODO: remove, needed for tests atm
61
62
  const llama_grammar_rules & llama_grammar_get_rules (const struct llama_grammar * grammar);
62
63
  llama_grammar_stacks & llama_grammar_get_stacks( struct llama_grammar * grammar);
63
64
 
@@ -65,11 +66,7 @@ const llama_grammar_rules & llama_grammar_get_rules (const struct llama_grammar
65
66
  // be positioned at a character range (see `llama_grammar_advance_stack`), and
66
67
  // produces the N possible stacks if the given char is accepted at those
67
68
  // positions
68
- void llama_grammar_accept(
69
- const llama_grammar_rules & rules,
70
- const llama_grammar_stacks & stacks,
71
- uint32_t chr,
72
- llama_grammar_stacks & stacks_new);
69
+ void llama_grammar_accept(struct llama_grammar * grammar, uint32_t chr);
73
70
 
74
71
  std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_stack(
75
72
  const llama_grammar_rules & rules,
@@ -1396,19 +1396,15 @@ struct llama_sampler * llama_sampler_init_grammar_impl(const struct llama_vocab
1396
1396
  // penalties
1397
1397
 
1398
1398
  struct llama_sampler_penalties {
1399
- const int32_t n_vocab;
1400
- const llama_token special_eos_id;
1401
- const llama_token linefeed_id;
1402
-
1403
1399
  const int32_t penalty_last_n;
1404
1400
  const float penalty_repeat;
1405
1401
  const float penalty_freq;
1406
1402
  const float penalty_present;
1407
1403
 
1408
- const bool penalize_nl;
1409
- const bool ignore_eos;
1410
-
1411
1404
  ring_buffer<llama_token> prev;
1405
+
1406
+ // a frequency map to count token occurrences
1407
+ std::unordered_map<llama_token, int> token_count;
1412
1408
  };
1413
1409
 
1414
1410
  static const char * llama_sampler_penalties_name(const struct llama_sampler * /*smpl*/) {
@@ -1421,76 +1417,50 @@ static void llama_sampler_penalties_accept(struct llama_sampler * smpl, llama_to
1421
1417
  return;
1422
1418
  }
1423
1419
 
1424
- ctx->prev.push_back(token);
1425
- }
1426
-
1427
- static void llama_sampler_penalties_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
1428
- auto * ctx = (llama_sampler_penalties *) smpl->ctx;
1420
+ ctx->token_count[token]++;
1429
1421
 
1430
- if (ctx->ignore_eos) {
1431
- assert(ctx->special_eos_id >= 0);
1422
+ // if the ring buffer is full, remove the oldest token
1423
+ if (ctx->prev.size() >= (size_t) ctx->penalty_last_n) {
1424
+ const auto old = ctx->prev.front();
1432
1425
 
1433
- // optimistically check if the candidates are not yet sorted/shuffled/truncated
1434
- if (cur_p->size > (size_t) ctx->special_eos_id && cur_p->data[ctx->special_eos_id].id == ctx->special_eos_id) {
1435
- cur_p->data[ctx->special_eos_id].logit = -INFINITY;
1436
- } else {
1437
- // else, search for the special EOS token
1438
- for (size_t i = 0; i < cur_p->size; ++i) {
1439
- if (cur_p->data[i].id == ctx->special_eos_id) {
1440
- cur_p->data[i].logit = -INFINITY;
1441
- break;
1442
- }
1443
- }
1426
+ ctx->token_count[old]--;
1427
+ if (ctx->token_count[old] == 0) {
1428
+ ctx->token_count.erase(old);
1444
1429
  }
1445
1430
  }
1446
1431
 
1447
- if ((ctx->penalty_last_n == 0) ||
1448
- (ctx->penalty_repeat == 1.0f && ctx->penalty_freq == 0.0f && ctx->penalty_present == 0.0f)) {
1449
- return;
1450
- }
1451
-
1452
- bool nl_found = false;
1453
- size_t nl_idx = 0;
1454
- float nl_logit = -INFINITY;
1455
- if (!ctx->penalize_nl) {
1456
- assert(ctx->linefeed_id >= 0);
1432
+ ctx->prev.push_back(token);
1457
1433
 
1458
- // optimistically check if the candidates are not yet sorted/shuffled/truncated
1459
- if (cur_p->size > (size_t) ctx->linefeed_id && cur_p->data[ctx->linefeed_id].id == ctx->linefeed_id) {
1460
- nl_found = true;
1461
- nl_idx = ctx->linefeed_id;
1462
- nl_logit = cur_p->data[ctx->linefeed_id].logit;
1463
- } else {
1464
- // else, search for the linefeed token
1465
- for (size_t i = 0; i < cur_p->size; ++i) {
1466
- if (cur_p->data[i].id == ctx->linefeed_id) {
1467
- nl_found = true;
1468
- nl_idx = i;
1469
- nl_logit = cur_p->data[i].logit;
1470
- break;
1471
- }
1472
- }
1473
- }
1434
+ #if 0
1435
+ // sanity check
1436
+ std::unordered_map<llama_token, int> tmp;
1437
+ for (int i = 0; i < std::min<int>(ctx->penalty_last_n, ctx->prev.size()); ++i) {
1438
+ tmp[ctx->prev.rat(i)]++;
1474
1439
  }
1475
1440
 
1476
- // Create a frequency map to count occurrences of each token in last_tokens
1477
- // TODO: optimize this by maintaining the token count in the sampler context
1478
- using llama_token_cnt = std::unordered_map<llama_token, int>;
1479
- llama_token_cnt token_count;
1441
+ assert(ctx->token_count == tmp);
1442
+ #endif
1443
+ }
1444
+
1445
+ static void llama_sampler_penalties_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
1446
+ auto * ctx = (llama_sampler_penalties *) smpl->ctx;
1480
1447
 
1481
- for (int i = 0; i < std::min<int>(ctx->penalty_last_n, ctx->prev.size()); ++i) {
1482
- token_count[ctx->prev.rat(i)]++;
1448
+ if ((ctx->penalty_last_n == 0) ||
1449
+ (ctx->penalty_repeat == 1.0f && ctx->penalty_freq == 0.0f && ctx->penalty_present == 0.0f)) {
1450
+ return;
1483
1451
  }
1484
1452
 
1485
1453
  // Apply frequency and presence penalties to the cur_p
1486
1454
  for (size_t i = 0; i < cur_p->size; ++i) {
1487
- const auto token_iter = token_count.find(cur_p->data[i].id);
1488
- if (token_iter == token_count.end()) {
1455
+ const auto token_iter = ctx->token_count.find(cur_p->data[i].id);
1456
+ if (token_iter == ctx->token_count.end()) {
1489
1457
  continue;
1490
1458
  }
1491
1459
 
1492
1460
  const int count = token_iter->second;
1493
1461
 
1462
+ assert(count > 0 && count <= ctx->penalty_last_n);
1463
+
1494
1464
  // The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
1495
1465
  // This is common fix for this problem, which is to multiply by the penalty instead of dividing.
1496
1466
  if (cur_p->data[i].logit <= 0) {
@@ -1503,30 +1473,21 @@ static void llama_sampler_penalties_apply(struct llama_sampler * smpl, llama_tok
1503
1473
  }
1504
1474
 
1505
1475
  cur_p->sorted = false;
1506
-
1507
- if (!ctx->penalize_nl && nl_found) {
1508
- // restore the logit of the newline token if it was penalized
1509
- cur_p->data[nl_idx].logit = nl_logit;
1510
- }
1511
1476
  }
1512
1477
 
1513
1478
  static void llama_sampler_penalties_reset(struct llama_sampler * smpl) {
1514
1479
  auto * ctx = (llama_sampler_penalties *) smpl->ctx;
1515
1480
  ctx->prev.clear();
1481
+ ctx->token_count.clear();
1516
1482
  }
1517
1483
 
1518
1484
  static struct llama_sampler * llama_sampler_penalties_clone(const struct llama_sampler * smpl) {
1519
1485
  const auto * ctx = (const llama_sampler_penalties *) smpl->ctx;
1520
1486
  auto * result = llama_sampler_init_penalties(
1521
- ctx->n_vocab,
1522
- ctx->special_eos_id,
1523
- ctx->linefeed_id,
1524
1487
  ctx->penalty_last_n,
1525
1488
  ctx->penalty_repeat,
1526
1489
  ctx->penalty_freq,
1527
- ctx->penalty_present,
1528
- ctx->penalize_nl,
1529
- ctx->ignore_eos);
1490
+ ctx->penalty_present);
1530
1491
 
1531
1492
  // copy the state
1532
1493
  {
@@ -1552,38 +1513,21 @@ static struct llama_sampler_i llama_sampler_penalties_i = {
1552
1513
  };
1553
1514
 
1554
1515
  struct llama_sampler * llama_sampler_init_penalties(
1555
- int32_t n_vocab,
1556
- llama_token special_eos_id,
1557
- llama_token linefeed_id,
1558
1516
  int32_t penalty_last_n,
1559
1517
  float penalty_repeat,
1560
1518
  float penalty_freq,
1561
- float penalty_present,
1562
- bool penalize_nl,
1563
- bool ignore_eos) {
1564
- if (linefeed_id == LLAMA_TOKEN_NULL) {
1565
- penalize_nl = true;
1566
- }
1567
-
1568
- if (special_eos_id == LLAMA_TOKEN_NULL) {
1569
- ignore_eos = false;
1570
- }
1571
-
1519
+ float penalty_present) {
1572
1520
  penalty_last_n = std::max(penalty_last_n, 0);
1573
1521
 
1574
1522
  return new llama_sampler {
1575
1523
  /* .iface = */ &llama_sampler_penalties_i,
1576
1524
  /* .ctx = */ new llama_sampler_penalties {
1577
- /* .n_vocab = */ n_vocab,
1578
- /* .special_eos_id = */ special_eos_id,
1579
- /* .linefeed_id = */ linefeed_id,
1580
1525
  /* .penalty_last_n = */ penalty_last_n,
1581
1526
  /* .penalty_repeat = */ penalty_repeat,
1582
1527
  /* .penalty_freq = */ penalty_freq,
1583
1528
  /* .penalty_present = */ penalty_present,
1584
- /* .penalize_nl = */ penalize_nl,
1585
- /* .ignore_eos = */ ignore_eos,
1586
1529
  /* .prev = */ ring_buffer<llama_token>(penalty_last_n),
1530
+ /* .token_count = */ {},
1587
1531
  },
1588
1532
  };
1589
1533
  }
@@ -1611,7 +1555,8 @@ static void get_overlapping_token_sequences(const llama_vocab & vocab, const std
1611
1555
  if (word.find(str) != std::string::npos) {
1612
1556
  token_sequences.emplace(token_id, std::vector<llama_token>());
1613
1557
  } else {
1614
- size_t word_len = word.size(), str_len = str.size();
1558
+ size_t word_len = word.size();
1559
+ size_t str_len = str.size();
1615
1560
  size_t pos = -1;
1616
1561
  while ((pos = word.find(str[0], pos + 1)) != std::string::npos) {
1617
1562
  bool match = true;
@@ -418,6 +418,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
418
418
  case LLAMA_VOCAB_PRE_TYPE_SMOLLM:
419
419
  case LLAMA_VOCAB_PRE_TYPE_CODESHELL:
420
420
  case LLAMA_VOCAB_PRE_TYPE_EXAONE:
421
+ case LLAMA_VOCAB_PRE_TYPE_MINERVA:
421
422
  regex_exprs = {
422
423
  "\\p{N}",
423
424
  "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
@@ -737,7 +738,7 @@ struct llm_tokenizer_wpm_session {
737
738
  std::vector<std::string> words(1, "");
738
739
 
739
740
  for (const uint32_t cpt : cpts_nfd) {
740
- const auto flags = unicode_cpt_flags(cpt);
741
+ const auto flags = unicode_cpt_flags_from_cpt(cpt);
741
742
 
742
743
  if (flags.is_whitespace) {
743
744
  if (words.back().size()) { // finish previous word if any
@@ -1866,6 +1867,10 @@ int32_t llama_detokenize_impl(
1866
1867
  int32_t text_len_max,
1867
1868
  bool remove_special,
1868
1869
  bool unparse_special) {
1870
+ if (vocab.type == LLAMA_VOCAB_TYPE_NONE) {
1871
+ return 0;
1872
+ }
1873
+
1869
1874
  GGML_ASSERT(vocab.tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first.");
1870
1875
 
1871
1876
  int32_t avail = text_len_max;