@fugood/llama.node 0.3.3 → 0.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (225) hide show
  1. package/CMakeLists.txt +5 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/lib/binding.ts +29 -1
  17. package/package.json +1 -1
  18. package/src/EmbeddingWorker.cpp +15 -5
  19. package/src/EmbeddingWorker.h +2 -1
  20. package/src/LlamaCompletionWorker.cpp +17 -1
  21. package/src/LlamaContext.cpp +86 -18
  22. package/src/LlamaContext.h +2 -0
  23. package/src/llama.cpp/.github/workflows/build.yml +197 -159
  24. package/src/llama.cpp/.github/workflows/docker.yml +5 -8
  25. package/src/llama.cpp/.github/workflows/python-lint.yml +8 -1
  26. package/src/llama.cpp/.github/workflows/server.yml +21 -14
  27. package/src/llama.cpp/CMakeLists.txt +11 -6
  28. package/src/llama.cpp/Sources/llama/llama.h +4 -0
  29. package/src/llama.cpp/cmake/common.cmake +33 -0
  30. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +11 -0
  31. package/src/llama.cpp/common/CMakeLists.txt +6 -2
  32. package/src/llama.cpp/common/arg.cpp +426 -245
  33. package/src/llama.cpp/common/common.cpp +143 -80
  34. package/src/llama.cpp/common/common.h +81 -24
  35. package/src/llama.cpp/common/sampling.cpp +53 -19
  36. package/src/llama.cpp/common/sampling.h +22 -1
  37. package/src/llama.cpp/common/speculative.cpp +274 -0
  38. package/src/llama.cpp/common/speculative.h +28 -0
  39. package/src/llama.cpp/docs/build.md +101 -148
  40. package/src/llama.cpp/examples/CMakeLists.txt +32 -13
  41. package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
  42. package/src/llama.cpp/examples/batched/batched.cpp +5 -4
  43. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
  44. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
  45. package/src/llama.cpp/examples/cvector-generator/CMakeLists.txt +1 -1
  46. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +1 -1
  47. package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
  48. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +3 -2
  49. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
  50. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +1 -1
  51. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +4 -7
  52. package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +1 -1
  53. package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
  54. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +8 -1
  55. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
  56. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +2 -2
  57. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
  58. package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
  59. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
  60. package/src/llama.cpp/examples/imatrix/imatrix.cpp +11 -2
  61. package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
  62. package/src/llama.cpp/examples/infill/infill.cpp +1 -1
  63. package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +1 -1
  64. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +405 -316
  65. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  66. package/src/llama.cpp/examples/llava/CMakeLists.txt +10 -3
  67. package/src/llama.cpp/examples/llava/clip.cpp +262 -66
  68. package/src/llama.cpp/examples/llava/clip.h +8 -2
  69. package/src/llama.cpp/examples/llava/llava-cli.cpp +1 -1
  70. package/src/llama.cpp/examples/llava/llava.cpp +46 -19
  71. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +1 -1
  72. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +581 -0
  73. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
  74. package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -1
  75. package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
  76. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -1
  77. package/src/llama.cpp/examples/lookup/lookup.cpp +2 -2
  78. package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
  79. package/src/llama.cpp/examples/main/main.cpp +9 -5
  80. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +1 -1
  81. package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
  82. package/src/llama.cpp/examples/parallel/parallel.cpp +1 -1
  83. package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
  84. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
  85. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  86. package/src/llama.cpp/examples/quantize/quantize.cpp +0 -3
  87. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
  88. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
  89. package/src/llama.cpp/examples/retrieval/retrieval.cpp +4 -4
  90. package/src/llama.cpp/examples/run/CMakeLists.txt +5 -0
  91. package/src/llama.cpp/examples/run/run.cpp +911 -0
  92. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
  93. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +4 -4
  94. package/src/llama.cpp/examples/server/CMakeLists.txt +3 -7
  95. package/src/llama.cpp/examples/server/server.cpp +1758 -886
  96. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
  97. package/src/llama.cpp/examples/server/utils.hpp +94 -304
  98. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  99. package/src/llama.cpp/examples/simple/simple.cpp +4 -0
  100. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +1 -1
  101. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +3 -0
  102. package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
  103. package/src/llama.cpp/examples/speculative/speculative.cpp +16 -15
  104. package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +5 -0
  105. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +265 -0
  106. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
  107. package/src/llama.cpp/examples/tokenize/tokenize.cpp +1 -1
  108. package/src/llama.cpp/examples/tts/CMakeLists.txt +5 -0
  109. package/src/llama.cpp/examples/tts/tts.cpp +932 -0
  110. package/src/llama.cpp/ggml/CMakeLists.txt +46 -34
  111. package/src/llama.cpp/ggml/include/ggml-backend.h +16 -0
  112. package/src/llama.cpp/ggml/include/ggml-cpu.h +7 -49
  113. package/src/llama.cpp/ggml/include/ggml-opencl.h +26 -0
  114. package/src/llama.cpp/ggml/include/ggml.h +106 -24
  115. package/src/llama.cpp/ggml/src/CMakeLists.txt +73 -24
  116. package/src/llama.cpp/ggml/src/ggml-alloc.c +0 -1
  117. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +51 -11
  118. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +379 -22
  119. package/src/llama.cpp/ggml/src/ggml-backend.cpp +4 -4
  120. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +3 -7
  121. package/src/llama.cpp/ggml/src/ggml-blas/ggml-blas.cpp +5 -2
  122. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +33 -3
  123. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +456 -111
  124. package/src/llama.cpp/ggml/src/ggml-cann/common.h +6 -3
  125. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +95 -35
  126. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -5
  127. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +22 -9
  128. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +24 -13
  129. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +23 -13
  130. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +11 -0
  131. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +10 -0
  132. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +10 -0
  133. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +17 -0
  134. package/src/llama.cpp/ggml/src/ggml-common.h +42 -42
  135. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +288 -213
  136. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
  137. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
  138. package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/common.h +19 -22
  139. package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/mmq.cpp +93 -92
  140. package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/mmq.h +2 -9
  141. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
  142. package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.c → ggml-cpu-aarch64.cpp} +892 -190
  143. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +2 -24
  144. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
  145. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
  146. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +15 -0
  147. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +38 -25
  148. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
  149. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
  150. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +552 -399
  151. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +101 -136
  152. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +2 -2
  153. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +7 -10
  154. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +8 -0
  155. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -6
  156. package/src/llama.cpp/ggml/src/ggml-impl.h +32 -11
  157. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +13 -9
  158. package/src/llama.cpp/ggml/src/ggml-kompute/ggml-kompute.cpp +131 -64
  159. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +3 -6
  160. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +39 -0
  161. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +14 -7
  162. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +147 -0
  163. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +4004 -0
  164. package/src/llama.cpp/ggml/src/ggml-opt.cpp +67 -80
  165. package/src/llama.cpp/ggml/src/ggml-quants.c +0 -9
  166. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +3 -5
  167. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +5 -2
  168. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +13 -10
  169. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +2 -11
  170. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +1 -0
  171. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +2 -2
  172. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
  173. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +5 -5
  174. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +32 -13
  175. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +80 -61
  176. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +4 -4
  177. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +159 -114
  178. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +3 -2
  179. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +6 -6
  180. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +6 -20
  181. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +4 -3
  182. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +8 -8
  183. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +4 -3
  184. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +7 -7
  185. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +1 -0
  186. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +4 -1
  187. package/src/llama.cpp/ggml/src/ggml-threading.h +4 -2
  188. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +21 -7
  189. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1718 -399
  190. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +3 -1
  191. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +105 -31
  192. package/src/llama.cpp/ggml/src/ggml.c +367 -207
  193. package/src/llama.cpp/include/llama-cpp.h +25 -0
  194. package/src/llama.cpp/include/llama.h +26 -19
  195. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +112 -0
  196. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +46 -0
  197. package/src/llama.cpp/pocs/CMakeLists.txt +3 -1
  198. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
  199. package/src/llama.cpp/src/CMakeLists.txt +2 -7
  200. package/src/llama.cpp/src/llama-grammar.cpp +15 -15
  201. package/src/llama.cpp/src/llama-grammar.h +2 -5
  202. package/src/llama.cpp/src/llama-sampling.cpp +35 -90
  203. package/src/llama.cpp/src/llama-vocab.cpp +6 -1
  204. package/src/llama.cpp/src/llama.cpp +1748 -640
  205. package/src/llama.cpp/src/unicode.cpp +62 -51
  206. package/src/llama.cpp/src/unicode.h +9 -10
  207. package/src/llama.cpp/tests/CMakeLists.txt +48 -37
  208. package/src/llama.cpp/tests/test-arg-parser.cpp +2 -2
  209. package/src/llama.cpp/tests/test-backend-ops.cpp +140 -21
  210. package/src/llama.cpp/tests/test-chat-template.cpp +50 -4
  211. package/src/llama.cpp/tests/test-gguf.cpp +1303 -0
  212. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -6
  213. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -4
  214. package/src/llama.cpp/tests/test-quantize-fns.cpp +3 -3
  215. package/src/llama.cpp/tests/test-rope.cpp +61 -20
  216. package/src/llama.cpp/tests/test-sampling.cpp +2 -2
  217. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +0 -72
  218. package/src/llama.cpp/.github/workflows/nix-ci.yml +0 -79
  219. package/src/llama.cpp/.github/workflows/nix-flake-update.yml +0 -22
  220. package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +0 -36
  221. package/src/llama.cpp/ggml/include/ggml-amx.h +0 -25
  222. package/src/llama.cpp/ggml/src/ggml-aarch64.c +0 -129
  223. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -19
  224. package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +0 -107
  225. package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +0 -446
@@ -536,12 +536,12 @@ std::string string_from(const struct llama_context * ctx, const struct llama_bat
536
536
  [](const unsigned char c) { return !std::isprint(c); }),
537
537
  detokenized.end());
538
538
 
539
- buf << "\n" << std::to_string(i)
540
- << ":token '" << detokenized << "'"
541
- << ":pos " << std::to_string(batch.pos[i])
542
- << ":n_seq_id " << std::to_string(batch.n_seq_id[i])
543
- << ":seq_id " << std::to_string(batch.seq_id[i][0])
544
- << ":logits " << std::to_string(batch.logits[i]);
539
+ buf << "\n" << std::to_string(i)
540
+ << ", token '" << detokenized << "'"
541
+ << ", pos " << std::to_string(batch.pos[i])
542
+ << ", n_seq_id " << std::to_string(batch.n_seq_id[i])
543
+ << ", seq_id " << std::to_string(batch.seq_id[i][0])
544
+ << ", logits " << std::to_string(batch.logits[i]);
545
545
  }
546
546
 
547
547
  buf << " ]";
@@ -652,7 +652,17 @@ bool fs_validate_filename(const std::string & filename) {
652
652
 
653
653
  std::u32string filename_utf32;
654
654
  try {
655
+ #if defined(__clang__)
656
+ // disable C++17 deprecation warning for std::codecvt_utf8
657
+ # pragma clang diagnostic push
658
+ # pragma clang diagnostic ignored "-Wdeprecated-declarations"
659
+ #endif
655
660
  std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
661
+
662
+ #if defined(__clang__)
663
+ # pragma clang diagnostic pop
664
+ #endif
665
+
656
666
  filename_utf32 = converter.from_bytes(filename);
657
667
 
658
668
  // If the reverse conversion mismatches, it means overlong UTF-8 sequences were used,
@@ -829,9 +839,9 @@ struct common_init_result common_init_from_params(common_params & params) {
829
839
  llama_model * model = nullptr;
830
840
 
831
841
  if (!params.hf_repo.empty() && !params.hf_file.empty()) {
832
- model = common_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
842
+ model = common_load_model_from_hf(params.hf_repo, params.hf_file, params.model, params.hf_token, mparams);
833
843
  } else if (!params.model_url.empty()) {
834
- model = common_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
844
+ model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams);
835
845
  } else {
836
846
  model = llama_load_model_from_file(params.model.c_str(), mparams);
837
847
  }
@@ -875,6 +885,12 @@ struct common_init_result common_init_from_params(common_params & params) {
875
885
  return iparams;
876
886
  }
877
887
 
888
+ if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) {
889
+ LOG_ERR("%s: KV cache shifting is not supported for this model (--no-context-shift to disable)'\n", __func__);
890
+ llama_free_model(model);
891
+ return iparams;
892
+ }
893
+
878
894
  if (!params.control_vectors.empty()) {
879
895
  if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
880
896
  if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model);
@@ -919,9 +935,28 @@ struct common_init_result common_init_from_params(common_params & params) {
919
935
  common_lora_adapters_apply(lctx, iparams.lora_adapters);
920
936
  }
921
937
 
922
- if (params.sparams.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
938
+ if (params.sampling.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
923
939
  LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__);
924
- params.sparams.ignore_eos = false;
940
+ params.sampling.ignore_eos = false;
941
+ }
942
+
943
+ if (params.sampling.ignore_eos) {
944
+ for (llama_token i = 0; i < llama_n_vocab(model); i++) {
945
+ if (llama_token_is_eog(model, i)) {
946
+ LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
947
+ params.sampling.logit_bias.push_back({i, -INFINITY});
948
+ }
949
+ }
950
+ }
951
+
952
+ if (params.sampling.penalty_last_n == -1) {
953
+ LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
954
+ params.sampling.penalty_last_n = llama_n_ctx(lctx);
955
+ }
956
+
957
+ if (params.sampling.dry_penalty_last_n == -1) {
958
+ LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
959
+ params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
925
960
  }
926
961
 
927
962
  if (params.warmup) {
@@ -973,9 +1008,12 @@ void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_l
973
1008
  }
974
1009
  }
975
1010
 
976
- struct llama_model_params common_model_params_to_llama(const common_params & params) {
1011
+ struct llama_model_params common_model_params_to_llama(common_params & params) {
977
1012
  auto mparams = llama_model_default_params();
978
1013
 
1014
+ if (!params.devices.empty()) {
1015
+ mparams.devices = params.devices.data();
1016
+ }
979
1017
  if (params.n_gpu_layers != -1) {
980
1018
  mparams.n_gpu_layers = params.n_gpu_layers;
981
1019
  }
@@ -996,38 +1034,6 @@ struct llama_model_params common_model_params_to_llama(const common_params & par
996
1034
  return mparams;
997
1035
  }
998
1036
 
999
- static ggml_type kv_cache_type_from_str(const std::string & s) {
1000
- if (s == "f32") {
1001
- return GGML_TYPE_F32;
1002
- }
1003
- if (s == "f16") {
1004
- return GGML_TYPE_F16;
1005
- }
1006
- if (s == "bf16") {
1007
- return GGML_TYPE_BF16;
1008
- }
1009
- if (s == "q8_0") {
1010
- return GGML_TYPE_Q8_0;
1011
- }
1012
- if (s == "q4_0") {
1013
- return GGML_TYPE_Q4_0;
1014
- }
1015
- if (s == "q4_1") {
1016
- return GGML_TYPE_Q4_1;
1017
- }
1018
- if (s == "iq4_nl") {
1019
- return GGML_TYPE_IQ4_NL;
1020
- }
1021
- if (s == "q5_0") {
1022
- return GGML_TYPE_Q5_0;
1023
- }
1024
- if (s == "q5_1") {
1025
- return GGML_TYPE_Q5_1;
1026
- }
1027
-
1028
- throw std::runtime_error("Unsupported cache type: " + s);
1029
- }
1030
-
1031
1037
  struct llama_context_params common_context_params_to_llama(const common_params & params) {
1032
1038
  auto cparams = llama_context_default_params();
1033
1039
 
@@ -1062,8 +1068,8 @@ struct llama_context_params common_context_params_to_llama(const common_params &
1062
1068
  cparams.pooling_type = LLAMA_POOLING_TYPE_RANK;
1063
1069
  }
1064
1070
 
1065
- cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
1066
- cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
1071
+ cparams.type_k = params.cache_type_k;
1072
+ cparams.type_v = params.cache_type_v;
1067
1073
 
1068
1074
  return cparams;
1069
1075
  }
@@ -1089,13 +1095,7 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
1089
1095
  #define CURL_MAX_RETRY 3
1090
1096
  #define CURL_RETRY_DELAY_SECONDS 2
1091
1097
 
1092
-
1093
- static bool starts_with(const std::string & str, const std::string & prefix) {
1094
- // While we wait for C++20's std::string::starts_with...
1095
- return str.rfind(prefix, 0) == 0;
1096
- }
1097
-
1098
- static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_attempts, int retry_delay_seconds) {
1098
+ static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds) {
1099
1099
  int remaining_attempts = max_attempts;
1100
1100
 
1101
1101
  while (remaining_attempts > 0) {
@@ -1119,7 +1119,6 @@ static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_
1119
1119
  }
1120
1120
 
1121
1121
  static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
1122
-
1123
1122
  // Initialize libcurl
1124
1123
  std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
1125
1124
  if (!curl) {
@@ -1192,11 +1191,13 @@ static bool common_download_file(const std::string & url, const std::string & pa
1192
1191
  std::string etag;
1193
1192
  std::string last_modified;
1194
1193
  };
1194
+
1195
1195
  common_load_model_from_url_headers headers;
1196
+
1196
1197
  {
1197
1198
  typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
1198
1199
  auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
1199
- common_load_model_from_url_headers *headers = (common_load_model_from_url_headers *) userdata;
1200
+ common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
1200
1201
 
1201
1202
  static std::regex header_regex("([^:]+): (.*)\r\n");
1202
1203
  static std::regex etag_regex("ETag", std::regex_constants::icase);
@@ -1333,17 +1334,17 @@ static bool common_download_file(const std::string & url, const std::string & pa
1333
1334
  }
1334
1335
 
1335
1336
  struct llama_model * common_load_model_from_url(
1336
- const char * model_url,
1337
- const char * path_model,
1338
- const char * hf_token,
1337
+ const std::string & model_url,
1338
+ const std::string & local_path,
1339
+ const std::string & hf_token,
1339
1340
  const struct llama_model_params & params) {
1340
1341
  // Basic validation of the model_url
1341
- if (!model_url || strlen(model_url) == 0) {
1342
+ if (model_url.empty()) {
1342
1343
  LOG_ERR("%s: invalid model_url\n", __func__);
1343
1344
  return NULL;
1344
1345
  }
1345
1346
 
1346
- if (!common_download_file(model_url, path_model, hf_token)) {
1347
+ if (!common_download_file(model_url, local_path, hf_token)) {
1347
1348
  return NULL;
1348
1349
  }
1349
1350
 
@@ -1354,9 +1355,9 @@ struct llama_model * common_load_model_from_url(
1354
1355
  /*.no_alloc = */ true,
1355
1356
  /*.ctx = */ NULL,
1356
1357
  };
1357
- auto * ctx_gguf = gguf_init_from_file(path_model, gguf_params);
1358
+ auto * ctx_gguf = gguf_init_from_file(local_path.c_str(), gguf_params);
1358
1359
  if (!ctx_gguf) {
1359
- LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, path_model);
1360
+ LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, local_path.c_str());
1360
1361
  return NULL;
1361
1362
  }
1362
1363
 
@@ -1375,13 +1376,13 @@ struct llama_model * common_load_model_from_url(
1375
1376
  // Verify the first split file format
1376
1377
  // and extract split URL and PATH prefixes
1377
1378
  {
1378
- if (!llama_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) {
1379
- LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, path_model, n_split);
1379
+ if (!llama_split_prefix(split_prefix, sizeof(split_prefix), local_path.c_str(), 0, n_split)) {
1380
+ LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, local_path.c_str(), n_split);
1380
1381
  return NULL;
1381
1382
  }
1382
1383
 
1383
- if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) {
1384
- LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url, n_split);
1384
+ if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url.c_str(), 0, n_split)) {
1385
+ LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url.c_str(), n_split);
1385
1386
  return NULL;
1386
1387
  }
1387
1388
  }
@@ -1408,14 +1409,14 @@ struct llama_model * common_load_model_from_url(
1408
1409
  }
1409
1410
  }
1410
1411
 
1411
- return llama_load_model_from_file(path_model, params);
1412
+ return llama_load_model_from_file(local_path.c_str(), params);
1412
1413
  }
1413
1414
 
1414
1415
  struct llama_model * common_load_model_from_hf(
1415
- const char * repo,
1416
- const char * model,
1417
- const char * path_model,
1418
- const char * hf_token,
1416
+ const std::string & repo,
1417
+ const std::string & remote_path,
1418
+ const std::string & local_path,
1419
+ const std::string & hf_token,
1419
1420
  const struct llama_model_params & params) {
1420
1421
  // construct hugging face model url:
1421
1422
  //
@@ -1429,27 +1430,27 @@ struct llama_model * common_load_model_from_hf(
1429
1430
  std::string model_url = "https://huggingface.co/";
1430
1431
  model_url += repo;
1431
1432
  model_url += "/resolve/main/";
1432
- model_url += model;
1433
+ model_url += remote_path;
1433
1434
 
1434
- return common_load_model_from_url(model_url.c_str(), path_model, hf_token, params);
1435
+ return common_load_model_from_url(model_url, local_path, hf_token, params);
1435
1436
  }
1436
1437
 
1437
1438
  #else
1438
1439
 
1439
1440
  struct llama_model * common_load_model_from_url(
1440
- const char * /*model_url*/,
1441
- const char * /*path_model*/,
1442
- const char * /*hf_token*/,
1441
+ const std::string & /*model_url*/,
1442
+ const std::string & /*local_path*/,
1443
+ const std::string & /*hf_token*/,
1443
1444
  const struct llama_model_params & /*params*/) {
1444
1445
  LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
1445
1446
  return nullptr;
1446
1447
  }
1447
1448
 
1448
1449
  struct llama_model * common_load_model_from_hf(
1449
- const char * /*repo*/,
1450
- const char * /*model*/,
1451
- const char * /*path_model*/,
1452
- const char * /*hf_token*/,
1450
+ const std::string & /*repo*/,
1451
+ const std::string & /*remote_path*/,
1452
+ const std::string & /*local_path*/,
1453
+ const std::string & /*hf_token*/,
1453
1454
  const struct llama_model_params & /*params*/) {
1454
1455
  LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
1455
1456
  return nullptr;
@@ -1484,6 +1485,66 @@ void common_batch_add(
1484
1485
  batch.n_tokens++;
1485
1486
  }
1486
1487
 
1488
+ //
1489
+ // Token utils
1490
+ //
1491
+
1492
+ size_t common_lcp(const llama_tokens & a, const llama_tokens & b) {
1493
+ size_t i;
1494
+ for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
1495
+
1496
+ return i;
1497
+ }
1498
+
1499
+ size_t common_lcs(const llama_tokens & a, const llama_tokens & b) {
1500
+ // check for empty sequences
1501
+ if (a.empty() || b.empty()) {
1502
+ return 0;
1503
+ }
1504
+
1505
+ // get the lengths of the input sequences
1506
+ size_t a_len = a.size();
1507
+ size_t b_len = b.size();
1508
+
1509
+ // initialize the maximum length of the longest common subsequence (LCS)
1510
+ size_t max_length = 0;
1511
+
1512
+ // use two rows instead of a 2D matrix to optimize space
1513
+ std::vector<size_t> prev_row(b_len + 1, 0);
1514
+ std::vector<size_t> curr_row(b_len + 1, 0);
1515
+
1516
+ // iterate through the elements of a
1517
+ for (size_t i = 1; i <= a_len; i++) {
1518
+ // iterate through the elements of b
1519
+ for (size_t j = 1; j <= b_len; j++) {
1520
+ // if elements at the current positions match
1521
+ if (a[i - 1] == b[j - 1]) {
1522
+ // if it's the first element of either sequences, set LCS length to 1
1523
+ if (i == 1 || j == 1) {
1524
+ curr_row[j] = 1;
1525
+ } else {
1526
+ // increment LCS length by 1 compared to the previous element
1527
+ curr_row[j] = prev_row[j - 1] + 1;
1528
+ }
1529
+
1530
+ // update max_length if necessary
1531
+ if (curr_row[j] > max_length) {
1532
+ max_length = curr_row[j];
1533
+ }
1534
+ } else {
1535
+ // reset LCS length if elements don't match
1536
+ curr_row[j] = 0;
1537
+ }
1538
+ }
1539
+
1540
+ // update the previous row for the next iteration
1541
+ prev_row = curr_row;
1542
+ }
1543
+
1544
+ // return the maximum length of the LCS
1545
+ return max_length;
1546
+ }
1547
+
1487
1548
  //
1488
1549
  // Vocab utils
1489
1550
  //
@@ -1720,7 +1781,9 @@ void common_embd_normalize(const float * inp, float * out, int n, int embd_norm)
1720
1781
  break;
1721
1782
  case 0: // max absolute
1722
1783
  for (int i = 0; i < n; i++) {
1723
- if (sum < std::abs(inp[i])) sum = std::abs(inp[i]);
1784
+ if (sum < std::abs(inp[i])) {
1785
+ sum = std::abs(inp[i]);
1786
+ }
1724
1787
  }
1725
1788
  sum /= 32760.0; // make an int16 range
1726
1789
  break;
@@ -33,11 +33,13 @@ struct common_lora_adapter_container : common_lora_adapter_info {
33
33
  struct llama_lora_adapter * adapter;
34
34
  };
35
35
 
36
+ using llama_tokens = std::vector<llama_token>;
37
+
36
38
  // build info
37
39
  extern int LLAMA_BUILD_NUMBER;
38
- extern char const * LLAMA_COMMIT;
39
- extern char const * LLAMA_COMPILER;
40
- extern char const * LLAMA_BUILD_TARGET;
40
+ extern const char * LLAMA_COMMIT;
41
+ extern const char * LLAMA_COMPILER;
42
+ extern const char * LLAMA_BUILD_TARGET;
41
43
 
42
44
  struct common_control_vector_load_info;
43
45
 
@@ -78,6 +80,7 @@ enum llama_example {
78
80
  LLAMA_EXAMPLE_LLAVA,
79
81
  LLAMA_EXAMPLE_LOOKUP,
80
82
  LLAMA_EXAMPLE_PARALLEL,
83
+ LLAMA_EXAMPLE_TTS,
81
84
 
82
85
  LLAMA_EXAMPLE_COUNT,
83
86
  };
@@ -93,6 +96,7 @@ enum common_sampler_type {
93
96
  COMMON_SAMPLER_TYPE_TEMPERATURE = 7,
94
97
  COMMON_SAMPLER_TYPE_XTC = 8,
95
98
  COMMON_SAMPLER_TYPE_INFILL = 9,
99
+ COMMON_SAMPLER_TYPE_PENALTIES = 10,
96
100
  };
97
101
 
98
102
  // dimensionality reduction methods, used by cvector-generator
@@ -101,8 +105,8 @@ enum dimre_method {
101
105
  DIMRE_METHOD_MEAN,
102
106
  };
103
107
 
104
- // sampler parameters
105
- struct common_sampler_params {
108
+ // sampling parameters
109
+ struct common_params_sampling {
106
110
  uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
107
111
 
108
112
  int32_t n_prev = 64; // number of previous tokens to remember
@@ -128,14 +132,15 @@ struct common_sampler_params {
128
132
  int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
129
133
  float mirostat_tau = 5.00f; // target entropy
130
134
  float mirostat_eta = 0.10f; // learning rate
131
- bool penalize_nl = false; // consider newlines as a repeatable token
132
135
  bool ignore_eos = false;
133
136
  bool no_perf = false; // disable performance metrics
137
+ bool timing_per_token = false;
134
138
 
135
139
  std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY
136
140
 
137
141
 
138
142
  std::vector<enum common_sampler_type> samplers = {
143
+ COMMON_SAMPLER_TYPE_PENALTIES,
139
144
  COMMON_SAMPLER_TYPE_DRY,
140
145
  COMMON_SAMPLER_TYPE_TOP_K,
141
146
  COMMON_SAMPLER_TYPE_TYPICAL_P,
@@ -153,21 +158,39 @@ struct common_sampler_params {
153
158
  std::string print() const;
154
159
  };
155
160
 
161
+ struct common_params_speculative {
162
+ std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
163
+
164
+ int32_t n_ctx = 0; // draft context size
165
+ int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
166
+ int32_t n_min = 5; // minimum number of draft tokens to use for speculative decoding
167
+ int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
168
+ float p_split = 0.1f; // speculative decoding split probability
169
+ float p_min = 0.9f; // minimum speculative decoding probability (greedy)
170
+
171
+ struct cpu_params cpuparams;
172
+ struct cpu_params cpuparams_batch;
173
+
174
+ std::string model = ""; // draft model for speculative decoding // NOLINT
175
+ };
176
+
177
+ struct common_params_vocoder {
178
+ std::string hf_repo = ""; // HF repo // NOLINT
179
+ std::string hf_file = ""; // HF file // NOLINT
180
+
181
+ std::string model = ""; // model path // NOLINT
182
+ std::string model_url = ""; // model url to download // NOLINT
183
+ };
184
+
156
185
  struct common_params {
157
186
  int32_t n_predict = -1; // new tokens to predict
158
187
  int32_t n_ctx = 4096; // context size
159
188
  int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
160
189
  int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
161
190
  int32_t n_keep = 0; // number of tokens to keep from initial prompt
162
- int32_t n_draft = 5; // number of tokens to draft during speculative decoding
163
191
  int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
164
192
  int32_t n_parallel = 1; // number of parallel sequences to decode
165
193
  int32_t n_sequences = 1; // number of sequences to decode
166
- float p_split = 0.1f; // speculative decoding split probability
167
- int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
168
- int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
169
- int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
170
- float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
171
194
  int32_t grp_attn_n = 1; // group-attention factor
172
195
  int32_t grp_attn_w = 512; // group-attention width
173
196
  int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
@@ -180,26 +203,33 @@ struct common_params {
180
203
  int32_t yarn_orig_ctx = 0; // YaRN original context length
181
204
  float defrag_thold = 0.1f; // KV cache defragmentation threshold
182
205
 
206
+ // offload params
207
+ std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
208
+
209
+ int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
210
+ int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
211
+ float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
212
+
213
+ enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
214
+
183
215
  struct cpu_params cpuparams;
184
216
  struct cpu_params cpuparams_batch;
185
- struct cpu_params draft_cpuparams;
186
- struct cpu_params draft_cpuparams_batch;
187
217
 
188
218
  ggml_backend_sched_eval_callback cb_eval = nullptr;
189
219
  void * cb_eval_user_data = nullptr;
190
220
 
191
221
  ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
192
222
 
193
- enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
194
223
  enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
195
224
  enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
196
225
  enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
197
226
 
198
- struct common_sampler_params sparams;
227
+ struct common_params_sampling sampling;
228
+ struct common_params_speculative speculative;
229
+ struct common_params_vocoder vocoder;
199
230
 
200
231
  std::string model = ""; // model path // NOLINT
201
- std::string model_draft = ""; // draft model for speculative decoding // NOLINT
202
- std::string model_alias = "unknown"; // model alias // NOLINT
232
+ std::string model_alias = ""; // model alias // NOLINT
203
233
  std::string model_url = ""; // model url to download // NOLINT
204
234
  std::string hf_token = ""; // HF token // NOLINT
205
235
  std::string hf_repo = ""; // HF repo // NOLINT
@@ -270,8 +300,8 @@ struct common_params {
270
300
  bool warmup = true; // warmup run
271
301
  bool check_tensors = false; // validate tensor data
272
302
 
273
- std::string cache_type_k = "f16"; // KV cache data type for the K
274
- std::string cache_type_v = "f16"; // KV cache data type for the V
303
+ ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
304
+ ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
275
305
 
276
306
  // multimodal models (see examples/llava)
277
307
  std::string mmproj = ""; // path to multimodal projector // NOLINT
@@ -421,6 +451,11 @@ std::vector<std::string> string_split<std::string>(const std::string & input, ch
421
451
  return parts;
422
452
  }
423
453
 
454
+ static bool string_starts_with(const std::string & str,
455
+ const std::string & prefix) { // While we wait for C++20's std::string::starts_with...
456
+ return str.rfind(prefix, 0) == 0;
457
+ }
458
+
424
459
  bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
425
460
  void string_process_escapes(std::string & input);
426
461
 
@@ -451,17 +486,28 @@ struct common_init_result {
451
486
 
452
487
  struct common_init_result common_init_from_params(common_params & params);
453
488
 
454
- struct llama_model_params common_model_params_to_llama (const common_params & params);
489
+ struct llama_model_params common_model_params_to_llama ( common_params & params);
455
490
  struct llama_context_params common_context_params_to_llama(const common_params & params);
456
491
  struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
457
492
 
458
- struct llama_model * common_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
459
- struct llama_model * common_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
493
+ struct llama_model * common_load_model_from_url(
494
+ const std::string & model_url,
495
+ const std::string & local_path,
496
+ const std::string & hf_token,
497
+ const struct llama_model_params & params);
498
+ struct llama_model * common_load_model_from_hf(
499
+ const std::string & repo,
500
+ const std::string & remote_path,
501
+ const std::string & local_path,
502
+ const std::string & hf_token,
503
+ const struct llama_model_params & params);
460
504
 
461
505
  // clear LoRA adapters from context, then apply new list of adapters
462
506
  void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
463
507
 
508
+ //
464
509
  // Batch utils
510
+ //
465
511
 
466
512
  void common_batch_clear(struct llama_batch & batch);
467
513
 
@@ -472,6 +518,16 @@ void common_batch_add(
472
518
  const std::vector<llama_seq_id> & seq_ids,
473
519
  bool logits);
474
520
 
521
+ //
522
+ // Token utils
523
+ //
524
+
525
+ // longest common prefix
526
+ size_t common_lcp(const llama_tokens & a, const llama_tokens & b);
527
+
528
+ // longet common subsequence
529
+ size_t common_lcs(const llama_tokens & a, const llama_tokens & b);
530
+
475
531
  //
476
532
  // Vocab utils
477
533
  //
@@ -551,7 +607,8 @@ void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_si
551
607
  // Embedding utils
552
608
  //
553
609
 
554
- void common_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
610
+ // TODO: repace embd_norm with an enum
611
+ void common_embd_normalize(const float * inp, float * out, int n, int embd_norm);
555
612
 
556
613
  float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
557
614