@fugood/llama.node 0.3.1 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (252) hide show
  1. package/CMakeLists.txt +1 -8
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +4 -2
  17. package/src/DetokenizeWorker.cpp +1 -1
  18. package/src/EmbeddingWorker.cpp +2 -2
  19. package/src/LlamaCompletionWorker.cpp +10 -10
  20. package/src/LlamaCompletionWorker.h +2 -2
  21. package/src/LlamaContext.cpp +14 -17
  22. package/src/TokenizeWorker.cpp +1 -1
  23. package/src/common.hpp +5 -4
  24. package/src/llama.cpp/.github/workflows/build.yml +137 -29
  25. package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
  26. package/src/llama.cpp/.github/workflows/docker.yml +46 -34
  27. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
  28. package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
  29. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
  30. package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
  31. package/src/llama.cpp/.github/workflows/server.yml +7 -0
  32. package/src/llama.cpp/CMakeLists.txt +26 -11
  33. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  34. package/src/llama.cpp/common/CMakeLists.txt +10 -10
  35. package/src/llama.cpp/common/arg.cpp +2041 -0
  36. package/src/llama.cpp/common/arg.h +77 -0
  37. package/src/llama.cpp/common/common.cpp +523 -1861
  38. package/src/llama.cpp/common/common.h +234 -106
  39. package/src/llama.cpp/common/console.cpp +3 -0
  40. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  41. package/src/llama.cpp/common/log.cpp +401 -0
  42. package/src/llama.cpp/common/log.h +66 -698
  43. package/src/llama.cpp/common/ngram-cache.cpp +39 -36
  44. package/src/llama.cpp/common/ngram-cache.h +19 -19
  45. package/src/llama.cpp/common/sampling.cpp +356 -350
  46. package/src/llama.cpp/common/sampling.h +62 -139
  47. package/src/llama.cpp/common/stb_image.h +5990 -6398
  48. package/src/llama.cpp/docs/build.md +72 -17
  49. package/src/llama.cpp/examples/CMakeLists.txt +1 -2
  50. package/src/llama.cpp/examples/batched/batched.cpp +49 -65
  51. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +42 -53
  52. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
  53. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +22 -22
  54. package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
  55. package/src/llama.cpp/examples/embedding/embedding.cpp +147 -91
  56. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +37 -37
  57. package/src/llama.cpp/examples/export-lora/export-lora.cpp +39 -38
  58. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
  59. package/src/llama.cpp/examples/{baby-llama → gen-docs}/CMakeLists.txt +2 -2
  60. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
  61. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
  62. package/src/llama.cpp/examples/gritlm/gritlm.cpp +46 -39
  63. package/src/llama.cpp/examples/imatrix/imatrix.cpp +75 -69
  64. package/src/llama.cpp/examples/infill/infill.cpp +131 -192
  65. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +276 -178
  66. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  67. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +40 -36
  68. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  69. package/src/llama.cpp/examples/llava/clip.cpp +686 -150
  70. package/src/llama.cpp/examples/llava/clip.h +11 -2
  71. package/src/llama.cpp/examples/llava/llava-cli.cpp +60 -71
  72. package/src/llama.cpp/examples/llava/llava.cpp +146 -26
  73. package/src/llama.cpp/examples/llava/llava.h +2 -3
  74. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
  75. package/src/llama.cpp/examples/llava/requirements.txt +1 -0
  76. package/src/llama.cpp/examples/lookahead/lookahead.cpp +55 -56
  77. package/src/llama.cpp/examples/lookup/lookup-create.cpp +15 -13
  78. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  79. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +34 -33
  80. package/src/llama.cpp/examples/lookup/lookup.cpp +60 -63
  81. package/src/llama.cpp/examples/main/main.cpp +216 -313
  82. package/src/llama.cpp/examples/parallel/parallel.cpp +58 -59
  83. package/src/llama.cpp/examples/passkey/passkey.cpp +53 -61
  84. package/src/llama.cpp/examples/perplexity/perplexity.cpp +277 -311
  85. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  86. package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
  87. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -12
  88. package/src/llama.cpp/examples/retrieval/retrieval.cpp +57 -52
  89. package/src/llama.cpp/examples/rpc/rpc-server.cpp +27 -2
  90. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +60 -46
  91. package/src/llama.cpp/examples/server/CMakeLists.txt +7 -18
  92. package/src/llama.cpp/examples/server/server.cpp +1347 -1531
  93. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
  94. package/src/llama.cpp/examples/server/utils.hpp +396 -107
  95. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  96. package/src/llama.cpp/examples/simple/simple.cpp +132 -106
  97. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
  98. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
  99. package/src/llama.cpp/examples/speculative/speculative.cpp +153 -124
  100. package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
  101. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  102. package/src/llama.cpp/examples/tokenize/tokenize.cpp +27 -29
  103. package/src/llama.cpp/ggml/CMakeLists.txt +29 -12
  104. package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
  105. package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
  106. package/src/llama.cpp/ggml/include/ggml-backend.h +166 -68
  107. package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
  108. package/src/llama.cpp/ggml/include/ggml-cann.h +17 -19
  109. package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
  110. package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
  111. package/src/llama.cpp/ggml/include/ggml-cuda.h +17 -17
  112. package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
  113. package/src/llama.cpp/ggml/include/ggml-metal.h +13 -12
  114. package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
  115. package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
  116. package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
  117. package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
  118. package/src/llama.cpp/ggml/include/ggml.h +272 -505
  119. package/src/llama.cpp/ggml/src/CMakeLists.txt +69 -1110
  120. package/src/llama.cpp/ggml/src/ggml-aarch64.c +52 -2116
  121. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
  122. package/src/llama.cpp/ggml/src/ggml-alloc.c +29 -27
  123. package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
  124. package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
  125. package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
  126. package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
  127. package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
  128. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +144 -81
  129. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
  130. package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +394 -635
  131. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
  132. package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +217 -70
  133. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
  134. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
  135. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
  136. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
  137. package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
  138. package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +458 -353
  139. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
  140. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
  141. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
  142. package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
  143. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
  144. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
  145. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
  146. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +371 -0
  147. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
  148. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  149. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
  150. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
  151. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1885 -0
  152. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
  153. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  154. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
  155. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  156. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
  157. package/src/llama.cpp/ggml/src/ggml-impl.h +380 -584
  158. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
  159. package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +233 -87
  160. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
  161. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
  162. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
  163. package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
  164. package/src/llama.cpp/ggml/src/ggml-quants.c +369 -9994
  165. package/src/llama.cpp/ggml/src/ggml-quants.h +78 -110
  166. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
  167. package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +560 -335
  168. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
  169. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
  170. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +51 -0
  171. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +310 -0
  172. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
  173. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
  174. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
  175. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
  176. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
  177. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
  178. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
  179. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +18 -25
  180. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
  181. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
  182. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
  183. package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3350 -3980
  184. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
  185. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
  186. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +70 -68
  187. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +9 -6
  188. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
  189. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
  190. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +8 -0
  191. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
  192. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
  193. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
  194. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
  195. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
  196. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
  197. package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  198. package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
  199. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
  200. package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2034 -1718
  201. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +2 -0
  202. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +152 -185
  203. package/src/llama.cpp/ggml/src/ggml.c +2075 -16579
  204. package/src/llama.cpp/include/llama.h +296 -285
  205. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
  206. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
  207. package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
  208. package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
  209. package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
  210. package/src/llama.cpp/src/CMakeLists.txt +2 -1
  211. package/src/llama.cpp/src/llama-grammar.cpp +721 -122
  212. package/src/llama.cpp/src/llama-grammar.h +120 -15
  213. package/src/llama.cpp/src/llama-impl.h +156 -1
  214. package/src/llama.cpp/src/llama-sampling.cpp +2058 -346
  215. package/src/llama.cpp/src/llama-sampling.h +39 -47
  216. package/src/llama.cpp/src/llama-vocab.cpp +390 -127
  217. package/src/llama.cpp/src/llama-vocab.h +60 -20
  218. package/src/llama.cpp/src/llama.cpp +6215 -3263
  219. package/src/llama.cpp/src/unicode-data.cpp +6 -4
  220. package/src/llama.cpp/src/unicode-data.h +4 -4
  221. package/src/llama.cpp/src/unicode.cpp +15 -7
  222. package/src/llama.cpp/tests/CMakeLists.txt +4 -2
  223. package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
  224. package/src/llama.cpp/tests/test-backend-ops.cpp +1725 -297
  225. package/src/llama.cpp/tests/test-barrier.cpp +94 -0
  226. package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
  227. package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
  228. package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
  229. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +23 -8
  230. package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
  231. package/src/llama.cpp/tests/test-log.cpp +39 -0
  232. package/src/llama.cpp/tests/test-opt.cpp +853 -142
  233. package/src/llama.cpp/tests/test-quantize-fns.cpp +28 -19
  234. package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
  235. package/src/llama.cpp/tests/test-rope.cpp +2 -1
  236. package/src/llama.cpp/tests/test-sampling.cpp +226 -142
  237. package/src/llama.cpp/tests/test-tokenizer-0.cpp +56 -36
  238. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
  239. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
  240. package/patches/llama.patch +0 -22
  241. package/src/llama.cpp/.github/workflows/bench.yml +0 -310
  242. package/src/llama.cpp/common/grammar-parser.cpp +0 -536
  243. package/src/llama.cpp/common/grammar-parser.h +0 -29
  244. package/src/llama.cpp/common/train.cpp +0 -1513
  245. package/src/llama.cpp/common/train.h +0 -233
  246. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1640
  247. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
  248. package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
  249. package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +0 -1027
  250. package/src/llama.cpp/tests/test-grad0.cpp +0 -1566
  251. /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
  252. /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
@@ -2,10 +2,13 @@
2
2
  #include "common.h"
3
3
  #include "log.h"
4
4
 
5
+ #include <cinttypes>
5
6
  #include <cstdint>
7
+ #include <cstdio>
6
8
  #include <fstream>
9
+ #include <thread>
7
10
 
8
- void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
11
+ void common_ngram_cache_update(common_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
9
12
  std::vector<llama_token> & inp, int nnew, bool print_progress) {
10
13
  const int64_t t_start_ms = ggml_time_ms();
11
14
  const int64_t inp_size = inp.size();
@@ -17,16 +20,16 @@ void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, in
17
20
  const int64_t i_start = std::max(inp_size - nnew, ngram_size);
18
21
  for (int64_t i = i_start; i < inp_size; ++i) {
19
22
  const int64_t ngram_start = i - ngram_size;
20
- llama_ngram ngram(&inp[ngram_start], ngram_size);
23
+ common_ngram ngram(&inp[ngram_start], ngram_size);
21
24
  const llama_token token = inp[i];
22
25
 
23
- llama_ngram_cache::iterator part_it = ngram_cache.find(ngram);
26
+ common_ngram_cache::iterator part_it = ngram_cache.find(ngram);
24
27
  if (part_it == ngram_cache.end()) {
25
- llama_ngram_cache_part part;
28
+ common_ngram_cache_part part;
26
29
  part.emplace(token, 1);
27
30
  ngram_cache.emplace(ngram, part);
28
31
  } else {
29
- llama_ngram_cache_part::iterator token_count_it = part_it->second.find(token);
32
+ common_ngram_cache_part::iterator token_count_it = part_it->second.find(token);
30
33
  if (token_count_it == part_it->second.end()) {
31
34
  part_it->second.emplace(token, 1);
32
35
  } else {
@@ -59,12 +62,12 @@ constexpr int draft_min_sample_size_strict[LLAMA_NGRAM_MAX] = { 4, 3, 2, 2};
59
62
  constexpr int draft_min_percent_strict[LLAMA_NGRAM_MAX] = {75, 66, 66, 66};
60
63
 
61
64
  // Helper function that tries to draft a token from only the static ngram cache:
62
- static llama_token try_draft(llama_ngram_cache & nc_static, const llama_ngram ngram_static) {
63
- llama_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
65
+ static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram ngram_static) {
66
+ common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
64
67
  if (part_static_it == nc_static.end()) {
65
68
  return -1;
66
69
  }
67
- const llama_ngram_cache_part part_static = part_static_it->second;
70
+ const common_ngram_cache_part part_static = part_static_it->second;
68
71
 
69
72
  int max_count_static = 0;
70
73
  int sum_count_static = 0;
@@ -92,19 +95,19 @@ static llama_token try_draft(llama_ngram_cache & nc_static, const llama_ngram ng
92
95
 
93
96
  // Try to draft a token from primary cache (context/dynamic), validate with static cache:
94
97
  static llama_token try_draft(
95
- llama_ngram_cache & nc_primary, const std::vector<llama_ngram> & ngrams_primary, llama_ngram_cache_part & part_static,
98
+ common_ngram_cache & nc_primary, const std::vector<common_ngram> & ngrams_primary, common_ngram_cache_part & part_static,
96
99
  const int * min_sample_size, const int * min_percent) {
97
100
 
98
101
  llama_token drafted_token = -1;
99
102
 
100
103
  for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == -1; --i) {
101
- const llama_ngram ngram_primary = ngrams_primary[i];
104
+ const common_ngram ngram_primary = ngrams_primary[i];
102
105
 
103
- llama_ngram_cache::iterator part_primary_it = nc_primary.find(ngram_primary);
106
+ common_ngram_cache::iterator part_primary_it = nc_primary.find(ngram_primary);
104
107
  if (part_primary_it == nc_primary.end()) {
105
108
  continue;
106
109
  }
107
- const llama_ngram_cache_part part_primary = part_primary_it->second;
110
+ const common_ngram_cache_part part_primary = part_primary_it->second;
108
111
 
109
112
  int max_count_primary = 0;
110
113
  int max_count_static = 0;
@@ -114,7 +117,7 @@ static llama_token try_draft(
114
117
  for (std::pair<llama_token, int> token_count_primary : part_primary) {
115
118
  const llama_token token = token_count_primary.first;
116
119
 
117
- llama_ngram_cache_part::iterator token_count_static_it = part_static.find(token);
120
+ common_ngram_cache_part::iterator token_count_static_it = part_static.find(token);
118
121
 
119
122
  const int32_t count_primary = token_count_primary.second;
120
123
  const int32_t count_static = token_count_static_it != part_static.end() ? 100*token_count_static_it->second : 1;
@@ -139,9 +142,9 @@ static llama_token try_draft(
139
142
  return drafted_token;
140
143
  }
141
144
 
142
- void llama_ngram_cache_draft(
145
+ void common_ngram_cache_draft(
143
146
  std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
144
- llama_ngram_cache & nc_context, llama_ngram_cache & nc_dynamic, llama_ngram_cache & nc_static
147
+ common_ngram_cache & nc_context, common_ngram_cache & nc_dynamic, common_ngram_cache & nc_static
145
148
  ) {
146
149
  GGML_ASSERT(draft.size() == 1);
147
150
  const int inp_size = inp.size();
@@ -154,21 +157,21 @@ void llama_ngram_cache_draft(
154
157
  llama_token drafted_token = -1;
155
158
 
156
159
  const int ngram_start_static = inp_size-LLAMA_NGRAM_STATIC + draft.size()-1;
157
- llama_ngram ngram_static;
160
+ common_ngram ngram_static;
158
161
  for (int j = ngram_start_static; j < ngram_start_static + LLAMA_NGRAM_STATIC; ++j) {
159
162
  ngram_static.tokens[j-ngram_start_static] = get_token(inp, draft, j);
160
163
  }
161
- llama_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
162
- llama_ngram_cache_part part_static;
164
+ common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
165
+ common_ngram_cache_part part_static;
163
166
  if (part_static_it != nc_static.end()) {
164
167
  part_static = part_static_it->second;
165
168
  }
166
169
 
167
170
  // cd = context + dynamic
168
- std::vector<llama_ngram> ngrams_cd;
171
+ std::vector<common_ngram> ngrams_cd;
169
172
  for (int ngram_size_cd = ngram_min; ngram_size_cd <= ngram_max; ++ngram_size_cd) {
170
173
  const int ngram_start_cd = inp_size-ngram_size_cd + draft.size()-1;
171
- llama_ngram ngram_cd;
174
+ common_ngram ngram_cd;
172
175
  for (int j = ngram_start_cd; j < ngram_start_cd + ngram_size_cd; ++j) {
173
176
  ngram_cd.tokens[j-ngram_start_cd] = get_token(inp, draft, j);
174
177
  }
@@ -193,16 +196,16 @@ void llama_ngram_cache_draft(
193
196
  }
194
197
  }
195
198
 
196
- void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filename) {
199
+ void common_ngram_cache_save(common_ngram_cache & ngram_cache, std::string & filename) {
197
200
  std::ofstream file_out(filename, std::ios::binary);
198
- for (std::pair<llama_ngram, llama_ngram_cache_part> item : ngram_cache) {
199
- const llama_ngram ngram = item.first;
200
- llama_ngram_cache_part token_counts = item.second;
201
+ for (std::pair<common_ngram, common_ngram_cache_part> item : ngram_cache) {
202
+ const common_ngram ngram = item.first;
203
+ common_ngram_cache_part token_counts = item.second;
201
204
  GGML_ASSERT(!token_counts.empty());
202
205
  const int32_t ntokens = token_counts.size();
203
206
  GGML_ASSERT(ntokens > 0);
204
207
 
205
- file_out.write(reinterpret_cast<const char *>(&ngram), sizeof(llama_ngram));
208
+ file_out.write(reinterpret_cast<const char *>(&ngram), sizeof(common_ngram));
206
209
  file_out.write(reinterpret_cast<const char *>(&ntokens), sizeof(int32_t));
207
210
  for (std::pair<llama_token, int32_t> item2 : token_counts) {
208
211
  const llama_token token = item2.first;
@@ -216,14 +219,14 @@ void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filen
216
219
 
217
220
  }
218
221
 
219
- llama_ngram_cache llama_ngram_cache_load(std::string & filename) {
222
+ common_ngram_cache common_ngram_cache_load(std::string & filename) {
220
223
  std::ifstream hashmap_file(filename, std::ios::binary);
221
224
  if (!hashmap_file) {
222
225
  throw std::ifstream::failure("Unable to open file " + filename);
223
226
  }
224
- llama_ngram_cache ngram_cache;
227
+ common_ngram_cache ngram_cache;
225
228
 
226
- llama_ngram ngram;
229
+ common_ngram ngram;
227
230
  int32_t ntokens;
228
231
  llama_token token;
229
232
  int32_t count;
@@ -232,11 +235,11 @@ llama_ngram_cache llama_ngram_cache_load(std::string & filename) {
232
235
  char * ntokensc = reinterpret_cast<char*>(&ntokens);
233
236
  char * tokenc = reinterpret_cast<char*>(&token);
234
237
  char * countc = reinterpret_cast<char*>(&count);
235
- while(hashmap_file.read(ngramc, sizeof(llama_ngram))) {
238
+ while(hashmap_file.read(ngramc, sizeof(common_ngram))) {
236
239
  GGML_ASSERT(!hashmap_file.eof());
237
240
  GGML_ASSERT(hashmap_file.read(ntokensc, sizeof(int32_t)));
238
241
  GGML_ASSERT(ntokens > 0);
239
- llama_ngram_cache_part token_counts;
242
+ common_ngram_cache_part token_counts;
240
243
 
241
244
  for (int i = 0; i < ntokens; ++i) {
242
245
  GGML_ASSERT(!hashmap_file.eof());
@@ -254,12 +257,12 @@ llama_ngram_cache llama_ngram_cache_load(std::string & filename) {
254
257
  return ngram_cache;
255
258
  }
256
259
 
257
- void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram_cache & ngram_cache_add) {
258
- for (std::pair<llama_ngram, llama_ngram_cache_part> ngram_part : ngram_cache_add) {
259
- const llama_ngram ngram = ngram_part.first;
260
- llama_ngram_cache_part part = ngram_part.second;
260
+ void common_ngram_cache_merge(common_ngram_cache & ngram_cache_target, common_ngram_cache & ngram_cache_add) {
261
+ for (std::pair<common_ngram, common_ngram_cache_part> ngram_part : ngram_cache_add) {
262
+ const common_ngram ngram = ngram_part.first;
263
+ common_ngram_cache_part part = ngram_part.second;
261
264
 
262
- llama_ngram_cache::iterator part_merged_it = ngram_cache_target.find(ngram);
265
+ common_ngram_cache::iterator part_merged_it = ngram_cache_target.find(ngram);
263
266
  if (part_merged_it == ngram_cache_target.end()) {
264
267
  ngram_cache_target.emplace(ngram, part);
265
268
  continue;
@@ -270,7 +273,7 @@ void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram
270
273
  const int32_t count = token_count.second;
271
274
  GGML_ASSERT(count > 0);
272
275
 
273
- llama_ngram_cache_part::iterator token_count_merged_it = part_merged_it->second.find(token);
276
+ common_ngram_cache_part::iterator token_count_merged_it = part_merged_it->second.find(token);
274
277
  if (token_count_merged_it == part_merged_it->second.end()) {
275
278
  part_merged_it->second.emplace(token, count);
276
279
  continue;
@@ -12,22 +12,22 @@
12
12
 
13
13
  // Data structures to map n-grams to empirical token probabilities:
14
14
 
15
- struct llama_ngram {
15
+ struct common_ngram {
16
16
  llama_token tokens[LLAMA_NGRAM_MAX];
17
17
 
18
- llama_ngram() {
18
+ common_ngram() {
19
19
  for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
20
20
  tokens[i] = -1;
21
21
  }
22
22
  }
23
23
 
24
- llama_ngram(const llama_token * input, const int ngram_size) {
24
+ common_ngram(const llama_token * input, const int ngram_size) {
25
25
  for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
26
26
  tokens[i] = i < ngram_size ? input[i] : -1;
27
27
  }
28
28
  }
29
29
 
30
- bool operator==(const llama_ngram & other) const {
30
+ bool operator==(const common_ngram & other) const {
31
31
  for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
32
32
  if (tokens[i] != other.tokens[i]) {
33
33
  return false;
@@ -37,28 +37,28 @@ struct llama_ngram {
37
37
  }
38
38
  };
39
39
 
40
- struct llama_token_hash_function {
40
+ struct common_token_hash_function {
41
41
  size_t operator()(const llama_token token) const {
42
42
  // see https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
43
43
  return token * 11400714819323198485llu;
44
44
  }
45
45
  };
46
46
 
47
- struct llama_ngram_hash_function {
48
- size_t operator()(const llama_ngram & ngram) const {
49
- size_t hash = llama_token_hash_function{}(ngram.tokens[0]);
47
+ struct common_ngram_hash_function {
48
+ size_t operator()(const common_ngram & ngram) const {
49
+ size_t hash = common_token_hash_function{}(ngram.tokens[0]);
50
50
  for (int i = 1; i < LLAMA_NGRAM_MAX; ++i) {
51
- hash ^= llama_token_hash_function{}(ngram.tokens[i]);
51
+ hash ^= common_token_hash_function{}(ngram.tokens[i]);
52
52
  }
53
53
  return hash;
54
54
  }
55
55
  };
56
56
 
57
57
  // token -> number of times token has been seen
58
- typedef std::unordered_map<llama_token, int32_t> llama_ngram_cache_part;
58
+ typedef std::unordered_map<llama_token, int32_t> common_ngram_cache_part;
59
59
 
60
60
  // n-gram -> empirical distribution of following tokens
61
- typedef std::unordered_map<llama_ngram, llama_ngram_cache_part, llama_ngram_hash_function> llama_ngram_cache;
61
+ typedef std::unordered_map<common_ngram, common_ngram_cache_part, common_ngram_hash_function> common_ngram_cache;
62
62
 
63
63
 
64
64
  // Update an ngram cache with tokens.
@@ -70,8 +70,8 @@ typedef std::unordered_map<llama_ngram, llama_ngram_cache_part, llama_ngram_hash
70
70
  //
71
71
  // In order to get correct results inp_data can ONLY BE APPENDED TO.
72
72
  // Changes in the middle need a complete rebuild.
73
- void llama_ngram_cache_update(
74
- llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector<llama_token> & inp_data, int nnew, bool print_progress);
73
+ void common_ngram_cache_update(
74
+ common_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector<llama_token> & inp_data, int nnew, bool print_progress);
75
75
 
76
76
  // Try to draft tokens from ngram caches.
77
77
  // inp: the tokens generated so far.
@@ -81,21 +81,21 @@ void llama_ngram_cache_update(
81
81
  // nc_context: ngram cache based on current context.
82
82
  // nc_dynamic: ngram cache based on previous user generations.
83
83
  // nc_static: ngram cache generated from a large text corpus, used for validation.
84
- void llama_ngram_cache_draft(
84
+ void common_ngram_cache_draft(
85
85
  std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
86
- llama_ngram_cache & nc_context, llama_ngram_cache & nc_dynamic, llama_ngram_cache & nc_static);
86
+ common_ngram_cache & nc_context, common_ngram_cache & nc_dynamic, common_ngram_cache & nc_static);
87
87
 
88
88
  // Save an ngram cache to a file.
89
89
  // ngram_cache: the ngram cache to save.
90
90
  // filename: the path under which to save the ngram cache.
91
- void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filename);
91
+ void common_ngram_cache_save(common_ngram_cache & ngram_cache, std::string & filename);
92
92
 
93
- // Load an ngram cache saved with llama_ngram_cache_save.
93
+ // Load an ngram cache saved with common_ngram_cache_save.
94
94
  // filename: the path from which to load the ngram cache.
95
95
  // returns: an ngram cache containing the information saved to filename.
96
- llama_ngram_cache llama_ngram_cache_load(std::string & filename);
96
+ common_ngram_cache common_ngram_cache_load(std::string & filename);
97
97
 
98
98
  // Merge two ngram caches.
99
99
  // ngram_cache_target: the ngram cache to which to add the information from ngram_cache_add.
100
100
  // ngram_cache_add: the ngram cache to add to ngram_cache_target.
101
- void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram_cache & ngram_cache_add);
101
+ void common_ngram_cache_merge(common_ngram_cache & ngram_cache_target, common_ngram_cache & ngram_cache_add);