@fugood/llama.node 0.3.1 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (252) hide show
  1. package/CMakeLists.txt +1 -8
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/package.json +4 -2
  17. package/src/DetokenizeWorker.cpp +1 -1
  18. package/src/EmbeddingWorker.cpp +2 -2
  19. package/src/LlamaCompletionWorker.cpp +10 -10
  20. package/src/LlamaCompletionWorker.h +2 -2
  21. package/src/LlamaContext.cpp +14 -17
  22. package/src/TokenizeWorker.cpp +1 -1
  23. package/src/common.hpp +5 -4
  24. package/src/llama.cpp/.github/workflows/build.yml +137 -29
  25. package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
  26. package/src/llama.cpp/.github/workflows/docker.yml +46 -34
  27. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
  28. package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
  29. package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
  30. package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
  31. package/src/llama.cpp/.github/workflows/server.yml +7 -0
  32. package/src/llama.cpp/CMakeLists.txt +26 -11
  33. package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
  34. package/src/llama.cpp/common/CMakeLists.txt +10 -10
  35. package/src/llama.cpp/common/arg.cpp +2041 -0
  36. package/src/llama.cpp/common/arg.h +77 -0
  37. package/src/llama.cpp/common/common.cpp +523 -1861
  38. package/src/llama.cpp/common/common.h +234 -106
  39. package/src/llama.cpp/common/console.cpp +3 -0
  40. package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
  41. package/src/llama.cpp/common/log.cpp +401 -0
  42. package/src/llama.cpp/common/log.h +66 -698
  43. package/src/llama.cpp/common/ngram-cache.cpp +39 -36
  44. package/src/llama.cpp/common/ngram-cache.h +19 -19
  45. package/src/llama.cpp/common/sampling.cpp +356 -350
  46. package/src/llama.cpp/common/sampling.h +62 -139
  47. package/src/llama.cpp/common/stb_image.h +5990 -6398
  48. package/src/llama.cpp/docs/build.md +72 -17
  49. package/src/llama.cpp/examples/CMakeLists.txt +1 -2
  50. package/src/llama.cpp/examples/batched/batched.cpp +49 -65
  51. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +42 -53
  52. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
  53. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +22 -22
  54. package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
  55. package/src/llama.cpp/examples/embedding/embedding.cpp +147 -91
  56. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +37 -37
  57. package/src/llama.cpp/examples/export-lora/export-lora.cpp +39 -38
  58. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
  59. package/src/llama.cpp/examples/{baby-llama → gen-docs}/CMakeLists.txt +2 -2
  60. package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
  61. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
  62. package/src/llama.cpp/examples/gritlm/gritlm.cpp +46 -39
  63. package/src/llama.cpp/examples/imatrix/imatrix.cpp +75 -69
  64. package/src/llama.cpp/examples/infill/infill.cpp +131 -192
  65. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +276 -178
  66. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  67. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +40 -36
  68. package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
  69. package/src/llama.cpp/examples/llava/clip.cpp +686 -150
  70. package/src/llama.cpp/examples/llava/clip.h +11 -2
  71. package/src/llama.cpp/examples/llava/llava-cli.cpp +60 -71
  72. package/src/llama.cpp/examples/llava/llava.cpp +146 -26
  73. package/src/llama.cpp/examples/llava/llava.h +2 -3
  74. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
  75. package/src/llama.cpp/examples/llava/requirements.txt +1 -0
  76. package/src/llama.cpp/examples/lookahead/lookahead.cpp +55 -56
  77. package/src/llama.cpp/examples/lookup/lookup-create.cpp +15 -13
  78. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
  79. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +34 -33
  80. package/src/llama.cpp/examples/lookup/lookup.cpp +60 -63
  81. package/src/llama.cpp/examples/main/main.cpp +216 -313
  82. package/src/llama.cpp/examples/parallel/parallel.cpp +58 -59
  83. package/src/llama.cpp/examples/passkey/passkey.cpp +53 -61
  84. package/src/llama.cpp/examples/perplexity/perplexity.cpp +277 -311
  85. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  86. package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
  87. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -12
  88. package/src/llama.cpp/examples/retrieval/retrieval.cpp +57 -52
  89. package/src/llama.cpp/examples/rpc/rpc-server.cpp +27 -2
  90. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +60 -46
  91. package/src/llama.cpp/examples/server/CMakeLists.txt +7 -18
  92. package/src/llama.cpp/examples/server/server.cpp +1347 -1531
  93. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
  94. package/src/llama.cpp/examples/server/utils.hpp +396 -107
  95. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  96. package/src/llama.cpp/examples/simple/simple.cpp +132 -106
  97. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
  98. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
  99. package/src/llama.cpp/examples/speculative/speculative.cpp +153 -124
  100. package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
  101. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  102. package/src/llama.cpp/examples/tokenize/tokenize.cpp +27 -29
  103. package/src/llama.cpp/ggml/CMakeLists.txt +29 -12
  104. package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
  105. package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
  106. package/src/llama.cpp/ggml/include/ggml-backend.h +166 -68
  107. package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
  108. package/src/llama.cpp/ggml/include/ggml-cann.h +17 -19
  109. package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
  110. package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
  111. package/src/llama.cpp/ggml/include/ggml-cuda.h +17 -17
  112. package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
  113. package/src/llama.cpp/ggml/include/ggml-metal.h +13 -12
  114. package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
  115. package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
  116. package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
  117. package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
  118. package/src/llama.cpp/ggml/include/ggml.h +272 -505
  119. package/src/llama.cpp/ggml/src/CMakeLists.txt +69 -1110
  120. package/src/llama.cpp/ggml/src/ggml-aarch64.c +52 -2116
  121. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
  122. package/src/llama.cpp/ggml/src/ggml-alloc.c +29 -27
  123. package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
  124. package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
  125. package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
  126. package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
  127. package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
  128. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +144 -81
  129. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
  130. package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +394 -635
  131. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
  132. package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +217 -70
  133. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
  134. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
  135. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
  136. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
  137. package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
  138. package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +458 -353
  139. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
  140. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
  141. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
  142. package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
  143. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
  144. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
  145. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
  146. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +371 -0
  147. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
  148. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
  149. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
  150. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
  151. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1885 -0
  152. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
  153. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
  154. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
  155. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
  156. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
  157. package/src/llama.cpp/ggml/src/ggml-impl.h +380 -584
  158. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
  159. package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +233 -87
  160. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
  161. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
  162. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
  163. package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
  164. package/src/llama.cpp/ggml/src/ggml-quants.c +369 -9994
  165. package/src/llama.cpp/ggml/src/ggml-quants.h +78 -110
  166. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
  167. package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +560 -335
  168. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
  169. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
  170. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +51 -0
  171. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +310 -0
  172. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
  173. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
  174. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
  175. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
  176. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
  177. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
  178. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
  179. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +18 -25
  180. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
  181. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
  182. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
  183. package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3350 -3980
  184. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
  185. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
  186. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +70 -68
  187. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +9 -6
  188. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
  189. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
  190. package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +8 -0
  191. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
  192. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
  193. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
  194. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
  195. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
  196. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
  197. package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
  198. package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
  199. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
  200. package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2034 -1718
  201. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +2 -0
  202. package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +152 -185
  203. package/src/llama.cpp/ggml/src/ggml.c +2075 -16579
  204. package/src/llama.cpp/include/llama.h +296 -285
  205. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
  206. package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
  207. package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
  208. package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
  209. package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
  210. package/src/llama.cpp/src/CMakeLists.txt +2 -1
  211. package/src/llama.cpp/src/llama-grammar.cpp +721 -122
  212. package/src/llama.cpp/src/llama-grammar.h +120 -15
  213. package/src/llama.cpp/src/llama-impl.h +156 -1
  214. package/src/llama.cpp/src/llama-sampling.cpp +2058 -346
  215. package/src/llama.cpp/src/llama-sampling.h +39 -47
  216. package/src/llama.cpp/src/llama-vocab.cpp +390 -127
  217. package/src/llama.cpp/src/llama-vocab.h +60 -20
  218. package/src/llama.cpp/src/llama.cpp +6215 -3263
  219. package/src/llama.cpp/src/unicode-data.cpp +6 -4
  220. package/src/llama.cpp/src/unicode-data.h +4 -4
  221. package/src/llama.cpp/src/unicode.cpp +15 -7
  222. package/src/llama.cpp/tests/CMakeLists.txt +4 -2
  223. package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
  224. package/src/llama.cpp/tests/test-backend-ops.cpp +1725 -297
  225. package/src/llama.cpp/tests/test-barrier.cpp +94 -0
  226. package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
  227. package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
  228. package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
  229. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +23 -8
  230. package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
  231. package/src/llama.cpp/tests/test-log.cpp +39 -0
  232. package/src/llama.cpp/tests/test-opt.cpp +853 -142
  233. package/src/llama.cpp/tests/test-quantize-fns.cpp +28 -19
  234. package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
  235. package/src/llama.cpp/tests/test-rope.cpp +2 -1
  236. package/src/llama.cpp/tests/test-sampling.cpp +226 -142
  237. package/src/llama.cpp/tests/test-tokenizer-0.cpp +56 -36
  238. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
  239. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
  240. package/patches/llama.patch +0 -22
  241. package/src/llama.cpp/.github/workflows/bench.yml +0 -310
  242. package/src/llama.cpp/common/grammar-parser.cpp +0 -536
  243. package/src/llama.cpp/common/grammar-parser.h +0 -29
  244. package/src/llama.cpp/common/train.cpp +0 -1513
  245. package/src/llama.cpp/common/train.h +0 -233
  246. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1640
  247. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
  248. package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
  249. package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +0 -1027
  250. package/src/llama.cpp/tests/test-grad0.cpp +0 -1566
  251. /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
  252. /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
package/CMakeLists.txt CHANGED
@@ -62,14 +62,7 @@ if (VULKAN_SDK)
62
62
  find_package(Vulkan REQUIRED)
63
63
  endif()
64
64
 
65
- find_program(PATCH patch REQUIRED)
66
-
67
- add_custom_target(
68
- patch ALL
69
- COMMAND ${PATCH} -p1 -N < ${CMAKE_SOURCE_DIR}/patches/llama.patch || true
70
- WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/src/llama.cpp
71
- COMMENT "Applying patches"
72
- )
65
+ set(LLAMA_BUILD_COMMON ON CACHE BOOL "Build common")
73
66
 
74
67
  set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build shared libraries")
75
68
  add_subdirectory("src/llama.cpp")
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "0.3.1",
4
+ "version": "0.3.3",
5
5
  "description": "Llama.cpp for Node.js",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -11,7 +11,8 @@
11
11
  "build-native": "cmake-js compile",
12
12
  "clean": "rimraf build",
13
13
  "prepare": "husky",
14
- "commitlint": "commitlint --edit"
14
+ "commitlint": "commitlint --edit",
15
+ "release": "release-it"
15
16
  },
16
17
  "repository": {
17
18
  "type": "git",
@@ -55,6 +56,7 @@
55
56
  "cmake-js": "^7.3.0",
56
57
  "husky": "^9.0.11",
57
58
  "jest": "^29.7.0",
59
+ "release-it": "^17.7.0",
58
60
  "rimraf": "^6.0.1",
59
61
  "typescript": "^5.4.5",
60
62
  "wait-for-expect": "^3.0.2"
@@ -8,7 +8,7 @@ DetokenizeWorker::DetokenizeWorker(const Napi::CallbackInfo &info,
8
8
  _tokens(std::move(tokens)) {}
9
9
 
10
10
  void DetokenizeWorker::Execute() {
11
- const auto text = ::llama_detokenize(_sess->context(), _tokens);
11
+ const auto text = ::common_detokenize(_sess->context(), _tokens);
12
12
  _text = std::move(text);
13
13
  }
14
14
 
@@ -7,7 +7,7 @@ EmbeddingWorker::EmbeddingWorker(const Napi::CallbackInfo &info,
7
7
 
8
8
  void EmbeddingWorker::Execute() {
9
9
  llama_kv_cache_clear(_sess->context());
10
- auto tokens = ::llama_tokenize(_sess->context(), _text, true);
10
+ auto tokens = ::common_tokenize(_sess->context(), _text, true);
11
11
  // add SEP if not present
12
12
  if (tokens.empty() || tokens.back() != llama_token_sep(_sess->model())) {
13
13
  tokens.push_back(llama_token_sep(_sess->model()));
@@ -16,7 +16,7 @@ void EmbeddingWorker::Execute() {
16
16
  do {
17
17
  int ret =
18
18
  llama_decode(_sess->context(),
19
- llama_batch_get_one(tokens.data(), tokens.size(), 0, 0));
19
+ llama_batch_get_one(tokens.data(), tokens.size()));
20
20
  if (ret < 0) {
21
21
  SetError("Failed to inference, code: " + std::to_string(ret));
22
22
  break;
@@ -34,7 +34,7 @@ size_t findStoppingStrings(const std::string &text,
34
34
 
35
35
  LlamaCompletionWorker::LlamaCompletionWorker(
36
36
  const Napi::CallbackInfo &info, LlamaSessionPtr &sess,
37
- Napi::Function callback, gpt_params params,
37
+ Napi::Function callback, common_params params,
38
38
  std::vector<std::string> stop_words)
39
39
  : AsyncWorker(info.Env()), Deferred(info.Env()), _sess(sess),
40
40
  _params(params), _stop_words(stop_words) {
@@ -59,16 +59,16 @@ void LlamaCompletionWorker::Execute() {
59
59
  size_t n_cur = 0;
60
60
  size_t n_input = 0;
61
61
  const auto model = _sess->model();
62
- const bool add_bos = llama_should_add_bos_token(model);
62
+ const bool add_bos = llama_add_bos_token(model);
63
63
  auto ctx = _sess->context();
64
64
 
65
- llama_set_rng_seed(ctx, _params.seed);
65
+ auto sparams = llama_sampler_chain_default_params();
66
66
 
67
- LlamaCppSampling sampling{llama_sampling_init(_params.sparams),
68
- llama_sampling_free};
67
+ LlamaCppSampling sampling{common_sampler_init(model, _params.sparams),
68
+ common_sampler_free};
69
69
 
70
70
  std::vector<llama_token> prompt_tokens =
71
- ::llama_tokenize(ctx, _params.prompt, add_bos);
71
+ ::common_tokenize(ctx, _params.prompt, add_bos);
72
72
  n_input = prompt_tokens.size();
73
73
  if (_sess->tokens_ptr()->size() > 0) {
74
74
  n_cur = common_part(*(_sess->tokens_ptr()), prompt_tokens);
@@ -102,18 +102,18 @@ void LlamaCompletionWorker::Execute() {
102
102
  _result.truncated = true;
103
103
  }
104
104
  int ret = llama_decode(
105
- ctx, llama_batch_get_one(embd->data() + n_cur, n_input, n_cur, 0));
105
+ ctx, llama_batch_get_one(embd->data() + n_cur, n_input));
106
106
  if (ret < 0) {
107
107
  SetError("Failed to decode token, code: " + std::to_string(ret));
108
108
  break;
109
109
  }
110
110
  // sample the next token
111
111
  const llama_token new_token_id =
112
- llama_sampling_sample(sampling.get(), ctx, nullptr);
113
- llama_sampling_accept(sampling.get(), ctx, new_token_id, true);
112
+ common_sampler_sample(sampling.get(), ctx, -1);
113
+ common_sampler_accept(sampling.get(), new_token_id, true);
114
114
  // prepare the next batch
115
115
  embd->emplace_back(new_token_id);
116
- auto token = llama_token_to_piece(ctx, new_token_id);
116
+ auto token = common_token_to_piece(ctx, new_token_id);
117
117
  _result.text += token;
118
118
  n_cur += n_input;
119
119
  _result.tokens_evaluated += n_input;
@@ -12,7 +12,7 @@ class LlamaCompletionWorker : public Napi::AsyncWorker,
12
12
  public Napi::Promise::Deferred {
13
13
  public:
14
14
  LlamaCompletionWorker(const Napi::CallbackInfo &info, LlamaSessionPtr &sess,
15
- Napi::Function callback, gpt_params params,
15
+ Napi::Function callback, common_params params,
16
16
  std::vector<std::string> stop_words = {});
17
17
 
18
18
  ~LlamaCompletionWorker();
@@ -28,7 +28,7 @@ protected:
28
28
 
29
29
  private:
30
30
  LlamaSessionPtr _sess;
31
- gpt_params _params;
31
+ common_params _params;
32
32
  std::vector<std::string> _stop_words;
33
33
  Napi::ThreadSafeFunction _tsfn;
34
34
  bool _has_callback = false;
@@ -7,8 +7,8 @@
7
7
  #include "SaveSessionWorker.h"
8
8
  #include "TokenizeWorker.h"
9
9
 
10
- std::vector<llama_chat_msg> get_messages(Napi::Array messages) {
11
- std::vector<llama_chat_msg> chat;
10
+ std::vector<common_chat_msg> get_messages(Napi::Array messages) {
11
+ std::vector<common_chat_msg> chat;
12
12
  for (size_t i = 0; i < messages.Length(); i++) {
13
13
  auto message = messages.Get(i).As<Napi::Object>();
14
14
  chat.push_back({
@@ -67,7 +67,7 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
67
67
  }
68
68
  auto options = info[0].As<Napi::Object>();
69
69
 
70
- gpt_params params;
70
+ common_params params;
71
71
  params.model = get_option<std::string>(options, "model", "");
72
72
  if (params.model.empty()) {
73
73
  Napi::TypeError::New(env, "Model is required").ThrowAsJavaScriptException();
@@ -75,7 +75,7 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
75
75
  params.embedding = get_option<bool>(options, "embedding", false);
76
76
  params.n_ctx = get_option<int32_t>(options, "n_ctx", 512);
77
77
  params.n_batch = get_option<int32_t>(options, "n_batch", 2048);
78
- params.n_threads =
78
+ params.cpuparams.n_threads =
79
79
  get_option<int32_t>(options, "n_threads", cpu_get_num_math() / 2);
80
80
  params.n_gpu_layers = get_option<int32_t>(options, "n_gpu_layers", -1);
81
81
  params.use_mlock = get_option<bool>(options, "use_mlock", false);
@@ -86,17 +86,15 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
86
86
  llama_backend_init();
87
87
  llama_numa_init(params.numa);
88
88
 
89
- llama_model *model;
90
- llama_context *ctx;
91
- std::tie(model, ctx) = llama_init_from_gpt_params(params);
89
+ auto result = common_init_from_params(params);
92
90
 
93
- if (model == nullptr || ctx == nullptr) {
91
+ if (result.model == nullptr || result.context == nullptr) {
94
92
  Napi::TypeError::New(env, "Failed to load model")
95
93
  .ThrowAsJavaScriptException();
96
94
  }
97
95
 
98
- _sess = std::make_shared<LlamaSession>(model, ctx, params);
99
- _info = gpt_params_get_system_info(params);
96
+ _sess = std::make_shared<LlamaSession>(result.model, result.context, params);
97
+ _info = common_params_get_system_info(params);
100
98
  }
101
99
 
102
100
  // getSystemInfo(): string
@@ -111,7 +109,7 @@ Napi::Value LlamaContext::GetFormattedChat(const Napi::CallbackInfo &info) {
111
109
  Napi::TypeError::New(env, "Array expected").ThrowAsJavaScriptException();
112
110
  }
113
111
  auto messages = info[0].As<Napi::Array>();
114
- auto formatted = llama_chat_apply_template(_sess->model(), "", get_messages(messages), true);
112
+ auto formatted = common_chat_apply_template(_sess->model(), "", get_messages(messages), true);
115
113
  return Napi::String::New(env, formatted);
116
114
  }
117
115
 
@@ -135,10 +133,10 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
135
133
  }
136
134
  auto options = info[0].As<Napi::Object>();
137
135
 
138
- gpt_params params = _sess->params();
136
+ common_params params = _sess->params();
139
137
  if (options.Has("messages") && options.Get("messages").IsArray()) {
140
138
  auto messages = options.Get("messages").As<Napi::Array>();
141
- auto formatted = llama_chat_apply_template(_sess->model(), "", get_messages(messages), true);
139
+ auto formatted = common_chat_apply_template(_sess->model(), "", get_messages(messages), true);
142
140
  params.prompt = formatted;
143
141
  } else {
144
142
  params.prompt = get_option<std::string>(options, "prompt", "");
@@ -152,7 +150,6 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
152
150
  params.sparams.top_k = get_option<int32_t>(options, "top_k", 40);
153
151
  params.sparams.top_p = get_option<float>(options, "top_p", 0.95f);
154
152
  params.sparams.min_p = get_option<float>(options, "min_p", 0.05f);
155
- params.sparams.tfs_z = get_option<float>(options, "tfs_z", 1.00f);
156
153
  params.sparams.mirostat = get_option<int32_t>(options, "mirostat", 0.00f);
157
154
  params.sparams.mirostat_tau =
158
155
  get_option<float>(options, "mirostat_tau", 5.00f);
@@ -167,11 +164,11 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
167
164
  params.sparams.penalty_present =
168
165
  get_option<float>(options, "penalty_present", 0.00f);
169
166
  params.sparams.penalize_nl = get_option<bool>(options, "penalize_nl", false);
170
- params.sparams.typical_p = get_option<float>(options, "typical_p", 1.00f);
171
- params.ignore_eos = get_option<float>(options, "ignore_eos", false);
167
+ params.sparams.typ_p = get_option<float>(options, "typical_p", 1.00f);
168
+ params.sparams.ignore_eos = get_option<float>(options, "ignore_eos", false);
172
169
  params.sparams.grammar = get_option<std::string>(options, "grammar", "");
173
170
  params.n_keep = get_option<int32_t>(options, "n_keep", 0);
174
- params.seed = get_option<int32_t>(options, "seed", LLAMA_DEFAULT_SEED);
171
+ params.sparams.seed = get_option<int32_t>(options, "seed", LLAMA_DEFAULT_SEED);
175
172
  std::vector<std::string> stop_words;
176
173
  if (options.Has("stop") && options.Get("stop").IsArray()) {
177
174
  auto stop_words_array = options.Get("stop").As<Napi::Array>();
@@ -6,7 +6,7 @@ TokenizeWorker::TokenizeWorker(const Napi::CallbackInfo &info,
6
6
  : AsyncWorker(info.Env()), Deferred(info.Env()), _sess(sess), _text(text) {}
7
7
 
8
8
  void TokenizeWorker::Execute() {
9
- const auto tokens = ::llama_tokenize(_sess->context(), _text, false);
9
+ const auto tokens = ::common_tokenize(_sess->context(), _text, false);
10
10
  _result.tokens = std::move(tokens);
11
11
  }
12
12
 
package/src/common.hpp CHANGED
@@ -1,6 +1,7 @@
1
1
  #pragma once
2
2
 
3
3
  #include "common/common.h"
4
+ #include "common/sampling.h"
4
5
  #include "llama.h"
5
6
  #include <memory>
6
7
  #include <mutex>
@@ -12,7 +13,7 @@
12
13
 
13
14
  typedef std::unique_ptr<llama_model, decltype(&llama_free_model)> LlamaCppModel;
14
15
  typedef std::unique_ptr<llama_context, decltype(&llama_free)> LlamaCppContext;
15
- typedef std::unique_ptr<llama_sampling_context, decltype(&llama_sampling_free)>
16
+ typedef std::unique_ptr<common_sampler, decltype(&common_sampler_free)>
16
17
  LlamaCppSampling;
17
18
  typedef std::unique_ptr<llama_batch, decltype(&llama_batch_free)> LlamaCppBatch;
18
19
 
@@ -46,7 +47,7 @@ constexpr T get_option(const Napi::Object &options, const std::string &name,
46
47
 
47
48
  class LlamaSession {
48
49
  public:
49
- LlamaSession(llama_model *model, llama_context *ctx, gpt_params params)
50
+ LlamaSession(llama_model *model, llama_context *ctx, common_params params)
50
51
  : model_(LlamaCppModel(model, llama_free_model)),
51
52
  ctx_(LlamaCppContext(ctx, llama_free)), params_(params) {
52
53
  tokens_.reserve(params.n_ctx);
@@ -64,7 +65,7 @@ public:
64
65
  tokens_ = std::move(tokens);
65
66
  }
66
67
 
67
- inline const gpt_params &params() const { return params_; }
68
+ inline const common_params &params() const { return params_; }
68
69
 
69
70
  inline std::mutex &get_mutex() { return mutex; }
70
71
 
@@ -78,7 +79,7 @@ public:
78
79
  private:
79
80
  LlamaCppModel model_;
80
81
  LlamaCppContext ctx_;
81
- const gpt_params params_;
82
+ const common_params params_;
82
83
  std::vector<llama_token> tokens_{};
83
84
  std::mutex mutex;
84
85
  };