@fugood/llama.node 0.3.3 → 0.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (225) hide show
  1. package/CMakeLists.txt +5 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  8. package/bin/win32/arm64/llama-node.node +0 -0
  9. package/bin/win32/arm64/node.lib +0 -0
  10. package/bin/win32/x64/llama-node.node +0 -0
  11. package/bin/win32/x64/node.lib +0 -0
  12. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  13. package/bin/win32-vulkan/arm64/node.lib +0 -0
  14. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/x64/node.lib +0 -0
  16. package/lib/binding.ts +29 -1
  17. package/package.json +1 -1
  18. package/src/EmbeddingWorker.cpp +15 -5
  19. package/src/EmbeddingWorker.h +2 -1
  20. package/src/LlamaCompletionWorker.cpp +17 -1
  21. package/src/LlamaContext.cpp +86 -18
  22. package/src/LlamaContext.h +2 -0
  23. package/src/llama.cpp/.github/workflows/build.yml +197 -159
  24. package/src/llama.cpp/.github/workflows/docker.yml +5 -8
  25. package/src/llama.cpp/.github/workflows/python-lint.yml +8 -1
  26. package/src/llama.cpp/.github/workflows/server.yml +21 -14
  27. package/src/llama.cpp/CMakeLists.txt +11 -6
  28. package/src/llama.cpp/Sources/llama/llama.h +4 -0
  29. package/src/llama.cpp/cmake/common.cmake +33 -0
  30. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +11 -0
  31. package/src/llama.cpp/common/CMakeLists.txt +6 -2
  32. package/src/llama.cpp/common/arg.cpp +426 -245
  33. package/src/llama.cpp/common/common.cpp +143 -80
  34. package/src/llama.cpp/common/common.h +81 -24
  35. package/src/llama.cpp/common/sampling.cpp +53 -19
  36. package/src/llama.cpp/common/sampling.h +22 -1
  37. package/src/llama.cpp/common/speculative.cpp +274 -0
  38. package/src/llama.cpp/common/speculative.h +28 -0
  39. package/src/llama.cpp/docs/build.md +101 -148
  40. package/src/llama.cpp/examples/CMakeLists.txt +32 -13
  41. package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
  42. package/src/llama.cpp/examples/batched/batched.cpp +5 -4
  43. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
  44. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
  45. package/src/llama.cpp/examples/cvector-generator/CMakeLists.txt +1 -1
  46. package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +1 -1
  47. package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
  48. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +3 -2
  49. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
  50. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +1 -1
  51. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +4 -7
  52. package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +1 -1
  53. package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
  54. package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +8 -1
  55. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
  56. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +2 -2
  57. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
  58. package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
  59. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
  60. package/src/llama.cpp/examples/imatrix/imatrix.cpp +11 -2
  61. package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
  62. package/src/llama.cpp/examples/infill/infill.cpp +1 -1
  63. package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +1 -1
  64. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +405 -316
  65. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  66. package/src/llama.cpp/examples/llava/CMakeLists.txt +10 -3
  67. package/src/llama.cpp/examples/llava/clip.cpp +262 -66
  68. package/src/llama.cpp/examples/llava/clip.h +8 -2
  69. package/src/llama.cpp/examples/llava/llava-cli.cpp +1 -1
  70. package/src/llama.cpp/examples/llava/llava.cpp +46 -19
  71. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +1 -1
  72. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +581 -0
  73. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
  74. package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -1
  75. package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
  76. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -1
  77. package/src/llama.cpp/examples/lookup/lookup.cpp +2 -2
  78. package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
  79. package/src/llama.cpp/examples/main/main.cpp +9 -5
  80. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +1 -1
  81. package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
  82. package/src/llama.cpp/examples/parallel/parallel.cpp +1 -1
  83. package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
  84. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
  85. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  86. package/src/llama.cpp/examples/quantize/quantize.cpp +0 -3
  87. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
  88. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
  89. package/src/llama.cpp/examples/retrieval/retrieval.cpp +4 -4
  90. package/src/llama.cpp/examples/run/CMakeLists.txt +5 -0
  91. package/src/llama.cpp/examples/run/run.cpp +911 -0
  92. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
  93. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +4 -4
  94. package/src/llama.cpp/examples/server/CMakeLists.txt +3 -7
  95. package/src/llama.cpp/examples/server/server.cpp +1758 -886
  96. package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
  97. package/src/llama.cpp/examples/server/utils.hpp +94 -304
  98. package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
  99. package/src/llama.cpp/examples/simple/simple.cpp +4 -0
  100. package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +1 -1
  101. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +3 -0
  102. package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
  103. package/src/llama.cpp/examples/speculative/speculative.cpp +16 -15
  104. package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +5 -0
  105. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +265 -0
  106. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
  107. package/src/llama.cpp/examples/tokenize/tokenize.cpp +1 -1
  108. package/src/llama.cpp/examples/tts/CMakeLists.txt +5 -0
  109. package/src/llama.cpp/examples/tts/tts.cpp +932 -0
  110. package/src/llama.cpp/ggml/CMakeLists.txt +46 -34
  111. package/src/llama.cpp/ggml/include/ggml-backend.h +16 -0
  112. package/src/llama.cpp/ggml/include/ggml-cpu.h +7 -49
  113. package/src/llama.cpp/ggml/include/ggml-opencl.h +26 -0
  114. package/src/llama.cpp/ggml/include/ggml.h +106 -24
  115. package/src/llama.cpp/ggml/src/CMakeLists.txt +73 -24
  116. package/src/llama.cpp/ggml/src/ggml-alloc.c +0 -1
  117. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +51 -11
  118. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +379 -22
  119. package/src/llama.cpp/ggml/src/ggml-backend.cpp +4 -4
  120. package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +3 -7
  121. package/src/llama.cpp/ggml/src/ggml-blas/ggml-blas.cpp +5 -2
  122. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +33 -3
  123. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +456 -111
  124. package/src/llama.cpp/ggml/src/ggml-cann/common.h +6 -3
  125. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +95 -35
  126. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -5
  127. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +22 -9
  128. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +24 -13
  129. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +23 -13
  130. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +11 -0
  131. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +10 -0
  132. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +10 -0
  133. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +17 -0
  134. package/src/llama.cpp/ggml/src/ggml-common.h +42 -42
  135. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +288 -213
  136. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
  137. package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
  138. package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/common.h +19 -22
  139. package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/mmq.cpp +93 -92
  140. package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/mmq.h +2 -9
  141. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
  142. package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.c → ggml-cpu-aarch64.cpp} +892 -190
  143. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +2 -24
  144. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
  145. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
  146. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +15 -0
  147. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +38 -25
  148. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
  149. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
  150. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +552 -399
  151. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +101 -136
  152. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +2 -2
  153. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +7 -10
  154. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +8 -0
  155. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -6
  156. package/src/llama.cpp/ggml/src/ggml-impl.h +32 -11
  157. package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +13 -9
  158. package/src/llama.cpp/ggml/src/ggml-kompute/ggml-kompute.cpp +131 -64
  159. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +3 -6
  160. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +39 -0
  161. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +14 -7
  162. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +147 -0
  163. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +4004 -0
  164. package/src/llama.cpp/ggml/src/ggml-opt.cpp +67 -80
  165. package/src/llama.cpp/ggml/src/ggml-quants.c +0 -9
  166. package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +3 -5
  167. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +5 -2
  168. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +13 -10
  169. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +2 -11
  170. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +1 -0
  171. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +2 -2
  172. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
  173. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +5 -5
  174. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +32 -13
  175. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +80 -61
  176. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +4 -4
  177. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +159 -114
  178. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +3 -2
  179. package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +6 -6
  180. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +6 -20
  181. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +4 -3
  182. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +8 -8
  183. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +4 -3
  184. package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +7 -7
  185. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +1 -0
  186. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +4 -1
  187. package/src/llama.cpp/ggml/src/ggml-threading.h +4 -2
  188. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +21 -7
  189. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1718 -399
  190. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +3 -1
  191. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +105 -31
  192. package/src/llama.cpp/ggml/src/ggml.c +367 -207
  193. package/src/llama.cpp/include/llama-cpp.h +25 -0
  194. package/src/llama.cpp/include/llama.h +26 -19
  195. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +112 -0
  196. package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +46 -0
  197. package/src/llama.cpp/pocs/CMakeLists.txt +3 -1
  198. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
  199. package/src/llama.cpp/src/CMakeLists.txt +2 -7
  200. package/src/llama.cpp/src/llama-grammar.cpp +15 -15
  201. package/src/llama.cpp/src/llama-grammar.h +2 -5
  202. package/src/llama.cpp/src/llama-sampling.cpp +35 -90
  203. package/src/llama.cpp/src/llama-vocab.cpp +6 -1
  204. package/src/llama.cpp/src/llama.cpp +1748 -640
  205. package/src/llama.cpp/src/unicode.cpp +62 -51
  206. package/src/llama.cpp/src/unicode.h +9 -10
  207. package/src/llama.cpp/tests/CMakeLists.txt +48 -37
  208. package/src/llama.cpp/tests/test-arg-parser.cpp +2 -2
  209. package/src/llama.cpp/tests/test-backend-ops.cpp +140 -21
  210. package/src/llama.cpp/tests/test-chat-template.cpp +50 -4
  211. package/src/llama.cpp/tests/test-gguf.cpp +1303 -0
  212. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -6
  213. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -4
  214. package/src/llama.cpp/tests/test-quantize-fns.cpp +3 -3
  215. package/src/llama.cpp/tests/test-rope.cpp +61 -20
  216. package/src/llama.cpp/tests/test-sampling.cpp +2 -2
  217. package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +0 -72
  218. package/src/llama.cpp/.github/workflows/nix-ci.yml +0 -79
  219. package/src/llama.cpp/.github/workflows/nix-flake-update.yml +0 -22
  220. package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +0 -36
  221. package/src/llama.cpp/ggml/include/ggml-amx.h +0 -25
  222. package/src/llama.cpp/ggml/src/ggml-aarch64.c +0 -129
  223. package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -19
  224. package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +0 -107
  225. package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +0 -446
package/CMakeLists.txt CHANGED
@@ -6,6 +6,11 @@ project (llama-node)
6
6
 
7
7
  set(CMAKE_CXX_STANDARD 17)
8
8
 
9
+ execute_process(COMMAND
10
+ git apply ${CMAKE_CURRENT_SOURCE_DIR}/scripts/llama.cpp.patch
11
+ WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
12
+ )
13
+
9
14
  if(NOT DEFINED napi_build_version)
10
15
  set(napi_build_version 6)
11
16
  endif()
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
package/lib/binding.ts CHANGED
@@ -8,12 +8,15 @@ export type ChatMessage = {
8
8
  export type LlamaModelOptions = {
9
9
  model: string
10
10
  embedding?: boolean
11
+ embd_normalize?: number
12
+ pooling_type?: number
11
13
  n_ctx?: number
12
14
  n_batch?: number
13
15
  n_threads?: number
14
16
  n_gpu_layers?: number
15
17
  use_mlock?: boolean
16
18
  use_mmap?: boolean
19
+ vocab_only?: boolean
17
20
  }
18
21
 
19
22
  export type LlamaCompletionOptions = {
@@ -23,7 +26,21 @@ export type LlamaCompletionOptions = {
23
26
  temperature?: number
24
27
  top_k?: number
25
28
  top_p?: number
26
- repetition_penalty?: number
29
+ min_p?: number
30
+ mirostat?: number
31
+ mirostat_tau?: number
32
+ mirostat_eta?: number
33
+ penalty_last_n?: number
34
+ penalty_repeat?: number
35
+ penalty_freq?: number
36
+ penalty_present?: number
37
+ typ_p?: number
38
+ xtc_threshold?: number
39
+ xtc_probability?: number
40
+ dry_multiplier?: number
41
+ dry_base?: number
42
+ dry_allowed_length?: number
43
+ dry_penalty_last_n?: number
27
44
  n_predict?: number
28
45
  max_length?: number
29
46
  max_tokens?: number
@@ -37,6 +54,16 @@ export type LlamaCompletionResult = {
37
54
  tokens_predicted: number
38
55
  tokens_evaluated: number
39
56
  truncated: boolean
57
+ timings: {
58
+ prompt_n: number
59
+ prompt_ms: number
60
+ prompt_per_token_ms: number
61
+ prompt_per_second: number
62
+ predicted_n: number
63
+ predicted_ms: number
64
+ predicted_per_token_ms: number
65
+ predicted_per_second: number
66
+ }
40
67
  }
41
68
 
42
69
  export type LlamaCompletionToken = {
@@ -54,6 +81,7 @@ export type EmbeddingResult = {
54
81
  export interface LlamaContext {
55
82
  new (options: LlamaModelOptions): LlamaContext
56
83
  getSystemInfo(): string
84
+ getModelInfo(): object
57
85
  getFormattedChat(messages: ChatMessage[]): string
58
86
  completion(options: LlamaCompletionOptions, callback?: (token: LlamaCompletionToken) => void): Promise<LlamaCompletionResult>
59
87
  stopCompletion(): void
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "0.3.3",
4
+ "version": "0.3.5",
5
5
  "description": "Llama.cpp for Node.js",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -2,8 +2,8 @@
2
2
  #include "LlamaContext.h"
3
3
 
4
4
  EmbeddingWorker::EmbeddingWorker(const Napi::CallbackInfo &info,
5
- LlamaSessionPtr &sess, std::string text)
6
- : AsyncWorker(info.Env()), Deferred(info.Env()), _sess(sess), _text(text) {}
5
+ LlamaSessionPtr &sess, std::string text, common_params &params)
6
+ : AsyncWorker(info.Env()), Deferred(info.Env()), _sess(sess), _text(text), _params(params) {}
7
7
 
8
8
  void EmbeddingWorker::Execute() {
9
9
  llama_kv_cache_clear(_sess->context());
@@ -14,20 +14,30 @@ void EmbeddingWorker::Execute() {
14
14
  }
15
15
  const int n_embd = llama_n_embd(_sess->model());
16
16
  do {
17
+ auto ctx = _sess->context();
17
18
  int ret =
18
- llama_decode(_sess->context(),
19
+ llama_decode(ctx,
19
20
  llama_batch_get_one(tokens.data(), tokens.size()));
20
21
  if (ret < 0) {
21
22
  SetError("Failed to inference, code: " + std::to_string(ret));
22
23
  break;
23
24
  }
24
- const float *embd = llama_get_embeddings_seq(_sess->context(), 0);
25
+
26
+ float *embd;
27
+ const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
28
+ if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
29
+ embd = llama_get_embeddings(ctx);
30
+ } else {
31
+ embd = llama_get_embeddings_seq(ctx, 0);
32
+ }
25
33
  if (embd == nullptr) {
26
34
  SetError("Failed to get embeddings");
27
35
  break;
28
36
  }
29
37
  _result.embedding.resize(n_embd);
30
- memcpy(_result.embedding.data(), embd, n_embd * sizeof(float));
38
+ std::vector<float> embedding(embd, embd + n_embd), out(embd, embd + n_embd);
39
+ common_embd_normalize(embedding.data(), out.data(), n_embd, _params.embd_normalize);
40
+ memcpy(_result.embedding.data(), out.data(), n_embd * sizeof(float));
31
41
  } while (false);
32
42
  }
33
43
 
@@ -9,7 +9,7 @@ class EmbeddingWorker : public Napi::AsyncWorker,
9
9
  public Napi::Promise::Deferred {
10
10
  public:
11
11
  EmbeddingWorker(const Napi::CallbackInfo &info, LlamaSessionPtr &sess,
12
- std::string text);
12
+ std::string text, common_params &params);
13
13
 
14
14
  protected:
15
15
  void Execute();
@@ -19,5 +19,6 @@ protected:
19
19
  private:
20
20
  LlamaSessionPtr _sess;
21
21
  std::string _text;
22
+ common_params _params;
22
23
  EmbeddingResult _result;
23
24
  };
@@ -64,7 +64,7 @@ void LlamaCompletionWorker::Execute() {
64
64
 
65
65
  auto sparams = llama_sampler_chain_default_params();
66
66
 
67
- LlamaCppSampling sampling{common_sampler_init(model, _params.sparams),
67
+ LlamaCppSampling sampling{common_sampler_init(model, _params.sampling),
68
68
  common_sampler_free};
69
69
 
70
70
  std::vector<llama_token> prompt_tokens =
@@ -159,6 +159,22 @@ void LlamaCompletionWorker::OnOK() {
159
159
  Napi::Boolean::New(Napi::AsyncWorker::Env(), _result.truncated));
160
160
  result.Set("text",
161
161
  Napi::String::New(Napi::AsyncWorker::Env(), _result.text.c_str()));
162
+
163
+ auto ctx = _sess->context();
164
+ const auto timings_token = llama_perf_context(ctx);
165
+
166
+ auto timingsResult = Napi::Object::New(Napi::AsyncWorker::Env());
167
+ timingsResult.Set("prompt_n", Napi::Number::New(Napi::AsyncWorker::Env(), timings_token.n_p_eval));
168
+ timingsResult.Set("prompt_ms", Napi::Number::New(Napi::AsyncWorker::Env(), timings_token.t_p_eval_ms));
169
+ timingsResult.Set("prompt_per_token_ms", Napi::Number::New(Napi::AsyncWorker::Env(), timings_token.t_p_eval_ms / timings_token.n_p_eval));
170
+ timingsResult.Set("prompt_per_second", Napi::Number::New(Napi::AsyncWorker::Env(), 1e3 / timings_token.t_p_eval_ms * timings_token.n_p_eval));
171
+ timingsResult.Set("predicted_n", Napi::Number::New(Napi::AsyncWorker::Env(), timings_token.n_eval));
172
+ timingsResult.Set("predicted_ms", Napi::Number::New(Napi::AsyncWorker::Env(), timings_token.t_eval_ms));
173
+ timingsResult.Set("predicted_per_token_ms", Napi::Number::New(Napi::AsyncWorker::Env(), timings_token.t_eval_ms / timings_token.n_eval));
174
+ timingsResult.Set("predicted_per_second", Napi::Number::New(Napi::AsyncWorker::Env(), 1e3 / timings_token.t_eval_ms * timings_token.n_eval));
175
+
176
+ result.Set("timings", timingsResult);
177
+
162
178
  Napi::Promise::Deferred::Resolve(result);
163
179
  }
164
180
 
@@ -25,6 +25,9 @@ void LlamaContext::Init(Napi::Env env, Napi::Object &exports) {
25
25
  {InstanceMethod<&LlamaContext::GetSystemInfo>(
26
26
  "getSystemInfo",
27
27
  static_cast<napi_property_attributes>(napi_enumerable)),
28
+ InstanceMethod<&LlamaContext::GetModelInfo>(
29
+ "getModelInfo",
30
+ static_cast<napi_property_attributes>(napi_enumerable)),
28
31
  InstanceMethod<&LlamaContext::GetFormattedChat>(
29
32
  "getFormattedChat",
30
33
  static_cast<napi_property_attributes>(napi_enumerable)),
@@ -72,9 +75,23 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
72
75
  if (params.model.empty()) {
73
76
  Napi::TypeError::New(env, "Model is required").ThrowAsJavaScriptException();
74
77
  }
75
- params.embedding = get_option<bool>(options, "embedding", false);
78
+
79
+ params.vocab_only = get_option<bool>(options, "vocab_only", false);
80
+ if (params.vocab_only) {
81
+ params.warmup = false;
82
+ }
83
+
76
84
  params.n_ctx = get_option<int32_t>(options, "n_ctx", 512);
77
85
  params.n_batch = get_option<int32_t>(options, "n_batch", 2048);
86
+ params.embedding = get_option<bool>(options, "embedding", false);
87
+ if (params.embedding) {
88
+ // For non-causal models, batch size must be equal to ubatch size
89
+ params.n_ubatch = params.n_batch;
90
+ }
91
+ params.embd_normalize = get_option<int32_t>(options, "embd_normalize", 2);
92
+ int32_t pooling_type = get_option<int32_t>(options, "pooling_type", -1);
93
+ params.pooling_type = (enum llama_pooling_type) pooling_type;
94
+
78
95
  params.cpuparams.n_threads =
79
96
  get_option<int32_t>(options, "n_threads", cpu_get_num_math() / 2);
80
97
  params.n_gpu_layers = get_option<int32_t>(options, "n_gpu_layers", -1);
@@ -102,6 +119,44 @@ Napi::Value LlamaContext::GetSystemInfo(const Napi::CallbackInfo &info) {
102
119
  return Napi::String::New(info.Env(), _info);
103
120
  }
104
121
 
122
+ bool validateModelChatTemplate(const struct llama_model * model) {
123
+ std::vector<char> model_template(2048, 0); // longest known template is about 1200 bytes
124
+ std::string template_key = "tokenizer.chat_template";
125
+ int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
126
+ if (res >= 0) {
127
+ llama_chat_message chat[] = {{"user", "test"}};
128
+ std::string tmpl = std::string(model_template.data(), model_template.size());
129
+ int32_t chat_res = llama_chat_apply_template(model, tmpl.c_str(), chat, 1, true, nullptr, 0);
130
+ return chat_res > 0;
131
+ }
132
+ return res > 0;
133
+ }
134
+
135
+ // getModelInfo(): object
136
+ Napi::Value LlamaContext::GetModelInfo(const Napi::CallbackInfo &info) {
137
+ char desc[1024];
138
+ auto model = _sess->model();
139
+ llama_model_desc(model, desc, sizeof(desc));
140
+
141
+ int count = llama_model_meta_count(model);
142
+ Napi::Object metadata = Napi::Object::New(info.Env());
143
+ for (int i = 0; i < count; i++) {
144
+ char key[256];
145
+ llama_model_meta_key_by_index(model, i, key, sizeof(key));
146
+ char val[2048];
147
+ llama_model_meta_val_str_by_index(model, i, val, sizeof(val));
148
+
149
+ metadata.Set(key, val);
150
+ }
151
+ Napi::Object details = Napi::Object::New(info.Env());
152
+ details.Set("desc", desc);
153
+ details.Set("nParams", llama_model_n_params(model));
154
+ details.Set("size", llama_model_size(model));
155
+ details.Set("isChatTemplateSupported", validateModelChatTemplate(model));
156
+ details.Set("metadata", metadata);
157
+ return details;
158
+ }
159
+
105
160
  // getFormattedChat(messages: [{ role: string, content: string }]): string
106
161
  Napi::Value LlamaContext::GetFormattedChat(const Napi::CallbackInfo &info) {
107
162
  Napi::Env env = info.Env();
@@ -146,29 +201,34 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
146
201
  .ThrowAsJavaScriptException();
147
202
  }
148
203
  params.n_predict = get_option<int32_t>(options, "n_predict", -1);
149
- params.sparams.temp = get_option<float>(options, "temperature", 0.80f);
150
- params.sparams.top_k = get_option<int32_t>(options, "top_k", 40);
151
- params.sparams.top_p = get_option<float>(options, "top_p", 0.95f);
152
- params.sparams.min_p = get_option<float>(options, "min_p", 0.05f);
153
- params.sparams.mirostat = get_option<int32_t>(options, "mirostat", 0.00f);
154
- params.sparams.mirostat_tau =
204
+ params.sampling.temp = get_option<float>(options, "temperature", 0.80f);
205
+ params.sampling.top_k = get_option<int32_t>(options, "top_k", 40);
206
+ params.sampling.top_p = get_option<float>(options, "top_p", 0.95f);
207
+ params.sampling.min_p = get_option<float>(options, "min_p", 0.05f);
208
+ params.sampling.mirostat = get_option<int32_t>(options, "mirostat", 0.00f);
209
+ params.sampling.mirostat_tau =
155
210
  get_option<float>(options, "mirostat_tau", 5.00f);
156
- params.sparams.mirostat_eta =
211
+ params.sampling.mirostat_eta =
157
212
  get_option<float>(options, "mirostat_eta", 0.10f);
158
- params.sparams.penalty_last_n =
213
+ params.sampling.penalty_last_n =
159
214
  get_option<int32_t>(options, "penalty_last_n", 64);
160
- params.sparams.penalty_repeat =
215
+ params.sampling.penalty_repeat =
161
216
  get_option<float>(options, "penalty_repeat", 1.00f);
162
- params.sparams.penalty_freq =
217
+ params.sampling.penalty_freq =
163
218
  get_option<float>(options, "penalty_freq", 0.00f);
164
- params.sparams.penalty_present =
219
+ params.sampling.penalty_present =
165
220
  get_option<float>(options, "penalty_present", 0.00f);
166
- params.sparams.penalize_nl = get_option<bool>(options, "penalize_nl", false);
167
- params.sparams.typ_p = get_option<float>(options, "typical_p", 1.00f);
168
- params.sparams.ignore_eos = get_option<float>(options, "ignore_eos", false);
169
- params.sparams.grammar = get_option<std::string>(options, "grammar", "");
221
+ params.sampling.typ_p = get_option<float>(options, "typical_p", 1.00f);
222
+ params.sampling.xtc_threshold = get_option<float>(options, "xtc_threshold", 0.00f);
223
+ params.sampling.xtc_probability = get_option<float>(options, "xtc_probability", 0.10f);
224
+ params.sampling.dry_multiplier = get_option<float>(options, "dry_multiplier", 1.75f);
225
+ params.sampling.dry_base = get_option<float>(options, "dry_base", 2);
226
+ params.sampling.dry_allowed_length = get_option<float>(options, "dry_allowed_length", -1);
227
+ params.sampling.dry_penalty_last_n = get_option<float>(options, "dry_penalty_last_n", 0);
228
+ params.sampling.ignore_eos = get_option<bool>(options, "ignore_eos", false);
229
+ params.sampling.grammar = get_option<std::string>(options, "grammar", "");
170
230
  params.n_keep = get_option<int32_t>(options, "n_keep", 0);
171
- params.sparams.seed = get_option<int32_t>(options, "seed", LLAMA_DEFAULT_SEED);
231
+ params.sampling.seed = get_option<int32_t>(options, "seed", LLAMA_DEFAULT_SEED);
172
232
  std::vector<std::string> stop_words;
173
233
  if (options.Has("stop") && options.Get("stop").IsArray()) {
174
234
  auto stop_words_array = options.Get("stop").As<Napi::Array>();
@@ -243,8 +303,16 @@ Napi::Value LlamaContext::Embedding(const Napi::CallbackInfo &info) {
243
303
  Napi::TypeError::New(env, "Context is disposed")
244
304
  .ThrowAsJavaScriptException();
245
305
  }
306
+ auto options = Napi::Object::New(env);
307
+ if (info.Length() >= 2 && info[1].IsObject()) {
308
+ options = info[1].As<Napi::Object>();
309
+ }
310
+
311
+ common_params embdParams;
312
+ embdParams.embedding = true;
313
+ embdParams.embd_normalize = get_option<int32_t>(options, "embd_normalize", 2);
246
314
  auto text = info[0].ToString().Utf8Value();
247
- auto *worker = new EmbeddingWorker(info, _sess, text);
315
+ auto *worker = new EmbeddingWorker(info, _sess, text, embdParams);
248
316
  worker->Queue();
249
317
  return worker->Promise();
250
318
  }
@@ -9,6 +9,7 @@ public:
9
9
 
10
10
  private:
11
11
  Napi::Value GetSystemInfo(const Napi::CallbackInfo &info);
12
+ Napi::Value GetModelInfo(const Napi::CallbackInfo &info);
12
13
  Napi::Value GetFormattedChat(const Napi::CallbackInfo &info);
13
14
  Napi::Value Completion(const Napi::CallbackInfo &info);
14
15
  void StopCompletion(const Napi::CallbackInfo &info);
@@ -20,6 +21,7 @@ private:
20
21
  Napi::Value Release(const Napi::CallbackInfo &info);
21
22
 
22
23
  std::string _info;
24
+ Napi::Object _meta;
23
25
  LlamaSessionPtr _sess = nullptr;
24
26
  LlamaCompletionWorker *_wip = nullptr;
25
27
  };