@fugood/llama.node 0.0.1-alpha.4 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. package/CMakeLists.txt +42 -7
  2. package/README.md +10 -0
  3. package/bin/darwin/arm64/default.metallib +0 -0
  4. package/bin/darwin/arm64/llama-node.node +0 -0
  5. package/bin/darwin/x64/default.metallib +0 -0
  6. package/bin/darwin/x64/llama-node.node +0 -0
  7. package/bin/linux/arm64/llama-node.node +0 -0
  8. package/bin/linux/x64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  10. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  11. package/lib/binding.js +1 -1
  12. package/lib/binding.ts +16 -2
  13. package/lib/index.ts +2 -2
  14. package/package.json +15 -3
  15. package/src/DetokenizeWorker.cpp +22 -0
  16. package/src/DetokenizeWorker.h +19 -0
  17. package/src/EmbeddingWorker.cpp +46 -0
  18. package/src/EmbeddingWorker.h +23 -0
  19. package/src/LlamaCompletionWorker.cpp +5 -1
  20. package/src/LlamaCompletionWorker.h +4 -0
  21. package/src/LlamaContext.cpp +80 -1
  22. package/src/LlamaContext.h +3 -0
  23. package/src/TokenizeWorker.cpp +26 -0
  24. package/src/TokenizeWorker.h +23 -0
  25. package/src/common.hpp +12 -7
  26. package/src/llama.cpp/CMakeLists.txt +13 -7
  27. package/src/llama.cpp/common/common.cpp +221 -173
  28. package/src/llama.cpp/common/common.h +19 -8
  29. package/src/llama.cpp/common/json-schema-to-grammar.h +4 -0
  30. package/src/llama.cpp/common/log.h +2 -2
  31. package/src/llama.cpp/common/sampling.cpp +17 -1
  32. package/src/llama.cpp/common/sampling.h +28 -20
  33. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +17 -11
  34. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +5 -5
  35. package/src/llama.cpp/examples/finetune/finetune.cpp +1 -1
  36. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -4
  37. package/src/llama.cpp/examples/imatrix/imatrix.cpp +72 -39
  38. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +42 -3
  39. package/src/llama.cpp/examples/llava/clip.cpp +74 -23
  40. package/src/llama.cpp/examples/llava/llava-cli.cpp +37 -28
  41. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +0 -1
  42. package/src/llama.cpp/examples/lookup/lookup.cpp +0 -1
  43. package/src/llama.cpp/examples/main/main.cpp +10 -8
  44. package/src/llama.cpp/examples/perplexity/perplexity.cpp +175 -55
  45. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  46. package/src/llama.cpp/examples/quantize/quantize.cpp +74 -47
  47. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
  48. package/src/llama.cpp/examples/server/server.cpp +97 -86
  49. package/src/llama.cpp/examples/server/utils.hpp +17 -15
  50. package/src/llama.cpp/ggml-backend.c +7 -5
  51. package/src/llama.cpp/ggml-impl.h +339 -4
  52. package/src/llama.cpp/ggml-kompute.cpp +7 -0
  53. package/src/llama.cpp/ggml-opencl.cpp +1 -0
  54. package/src/llama.cpp/ggml-quants.c +302 -293
  55. package/src/llama.cpp/ggml-sycl.cpp +28 -16
  56. package/src/llama.cpp/ggml-vulkan-shaders.hpp +46843 -39205
  57. package/src/llama.cpp/ggml-vulkan.cpp +951 -263
  58. package/src/llama.cpp/ggml.c +1469 -116
  59. package/src/llama.cpp/ggml.h +37 -7
  60. package/src/llama.cpp/llama.cpp +969 -432
  61. package/src/llama.cpp/llama.h +46 -14
  62. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf-update.txt +2 -0
  63. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +0 -1
  64. package/src/llama.cpp/requirements/requirements-convert.txt +2 -2
  65. package/src/llama.cpp/requirements.txt +1 -0
  66. package/src/llama.cpp/sgemm.cpp +134 -103
  67. package/src/llama.cpp/sgemm.h +4 -2
  68. package/src/llama.cpp/tests/CMakeLists.txt +96 -36
  69. package/src/llama.cpp/tests/test-backend-ops.cpp +56 -6
  70. package/src/llama.cpp/tests/test-chat-template.cpp +4 -0
  71. package/src/llama.cpp/tests/test-grammar-integration.cpp +225 -136
  72. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +1 -0
  73. package/src/llama.cpp/tests/test-tokenizer-0.cpp +292 -0
  74. package/src/llama.cpp/tests/{test-tokenizer-1-llama.cpp → test-tokenizer-1-spm.cpp} +1 -1
  75. package/src/llama.cpp/unicode-data.cpp +1188 -656
  76. package/src/llama.cpp/unicode-data.h +4 -3
  77. package/src/llama.cpp/unicode.cpp +590 -49
  78. package/src/llama.cpp/unicode.h +6 -3
  79. package/bin/win32/arm64/llama-node.node +0 -0
  80. package/bin/win32/arm64/node.lib +0 -0
  81. package/bin/win32/x64/llama-node.node +0 -0
  82. package/bin/win32/x64/node.lib +0 -0
  83. package/src/llama.cpp/tests/test-tokenizer-0-falcon.cpp +0 -187
  84. package/src/llama.cpp/tests/test-tokenizer-0-llama.cpp +0 -190
package/CMakeLists.txt CHANGED
@@ -26,7 +26,7 @@ string(REPLACE "i686" "ia32" ARCH ${ARCH})
26
26
  string(REPLACE "i386" "ia32" ARCH ${ARCH})
27
27
  string(REPLACE "armv7l" "arm" ARCH ${ARCH})
28
28
  string(REPLACE "arm" "arm" ARCH ${ARCH})
29
- string(REPLACE "arm64ex" "arm64" ARCH ${ARCH})
29
+ string(REPLACE "arm64x" "arm64" ARCH ${ARCH})
30
30
  string(REPLACE "aarch64" "arm64" ARCH ${ARCH})
31
31
 
32
32
  if(DEFINED VARIANT)
@@ -58,6 +58,12 @@ include_directories(${CMAKE_JS_INC})
58
58
  # flags: -fPIC
59
59
  set(CMAKE_POSITION_INDEPENDENT_CODE ON)
60
60
 
61
+ # VULKAN_SDK
62
+ if (VULKAN_SDK)
63
+ set(ENV{VULKAN_SDK} ${VULKAN_SDK})
64
+ find_package(Vulkan REQUIRED)
65
+ endif()
66
+
61
67
  set(LLAMA_STATIC ON CACHE BOOL "Build llama as static library")
62
68
  add_subdirectory("src/llama.cpp")
63
69
 
@@ -71,6 +77,12 @@ file(
71
77
  "src/LlamaCompletionWorker.h"
72
78
  "src/LlamaContext.cpp"
73
79
  "src/LlamaContext.h"
80
+ "src/TokenizeWorker.cpp"
81
+ "src/TokenizeWorker.h"
82
+ "src/DetokenizeWorker.cpp"
83
+ "src/DetokenizeWorker.h"
84
+ "src/EmbeddingWorker.cpp"
85
+ "src/EmbeddingWorker.h"
74
86
  "src/LoadSessionWorker.cpp"
75
87
  "src/LoadSessionWorker.h"
76
88
  "src/SaveSessionWorker.cpp"
@@ -81,31 +93,54 @@ add_library(${PROJECT_NAME} SHARED ${SOURCE_FILES} ${CMAKE_JS_SRC})
81
93
  set_target_properties(${PROJECT_NAME} PROPERTIES PREFIX "" SUFFIX ".node")
82
94
  target_link_libraries(${PROJECT_NAME} ${CMAKE_JS_LIB} llama ggml common)
83
95
 
96
+ add_custom_target(copy_assets ALL DEPENDS ${PROJECT_NAME})
97
+
98
+ add_custom_command(
99
+ TARGET copy_assets
100
+ COMMAND ${CMAKE_COMMAND} -E remove_directory ${PLATFORM_BINARY_DIR}
101
+ COMMENT "Cleaning bin folder"
102
+ )
103
+
84
104
  if(MSVC AND CMAKE_JS_NODELIB_DEF AND CMAKE_JS_NODELIB_TARGET)
85
105
  # Generate node.lib
86
106
  execute_process(COMMAND ${CMAKE_AR} /def:${CMAKE_JS_NODELIB_DEF} /out:${CMAKE_JS_NODELIB_TARGET} ${CMAKE_STATIC_LINKER_FLAGS})
87
107
  # copy target to bin folder
88
108
  get_filename_component(CMAKE_JS_NODELIB_TARGET_NAME ${CMAKE_JS_NODELIB_TARGET} NAME)
89
- add_custom_command(TARGET ${PROJECT_NAME} POST_BUILD
109
+ add_custom_command(TARGET copy_assets
90
110
  COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_JS_NODELIB_TARGET} ${PLATFORM_BINARY_DIR}/${CMAKE_JS_NODELIB_TARGET_NAME}
91
111
  COMMENT "Copying to bin folder"
92
112
  )
93
113
  endif()
94
114
 
95
115
  # copy target to bin folder
96
- add_custom_command(TARGET ${PROJECT_NAME} POST_BUILD
116
+ add_custom_command(TARGET copy_assets
97
117
  COMMAND ${CMAKE_COMMAND} -E copy $<TARGET_FILE:${PROJECT_NAME}> ${PLATFORM_BINARY_DIR}/$<TARGET_FILE_NAME:${PROJECT_NAME}>
98
118
  COMMENT "Copying to bin folder"
99
119
  )
100
120
 
101
121
  if (LLAMA_METAL)
102
122
  # copy ${CMAKE_BINARY_DIR}/bin/default.metallib
103
- add_custom_target(copy_metallib)
104
123
  add_custom_command(
105
- TARGET copy_metallib
124
+ TARGET copy_assets
106
125
  COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_BINARY_DIR}/bin/default.metallib ${PLATFORM_BINARY_DIR}/default.metallib
107
126
  COMMENT "Copying default.metallib to bin folder"
108
127
  )
109
- add_dependencies(copy_metallib ggml-metal)
110
- add_dependencies(${PROJECT_NAME} copy_metallib)
128
+ add_dependencies(copy_assets ggml-metal)
129
+ endif()
130
+
131
+ if (LLAMA_CLBLAST)
132
+ find_package(CLBlast)
133
+ if (CLBlast_FOUND)
134
+ message(STATUS "CLBlast found: ${CLBlast_DIR}")
135
+ file(
136
+ GLOB CLBlast_SO_FILES
137
+ ${CLBlast_DIR}/../../../bin/clblast.dll
138
+ ${CLBlast_DIR}/../../../lib/libclblast.so
139
+ )
140
+ add_custom_command(
141
+ TARGET copy_assets
142
+ COMMAND ${CMAKE_COMMAND} -E copy ${CLBlast_SO_FILES} ${PLATFORM_BINARY_DIR}
143
+ COMMENT "Copying CLBlast SO files to bin folder"
144
+ )
145
+ endif()
111
146
  endif()
package/README.md CHANGED
@@ -1,5 +1,9 @@
1
1
  # llama.node
2
2
 
3
+ [![CI](https://github.com/mybigday/llama.node/actions/workflows/ci.yml/badge.svg?branch=main)](https://github.com/mybigday/llama.node/actions/workflows/ci.yml)
4
+ [![NPM Version](https://img.shields.io/npm/v/%40fugood%2Fllama.node)](https://www.npmjs.com/package/@fugood/llama.node)
5
+ ![NPM Downloads](https://img.shields.io/npm/dw/%40fugood%2Fllama.node)
6
+
3
7
  Node binding of [llama.cpp](https://github.com/ggerganov/llama.cpp).
4
8
 
5
9
  [llama.cpp](https://github.com/ggerganov/llama.cpp): Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
@@ -22,6 +26,7 @@ const context = await loadModel({
22
26
  n_ctx: 2048,
23
27
  n_gpu_layers: 1, // > 0: enable GPU
24
28
  // embedding: true, // use embedding
29
+ // lib_variant: 'opencl', // Change backend
25
30
  })
26
31
 
27
32
  // Do completion
@@ -40,6 +45,11 @@ const { text, timings } = await context.completion(
40
45
  console.log('Result:', text)
41
46
  ```
42
47
 
48
+ ## Lib Variants
49
+
50
+ - [x] `default`: General usage, not support GPU except macOS (Metal)
51
+ - [x] `vulkan`: Support GPU Vulkan (Windows/Linux), but some scenario might unstable
52
+
43
53
  ## License
44
54
 
45
55
  MIT
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
package/lib/binding.js CHANGED
@@ -50,7 +50,7 @@ const setupEnv = (variant) => {
50
50
  };
51
51
  const loadModule = (variant) => __awaiter(void 0, void 0, void 0, function* () {
52
52
  try {
53
- if (variant) {
53
+ if (variant && variant !== 'default') {
54
54
  setupEnv(variant);
55
55
  return yield Promise.resolve(`${`../bin/${process.platform}-${variant}/${process.arch}/llama-node.node`}`).then(s => __importStar(require(s)));
56
56
  }
package/lib/binding.ts CHANGED
@@ -23,6 +23,7 @@ export type LlamaCompletionOptions = {
23
23
  max_tokens?: number
24
24
  seed?: number
25
25
  stop?: string[]
26
+ grammar?: string
26
27
  }
27
28
 
28
29
  export type LlamaCompletionResult = {
@@ -36,11 +37,22 @@ export type LlamaCompletionToken = {
36
37
  token: string
37
38
  }
38
39
 
40
+ export type TokenizeResult = {
41
+ tokens: Int32Array
42
+ }
43
+
44
+ export type EmbeddingResult = {
45
+ embedding: Float32Array
46
+ }
47
+
39
48
  export interface LlamaContext {
40
49
  new (options: LlamaModelOptions): LlamaContext
41
50
  getSystemInfo(): string
42
51
  completion(options: LlamaCompletionOptions, callback?: (token: LlamaCompletionToken) => void): Promise<LlamaCompletionResult>
43
52
  stopCompletion(): void
53
+ tokenize(text: string): Promise<TokenizeResult>
54
+ detokenize(tokens: number[]): Promise<string>
55
+ embedding(text: string): Promise<EmbeddingResult>
44
56
  saveSession(path: string): Promise<void>
45
57
  loadSession(path: string): Promise<void>
46
58
  release(): Promise<void>
@@ -50,6 +62,8 @@ export interface Module {
50
62
  LlamaContext: LlamaContext
51
63
  }
52
64
 
65
+ export type LibVariant = 'default' | 'opencl'
66
+
53
67
  const setupEnv = (variant?: string) => {
54
68
  const postfix = variant ? `-${variant}` : ''
55
69
  const binPath = path.resolve(__dirname, `../bin/${process.platform}${postfix}/${process.arch}/`)
@@ -63,9 +77,9 @@ const setupEnv = (variant?: string) => {
63
77
  }
64
78
  }
65
79
 
66
- export const loadModule = async (variant?: string): Promise<Module> => {
80
+ export const loadModule = async (variant?: LibVariant): Promise<Module> => {
67
81
  try {
68
- if (variant) {
82
+ if (variant && variant !== 'default') {
69
83
  setupEnv(variant)
70
84
  return await import(`../bin/${process.platform}-${variant}/${process.arch}/llama-node.node`) as Module
71
85
  }
package/lib/index.ts CHANGED
@@ -1,10 +1,10 @@
1
1
  import { loadModule, LlamaModelOptions } from './binding'
2
- import type { Module, LlamaContext } from './binding'
2
+ import type { Module, LlamaContext, LibVariant } from './binding'
3
3
 
4
4
  export * from './binding'
5
5
 
6
6
  export interface LlamaModelOptionsExtended extends LlamaModelOptions {
7
- lib_variant?: string
7
+ lib_variant?: LibVariant
8
8
  }
9
9
 
10
10
  const mods: { [key: string]: Module } = {}
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "0.0.1-alpha.4",
4
+ "version": "0.2.0",
5
5
  "description": "Llama.cpp for Node.js",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -39,8 +39,6 @@
39
39
  },
40
40
  "files": [
41
41
  "bin/**/*",
42
- "scripts/*.js",
43
- "scripts/*.ts",
44
42
  "src/**/*.{c,cc,cpp,h,hh,hpp,txt,cmake}",
45
43
  "lib/*.js",
46
44
  "lib/*.ts",
@@ -62,5 +60,19 @@
62
60
  },
63
61
  "dependencies": {
64
62
  "node-addon-api": "^8.0.0"
63
+ },
64
+ "jest": {
65
+ "testEnvironment": "node",
66
+ "moduleFileExtensions": [
67
+ "ts",
68
+ "tsx",
69
+ "js",
70
+ "jsx",
71
+ "json",
72
+ "node"
73
+ ],
74
+ "testMatch": [
75
+ "**/*.test.ts"
76
+ ]
65
77
  }
66
78
  }
@@ -0,0 +1,22 @@
1
+ #include "DetokenizeWorker.h"
2
+ #include "LlamaContext.h"
3
+
4
+ DetokenizeWorker::DetokenizeWorker(const Napi::CallbackInfo &info,
5
+ LlamaSessionPtr &sess,
6
+ std::vector<llama_token> &tokens)
7
+ : AsyncWorker(info.Env()), Deferred(info.Env()), _sess(sess),
8
+ _tokens(std::move(tokens)) {}
9
+
10
+ void DetokenizeWorker::Execute() {
11
+ const auto text = ::llama_detokenize_bpe(_sess->context(), _tokens);
12
+ _text = std::move(text);
13
+ }
14
+
15
+ void DetokenizeWorker::OnOK() {
16
+ Napi::Promise::Deferred::Resolve(
17
+ Napi::String::New(Napi::AsyncWorker::Env(), _text));
18
+ }
19
+
20
+ void DetokenizeWorker::OnError(const Napi::Error &err) {
21
+ Napi::Promise::Deferred::Reject(err.Value());
22
+ }
@@ -0,0 +1,19 @@
1
+ #include "common.hpp"
2
+ #include <vector>
3
+
4
+ class DetokenizeWorker : public Napi::AsyncWorker,
5
+ public Napi::Promise::Deferred {
6
+ public:
7
+ DetokenizeWorker(const Napi::CallbackInfo &info, LlamaSessionPtr &sess,
8
+ std::vector<llama_token> &tokens);
9
+
10
+ protected:
11
+ void Execute();
12
+ void OnOK();
13
+ void OnError(const Napi::Error &err);
14
+
15
+ private:
16
+ LlamaSessionPtr _sess;
17
+ std::vector<llama_token> _tokens;
18
+ std::string _text;
19
+ };
@@ -0,0 +1,46 @@
1
+ #include "EmbeddingWorker.h"
2
+ #include "LlamaContext.h"
3
+
4
+ EmbeddingWorker::EmbeddingWorker(const Napi::CallbackInfo &info,
5
+ LlamaSessionPtr &sess, std::string text)
6
+ : AsyncWorker(info.Env()), Deferred(info.Env()), _sess(sess), _text(text) {}
7
+
8
+ void EmbeddingWorker::Execute() {
9
+ llama_kv_cache_clear(_sess->context());
10
+ auto tokens = ::llama_tokenize(_sess->context(), _text, true);
11
+ // add SEP if not present
12
+ if (tokens.empty() || tokens.back() != llama_token_sep(_sess->model())) {
13
+ tokens.push_back(llama_token_sep(_sess->model()));
14
+ }
15
+ const int n_embd = llama_n_embd(_sess->model());
16
+ do {
17
+ int ret =
18
+ llama_decode(_sess->context(),
19
+ llama_batch_get_one(tokens.data(), tokens.size(), 0, 0));
20
+ if (ret < 0) {
21
+ SetError("Failed to inference, code: " + std::to_string(ret));
22
+ break;
23
+ }
24
+ const float *embd = llama_get_embeddings_seq(_sess->context(), 0);
25
+ if (embd == nullptr) {
26
+ SetError("Failed to get embeddings");
27
+ break;
28
+ }
29
+ _result.embedding.resize(n_embd);
30
+ memcpy(_result.embedding.data(), embd, n_embd * sizeof(float));
31
+ } while (false);
32
+ }
33
+
34
+ void EmbeddingWorker::OnOK() {
35
+ auto result = Napi::Object::New(Napi::AsyncWorker::Env());
36
+ auto embedding = Napi::Float32Array::New(Napi::AsyncWorker::Env(),
37
+ _result.embedding.size());
38
+ memcpy(embedding.Data(), _result.embedding.data(),
39
+ _result.embedding.size() * sizeof(float));
40
+ result.Set("embedding", embedding);
41
+ Napi::Promise::Deferred::Resolve(result);
42
+ }
43
+
44
+ void EmbeddingWorker::OnError(const Napi::Error &err) {
45
+ Napi::Promise::Deferred::Reject(err.Value());
46
+ }
@@ -0,0 +1,23 @@
1
+ #include "common.hpp"
2
+ #include <vector>
3
+
4
+ struct EmbeddingResult {
5
+ std::vector<float> embedding;
6
+ };
7
+
8
+ class EmbeddingWorker : public Napi::AsyncWorker,
9
+ public Napi::Promise::Deferred {
10
+ public:
11
+ EmbeddingWorker(const Napi::CallbackInfo &info, LlamaSessionPtr &sess,
12
+ std::string text);
13
+
14
+ protected:
15
+ void Execute();
16
+ void OnOK();
17
+ void OnError(const Napi::Error &err);
18
+
19
+ private:
20
+ LlamaSessionPtr _sess;
21
+ std::string _text;
22
+ EmbeddingResult _result;
23
+ };
@@ -58,7 +58,7 @@ void LlamaCompletionWorker::Execute() {
58
58
  const auto n_keep = _params.n_keep;
59
59
  size_t n_cur = 0;
60
60
  size_t n_input = 0;
61
- const auto model = llama_get_model(_sess->context());
61
+ const auto model = _sess->model();
62
62
  const bool add_bos = llama_should_add_bos_token(model);
63
63
  auto ctx = _sess->context();
64
64
 
@@ -110,6 +110,7 @@ void LlamaCompletionWorker::Execute() {
110
110
  // sample the next token
111
111
  const llama_token new_token_id =
112
112
  llama_sampling_sample(sampling.get(), ctx, nullptr);
113
+ llama_sampling_accept(sampling.get(), ctx, new_token_id, true);
113
114
  // prepare the next batch
114
115
  embd->emplace_back(new_token_id);
115
116
  auto token = llama_token_to_piece(ctx, new_token_id);
@@ -143,6 +144,9 @@ void LlamaCompletionWorker::Execute() {
143
144
  }
144
145
  const auto t_main_end = ggml_time_us();
145
146
  _sess->get_mutex().unlock();
147
+ if (_onComplete) {
148
+ _onComplete();
149
+ }
146
150
  }
147
151
 
148
152
  void LlamaCompletionWorker::OnOK() {
@@ -1,4 +1,5 @@
1
1
  #include "common.hpp"
2
+ #include <functional>
2
3
 
3
4
  struct CompletionResult {
4
5
  std::string text = "";
@@ -18,6 +19,8 @@ public:
18
19
 
19
20
  inline void Stop() { _stop = true; }
20
21
 
22
+ inline void onComplete(std::function<void()> cb) { _onComplete = cb; }
23
+
21
24
  protected:
22
25
  void Execute();
23
26
  void OnOK();
@@ -30,5 +33,6 @@ private:
30
33
  Napi::ThreadSafeFunction _tsfn;
31
34
  bool _has_callback = false;
32
35
  bool _stop = false;
36
+ std::function<void()> _onComplete;
33
37
  CompletionResult _result;
34
38
  };
@@ -1,8 +1,11 @@
1
1
  #include "LlamaContext.h"
2
+ #include "DetokenizeWorker.h"
2
3
  #include "DisposeWorker.h"
4
+ #include "EmbeddingWorker.h"
3
5
  #include "LlamaCompletionWorker.h"
4
6
  #include "LoadSessionWorker.h"
5
7
  #include "SaveSessionWorker.h"
8
+ #include "TokenizeWorker.h"
6
9
 
7
10
  void LlamaContext::Init(Napi::Env env, Napi::Object &exports) {
8
11
  Napi::Function func = DefineClass(
@@ -16,6 +19,13 @@ void LlamaContext::Init(Napi::Env env, Napi::Object &exports) {
16
19
  InstanceMethod<&LlamaContext::StopCompletion>(
17
20
  "stopCompletion",
18
21
  static_cast<napi_property_attributes>(napi_enumerable)),
22
+ InstanceMethod<&LlamaContext::Tokenize>(
23
+ "tokenize", static_cast<napi_property_attributes>(napi_enumerable)),
24
+ InstanceMethod<&LlamaContext::Detokenize>(
25
+ "detokenize",
26
+ static_cast<napi_property_attributes>(napi_enumerable)),
27
+ InstanceMethod<&LlamaContext::Embedding>(
28
+ "embedding", static_cast<napi_property_attributes>(napi_enumerable)),
19
29
  InstanceMethod<&LlamaContext::SaveSession>(
20
30
  "saveSession",
21
31
  static_cast<napi_property_attributes>(napi_enumerable)),
@@ -70,7 +80,7 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
70
80
  .ThrowAsJavaScriptException();
71
81
  }
72
82
 
73
- _sess = std::make_shared<LlamaSession>(ctx, params);
83
+ _sess = std::make_shared<LlamaSession>(model, ctx, params);
74
84
  _info = get_system_info(params);
75
85
  }
76
86
 
@@ -93,6 +103,10 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
93
103
  Napi::TypeError::New(env, "Context is disposed")
94
104
  .ThrowAsJavaScriptException();
95
105
  }
106
+ if (_wip != nullptr) {
107
+ Napi::TypeError::New(env, "Another completion is in progress")
108
+ .ThrowAsJavaScriptException();
109
+ }
96
110
  auto options = info[0].As<Napi::Object>();
97
111
 
98
112
  gpt_params params = _sess->params();
@@ -143,6 +157,7 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
143
157
  new LlamaCompletionWorker(info, _sess, callback, params, stop_words);
144
158
  worker->Queue();
145
159
  _wip = worker;
160
+ worker->onComplete([this]() { _wip = nullptr; });
146
161
  return worker->Promise();
147
162
  }
148
163
 
@@ -153,6 +168,58 @@ void LlamaContext::StopCompletion(const Napi::CallbackInfo &info) {
153
168
  }
154
169
  }
155
170
 
171
+ // tokenize(text: string): Promise<TokenizeResult>
172
+ Napi::Value LlamaContext::Tokenize(const Napi::CallbackInfo &info) {
173
+ Napi::Env env = info.Env();
174
+ if (info.Length() < 1 || !info[0].IsString()) {
175
+ Napi::TypeError::New(env, "String expected").ThrowAsJavaScriptException();
176
+ }
177
+ if (_sess == nullptr) {
178
+ Napi::TypeError::New(env, "Context is disposed")
179
+ .ThrowAsJavaScriptException();
180
+ }
181
+ auto text = info[0].ToString().Utf8Value();
182
+ auto *worker = new TokenizeWorker(info, _sess, text);
183
+ worker->Queue();
184
+ return worker->Promise();
185
+ }
186
+
187
+ // detokenize(tokens: number[]): Promise<string>
188
+ Napi::Value LlamaContext::Detokenize(const Napi::CallbackInfo &info) {
189
+ Napi::Env env = info.Env();
190
+ if (info.Length() < 1 || !info[0].IsArray()) {
191
+ Napi::TypeError::New(env, "Array expected").ThrowAsJavaScriptException();
192
+ }
193
+ if (_sess == nullptr) {
194
+ Napi::TypeError::New(env, "Context is disposed")
195
+ .ThrowAsJavaScriptException();
196
+ }
197
+ auto tokens = info[0].As<Napi::Array>();
198
+ std::vector<int32_t> token_ids;
199
+ for (size_t i = 0; i < tokens.Length(); i++) {
200
+ token_ids.push_back(tokens.Get(i).ToNumber().Int32Value());
201
+ }
202
+ auto *worker = new DetokenizeWorker(info, _sess, token_ids);
203
+ worker->Queue();
204
+ return worker->Promise();
205
+ }
206
+
207
+ // embedding(text: string): Promise<EmbeddingResult>
208
+ Napi::Value LlamaContext::Embedding(const Napi::CallbackInfo &info) {
209
+ Napi::Env env = info.Env();
210
+ if (info.Length() < 1 || !info[0].IsString()) {
211
+ Napi::TypeError::New(env, "String expected").ThrowAsJavaScriptException();
212
+ }
213
+ if (_sess == nullptr) {
214
+ Napi::TypeError::New(env, "Context is disposed")
215
+ .ThrowAsJavaScriptException();
216
+ }
217
+ auto text = info[0].ToString().Utf8Value();
218
+ auto *worker = new EmbeddingWorker(info, _sess, text);
219
+ worker->Queue();
220
+ return worker->Promise();
221
+ }
222
+
156
223
  // saveSession(path: string): Promise<void> throws error
157
224
  Napi::Value LlamaContext::SaveSession(const Napi::CallbackInfo &info) {
158
225
  Napi::Env env = info.Env();
@@ -163,6 +230,12 @@ Napi::Value LlamaContext::SaveSession(const Napi::CallbackInfo &info) {
163
230
  Napi::TypeError::New(env, "Context is disposed")
164
231
  .ThrowAsJavaScriptException();
165
232
  }
233
+ #ifdef GGML_USE_VULKAN
234
+ if (_sess->params().n_gpu_layers > 0) {
235
+ Napi::TypeError::New(env, "Vulkan cannot save session")
236
+ .ThrowAsJavaScriptException();
237
+ }
238
+ #endif
166
239
  auto *worker = new SaveSessionWorker(info, _sess);
167
240
  worker->Queue();
168
241
  return worker->Promise();
@@ -178,6 +251,12 @@ Napi::Value LlamaContext::LoadSession(const Napi::CallbackInfo &info) {
178
251
  Napi::TypeError::New(env, "Context is disposed")
179
252
  .ThrowAsJavaScriptException();
180
253
  }
254
+ #ifdef GGML_USE_VULKAN
255
+ if (_sess->params().n_gpu_layers > 0) {
256
+ Napi::TypeError::New(env, "Vulkan cannot load session")
257
+ .ThrowAsJavaScriptException();
258
+ }
259
+ #endif
181
260
  auto *worker = new LoadSessionWorker(info, _sess);
182
261
  worker->Queue();
183
262
  return worker->Promise();
@@ -11,6 +11,9 @@ private:
11
11
  Napi::Value GetSystemInfo(const Napi::CallbackInfo &info);
12
12
  Napi::Value Completion(const Napi::CallbackInfo &info);
13
13
  void StopCompletion(const Napi::CallbackInfo &info);
14
+ Napi::Value Tokenize(const Napi::CallbackInfo &info);
15
+ Napi::Value Detokenize(const Napi::CallbackInfo &info);
16
+ Napi::Value Embedding(const Napi::CallbackInfo &info);
14
17
  Napi::Value SaveSession(const Napi::CallbackInfo &info);
15
18
  Napi::Value LoadSession(const Napi::CallbackInfo &info);
16
19
  Napi::Value Release(const Napi::CallbackInfo &info);
@@ -0,0 +1,26 @@
1
+ #include "TokenizeWorker.h"
2
+ #include "LlamaContext.h"
3
+
4
+ TokenizeWorker::TokenizeWorker(const Napi::CallbackInfo &info,
5
+ LlamaSessionPtr &sess, std::string text)
6
+ : AsyncWorker(info.Env()), Deferred(info.Env()), _sess(sess), _text(text) {}
7
+
8
+ void TokenizeWorker::Execute() {
9
+ const auto tokens = ::llama_tokenize(_sess->context(), _text, false);
10
+ _result = {.tokens = std::move(tokens)};
11
+ }
12
+
13
+ void TokenizeWorker::OnOK() {
14
+ Napi::HandleScope scope(Napi::AsyncWorker::Env());
15
+ auto result = Napi::Object::New(Napi::AsyncWorker::Env());
16
+ auto tokens =
17
+ Napi::Int32Array::New(Napi::AsyncWorker::Env(), _result.tokens.size());
18
+ memcpy(tokens.Data(), _result.tokens.data(),
19
+ _result.tokens.size() * sizeof(llama_token));
20
+ result.Set("tokens", tokens);
21
+ Napi::Promise::Deferred::Resolve(result);
22
+ }
23
+
24
+ void TokenizeWorker::OnError(const Napi::Error &err) {
25
+ Napi::Promise::Deferred::Reject(err.Value());
26
+ }
@@ -0,0 +1,23 @@
1
+ #include "common.hpp"
2
+ #include <vector>
3
+
4
+ struct TokenizeResult {
5
+ std::vector<llama_token> tokens;
6
+ };
7
+
8
+ class TokenizeWorker : public Napi::AsyncWorker,
9
+ public Napi::Promise::Deferred {
10
+ public:
11
+ TokenizeWorker(const Napi::CallbackInfo &info, LlamaSessionPtr &sess,
12
+ std::string text);
13
+
14
+ protected:
15
+ void Execute();
16
+ void OnOK();
17
+ void OnError(const Napi::Error &err);
18
+
19
+ private:
20
+ LlamaSessionPtr _sess;
21
+ std::string _text;
22
+ TokenizeResult _result;
23
+ };
package/src/common.hpp CHANGED
@@ -46,32 +46,37 @@ constexpr T get_option(const Napi::Object &options, const std::string &name,
46
46
 
47
47
  class LlamaSession {
48
48
  public:
49
- LlamaSession(llama_context *ctx, gpt_params params)
50
- : ctx_(LlamaCppContext(ctx, llama_free)), params_(params) {
49
+ LlamaSession(llama_model *model, llama_context *ctx, gpt_params params)
50
+ : model_(LlamaCppModel(model, llama_free_model)),
51
+ ctx_(LlamaCppContext(ctx, llama_free)), params_(params) {
51
52
  tokens_.reserve(params.n_ctx);
52
53
  }
53
54
 
54
55
  ~LlamaSession() { dispose(); }
55
56
 
56
- llama_context *context() { return ctx_.get(); }
57
+ inline llama_context *context() { return ctx_.get(); }
57
58
 
58
- std::vector<llama_token>* tokens_ptr() { return &tokens_; }
59
+ inline llama_model *model() { return model_.get(); }
59
60
 
60
- void set_tokens(std::vector<llama_token> tokens) {
61
+ inline std::vector<llama_token> *tokens_ptr() { return &tokens_; }
62
+
63
+ inline void set_tokens(std::vector<llama_token> tokens) {
61
64
  tokens_ = std::move(tokens);
62
65
  }
63
66
 
64
- const gpt_params &params() const { return params_; }
67
+ inline const gpt_params &params() const { return params_; }
65
68
 
66
- std::mutex &get_mutex() { return mutex; }
69
+ inline std::mutex &get_mutex() { return mutex; }
67
70
 
68
71
  void dispose() {
69
72
  std::lock_guard<std::mutex> lock(mutex);
70
73
  tokens_.clear();
71
74
  ctx_.reset();
75
+ model_.reset();
72
76
  }
73
77
 
74
78
  private:
79
+ LlamaCppModel model_;
75
80
  LlamaCppContext ctx_;
76
81
  const gpt_params params_;
77
82
  std::vector<llama_token> tokens_{};