@fugood/llama.node 0.1.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. package/CMakeLists.txt +15 -0
  2. package/README.md +3 -2
  3. package/bin/darwin/arm64/default.metallib +0 -0
  4. package/bin/darwin/arm64/llama-node.node +0 -0
  5. package/bin/darwin/x64/default.metallib +0 -0
  6. package/bin/darwin/x64/llama-node.node +0 -0
  7. package/bin/linux/arm64/llama-node.node +0 -0
  8. package/bin/linux/x64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  10. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  11. package/bin/win32/arm64/llama-node.node +0 -0
  12. package/bin/win32/arm64/node.lib +0 -0
  13. package/bin/win32/x64/llama-node.node +0 -0
  14. package/bin/win32/x64/node.lib +0 -0
  15. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/arm64/node.lib +0 -0
  17. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  18. package/bin/win32-vulkan/x64/node.lib +0 -0
  19. package/lib/binding.ts +12 -1
  20. package/package.json +2 -1
  21. package/patches/llama.patch +22 -0
  22. package/src/DetokenizeWorker.cpp +22 -0
  23. package/src/DetokenizeWorker.h +19 -0
  24. package/src/EmbeddingWorker.cpp +46 -0
  25. package/src/EmbeddingWorker.h +23 -0
  26. package/src/LlamaContext.cpp +62 -0
  27. package/src/LlamaContext.h +3 -0
  28. package/src/TokenizeWorker.cpp +26 -0
  29. package/src/TokenizeWorker.h +23 -0
  30. package/src/common.hpp +3 -2
  31. package/src/llama.cpp/CMakeLists.txt +14 -12
  32. package/src/llama.cpp/common/common.cpp +19 -5
  33. package/src/llama.cpp/common/common.h +2 -0
  34. package/src/llama.cpp/common/grammar-parser.cpp +9 -0
  35. package/src/llama.cpp/common/sampling.cpp +3 -3
  36. package/src/llama.cpp/common/sampling.h +1 -1
  37. package/src/llama.cpp/examples/CMakeLists.txt +3 -0
  38. package/src/llama.cpp/examples/embedding/embedding.cpp +10 -2
  39. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +56 -7
  40. package/src/llama.cpp/examples/llama.android/{app/src/main/cpp → llama}/CMakeLists.txt +1 -1
  41. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +49 -0
  42. package/src/llama.cpp/examples/llama.android/{app → llama}/src/main/cpp/llama-android.cpp +14 -14
  43. package/src/llama.cpp/examples/llava/llava-cli.cpp +26 -6
  44. package/src/llama.cpp/examples/main/main.cpp +5 -1
  45. package/src/llama.cpp/examples/rpc/CMakeLists.txt +2 -0
  46. package/src/llama.cpp/examples/rpc/rpc-server.cpp +70 -0
  47. package/src/llama.cpp/examples/server/server.cpp +12 -16
  48. package/src/llama.cpp/examples/server/utils.hpp +1 -1
  49. package/src/llama.cpp/ggml-backend.c +2 -2
  50. package/src/llama.cpp/ggml-kompute.cpp +9 -3
  51. package/src/llama.cpp/ggml-quants.c +6 -0
  52. package/src/llama.cpp/ggml-rpc.cpp +1023 -0
  53. package/src/llama.cpp/ggml-rpc.h +24 -0
  54. package/src/llama.cpp/ggml-sycl.cpp +20 -143
  55. package/src/llama.cpp/ggml-vulkan.cpp +4 -2
  56. package/src/llama.cpp/ggml.c +116 -271
  57. package/src/llama.cpp/ggml.h +12 -15
  58. package/src/llama.cpp/llama.cpp +451 -265
  59. package/src/llama.cpp/llama.h +3 -0
  60. package/src/llama.cpp/requirements.txt +0 -1
  61. package/src/llama.cpp/tests/CMakeLists.txt +1 -1
  62. package/src/llama.cpp/tests/test-backend-ops.cpp +16 -19
  63. package/src/llama.cpp/tests/test-grammar-integration.cpp +46 -0
  64. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +27 -3
  65. package/src/llama.cpp/requirements/requirements-convert-lora-to-ggml.txt +0 -2
package/CMakeLists.txt CHANGED
@@ -64,6 +64,15 @@ if (VULKAN_SDK)
64
64
  find_package(Vulkan REQUIRED)
65
65
  endif()
66
66
 
67
+ find_program(PATCH patch REQUIRED)
68
+
69
+ add_custom_target(
70
+ patch ALL
71
+ COMMAND ${PATCH} -p1 -N < ${CMAKE_SOURCE_DIR}/patches/llama.patch || true
72
+ WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/src/llama.cpp
73
+ COMMENT "Applying patches"
74
+ )
75
+
67
76
  set(LLAMA_STATIC ON CACHE BOOL "Build llama as static library")
68
77
  add_subdirectory("src/llama.cpp")
69
78
 
@@ -77,6 +86,12 @@ file(
77
86
  "src/LlamaCompletionWorker.h"
78
87
  "src/LlamaContext.cpp"
79
88
  "src/LlamaContext.h"
89
+ "src/TokenizeWorker.cpp"
90
+ "src/TokenizeWorker.h"
91
+ "src/DetokenizeWorker.cpp"
92
+ "src/DetokenizeWorker.h"
93
+ "src/EmbeddingWorker.cpp"
94
+ "src/EmbeddingWorker.h"
80
95
  "src/LoadSessionWorker.cpp"
81
96
  "src/LoadSessionWorker.h"
82
97
  "src/SaveSessionWorker.cpp"
package/README.md CHANGED
@@ -30,7 +30,7 @@ const context = await loadModel({
30
30
  })
31
31
 
32
32
  // Do completion
33
- const { text, timings } = await context.completion(
33
+ const { text } = await context.completion(
34
34
  {
35
35
  prompt: 'This is a conversation between user and llama, a friendly chatbot. respond in simple markdown.\n\nUser: Hello!\nLlama:',
36
36
  n_predict: 100,
@@ -47,7 +47,8 @@ console.log('Result:', text)
47
47
 
48
48
  ## Lib Variants
49
49
 
50
- - [x] `default`: General usage, Supported GPU: Metal (macOS) and Vulkan (Linux / Windows)
50
+ - [x] `default`: General usage, not support GPU except macOS (Metal)
51
+ - [x] `vulkan`: Support GPU Vulkan (Windows/Linux), but some scenario might unstable
51
52
 
52
53
  ## License
53
54
 
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
package/lib/binding.ts CHANGED
@@ -37,11 +37,22 @@ export type LlamaCompletionToken = {
37
37
  token: string
38
38
  }
39
39
 
40
+ export type TokenizeResult = {
41
+ tokens: Int32Array
42
+ }
43
+
44
+ export type EmbeddingResult = {
45
+ embedding: Float32Array
46
+ }
47
+
40
48
  export interface LlamaContext {
41
49
  new (options: LlamaModelOptions): LlamaContext
42
50
  getSystemInfo(): string
43
51
  completion(options: LlamaCompletionOptions, callback?: (token: LlamaCompletionToken) => void): Promise<LlamaCompletionResult>
44
52
  stopCompletion(): void
53
+ tokenize(text: string): Promise<TokenizeResult>
54
+ detokenize(tokens: number[]): Promise<string>
55
+ embedding(text: string): Promise<EmbeddingResult>
45
56
  saveSession(path: string): Promise<void>
46
57
  loadSession(path: string): Promise<void>
47
58
  release(): Promise<void>
@@ -51,7 +62,7 @@ export interface Module {
51
62
  LlamaContext: LlamaContext
52
63
  }
53
64
 
54
- export type LibVariant = 'default' | 'opencl'
65
+ export type LibVariant = 'default' | 'vulkan'
55
66
 
56
67
  const setupEnv = (variant?: string) => {
57
68
  const postfix = variant ? `-${variant}` : ''
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "0.1.0",
4
+ "version": "0.2.1",
5
5
  "description": "Llama.cpp for Node.js",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -38,6 +38,7 @@
38
38
  ]
39
39
  },
40
40
  "files": [
41
+ "patches/*.patch",
41
42
  "bin/**/*",
42
43
  "src/**/*.{c,cc,cpp,h,hh,hpp,txt,cmake}",
43
44
  "lib/*.js",
@@ -0,0 +1,22 @@
1
+ diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
2
+ index b9449be0..cfa0f774 100644
3
+ --- a/ggml-vulkan.cpp
4
+ +++ b/ggml-vulkan.cpp
5
+ @@ -525,9 +525,15 @@ static void ggml_vk_create_pipeline(ggml_backend_vk_context * ctx, vk_pipeline&
6
+ vk::PipelineCreateFlags(),
7
+ pipeline_shader_create_info,
8
+ pipeline->layout);
9
+ - pipeline->pipeline = ctx->device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value;
10
+
11
+ - ctx->device->pipelines.push_back(pipeline);
12
+ + try {
13
+ + pipeline->pipeline = ctx->device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value;
14
+ + ctx->device->pipelines.push_back(pipeline);
15
+ + } catch (vk::UnknownError const&) {
16
+ + std::cerr << "ggml_vk_create_pipeline: Failed to create pipeline " << name << std::endl;
17
+ + ggml_vk_destroy_pipeline(ctx->device->device, pipeline);
18
+ + pipeline.reset();
19
+ + }
20
+ }
21
+
22
+ static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline) {
@@ -0,0 +1,22 @@
1
+ #include "DetokenizeWorker.h"
2
+ #include "LlamaContext.h"
3
+
4
+ DetokenizeWorker::DetokenizeWorker(const Napi::CallbackInfo &info,
5
+ LlamaSessionPtr &sess,
6
+ std::vector<llama_token> &tokens)
7
+ : AsyncWorker(info.Env()), Deferred(info.Env()), _sess(sess),
8
+ _tokens(std::move(tokens)) {}
9
+
10
+ void DetokenizeWorker::Execute() {
11
+ const auto text = ::llama_detokenize_bpe(_sess->context(), _tokens);
12
+ _text = std::move(text);
13
+ }
14
+
15
+ void DetokenizeWorker::OnOK() {
16
+ Napi::Promise::Deferred::Resolve(
17
+ Napi::String::New(Napi::AsyncWorker::Env(), _text));
18
+ }
19
+
20
+ void DetokenizeWorker::OnError(const Napi::Error &err) {
21
+ Napi::Promise::Deferred::Reject(err.Value());
22
+ }
@@ -0,0 +1,19 @@
1
+ #include "common.hpp"
2
+ #include <vector>
3
+
4
+ class DetokenizeWorker : public Napi::AsyncWorker,
5
+ public Napi::Promise::Deferred {
6
+ public:
7
+ DetokenizeWorker(const Napi::CallbackInfo &info, LlamaSessionPtr &sess,
8
+ std::vector<llama_token> &tokens);
9
+
10
+ protected:
11
+ void Execute();
12
+ void OnOK();
13
+ void OnError(const Napi::Error &err);
14
+
15
+ private:
16
+ LlamaSessionPtr _sess;
17
+ std::vector<llama_token> _tokens;
18
+ std::string _text;
19
+ };
@@ -0,0 +1,46 @@
1
+ #include "EmbeddingWorker.h"
2
+ #include "LlamaContext.h"
3
+
4
+ EmbeddingWorker::EmbeddingWorker(const Napi::CallbackInfo &info,
5
+ LlamaSessionPtr &sess, std::string text)
6
+ : AsyncWorker(info.Env()), Deferred(info.Env()), _sess(sess), _text(text) {}
7
+
8
+ void EmbeddingWorker::Execute() {
9
+ llama_kv_cache_clear(_sess->context());
10
+ auto tokens = ::llama_tokenize(_sess->context(), _text, true);
11
+ // add SEP if not present
12
+ if (tokens.empty() || tokens.back() != llama_token_sep(_sess->model())) {
13
+ tokens.push_back(llama_token_sep(_sess->model()));
14
+ }
15
+ const int n_embd = llama_n_embd(_sess->model());
16
+ do {
17
+ int ret =
18
+ llama_decode(_sess->context(),
19
+ llama_batch_get_one(tokens.data(), tokens.size(), 0, 0));
20
+ if (ret < 0) {
21
+ SetError("Failed to inference, code: " + std::to_string(ret));
22
+ break;
23
+ }
24
+ const float *embd = llama_get_embeddings_seq(_sess->context(), 0);
25
+ if (embd == nullptr) {
26
+ SetError("Failed to get embeddings");
27
+ break;
28
+ }
29
+ _result.embedding.resize(n_embd);
30
+ memcpy(_result.embedding.data(), embd, n_embd * sizeof(float));
31
+ } while (false);
32
+ }
33
+
34
+ void EmbeddingWorker::OnOK() {
35
+ auto result = Napi::Object::New(Napi::AsyncWorker::Env());
36
+ auto embedding = Napi::Float32Array::New(Napi::AsyncWorker::Env(),
37
+ _result.embedding.size());
38
+ memcpy(embedding.Data(), _result.embedding.data(),
39
+ _result.embedding.size() * sizeof(float));
40
+ result.Set("embedding", embedding);
41
+ Napi::Promise::Deferred::Resolve(result);
42
+ }
43
+
44
+ void EmbeddingWorker::OnError(const Napi::Error &err) {
45
+ Napi::Promise::Deferred::Reject(err.Value());
46
+ }
@@ -0,0 +1,23 @@
1
+ #include "common.hpp"
2
+ #include <vector>
3
+
4
+ struct EmbeddingResult {
5
+ std::vector<float> embedding;
6
+ };
7
+
8
+ class EmbeddingWorker : public Napi::AsyncWorker,
9
+ public Napi::Promise::Deferred {
10
+ public:
11
+ EmbeddingWorker(const Napi::CallbackInfo &info, LlamaSessionPtr &sess,
12
+ std::string text);
13
+
14
+ protected:
15
+ void Execute();
16
+ void OnOK();
17
+ void OnError(const Napi::Error &err);
18
+
19
+ private:
20
+ LlamaSessionPtr _sess;
21
+ std::string _text;
22
+ EmbeddingResult _result;
23
+ };
@@ -1,8 +1,11 @@
1
1
  #include "LlamaContext.h"
2
+ #include "DetokenizeWorker.h"
2
3
  #include "DisposeWorker.h"
4
+ #include "EmbeddingWorker.h"
3
5
  #include "LlamaCompletionWorker.h"
4
6
  #include "LoadSessionWorker.h"
5
7
  #include "SaveSessionWorker.h"
8
+ #include "TokenizeWorker.h"
6
9
 
7
10
  void LlamaContext::Init(Napi::Env env, Napi::Object &exports) {
8
11
  Napi::Function func = DefineClass(
@@ -16,6 +19,13 @@ void LlamaContext::Init(Napi::Env env, Napi::Object &exports) {
16
19
  InstanceMethod<&LlamaContext::StopCompletion>(
17
20
  "stopCompletion",
18
21
  static_cast<napi_property_attributes>(napi_enumerable)),
22
+ InstanceMethod<&LlamaContext::Tokenize>(
23
+ "tokenize", static_cast<napi_property_attributes>(napi_enumerable)),
24
+ InstanceMethod<&LlamaContext::Detokenize>(
25
+ "detokenize",
26
+ static_cast<napi_property_attributes>(napi_enumerable)),
27
+ InstanceMethod<&LlamaContext::Embedding>(
28
+ "embedding", static_cast<napi_property_attributes>(napi_enumerable)),
19
29
  InstanceMethod<&LlamaContext::SaveSession>(
20
30
  "saveSession",
21
31
  static_cast<napi_property_attributes>(napi_enumerable)),
@@ -158,6 +168,58 @@ void LlamaContext::StopCompletion(const Napi::CallbackInfo &info) {
158
168
  }
159
169
  }
160
170
 
171
+ // tokenize(text: string): Promise<TokenizeResult>
172
+ Napi::Value LlamaContext::Tokenize(const Napi::CallbackInfo &info) {
173
+ Napi::Env env = info.Env();
174
+ if (info.Length() < 1 || !info[0].IsString()) {
175
+ Napi::TypeError::New(env, "String expected").ThrowAsJavaScriptException();
176
+ }
177
+ if (_sess == nullptr) {
178
+ Napi::TypeError::New(env, "Context is disposed")
179
+ .ThrowAsJavaScriptException();
180
+ }
181
+ auto text = info[0].ToString().Utf8Value();
182
+ auto *worker = new TokenizeWorker(info, _sess, text);
183
+ worker->Queue();
184
+ return worker->Promise();
185
+ }
186
+
187
+ // detokenize(tokens: number[]): Promise<string>
188
+ Napi::Value LlamaContext::Detokenize(const Napi::CallbackInfo &info) {
189
+ Napi::Env env = info.Env();
190
+ if (info.Length() < 1 || !info[0].IsArray()) {
191
+ Napi::TypeError::New(env, "Array expected").ThrowAsJavaScriptException();
192
+ }
193
+ if (_sess == nullptr) {
194
+ Napi::TypeError::New(env, "Context is disposed")
195
+ .ThrowAsJavaScriptException();
196
+ }
197
+ auto tokens = info[0].As<Napi::Array>();
198
+ std::vector<int32_t> token_ids;
199
+ for (size_t i = 0; i < tokens.Length(); i++) {
200
+ token_ids.push_back(tokens.Get(i).ToNumber().Int32Value());
201
+ }
202
+ auto *worker = new DetokenizeWorker(info, _sess, token_ids);
203
+ worker->Queue();
204
+ return worker->Promise();
205
+ }
206
+
207
+ // embedding(text: string): Promise<EmbeddingResult>
208
+ Napi::Value LlamaContext::Embedding(const Napi::CallbackInfo &info) {
209
+ Napi::Env env = info.Env();
210
+ if (info.Length() < 1 || !info[0].IsString()) {
211
+ Napi::TypeError::New(env, "String expected").ThrowAsJavaScriptException();
212
+ }
213
+ if (_sess == nullptr) {
214
+ Napi::TypeError::New(env, "Context is disposed")
215
+ .ThrowAsJavaScriptException();
216
+ }
217
+ auto text = info[0].ToString().Utf8Value();
218
+ auto *worker = new EmbeddingWorker(info, _sess, text);
219
+ worker->Queue();
220
+ return worker->Promise();
221
+ }
222
+
161
223
  // saveSession(path: string): Promise<void> throws error
162
224
  Napi::Value LlamaContext::SaveSession(const Napi::CallbackInfo &info) {
163
225
  Napi::Env env = info.Env();
@@ -11,6 +11,9 @@ private:
11
11
  Napi::Value GetSystemInfo(const Napi::CallbackInfo &info);
12
12
  Napi::Value Completion(const Napi::CallbackInfo &info);
13
13
  void StopCompletion(const Napi::CallbackInfo &info);
14
+ Napi::Value Tokenize(const Napi::CallbackInfo &info);
15
+ Napi::Value Detokenize(const Napi::CallbackInfo &info);
16
+ Napi::Value Embedding(const Napi::CallbackInfo &info);
14
17
  Napi::Value SaveSession(const Napi::CallbackInfo &info);
15
18
  Napi::Value LoadSession(const Napi::CallbackInfo &info);
16
19
  Napi::Value Release(const Napi::CallbackInfo &info);
@@ -0,0 +1,26 @@
1
+ #include "TokenizeWorker.h"
2
+ #include "LlamaContext.h"
3
+
4
+ TokenizeWorker::TokenizeWorker(const Napi::CallbackInfo &info,
5
+ LlamaSessionPtr &sess, std::string text)
6
+ : AsyncWorker(info.Env()), Deferred(info.Env()), _sess(sess), _text(text) {}
7
+
8
+ void TokenizeWorker::Execute() {
9
+ const auto tokens = ::llama_tokenize(_sess->context(), _text, false);
10
+ _result.tokens = std::move(tokens);
11
+ }
12
+
13
+ void TokenizeWorker::OnOK() {
14
+ Napi::HandleScope scope(Napi::AsyncWorker::Env());
15
+ auto result = Napi::Object::New(Napi::AsyncWorker::Env());
16
+ auto tokens =
17
+ Napi::Int32Array::New(Napi::AsyncWorker::Env(), _result.tokens.size());
18
+ memcpy(tokens.Data(), _result.tokens.data(),
19
+ _result.tokens.size() * sizeof(llama_token));
20
+ result.Set("tokens", tokens);
21
+ Napi::Promise::Deferred::Resolve(result);
22
+ }
23
+
24
+ void TokenizeWorker::OnError(const Napi::Error &err) {
25
+ Napi::Promise::Deferred::Reject(err.Value());
26
+ }
@@ -0,0 +1,23 @@
1
+ #include "common.hpp"
2
+ #include <vector>
3
+
4
+ struct TokenizeResult {
5
+ std::vector<llama_token> tokens;
6
+ };
7
+
8
+ class TokenizeWorker : public Napi::AsyncWorker,
9
+ public Napi::Promise::Deferred {
10
+ public:
11
+ TokenizeWorker(const Napi::CallbackInfo &info, LlamaSessionPtr &sess,
12
+ std::string text);
13
+
14
+ protected:
15
+ void Execute();
16
+ void OnOK();
17
+ void OnError(const Napi::Error &err);
18
+
19
+ private:
20
+ LlamaSessionPtr _sess;
21
+ std::string _text;
22
+ TokenizeResult _result;
23
+ };
package/src/common.hpp CHANGED
@@ -47,7 +47,8 @@ constexpr T get_option(const Napi::Object &options, const std::string &name,
47
47
  class LlamaSession {
48
48
  public:
49
49
  LlamaSession(llama_model *model, llama_context *ctx, gpt_params params)
50
- : model_(LlamaCppModel(model, llama_free_model)), ctx_(LlamaCppContext(ctx, llama_free)), params_(params) {
50
+ : model_(LlamaCppModel(model, llama_free_model)),
51
+ ctx_(LlamaCppContext(ctx, llama_free)), params_(params) {
51
52
  tokens_.reserve(params.n_ctx);
52
53
  }
53
54
 
@@ -57,7 +58,7 @@ public:
57
58
 
58
59
  inline llama_model *model() { return model_.get(); }
59
60
 
60
- inline std::vector<llama_token>* tokens_ptr() { return &tokens_; }
61
+ inline std::vector<llama_token> *tokens_ptr() { return &tokens_; }
61
62
 
62
63
  inline void set_tokens(std::vector<llama_token> tokens) {
63
64
  tokens_ = std::move(tokens);
@@ -123,6 +123,7 @@ set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING
123
123
  set(LLAMA_METAL_STD "" CACHE STRING "llama: metal standard version (-std flag)")
124
124
  option(LLAMA_KOMPUTE "llama: use Kompute" OFF)
125
125
  option(LLAMA_MPI "llama: use MPI" OFF)
126
+ option(LLAMA_RPC "llama: use RPC" OFF)
126
127
  option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
127
128
  option(LLAMA_SYCL "llama: use SYCL" OFF)
128
129
  option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF)
@@ -296,7 +297,7 @@ if (LLAMA_BLAS)
296
297
  if (LLAMA_STATIC)
297
298
  set(BLA_STATIC ON)
298
299
  endif()
299
- if ($(CMAKE_VERSION) VERSION_GREATER_EQUAL 3.22)
300
+ if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.22)
300
301
  set(BLA_SIZEOF_INTEGER 8)
301
302
  endif()
302
303
 
@@ -494,6 +495,17 @@ if (LLAMA_MPI)
494
495
  endif()
495
496
  endif()
496
497
 
498
+ if (LLAMA_RPC)
499
+ add_compile_definitions(GGML_USE_RPC)
500
+
501
+ if (WIN32)
502
+ set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ws2_32)
503
+ endif()
504
+
505
+ set(GGML_HEADERS_RPC ggml-rpc.h)
506
+ set(GGML_SOURCES_RPC ggml-rpc.cpp)
507
+ endif()
508
+
497
509
  if (LLAMA_CLBLAST)
498
510
  find_package(CLBlast)
499
511
  if (CLBlast_FOUND)
@@ -1176,6 +1188,7 @@ add_library(ggml OBJECT
1176
1188
  ${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
1177
1189
  ${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
1178
1190
  ${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
1191
+ ${GGML_SOURCES_RPC} ${GGML_HEADERS_RPC}
1179
1192
  ${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
1180
1193
  ${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL}
1181
1194
  ${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE}
@@ -1281,17 +1294,6 @@ install(
1281
1294
  WORLD_READ
1282
1295
  WORLD_EXECUTE
1283
1296
  DESTINATION ${CMAKE_INSTALL_BINDIR})
1284
- install(
1285
- FILES convert-lora-to-ggml.py
1286
- PERMISSIONS
1287
- OWNER_READ
1288
- OWNER_WRITE
1289
- OWNER_EXECUTE
1290
- GROUP_READ
1291
- GROUP_EXECUTE
1292
- WORLD_READ
1293
- WORLD_EXECUTE
1294
- DESTINATION ${CMAKE_INSTALL_BINDIR})
1295
1297
  if (LLAMA_METAL)
1296
1298
  install(
1297
1299
  FILES ggml-metal.metal
@@ -901,6 +901,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
901
901
  params.interactive = true;
902
902
  return true;
903
903
  }
904
+ if (arg == "--interactive-specials") {
905
+ params.interactive_specials = true;
906
+ return true;
907
+ }
904
908
  if (arg == "--embedding") {
905
909
  params.embedding = true;
906
910
  return true;
@@ -1056,6 +1060,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
1056
1060
  #endif // GGML_USE_CUDA_SYCL_VULKAN
1057
1061
  return true;
1058
1062
  }
1063
+ if (arg == "--rpc") {
1064
+ if (++i >= argc) {
1065
+ invalid_param = true;
1066
+ return true;
1067
+ }
1068
+ params.rpc_servers = argv[i];
1069
+ return true;
1070
+ }
1059
1071
  if (arg == "--no-mmap") {
1060
1072
  params.use_mmap = false;
1061
1073
  return true;
@@ -1367,14 +1379,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
1367
1379
  if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
1368
1380
  std::replace(arg.begin(), arg.end(), '_', '-');
1369
1381
  }
1370
-
1371
1382
  if (!gpt_params_find_arg(argc, argv, arg, params, i, invalid_param)) {
1372
1383
  throw std::invalid_argument("error: unknown argument: " + arg);
1373
1384
  }
1374
- }
1375
-
1376
- if (invalid_param) {
1377
- throw std::invalid_argument("error: invalid parameter for argument: " + arg);
1385
+ if (invalid_param) {
1386
+ throw std::invalid_argument("error: invalid parameter for argument: " + arg);
1387
+ }
1378
1388
  }
1379
1389
 
1380
1390
  if (params.prompt_cache_all &&
@@ -1422,6 +1432,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
1422
1432
  printf(" -h, --help show this help message and exit\n");
1423
1433
  printf(" --version show version and build info\n");
1424
1434
  printf(" -i, --interactive run in interactive mode\n");
1435
+ printf(" --interactive-specials allow special tokens in user text, in interactive mode\n");
1425
1436
  printf(" --interactive-first run in interactive mode and wait for input right away\n");
1426
1437
  printf(" -cnv, --conversation run in conversation mode (does not print special tokens and suffix/prefix)\n");
1427
1438
  printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n");
@@ -1554,6 +1565,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
1554
1565
  printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
1555
1566
  printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
1556
1567
  }
1568
+ printf(" --rpc SERVERS comma separated list of RPC servers\n");
1557
1569
  printf(" --verbose-prompt print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false");
1558
1570
  printf(" --no-display-prompt don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false");
1559
1571
  printf(" -gan N, --grp-attn-n N\n");
@@ -1827,6 +1839,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
1827
1839
  if (params.n_gpu_layers != -1) {
1828
1840
  mparams.n_gpu_layers = params.n_gpu_layers;
1829
1841
  }
1842
+ mparams.rpc_servers = params.rpc_servers.c_str();
1830
1843
  mparams.main_gpu = params.main_gpu;
1831
1844
  mparams.split_mode = params.split_mode;
1832
1845
  mparams.tensor_split = params.tensor_split;
@@ -2652,6 +2665,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
2652
2665
  dump_string_yaml_multiline(stream, "in_suffix", params.input_prefix.c_str());
2653
2666
  fprintf(stream, "instruct: %s # default: false\n", params.instruct ? "true" : "false");
2654
2667
  fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false");
2668
+ fprintf(stream, "interactive_specials: %s # default: false\n", params.interactive_specials ? "true" : "false");
2655
2669
  fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false");
2656
2670
  fprintf(stream, "keep: %d # default: 0\n", params.n_keep);
2657
2671
  fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
@@ -82,6 +82,7 @@ struct gpt_params {
82
82
  float yarn_beta_slow = 1.0f; // YaRN high correction dim
83
83
  int32_t yarn_orig_ctx = 0; // YaRN original context length
84
84
  float defrag_thold = -1.0f; // KV cache defragmentation threshold
85
+ std::string rpc_servers = ""; // comma separated list of RPC servers
85
86
 
86
87
  ggml_backend_sched_eval_callback cb_eval = nullptr;
87
88
  void * cb_eval_user_data = nullptr;
@@ -140,6 +141,7 @@ struct gpt_params {
140
141
  bool random_prompt = false; // do not randomize prompt if none provided
141
142
  bool use_color = false; // use color to distinguish generations and inputs
142
143
  bool interactive = false; // interactive mode
144
+ bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode
143
145
  bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
144
146
  bool chatml = false; // chatml mode (used for models trained on chatml syntax)
145
147
  bool prompt_cache_all = false; // save user input and generations to prompt cache
@@ -142,6 +142,9 @@ namespace grammar_parser {
142
142
  pos++;
143
143
  last_sym_start = out_elements.size();
144
144
  while (*pos != '"') {
145
+ if (!*pos) {
146
+ throw std::runtime_error("unexpected end of input");
147
+ }
145
148
  auto char_pair = parse_char(pos);
146
149
  pos = char_pair.second;
147
150
  out_elements.push_back({LLAMA_GRETYPE_CHAR, char_pair.first});
@@ -156,6 +159,9 @@ namespace grammar_parser {
156
159
  }
157
160
  last_sym_start = out_elements.size();
158
161
  while (*pos != ']') {
162
+ if (!*pos) {
163
+ throw std::runtime_error("unexpected end of input");
164
+ }
159
165
  auto char_pair = parse_char(pos);
160
166
  pos = char_pair.second;
161
167
  enum llama_gretype type = last_sym_start < out_elements.size()
@@ -164,6 +170,9 @@ namespace grammar_parser {
164
170
 
165
171
  out_elements.push_back({type, char_pair.first});
166
172
  if (pos[0] == '-' && pos[1] != ']') {
173
+ if (!pos[1]) {
174
+ throw std::runtime_error("unexpected end of input");
175
+ }
167
176
  auto endchar_pair = parse_char(pos + 1);
168
177
  pos = endchar_pair.second;
169
178
  out_elements.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first});
@@ -35,7 +35,7 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_
35
35
 
36
36
  result->prev.resize(params.n_prev);
37
37
 
38
- result->n_considered = 0;
38
+ result->n_valid = 0;
39
39
 
40
40
  llama_sampling_set_rng_seed(result, params.seed);
41
41
 
@@ -66,7 +66,7 @@ void llama_sampling_reset(llama_sampling_context * ctx) {
66
66
 
67
67
  std::fill(ctx->prev.begin(), ctx->prev.end(), 0);
68
68
  ctx->cur.clear();
69
- ctx->n_considered = 0;
69
+ ctx->n_valid = 0;
70
70
  }
71
71
 
72
72
  void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed) {
@@ -256,7 +256,7 @@ static llama_token llama_sampling_sample_impl(
256
256
  }
257
257
  }
258
258
 
259
- ctx_sampling->n_considered = cur_p.size;
259
+ ctx_sampling->n_valid = temp == 0.0f ? 0 : cur_p.size;
260
260
 
261
261
  return id;
262
262
  }
@@ -81,7 +81,7 @@ struct llama_sampling_context {
81
81
  // TODO: replace with ring-buffer
82
82
  std::vector<llama_token> prev;
83
83
  std::vector<llama_token_data> cur;
84
- size_t n_considered;
84
+ size_t n_valid; // Number of correct top tokens with correct probabilities.
85
85
 
86
86
  std::mt19937 rng;
87
87
  };