@fugood/llama.node 0.0.1-alpha.3 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/CMakeLists.txt +36 -7
  2. package/README.md +9 -0
  3. package/bin/darwin/arm64/default.metallib +0 -0
  4. package/bin/darwin/arm64/llama-node.node +0 -0
  5. package/bin/darwin/x64/default.metallib +0 -0
  6. package/bin/darwin/x64/llama-node.node +0 -0
  7. package/bin/linux/arm64/llama-node.node +0 -0
  8. package/bin/linux/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/lib/binding.js +18 -1
  14. package/lib/binding.ts +22 -2
  15. package/lib/index.ts +2 -2
  16. package/package.json +15 -3
  17. package/src/LlamaCompletionWorker.cpp +5 -1
  18. package/src/LlamaCompletionWorker.h +4 -0
  19. package/src/LlamaContext.cpp +18 -1
  20. package/src/common.hpp +11 -7
  21. package/src/llama.cpp/CMakeLists.txt +13 -7
  22. package/src/llama.cpp/common/common.cpp +221 -173
  23. package/src/llama.cpp/common/common.h +19 -8
  24. package/src/llama.cpp/common/json-schema-to-grammar.h +4 -0
  25. package/src/llama.cpp/common/log.h +2 -2
  26. package/src/llama.cpp/common/sampling.cpp +17 -1
  27. package/src/llama.cpp/common/sampling.h +28 -20
  28. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +17 -11
  29. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +5 -5
  30. package/src/llama.cpp/examples/finetune/finetune.cpp +1 -1
  31. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -4
  32. package/src/llama.cpp/examples/imatrix/imatrix.cpp +72 -39
  33. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +42 -3
  34. package/src/llama.cpp/examples/llava/clip.cpp +74 -23
  35. package/src/llama.cpp/examples/llava/llava-cli.cpp +37 -28
  36. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +0 -1
  37. package/src/llama.cpp/examples/lookup/lookup.cpp +0 -1
  38. package/src/llama.cpp/examples/main/main.cpp +10 -8
  39. package/src/llama.cpp/examples/perplexity/perplexity.cpp +175 -55
  40. package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
  41. package/src/llama.cpp/examples/quantize/quantize.cpp +74 -47
  42. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
  43. package/src/llama.cpp/examples/server/server.cpp +97 -86
  44. package/src/llama.cpp/examples/server/utils.hpp +17 -15
  45. package/src/llama.cpp/ggml-backend.c +7 -5
  46. package/src/llama.cpp/ggml-impl.h +339 -4
  47. package/src/llama.cpp/ggml-kompute.cpp +7 -0
  48. package/src/llama.cpp/ggml-opencl.cpp +1 -0
  49. package/src/llama.cpp/ggml-quants.c +302 -293
  50. package/src/llama.cpp/ggml-sycl.cpp +28 -16
  51. package/src/llama.cpp/ggml-vulkan-shaders.hpp +46843 -39205
  52. package/src/llama.cpp/ggml-vulkan.cpp +951 -263
  53. package/src/llama.cpp/ggml.c +1469 -116
  54. package/src/llama.cpp/ggml.h +37 -7
  55. package/src/llama.cpp/llama.cpp +969 -432
  56. package/src/llama.cpp/llama.h +46 -14
  57. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf-update.txt +2 -0
  58. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +0 -1
  59. package/src/llama.cpp/requirements/requirements-convert.txt +2 -2
  60. package/src/llama.cpp/requirements.txt +1 -0
  61. package/src/llama.cpp/sgemm.cpp +134 -103
  62. package/src/llama.cpp/sgemm.h +4 -2
  63. package/src/llama.cpp/tests/CMakeLists.txt +96 -36
  64. package/src/llama.cpp/tests/test-backend-ops.cpp +56 -6
  65. package/src/llama.cpp/tests/test-chat-template.cpp +4 -0
  66. package/src/llama.cpp/tests/test-grammar-integration.cpp +225 -136
  67. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +1 -0
  68. package/src/llama.cpp/tests/test-tokenizer-0.cpp +292 -0
  69. package/src/llama.cpp/tests/{test-tokenizer-1-llama.cpp → test-tokenizer-1-spm.cpp} +1 -1
  70. package/src/llama.cpp/unicode-data.cpp +1188 -656
  71. package/src/llama.cpp/unicode-data.h +4 -3
  72. package/src/llama.cpp/unicode.cpp +590 -49
  73. package/src/llama.cpp/unicode.h +6 -3
  74. package/src/llama.cpp/tests/test-tokenizer-0-falcon.cpp +0 -187
  75. package/src/llama.cpp/tests/test-tokenizer-0-llama.cpp +0 -190
package/CMakeLists.txt CHANGED
@@ -26,7 +26,7 @@ string(REPLACE "i686" "ia32" ARCH ${ARCH})
26
26
  string(REPLACE "i386" "ia32" ARCH ${ARCH})
27
27
  string(REPLACE "armv7l" "arm" ARCH ${ARCH})
28
28
  string(REPLACE "arm" "arm" ARCH ${ARCH})
29
- string(REPLACE "arm64ex" "arm64" ARCH ${ARCH})
29
+ string(REPLACE "arm64x" "arm64" ARCH ${ARCH})
30
30
  string(REPLACE "aarch64" "arm64" ARCH ${ARCH})
31
31
 
32
32
  if(DEFINED VARIANT)
@@ -58,6 +58,12 @@ include_directories(${CMAKE_JS_INC})
58
58
  # flags: -fPIC
59
59
  set(CMAKE_POSITION_INDEPENDENT_CODE ON)
60
60
 
61
+ # VULKAN_SDK
62
+ if (VULKAN_SDK)
63
+ set(ENV{VULKAN_SDK} ${VULKAN_SDK})
64
+ find_package(Vulkan REQUIRED)
65
+ endif()
66
+
61
67
  set(LLAMA_STATIC ON CACHE BOOL "Build llama as static library")
62
68
  add_subdirectory("src/llama.cpp")
63
69
 
@@ -81,31 +87,54 @@ add_library(${PROJECT_NAME} SHARED ${SOURCE_FILES} ${CMAKE_JS_SRC})
81
87
  set_target_properties(${PROJECT_NAME} PROPERTIES PREFIX "" SUFFIX ".node")
82
88
  target_link_libraries(${PROJECT_NAME} ${CMAKE_JS_LIB} llama ggml common)
83
89
 
90
+ add_custom_target(copy_assets ALL DEPENDS ${PROJECT_NAME})
91
+
92
+ add_custom_command(
93
+ TARGET copy_assets
94
+ COMMAND ${CMAKE_COMMAND} -E remove_directory ${PLATFORM_BINARY_DIR}
95
+ COMMENT "Cleaning bin folder"
96
+ )
97
+
84
98
  if(MSVC AND CMAKE_JS_NODELIB_DEF AND CMAKE_JS_NODELIB_TARGET)
85
99
  # Generate node.lib
86
100
  execute_process(COMMAND ${CMAKE_AR} /def:${CMAKE_JS_NODELIB_DEF} /out:${CMAKE_JS_NODELIB_TARGET} ${CMAKE_STATIC_LINKER_FLAGS})
87
101
  # copy target to bin folder
88
102
  get_filename_component(CMAKE_JS_NODELIB_TARGET_NAME ${CMAKE_JS_NODELIB_TARGET} NAME)
89
- add_custom_command(TARGET ${PROJECT_NAME} POST_BUILD
103
+ add_custom_command(TARGET copy_assets
90
104
  COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_JS_NODELIB_TARGET} ${PLATFORM_BINARY_DIR}/${CMAKE_JS_NODELIB_TARGET_NAME}
91
105
  COMMENT "Copying to bin folder"
92
106
  )
93
107
  endif()
94
108
 
95
109
  # copy target to bin folder
96
- add_custom_command(TARGET ${PROJECT_NAME} POST_BUILD
110
+ add_custom_command(TARGET copy_assets
97
111
  COMMAND ${CMAKE_COMMAND} -E copy $<TARGET_FILE:${PROJECT_NAME}> ${PLATFORM_BINARY_DIR}/$<TARGET_FILE_NAME:${PROJECT_NAME}>
98
112
  COMMENT "Copying to bin folder"
99
113
  )
100
114
 
101
115
  if (LLAMA_METAL)
102
116
  # copy ${CMAKE_BINARY_DIR}/bin/default.metallib
103
- add_custom_target(copy_metallib)
104
117
  add_custom_command(
105
- TARGET copy_metallib
118
+ TARGET copy_assets
106
119
  COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_BINARY_DIR}/bin/default.metallib ${PLATFORM_BINARY_DIR}/default.metallib
107
120
  COMMENT "Copying default.metallib to bin folder"
108
121
  )
109
- add_dependencies(copy_metallib ggml-metal)
110
- add_dependencies(${PROJECT_NAME} copy_metallib)
122
+ add_dependencies(copy_assets ggml-metal)
123
+ endif()
124
+
125
+ if (LLAMA_CLBLAST)
126
+ find_package(CLBlast)
127
+ if (CLBlast_FOUND)
128
+ message(STATUS "CLBlast found: ${CLBlast_DIR}")
129
+ file(
130
+ GLOB CLBlast_SO_FILES
131
+ ${CLBlast_DIR}/../../../bin/clblast.dll
132
+ ${CLBlast_DIR}/../../../lib/libclblast.so
133
+ )
134
+ add_custom_command(
135
+ TARGET copy_assets
136
+ COMMAND ${CMAKE_COMMAND} -E copy ${CLBlast_SO_FILES} ${PLATFORM_BINARY_DIR}
137
+ COMMENT "Copying CLBlast SO files to bin folder"
138
+ )
139
+ endif()
111
140
  endif()
package/README.md CHANGED
@@ -1,5 +1,9 @@
1
1
  # llama.node
2
2
 
3
+ [![CI](https://github.com/mybigday/llama.node/actions/workflows/ci.yml/badge.svg?branch=main)](https://github.com/mybigday/llama.node/actions/workflows/ci.yml)
4
+ [![NPM Version](https://img.shields.io/npm/v/%40fugood%2Fllama.node)](https://www.npmjs.com/package/@fugood/llama.node)
5
+ ![NPM Downloads](https://img.shields.io/npm/dw/%40fugood%2Fllama.node)
6
+
3
7
  Node binding of [llama.cpp](https://github.com/ggerganov/llama.cpp).
4
8
 
5
9
  [llama.cpp](https://github.com/ggerganov/llama.cpp): Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
@@ -22,6 +26,7 @@ const context = await loadModel({
22
26
  n_ctx: 2048,
23
27
  n_gpu_layers: 1, // > 0: enable GPU
24
28
  // embedding: true, // use embedding
29
+ // lib_variant: 'opencl', // Change backend
25
30
  })
26
31
 
27
32
  // Do completion
@@ -40,6 +45,10 @@ const { text, timings } = await context.completion(
40
45
  console.log('Result:', text)
41
46
  ```
42
47
 
48
+ ## Lib Variants
49
+
50
+ - [x] `default`: General usage, Supported GPU: Metal (macOS) and Vulkan (Linux / Windows)
51
+
43
52
  ## License
44
53
 
45
54
  MIT
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
package/lib/binding.js CHANGED
@@ -33,13 +33,30 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
33
33
  };
34
34
  Object.defineProperty(exports, "__esModule", { value: true });
35
35
  exports.loadModule = void 0;
36
+ const path = __importStar(require("path"));
37
+ const setupEnv = (variant) => {
38
+ var _a, _b;
39
+ const postfix = variant ? `-${variant}` : '';
40
+ const binPath = path.resolve(__dirname, `../bin/${process.platform}${postfix}/${process.arch}/`);
41
+ const systemPathEnv = (_b = (_a = process.env.PATH) !== null && _a !== void 0 ? _a : process.env.Path) !== null && _b !== void 0 ? _b : '';
42
+ if (!systemPathEnv.includes(binPath)) {
43
+ if (process.platform === 'win32') {
44
+ process.env.Path = `${binPath};${systemPathEnv}`;
45
+ }
46
+ else {
47
+ process.env.PATH = `${binPath}:${systemPathEnv}`;
48
+ }
49
+ }
50
+ };
36
51
  const loadModule = (variant) => __awaiter(void 0, void 0, void 0, function* () {
37
52
  try {
38
- if (variant) {
53
+ if (variant && variant !== 'default') {
54
+ setupEnv(variant);
39
55
  return yield Promise.resolve(`${`../bin/${process.platform}-${variant}/${process.arch}/llama-node.node`}`).then(s => __importStar(require(s)));
40
56
  }
41
57
  }
42
58
  catch (_a) { } // ignore errors and try the common path
59
+ setupEnv();
43
60
  return yield Promise.resolve(`${`../bin/${process.platform}/${process.arch}/llama-node.node`}`).then(s => __importStar(require(s)));
44
61
  });
45
62
  exports.loadModule = loadModule;
package/lib/binding.ts CHANGED
@@ -1,3 +1,5 @@
1
+ import * as path from 'path'
2
+
1
3
  export type LlamaModelOptions = {
2
4
  model: string
3
5
  embedding?: boolean
@@ -21,6 +23,7 @@ export type LlamaCompletionOptions = {
21
23
  max_tokens?: number
22
24
  seed?: number
23
25
  stop?: string[]
26
+ grammar?: string
24
27
  }
25
28
 
26
29
  export type LlamaCompletionResult = {
@@ -48,11 +51,28 @@ export interface Module {
48
51
  LlamaContext: LlamaContext
49
52
  }
50
53
 
51
- export const loadModule = async (variant?: string): Promise<Module> => {
54
+ export type LibVariant = 'default' | 'opencl'
55
+
56
+ const setupEnv = (variant?: string) => {
57
+ const postfix = variant ? `-${variant}` : ''
58
+ const binPath = path.resolve(__dirname, `../bin/${process.platform}${postfix}/${process.arch}/`)
59
+ const systemPathEnv = process.env.PATH ?? process.env.Path ?? ''
60
+ if (!systemPathEnv.includes(binPath)) {
61
+ if (process.platform === 'win32') {
62
+ process.env.Path = `${binPath};${systemPathEnv}`
63
+ } else {
64
+ process.env.PATH = `${binPath}:${systemPathEnv}`
65
+ }
66
+ }
67
+ }
68
+
69
+ export const loadModule = async (variant?: LibVariant): Promise<Module> => {
52
70
  try {
53
- if (variant) {
71
+ if (variant && variant !== 'default') {
72
+ setupEnv(variant)
54
73
  return await import(`../bin/${process.platform}-${variant}/${process.arch}/llama-node.node`) as Module
55
74
  }
56
75
  } catch {} // ignore errors and try the common path
76
+ setupEnv()
57
77
  return await import(`../bin/${process.platform}/${process.arch}/llama-node.node`) as Module
58
78
  }
package/lib/index.ts CHANGED
@@ -1,10 +1,10 @@
1
1
  import { loadModule, LlamaModelOptions } from './binding'
2
- import type { Module, LlamaContext } from './binding'
2
+ import type { Module, LlamaContext, LibVariant } from './binding'
3
3
 
4
4
  export * from './binding'
5
5
 
6
6
  export interface LlamaModelOptionsExtended extends LlamaModelOptions {
7
- lib_variant?: string
7
+ lib_variant?: LibVariant
8
8
  }
9
9
 
10
10
  const mods: { [key: string]: Module } = {}
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "0.0.1-alpha.3",
4
+ "version": "0.1.0",
5
5
  "description": "Llama.cpp for Node.js",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -39,8 +39,6 @@
39
39
  },
40
40
  "files": [
41
41
  "bin/**/*",
42
- "scripts/*.js",
43
- "scripts/*.ts",
44
42
  "src/**/*.{c,cc,cpp,h,hh,hpp,txt,cmake}",
45
43
  "lib/*.js",
46
44
  "lib/*.ts",
@@ -62,5 +60,19 @@
62
60
  },
63
61
  "dependencies": {
64
62
  "node-addon-api": "^8.0.0"
63
+ },
64
+ "jest": {
65
+ "testEnvironment": "node",
66
+ "moduleFileExtensions": [
67
+ "ts",
68
+ "tsx",
69
+ "js",
70
+ "jsx",
71
+ "json",
72
+ "node"
73
+ ],
74
+ "testMatch": [
75
+ "**/*.test.ts"
76
+ ]
65
77
  }
66
78
  }
@@ -58,7 +58,7 @@ void LlamaCompletionWorker::Execute() {
58
58
  const auto n_keep = _params.n_keep;
59
59
  size_t n_cur = 0;
60
60
  size_t n_input = 0;
61
- const auto model = llama_get_model(_sess->context());
61
+ const auto model = _sess->model();
62
62
  const bool add_bos = llama_should_add_bos_token(model);
63
63
  auto ctx = _sess->context();
64
64
 
@@ -110,6 +110,7 @@ void LlamaCompletionWorker::Execute() {
110
110
  // sample the next token
111
111
  const llama_token new_token_id =
112
112
  llama_sampling_sample(sampling.get(), ctx, nullptr);
113
+ llama_sampling_accept(sampling.get(), ctx, new_token_id, true);
113
114
  // prepare the next batch
114
115
  embd->emplace_back(new_token_id);
115
116
  auto token = llama_token_to_piece(ctx, new_token_id);
@@ -143,6 +144,9 @@ void LlamaCompletionWorker::Execute() {
143
144
  }
144
145
  const auto t_main_end = ggml_time_us();
145
146
  _sess->get_mutex().unlock();
147
+ if (_onComplete) {
148
+ _onComplete();
149
+ }
146
150
  }
147
151
 
148
152
  void LlamaCompletionWorker::OnOK() {
@@ -1,4 +1,5 @@
1
1
  #include "common.hpp"
2
+ #include <functional>
2
3
 
3
4
  struct CompletionResult {
4
5
  std::string text = "";
@@ -18,6 +19,8 @@ public:
18
19
 
19
20
  inline void Stop() { _stop = true; }
20
21
 
22
+ inline void onComplete(std::function<void()> cb) { _onComplete = cb; }
23
+
21
24
  protected:
22
25
  void Execute();
23
26
  void OnOK();
@@ -30,5 +33,6 @@ private:
30
33
  Napi::ThreadSafeFunction _tsfn;
31
34
  bool _has_callback = false;
32
35
  bool _stop = false;
36
+ std::function<void()> _onComplete;
33
37
  CompletionResult _result;
34
38
  };
@@ -70,7 +70,7 @@ LlamaContext::LlamaContext(const Napi::CallbackInfo &info)
70
70
  .ThrowAsJavaScriptException();
71
71
  }
72
72
 
73
- _sess = std::make_shared<LlamaSession>(ctx, params);
73
+ _sess = std::make_shared<LlamaSession>(model, ctx, params);
74
74
  _info = get_system_info(params);
75
75
  }
76
76
 
@@ -93,6 +93,10 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
93
93
  Napi::TypeError::New(env, "Context is disposed")
94
94
  .ThrowAsJavaScriptException();
95
95
  }
96
+ if (_wip != nullptr) {
97
+ Napi::TypeError::New(env, "Another completion is in progress")
98
+ .ThrowAsJavaScriptException();
99
+ }
96
100
  auto options = info[0].As<Napi::Object>();
97
101
 
98
102
  gpt_params params = _sess->params();
@@ -143,6 +147,7 @@ Napi::Value LlamaContext::Completion(const Napi::CallbackInfo &info) {
143
147
  new LlamaCompletionWorker(info, _sess, callback, params, stop_words);
144
148
  worker->Queue();
145
149
  _wip = worker;
150
+ worker->onComplete([this]() { _wip = nullptr; });
146
151
  return worker->Promise();
147
152
  }
148
153
 
@@ -163,6 +168,12 @@ Napi::Value LlamaContext::SaveSession(const Napi::CallbackInfo &info) {
163
168
  Napi::TypeError::New(env, "Context is disposed")
164
169
  .ThrowAsJavaScriptException();
165
170
  }
171
+ #ifdef GGML_USE_VULKAN
172
+ if (_sess->params().n_gpu_layers > 0) {
173
+ Napi::TypeError::New(env, "Vulkan cannot save session")
174
+ .ThrowAsJavaScriptException();
175
+ }
176
+ #endif
166
177
  auto *worker = new SaveSessionWorker(info, _sess);
167
178
  worker->Queue();
168
179
  return worker->Promise();
@@ -178,6 +189,12 @@ Napi::Value LlamaContext::LoadSession(const Napi::CallbackInfo &info) {
178
189
  Napi::TypeError::New(env, "Context is disposed")
179
190
  .ThrowAsJavaScriptException();
180
191
  }
192
+ #ifdef GGML_USE_VULKAN
193
+ if (_sess->params().n_gpu_layers > 0) {
194
+ Napi::TypeError::New(env, "Vulkan cannot load session")
195
+ .ThrowAsJavaScriptException();
196
+ }
197
+ #endif
181
198
  auto *worker = new LoadSessionWorker(info, _sess);
182
199
  worker->Queue();
183
200
  return worker->Promise();
package/src/common.hpp CHANGED
@@ -46,32 +46,36 @@ constexpr T get_option(const Napi::Object &options, const std::string &name,
46
46
 
47
47
  class LlamaSession {
48
48
  public:
49
- LlamaSession(llama_context *ctx, gpt_params params)
50
- : ctx_(LlamaCppContext(ctx, llama_free)), params_(params) {
49
+ LlamaSession(llama_model *model, llama_context *ctx, gpt_params params)
50
+ : model_(LlamaCppModel(model, llama_free_model)), ctx_(LlamaCppContext(ctx, llama_free)), params_(params) {
51
51
  tokens_.reserve(params.n_ctx);
52
52
  }
53
53
 
54
54
  ~LlamaSession() { dispose(); }
55
55
 
56
- llama_context *context() { return ctx_.get(); }
56
+ inline llama_context *context() { return ctx_.get(); }
57
57
 
58
- std::vector<llama_token>* tokens_ptr() { return &tokens_; }
58
+ inline llama_model *model() { return model_.get(); }
59
59
 
60
- void set_tokens(std::vector<llama_token> tokens) {
60
+ inline std::vector<llama_token>* tokens_ptr() { return &tokens_; }
61
+
62
+ inline void set_tokens(std::vector<llama_token> tokens) {
61
63
  tokens_ = std::move(tokens);
62
64
  }
63
65
 
64
- const gpt_params &params() const { return params_; }
66
+ inline const gpt_params &params() const { return params_; }
65
67
 
66
- std::mutex &get_mutex() { return mutex; }
68
+ inline std::mutex &get_mutex() { return mutex; }
67
69
 
68
70
  void dispose() {
69
71
  std::lock_guard<std::mutex> lock(mutex);
70
72
  tokens_.clear();
71
73
  ctx_.reset();
74
+ model_.reset();
72
75
  }
73
76
 
74
77
  private:
78
+ LlamaCppModel model_;
75
79
  LlamaCppContext ctx_;
76
80
  const gpt_params params_;
77
81
  std::vector<llama_token> tokens_{};
@@ -43,11 +43,7 @@ else()
43
43
  set(LLAMA_METAL_DEFAULT OFF)
44
44
  endif()
45
45
 
46
- if (CMAKE_SYSTEM_NAME MATCHES "ANDROID")
47
- set(LLAMA_LLAMAFILE_DEFAULT OFF)
48
- else()
49
- set(LLAMA_LLAMAFILE_DEFAULT ON)
50
- endif()
46
+ set(LLAMA_LLAMAFILE_DEFAULT ON)
51
47
 
52
48
  # general
53
49
  option(BUILD_SHARED_LIBS "build shared libraries" OFF)
@@ -107,6 +103,8 @@ set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for
107
103
  set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
108
104
  "llama: max. batch size for using peer access")
109
105
  option(LLAMA_CUDA_NO_PEER_COPY "llama: do not use peer to peer copies" OFF)
106
+ option(LLAMA_CUDA_NO_VMM "llama: do not try to use CUDA VMM" OFF)
107
+
110
108
  option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
111
109
  option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
112
110
  option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF)
@@ -407,12 +405,16 @@ if (LLAMA_CUDA)
407
405
  list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu")
408
406
 
409
407
  add_compile_definitions(GGML_USE_CUDA)
408
+ add_compile_definitions(GGML_CUDA_USE_GRAPHS)
410
409
  if (LLAMA_CUDA_FORCE_DMMV)
411
410
  add_compile_definitions(GGML_CUDA_FORCE_DMMV)
412
411
  endif()
413
412
  if (LLAMA_CUDA_FORCE_MMQ)
414
413
  add_compile_definitions(GGML_CUDA_FORCE_MMQ)
415
414
  endif()
415
+ if (LLAMA_CUDA_NO_VMM)
416
+ add_compile_definitions(GGML_CUDA_NO_VMM)
417
+ endif()
416
418
  add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
417
419
  add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
418
420
  if (DEFINED LLAMA_CUDA_DMMV_Y)
@@ -429,7 +431,7 @@ if (LLAMA_CUDA)
429
431
 
430
432
  if (LLAMA_STATIC)
431
433
  if (WIN32)
432
- # As of 12.3.1 CUDA Tookit for Windows does not offer a static cublas library
434
+ # As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library
433
435
  set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
434
436
  else ()
435
437
  set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
@@ -438,7 +440,11 @@ if (LLAMA_CUDA)
438
440
  set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
439
441
  endif()
440
442
 
441
- set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver)
443
+ if (LLAMA_CUDA_NO_VMM)
444
+ # No VMM requested, no need to link directly with the cuda driver lib (libcuda.so)
445
+ else()
446
+ set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
447
+ endif()
442
448
 
443
449
  if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
444
450
  # 52 == lowest CUDA 12 standard