@fugood/llama.node 1.3.0-rc.0 → 1.3.0-rc.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +2 -0
- package/lib/binding.ts +45 -2
- package/lib/index.js +9 -1
- package/lib/index.ts +9 -0
- package/package.json +14 -14
- package/src/LlamaContext.cpp +10 -0
- package/src/LlamaContext.h +1 -0
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +12 -12
- package/src/llama.cpp/ggml/include/ggml-rpc.h +1 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.cpp +3 -2
- package/src/llama.cpp/src/llama-model.cpp +28 -32
package/CMakeLists.txt
CHANGED
package/lib/binding.ts
CHANGED
|
@@ -236,6 +236,36 @@ export type LlamaCompletionToken = {
|
|
|
236
236
|
completion_probabilities?: CompletionProbability[]
|
|
237
237
|
}
|
|
238
238
|
|
|
239
|
+
/**
|
|
240
|
+
* Result from a parallel completion request (queueCompletion callback).
|
|
241
|
+
* Extends the basic completion result with per-slot timing information.
|
|
242
|
+
*/
|
|
243
|
+
export type LlamaParallelCompletionResult = {
|
|
244
|
+
requestId: number
|
|
245
|
+
text: string
|
|
246
|
+
reasoning_content?: string
|
|
247
|
+
content?: string
|
|
248
|
+
tool_calls?: ToolCall[]
|
|
249
|
+
chat_format: number
|
|
250
|
+
stopped_eos: boolean
|
|
251
|
+
stopped_limit: boolean
|
|
252
|
+
stopped_word: boolean
|
|
253
|
+
context_full: boolean
|
|
254
|
+
tokens_evaluated: number
|
|
255
|
+
tokens_predicted: number
|
|
256
|
+
timings: {
|
|
257
|
+
cache_n: number
|
|
258
|
+
prompt_n: number
|
|
259
|
+
prompt_ms: number
|
|
260
|
+
prompt_per_token_ms: number
|
|
261
|
+
prompt_per_second: number
|
|
262
|
+
predicted_n: number
|
|
263
|
+
predicted_ms: number
|
|
264
|
+
predicted_per_token_ms: number
|
|
265
|
+
predicted_per_second: number
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
|
|
239
269
|
export type TokenizeResult = {
|
|
240
270
|
tokens: Int32Array
|
|
241
271
|
has_media: boolean
|
|
@@ -257,6 +287,14 @@ export type RerankResult = {
|
|
|
257
287
|
index: number
|
|
258
288
|
}
|
|
259
289
|
|
|
290
|
+
export type BackendDeviceInfo = {
|
|
291
|
+
backend: string
|
|
292
|
+
type: string
|
|
293
|
+
deviceName: string
|
|
294
|
+
maxMemorySize: number
|
|
295
|
+
metadata?: Record<string, any>
|
|
296
|
+
}
|
|
297
|
+
|
|
260
298
|
export type ModelInfo = {
|
|
261
299
|
desc: string
|
|
262
300
|
nEmbd: number
|
|
@@ -457,12 +495,12 @@ export interface LlamaContext {
|
|
|
457
495
|
/**
|
|
458
496
|
* Queue a completion request for parallel processing
|
|
459
497
|
* @param options Completion options with parallel-specific state management
|
|
460
|
-
* @param callback Optional
|
|
498
|
+
* @param callback Optional callback that receives tokens during generation and final result
|
|
461
499
|
* @returns Object with requestId
|
|
462
500
|
*/
|
|
463
501
|
queueCompletion(
|
|
464
502
|
options: LlamaParallelCompletionOptions,
|
|
465
|
-
callback?: (error: any, result:
|
|
503
|
+
callback?: (error: any, result: LlamaParallelCompletionResult) => void,
|
|
466
504
|
): { requestId: number }
|
|
467
505
|
|
|
468
506
|
/**
|
|
@@ -505,6 +543,11 @@ export interface LlamaContext {
|
|
|
505
543
|
enable: boolean,
|
|
506
544
|
callback: (level: string, text: string) => void,
|
|
507
545
|
): void
|
|
546
|
+
/**
|
|
547
|
+
* Get information about available backend devices
|
|
548
|
+
* @returns Array of backend device information
|
|
549
|
+
*/
|
|
550
|
+
getBackendDevicesInfo(): BackendDeviceInfo[]
|
|
508
551
|
}
|
|
509
552
|
|
|
510
553
|
export interface Module {
|
package/lib/index.js
CHANGED
|
@@ -23,7 +23,7 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
|
|
|
23
23
|
});
|
|
24
24
|
};
|
|
25
25
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
26
|
-
exports.BuildInfo = exports.loadLlamaModelInfo = exports.initLlama = exports.loadModel = exports.toggleNativeLog = exports.MTMD_DEFAULT_MEDIA_MARKER = exports.LlamaParallelAPI = void 0;
|
|
26
|
+
exports.BuildInfo = exports.getBackendDevicesInfo = exports.loadLlamaModelInfo = exports.initLlama = exports.loadModel = exports.toggleNativeLog = exports.MTMD_DEFAULT_MEDIA_MARKER = exports.LlamaParallelAPI = void 0;
|
|
27
27
|
exports.addNativeLogListener = addNativeLogListener;
|
|
28
28
|
const binding_1 = require("./binding");
|
|
29
29
|
const version_1 = require("./version");
|
|
@@ -269,6 +269,14 @@ const loadLlamaModelInfo = (path) => __awaiter(void 0, void 0, void 0, function*
|
|
|
269
269
|
return mods[variant].LlamaContext.loadModelInfo(path, modelInfoSkip);
|
|
270
270
|
});
|
|
271
271
|
exports.loadLlamaModelInfo = loadLlamaModelInfo;
|
|
272
|
+
const getBackendDevicesInfo = (...args_1) => __awaiter(void 0, [...args_1], void 0, function* (variant = 'default') {
|
|
273
|
+
var _a;
|
|
274
|
+
(_a = mods[variant]) !== null && _a !== void 0 ? _a : (mods[variant] = yield (0, binding_1.loadModule)(variant));
|
|
275
|
+
refreshNativeLogSetup();
|
|
276
|
+
const jsonString = mods[variant].LlamaContext.getBackendDevicesInfo();
|
|
277
|
+
return JSON.parse(jsonString);
|
|
278
|
+
});
|
|
279
|
+
exports.getBackendDevicesInfo = getBackendDevicesInfo;
|
|
272
280
|
exports.BuildInfo = {
|
|
273
281
|
number: version_1.BUILD_NUMBER,
|
|
274
282
|
commit: version_1.BUILD_COMMIT,
|
package/lib/index.ts
CHANGED
|
@@ -385,6 +385,15 @@ export const loadLlamaModelInfo = async (
|
|
|
385
385
|
return mods[variant].LlamaContext.loadModelInfo(path, modelInfoSkip)
|
|
386
386
|
}
|
|
387
387
|
|
|
388
|
+
export const getBackendDevicesInfo = async (
|
|
389
|
+
variant: LibVariant = 'default'
|
|
390
|
+
): Promise<import('./binding').BackendDeviceInfo[]> => {
|
|
391
|
+
mods[variant] ??= await loadModule(variant)
|
|
392
|
+
refreshNativeLogSetup()
|
|
393
|
+
const jsonString = mods[variant].LlamaContext.getBackendDevicesInfo()
|
|
394
|
+
return JSON.parse(jsonString as any)
|
|
395
|
+
}
|
|
396
|
+
|
|
388
397
|
export const BuildInfo = {
|
|
389
398
|
number: BUILD_NUMBER,
|
|
390
399
|
commit: BUILD_COMMIT,
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@fugood/llama.node",
|
|
3
3
|
"access": "public",
|
|
4
|
-
"version": "1.3.0-rc.
|
|
4
|
+
"version": "1.3.0-rc.2",
|
|
5
5
|
"description": "An another Node binding of llama.cpp",
|
|
6
6
|
"main": "lib/index.js",
|
|
7
7
|
"scripts": {
|
|
@@ -72,19 +72,19 @@
|
|
|
72
72
|
"CMakeLists.txt"
|
|
73
73
|
],
|
|
74
74
|
"optionalDependencies": {
|
|
75
|
-
"@fugood/node-llama-linux-x64": "1.3.0-rc.
|
|
76
|
-
"@fugood/node-llama-linux-x64-vulkan": "1.3.0-rc.
|
|
77
|
-
"@fugood/node-llama-linux-x64-cuda": "1.3.0-rc.
|
|
78
|
-
"@fugood/node-llama-linux-arm64": "1.3.0-rc.
|
|
79
|
-
"@fugood/node-llama-linux-arm64-vulkan": "1.3.0-rc.
|
|
80
|
-
"@fugood/node-llama-linux-arm64-cuda": "1.3.0-rc.
|
|
81
|
-
"@fugood/node-llama-win32-x64": "1.3.0-rc.
|
|
82
|
-
"@fugood/node-llama-win32-x64-vulkan": "1.3.0-rc.
|
|
83
|
-
"@fugood/node-llama-win32-x64-cuda": "1.3.0-rc.
|
|
84
|
-
"@fugood/node-llama-win32-arm64": "1.3.0-rc.
|
|
85
|
-
"@fugood/node-llama-win32-arm64-vulkan": "1.3.0-rc.
|
|
86
|
-
"@fugood/node-llama-darwin-x64": "1.3.0-rc.
|
|
87
|
-
"@fugood/node-llama-darwin-arm64": "1.3.0-rc.
|
|
75
|
+
"@fugood/node-llama-linux-x64": "1.3.0-rc.2",
|
|
76
|
+
"@fugood/node-llama-linux-x64-vulkan": "1.3.0-rc.2",
|
|
77
|
+
"@fugood/node-llama-linux-x64-cuda": "1.3.0-rc.2",
|
|
78
|
+
"@fugood/node-llama-linux-arm64": "1.3.0-rc.2",
|
|
79
|
+
"@fugood/node-llama-linux-arm64-vulkan": "1.3.0-rc.2",
|
|
80
|
+
"@fugood/node-llama-linux-arm64-cuda": "1.3.0-rc.2",
|
|
81
|
+
"@fugood/node-llama-win32-x64": "1.3.0-rc.2",
|
|
82
|
+
"@fugood/node-llama-win32-x64-vulkan": "1.3.0-rc.2",
|
|
83
|
+
"@fugood/node-llama-win32-x64-cuda": "1.3.0-rc.2",
|
|
84
|
+
"@fugood/node-llama-win32-arm64": "1.3.0-rc.2",
|
|
85
|
+
"@fugood/node-llama-win32-arm64-vulkan": "1.3.0-rc.2",
|
|
86
|
+
"@fugood/node-llama-darwin-x64": "1.3.0-rc.2",
|
|
87
|
+
"@fugood/node-llama-darwin-arm64": "1.3.0-rc.2"
|
|
88
88
|
},
|
|
89
89
|
"devDependencies": {
|
|
90
90
|
"@babel/preset-env": "^7.24.4",
|
package/src/LlamaContext.cpp
CHANGED
|
@@ -89,6 +89,13 @@ Napi::Value LlamaContext::ModelInfo(const Napi::CallbackInfo &info) {
|
|
|
89
89
|
return metadata;
|
|
90
90
|
}
|
|
91
91
|
|
|
92
|
+
// getBackendDevicesInfo(): string
|
|
93
|
+
Napi::Value LlamaContext::GetBackendDevicesInfo(const Napi::CallbackInfo &info) {
|
|
94
|
+
Napi::Env env = info.Env();
|
|
95
|
+
std::string devices_json = rnllama::get_backend_devices_info();
|
|
96
|
+
return Napi::String::New(env, devices_json);
|
|
97
|
+
}
|
|
98
|
+
|
|
92
99
|
void LlamaContext::Init(Napi::Env env, Napi::Object &exports) {
|
|
93
100
|
Napi::Function func = DefineClass(
|
|
94
101
|
env, "LlamaContext",
|
|
@@ -148,6 +155,9 @@ void LlamaContext::Init(Napi::Env env, Napi::Object &exports) {
|
|
|
148
155
|
StaticMethod<&LlamaContext::ToggleNativeLog>(
|
|
149
156
|
"toggleNativeLog",
|
|
150
157
|
static_cast<napi_property_attributes>(napi_enumerable)),
|
|
158
|
+
StaticMethod<&LlamaContext::GetBackendDevicesInfo>(
|
|
159
|
+
"getBackendDevicesInfo",
|
|
160
|
+
static_cast<napi_property_attributes>(napi_enumerable)),
|
|
151
161
|
InstanceMethod<&LlamaContext::GetMultimodalSupport>(
|
|
152
162
|
"getMultimodalSupport",
|
|
153
163
|
static_cast<napi_property_attributes>(napi_enumerable)),
|
package/src/LlamaContext.h
CHANGED
|
@@ -25,6 +25,7 @@ public:
|
|
|
25
25
|
~LlamaContext();
|
|
26
26
|
static void ToggleNativeLog(const Napi::CallbackInfo &info);
|
|
27
27
|
static Napi::Value ModelInfo(const Napi::CallbackInfo &info);
|
|
28
|
+
static Napi::Value GetBackendDevicesInfo(const Napi::CallbackInfo &info);
|
|
28
29
|
static void Init(Napi::Env env, Napi::Object &exports);
|
|
29
30
|
|
|
30
31
|
private:
|
|
@@ -41,9 +41,9 @@ static std::string build_repetition(const std::string & item_rule, int min_items
|
|
|
41
41
|
return result;
|
|
42
42
|
}
|
|
43
43
|
|
|
44
|
-
static void _build_min_max_int(
|
|
45
|
-
auto has_min = min_value != std::numeric_limits<
|
|
46
|
-
auto has_max = max_value != std::numeric_limits<
|
|
44
|
+
static void _build_min_max_int(int64_t min_value, int64_t max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) {
|
|
45
|
+
auto has_min = min_value != std::numeric_limits<int64_t>::min();
|
|
46
|
+
auto has_max = max_value != std::numeric_limits<int64_t>::max();
|
|
47
47
|
|
|
48
48
|
auto digit_range = [&](char from, char to) {
|
|
49
49
|
out << "[";
|
|
@@ -159,7 +159,7 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
|
|
|
159
159
|
if (has_min) {
|
|
160
160
|
if (min_value < 0) {
|
|
161
161
|
out << "\"-\" (";
|
|
162
|
-
_build_min_max_int(std::numeric_limits<
|
|
162
|
+
_build_min_max_int(std::numeric_limits<int64_t>::min(), -min_value, out, decimals_left, /* top_level= */ false);
|
|
163
163
|
out << ") | [0] | [1-9] ";
|
|
164
164
|
more_digits(0, decimals_left - 1);
|
|
165
165
|
} else if (min_value == 0) {
|
|
@@ -194,7 +194,7 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
|
|
|
194
194
|
}
|
|
195
195
|
digit_range(c, c);
|
|
196
196
|
out << " (";
|
|
197
|
-
_build_min_max_int(std::
|
|
197
|
+
_build_min_max_int(std::stoll(min_s.substr(1)), std::numeric_limits<int64_t>::max(), out, less_decimals, /* top_level= */ false);
|
|
198
198
|
out << ")";
|
|
199
199
|
if (c < '9') {
|
|
200
200
|
out << " | ";
|
|
@@ -216,7 +216,7 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
|
|
|
216
216
|
_build_min_max_int(0, max_value, out, decimals_left, /* top_level= */ true);
|
|
217
217
|
} else {
|
|
218
218
|
out << "\"-\" (";
|
|
219
|
-
_build_min_max_int(-max_value, std::numeric_limits<
|
|
219
|
+
_build_min_max_int(-max_value, std::numeric_limits<int64_t>::max(), out, decimals_left, /* top_level= */ false);
|
|
220
220
|
out << ")";
|
|
221
221
|
}
|
|
222
222
|
return;
|
|
@@ -925,17 +925,17 @@ public:
|
|
|
925
925
|
int max_len = schema.contains("maxLength") ? schema["maxLength"].get<int>() : std::numeric_limits<int>::max();
|
|
926
926
|
return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\" space");
|
|
927
927
|
} else if (schema_type == "integer" && (schema.contains("minimum") || schema.contains("exclusiveMinimum") || schema.contains("maximum") || schema.contains("exclusiveMaximum"))) {
|
|
928
|
-
|
|
929
|
-
|
|
928
|
+
int64_t min_value = std::numeric_limits<int64_t>::min();
|
|
929
|
+
int64_t max_value = std::numeric_limits<int64_t>::max();
|
|
930
930
|
if (schema.contains("minimum")) {
|
|
931
|
-
min_value = schema["minimum"].get<
|
|
931
|
+
min_value = schema["minimum"].get<int64_t>();
|
|
932
932
|
} else if (schema.contains("exclusiveMinimum")) {
|
|
933
|
-
min_value = schema["exclusiveMinimum"].get<
|
|
933
|
+
min_value = schema["exclusiveMinimum"].get<int64_t>() + 1;
|
|
934
934
|
}
|
|
935
935
|
if (schema.contains("maximum")) {
|
|
936
|
-
max_value = schema["maximum"].get<
|
|
936
|
+
max_value = schema["maximum"].get<int64_t>();
|
|
937
937
|
} else if (schema.contains("exclusiveMaximum")) {
|
|
938
|
-
max_value = schema["exclusiveMaximum"].get<
|
|
938
|
+
max_value = schema["exclusiveMaximum"].get<int64_t>() - 1;
|
|
939
939
|
}
|
|
940
940
|
std::stringstream out;
|
|
941
941
|
out << "(";
|
|
@@ -21,8 +21,7 @@ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const c
|
|
|
21
21
|
GGML_BACKEND_API void ggml_backend_rpc_get_device_memory(const char * endpoint, uint32_t device, size_t * free, size_t * total);
|
|
22
22
|
|
|
23
23
|
GGML_BACKEND_API void ggml_backend_rpc_start_server(const char * endpoint, const char * cache_dir,
|
|
24
|
-
size_t n_threads, size_t n_devices,
|
|
25
|
-
ggml_backend_dev_t * devices, size_t * free_mem, size_t * total_mem);
|
|
24
|
+
size_t n_threads, size_t n_devices, ggml_backend_dev_t * devices);
|
|
26
25
|
|
|
27
26
|
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_reg(void);
|
|
28
27
|
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_add_server(const char * endpoint);
|
|
@@ -485,8 +485,9 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS> class tensor_
|
|
|
485
485
|
int32_t start = ith * task_per_thread;
|
|
486
486
|
int32_t end = std::min((ith + 1) * task_per_thread, task_count);
|
|
487
487
|
for (int32_t compute_idx = start; compute_idx < end; compute_idx++) {
|
|
488
|
-
int32_t gemm_idx = compute_idx /
|
|
489
|
-
int32_t
|
|
488
|
+
int32_t gemm_idx = compute_idx / per_gemm_block_count_m;
|
|
489
|
+
int32_t block_idx_in_gemm = compute_idx % per_gemm_block_count_m;
|
|
490
|
+
int32_t m_idx = block_idx_in_gemm * block_size_m;
|
|
490
491
|
const qnbitgemm_spacemit_ime_args & data = qnbitgemm_args[gemm_idx];
|
|
491
492
|
int32_t rows_tobe_handled = (gemm_m - m_idx) > block_size_m ? block_size_m : (gemm_m - m_idx);
|
|
492
493
|
|
|
@@ -421,11 +421,8 @@ struct llama_model::impl {
|
|
|
421
421
|
llama_mlocks mlock_bufs;
|
|
422
422
|
llama_mlocks mlock_mmaps;
|
|
423
423
|
|
|
424
|
-
// contexts where the model tensors metadata is stored
|
|
425
|
-
std::vector<ggml_context_ptr
|
|
426
|
-
|
|
427
|
-
// the model memory buffers for the tensor data
|
|
428
|
-
std::vector<ggml_backend_buffer_ptr> bufs;
|
|
424
|
+
// contexts where the model tensors metadata is stored as well ass the corresponding buffers:
|
|
425
|
+
std::vector<std::pair<ggml_context_ptr, ggml_backend_buffer_ptr>> ctxs_bufs;
|
|
429
426
|
|
|
430
427
|
buft_list_t cpu_buft_list;
|
|
431
428
|
std::map<ggml_backend_dev_t, buft_list_t> gpu_buft_list;
|
|
@@ -2182,7 +2179,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2182
2179
|
max_n_tensors += n_layer*2; // duplicated rope freq tensors
|
|
2183
2180
|
const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors;
|
|
2184
2181
|
|
|
2185
|
-
|
|
2182
|
+
// define a comparator for the buft -> ctx map to ensure that the order is well-defined:
|
|
2183
|
+
struct ggml_backend_buft_comparator {
|
|
2184
|
+
bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const {
|
|
2185
|
+
return ggml_backend_buft_name(lhs) < ggml_backend_buft_name(rhs);
|
|
2186
|
+
}
|
|
2187
|
+
};
|
|
2188
|
+
std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> ctx_map;
|
|
2189
|
+
|
|
2186
2190
|
auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
|
|
2187
2191
|
auto it = ctx_map.find(buft);
|
|
2188
2192
|
if (it == ctx_map.end()) {
|
|
@@ -2197,12 +2201,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2197
2201
|
throw std::runtime_error(format("failed to create ggml context"));
|
|
2198
2202
|
}
|
|
2199
2203
|
|
|
2200
|
-
ctx_map
|
|
2201
|
-
pimpl->ctxs.emplace_back(ctx);
|
|
2204
|
+
ctx_map.emplace(buft, ctx);
|
|
2202
2205
|
|
|
2203
2206
|
return ctx;
|
|
2204
2207
|
}
|
|
2205
|
-
return it->second;
|
|
2208
|
+
return it->second.get();
|
|
2206
2209
|
};
|
|
2207
2210
|
|
|
2208
2211
|
const auto TENSOR_DUPLICATED = llama_model_loader::TENSOR_DUPLICATED;
|
|
@@ -6037,16 +6040,15 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
6037
6040
|
pimpl->mappings.reserve(ml.mappings.size());
|
|
6038
6041
|
|
|
6039
6042
|
// create the backend buffers
|
|
6040
|
-
std::vector<std::pair<ggml_context *, llama_buf_map>>
|
|
6041
|
-
|
|
6043
|
+
std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_buf_maps;
|
|
6044
|
+
ctx_buf_maps.reserve(ctx_map.size());
|
|
6042
6045
|
|
|
6043
6046
|
// Ensure we have enough capacity for the maximum backend buffer we will potentially create
|
|
6044
6047
|
const size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
|
|
6045
|
-
pimpl->
|
|
6048
|
+
pimpl->ctxs_bufs.reserve(n_max_backend_buffer);
|
|
6046
6049
|
|
|
6047
|
-
for (auto &
|
|
6048
|
-
|
|
6049
|
-
ggml_context * ctx = it.second;
|
|
6050
|
+
for (auto & [buft, ctx_ptr] : ctx_map) {
|
|
6051
|
+
ggml_context * ctx = ctx_ptr.get();
|
|
6050
6052
|
|
|
6051
6053
|
// skip contexts without tensors
|
|
6052
6054
|
if (ggml_get_first_tensor(ctx) == nullptr) {
|
|
@@ -6070,6 +6072,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
6070
6072
|
bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
|
|
6071
6073
|
bool is_default_buft = buft == ggml_backend_dev_buffer_type(dev);
|
|
6072
6074
|
|
|
6075
|
+
ggml_backend_buffer_t buf = nullptr;
|
|
6073
6076
|
if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
|
|
6074
6077
|
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
|
|
6075
6078
|
// only the mmap region containing the tensors in the model is mapped to the backend buffer
|
|
@@ -6082,20 +6085,18 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
6082
6085
|
continue;
|
|
6083
6086
|
}
|
|
6084
6087
|
const size_t max_size = ggml_get_max_tensor_size(ctx);
|
|
6085
|
-
|
|
6088
|
+
buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
|
|
6086
6089
|
if (buf == nullptr) {
|
|
6087
6090
|
throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
|
|
6088
6091
|
}
|
|
6089
|
-
pimpl->bufs.emplace_back(buf);
|
|
6090
6092
|
buf_map.emplace(idx, buf);
|
|
6091
6093
|
}
|
|
6092
6094
|
}
|
|
6093
6095
|
else {
|
|
6094
|
-
|
|
6096
|
+
buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
|
|
6095
6097
|
if (buf == nullptr) {
|
|
6096
6098
|
throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
|
|
6097
6099
|
}
|
|
6098
|
-
pimpl->bufs.emplace_back(buf);
|
|
6099
6100
|
if (use_mlock && ggml_backend_buffer_is_host(buf)) {
|
|
6100
6101
|
pimpl->mlock_bufs.emplace_back(new llama_mlock);
|
|
6101
6102
|
auto & mlock_buf = pimpl->mlock_bufs.back();
|
|
@@ -6106,10 +6107,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
6106
6107
|
buf_map.emplace(idx, buf);
|
|
6107
6108
|
}
|
|
6108
6109
|
}
|
|
6109
|
-
|
|
6110
|
-
if (pimpl->bufs.empty()) {
|
|
6111
|
-
throw std::runtime_error("failed to allocate buffer");
|
|
6112
|
-
}
|
|
6110
|
+
pimpl->ctxs_bufs.emplace_back(std::move(ctx_ptr), buf);
|
|
6113
6111
|
|
|
6114
6112
|
for (auto & buf : buf_map) {
|
|
6115
6113
|
// indicate that this buffer contains weights
|
|
@@ -6117,7 +6115,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
6117
6115
|
ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
|
|
6118
6116
|
}
|
|
6119
6117
|
|
|
6120
|
-
|
|
6118
|
+
ctx_buf_maps.emplace_back(ctx, buf_map);
|
|
6121
6119
|
}
|
|
6122
6120
|
|
|
6123
6121
|
if (llama_supports_gpu_offload()) {
|
|
@@ -6135,22 +6133,20 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
6135
6133
|
}
|
|
6136
6134
|
|
|
6137
6135
|
// print memory requirements per buffer type
|
|
6138
|
-
for (auto & buf : pimpl->
|
|
6136
|
+
for (auto & [_, buf] : pimpl->ctxs_bufs) {
|
|
6139
6137
|
LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
|
|
6140
6138
|
}
|
|
6141
6139
|
|
|
6142
6140
|
// populate tensors_by_name
|
|
6143
|
-
for (auto & ctx : pimpl->
|
|
6141
|
+
for (auto & [ctx, _] : pimpl->ctxs_bufs) {
|
|
6144
6142
|
for (auto * cur = ggml_get_first_tensor(ctx.get()); cur != NULL; cur = ggml_get_next_tensor(ctx.get(), cur)) {
|
|
6145
6143
|
tensors_by_name.emplace_back(ggml_get_name(cur), cur);
|
|
6146
6144
|
}
|
|
6147
6145
|
}
|
|
6148
6146
|
|
|
6149
6147
|
// load tensor data
|
|
6150
|
-
for (auto &
|
|
6151
|
-
|
|
6152
|
-
auto & bufs = it.second;
|
|
6153
|
-
if (!ml.load_all_data(ctx, bufs, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
|
|
6148
|
+
for (auto & [ctx, buf_map] : ctx_buf_maps) {
|
|
6149
|
+
if (!ml.load_all_data(ctx, buf_map, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
|
|
6154
6150
|
return false;
|
|
6155
6151
|
}
|
|
6156
6152
|
}
|
|
@@ -6190,8 +6186,8 @@ size_t llama_model::n_devices() const {
|
|
|
6190
6186
|
|
|
6191
6187
|
std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
|
|
6192
6188
|
std::map<ggml_backend_buffer_type_t, size_t> ret;
|
|
6193
|
-
for (const
|
|
6194
|
-
ret[ggml_backend_buffer_get_type(
|
|
6189
|
+
for (const auto & [_, buf] : pimpl->ctxs_bufs) {
|
|
6190
|
+
ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
|
|
6195
6191
|
}
|
|
6196
6192
|
return ret;
|
|
6197
6193
|
}
|