@fugood/llama.node 1.3.0-rc.0 → 1.3.0-rc.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CMakeLists.txt CHANGED
@@ -124,6 +124,8 @@ include_directories(
124
124
  ${CMAKE_JS_INC}
125
125
  "src/llama.cpp"
126
126
  "src/llama.cpp/src"
127
+ "src/llama.cpp/ggml/include"
128
+ "src/llama.cpp/ggml/src"
127
129
  "src/tools/mtmd"
128
130
  )
129
131
 
package/lib/binding.ts CHANGED
@@ -236,6 +236,36 @@ export type LlamaCompletionToken = {
236
236
  completion_probabilities?: CompletionProbability[]
237
237
  }
238
238
 
239
+ /**
240
+ * Result from a parallel completion request (queueCompletion callback).
241
+ * Extends the basic completion result with per-slot timing information.
242
+ */
243
+ export type LlamaParallelCompletionResult = {
244
+ requestId: number
245
+ text: string
246
+ reasoning_content?: string
247
+ content?: string
248
+ tool_calls?: ToolCall[]
249
+ chat_format: number
250
+ stopped_eos: boolean
251
+ stopped_limit: boolean
252
+ stopped_word: boolean
253
+ context_full: boolean
254
+ tokens_evaluated: number
255
+ tokens_predicted: number
256
+ timings: {
257
+ cache_n: number
258
+ prompt_n: number
259
+ prompt_ms: number
260
+ prompt_per_token_ms: number
261
+ prompt_per_second: number
262
+ predicted_n: number
263
+ predicted_ms: number
264
+ predicted_per_token_ms: number
265
+ predicted_per_second: number
266
+ }
267
+ }
268
+
239
269
  export type TokenizeResult = {
240
270
  tokens: Int32Array
241
271
  has_media: boolean
@@ -257,6 +287,14 @@ export type RerankResult = {
257
287
  index: number
258
288
  }
259
289
 
290
+ export type BackendDeviceInfo = {
291
+ backend: string
292
+ type: string
293
+ deviceName: string
294
+ maxMemorySize: number
295
+ metadata?: Record<string, any>
296
+ }
297
+
260
298
  export type ModelInfo = {
261
299
  desc: string
262
300
  nEmbd: number
@@ -457,12 +495,12 @@ export interface LlamaContext {
457
495
  /**
458
496
  * Queue a completion request for parallel processing
459
497
  * @param options Completion options with parallel-specific state management
460
- * @param callback Optional token callback
498
+ * @param callback Optional callback that receives tokens during generation and final result
461
499
  * @returns Object with requestId
462
500
  */
463
501
  queueCompletion(
464
502
  options: LlamaParallelCompletionOptions,
465
- callback?: (error: any, result: any) => void,
503
+ callback?: (error: any, result: LlamaParallelCompletionResult) => void,
466
504
  ): { requestId: number }
467
505
 
468
506
  /**
@@ -505,6 +543,11 @@ export interface LlamaContext {
505
543
  enable: boolean,
506
544
  callback: (level: string, text: string) => void,
507
545
  ): void
546
+ /**
547
+ * Get information about available backend devices
548
+ * @returns Array of backend device information
549
+ */
550
+ getBackendDevicesInfo(): BackendDeviceInfo[]
508
551
  }
509
552
 
510
553
  export interface Module {
package/lib/index.js CHANGED
@@ -23,7 +23,7 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
23
23
  });
24
24
  };
25
25
  Object.defineProperty(exports, "__esModule", { value: true });
26
- exports.BuildInfo = exports.loadLlamaModelInfo = exports.initLlama = exports.loadModel = exports.toggleNativeLog = exports.MTMD_DEFAULT_MEDIA_MARKER = exports.LlamaParallelAPI = void 0;
26
+ exports.BuildInfo = exports.getBackendDevicesInfo = exports.loadLlamaModelInfo = exports.initLlama = exports.loadModel = exports.toggleNativeLog = exports.MTMD_DEFAULT_MEDIA_MARKER = exports.LlamaParallelAPI = void 0;
27
27
  exports.addNativeLogListener = addNativeLogListener;
28
28
  const binding_1 = require("./binding");
29
29
  const version_1 = require("./version");
@@ -269,6 +269,14 @@ const loadLlamaModelInfo = (path) => __awaiter(void 0, void 0, void 0, function*
269
269
  return mods[variant].LlamaContext.loadModelInfo(path, modelInfoSkip);
270
270
  });
271
271
  exports.loadLlamaModelInfo = loadLlamaModelInfo;
272
+ const getBackendDevicesInfo = (...args_1) => __awaiter(void 0, [...args_1], void 0, function* (variant = 'default') {
273
+ var _a;
274
+ (_a = mods[variant]) !== null && _a !== void 0 ? _a : (mods[variant] = yield (0, binding_1.loadModule)(variant));
275
+ refreshNativeLogSetup();
276
+ const jsonString = mods[variant].LlamaContext.getBackendDevicesInfo();
277
+ return JSON.parse(jsonString);
278
+ });
279
+ exports.getBackendDevicesInfo = getBackendDevicesInfo;
272
280
  exports.BuildInfo = {
273
281
  number: version_1.BUILD_NUMBER,
274
282
  commit: version_1.BUILD_COMMIT,
package/lib/index.ts CHANGED
@@ -385,6 +385,15 @@ export const loadLlamaModelInfo = async (
385
385
  return mods[variant].LlamaContext.loadModelInfo(path, modelInfoSkip)
386
386
  }
387
387
 
388
+ export const getBackendDevicesInfo = async (
389
+ variant: LibVariant = 'default'
390
+ ): Promise<import('./binding').BackendDeviceInfo[]> => {
391
+ mods[variant] ??= await loadModule(variant)
392
+ refreshNativeLogSetup()
393
+ const jsonString = mods[variant].LlamaContext.getBackendDevicesInfo()
394
+ return JSON.parse(jsonString as any)
395
+ }
396
+
388
397
  export const BuildInfo = {
389
398
  number: BUILD_NUMBER,
390
399
  commit: BUILD_COMMIT,
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "1.3.0-rc.0",
4
+ "version": "1.3.0-rc.2",
5
5
  "description": "An another Node binding of llama.cpp",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -72,19 +72,19 @@
72
72
  "CMakeLists.txt"
73
73
  ],
74
74
  "optionalDependencies": {
75
- "@fugood/node-llama-linux-x64": "1.3.0-rc.0",
76
- "@fugood/node-llama-linux-x64-vulkan": "1.3.0-rc.0",
77
- "@fugood/node-llama-linux-x64-cuda": "1.3.0-rc.0",
78
- "@fugood/node-llama-linux-arm64": "1.3.0-rc.0",
79
- "@fugood/node-llama-linux-arm64-vulkan": "1.3.0-rc.0",
80
- "@fugood/node-llama-linux-arm64-cuda": "1.3.0-rc.0",
81
- "@fugood/node-llama-win32-x64": "1.3.0-rc.0",
82
- "@fugood/node-llama-win32-x64-vulkan": "1.3.0-rc.0",
83
- "@fugood/node-llama-win32-x64-cuda": "1.3.0-rc.0",
84
- "@fugood/node-llama-win32-arm64": "1.3.0-rc.0",
85
- "@fugood/node-llama-win32-arm64-vulkan": "1.3.0-rc.0",
86
- "@fugood/node-llama-darwin-x64": "1.3.0-rc.0",
87
- "@fugood/node-llama-darwin-arm64": "1.3.0-rc.0"
75
+ "@fugood/node-llama-linux-x64": "1.3.0-rc.2",
76
+ "@fugood/node-llama-linux-x64-vulkan": "1.3.0-rc.2",
77
+ "@fugood/node-llama-linux-x64-cuda": "1.3.0-rc.2",
78
+ "@fugood/node-llama-linux-arm64": "1.3.0-rc.2",
79
+ "@fugood/node-llama-linux-arm64-vulkan": "1.3.0-rc.2",
80
+ "@fugood/node-llama-linux-arm64-cuda": "1.3.0-rc.2",
81
+ "@fugood/node-llama-win32-x64": "1.3.0-rc.2",
82
+ "@fugood/node-llama-win32-x64-vulkan": "1.3.0-rc.2",
83
+ "@fugood/node-llama-win32-x64-cuda": "1.3.0-rc.2",
84
+ "@fugood/node-llama-win32-arm64": "1.3.0-rc.2",
85
+ "@fugood/node-llama-win32-arm64-vulkan": "1.3.0-rc.2",
86
+ "@fugood/node-llama-darwin-x64": "1.3.0-rc.2",
87
+ "@fugood/node-llama-darwin-arm64": "1.3.0-rc.2"
88
88
  },
89
89
  "devDependencies": {
90
90
  "@babel/preset-env": "^7.24.4",
@@ -89,6 +89,13 @@ Napi::Value LlamaContext::ModelInfo(const Napi::CallbackInfo &info) {
89
89
  return metadata;
90
90
  }
91
91
 
92
+ // getBackendDevicesInfo(): string
93
+ Napi::Value LlamaContext::GetBackendDevicesInfo(const Napi::CallbackInfo &info) {
94
+ Napi::Env env = info.Env();
95
+ std::string devices_json = rnllama::get_backend_devices_info();
96
+ return Napi::String::New(env, devices_json);
97
+ }
98
+
92
99
  void LlamaContext::Init(Napi::Env env, Napi::Object &exports) {
93
100
  Napi::Function func = DefineClass(
94
101
  env, "LlamaContext",
@@ -148,6 +155,9 @@ void LlamaContext::Init(Napi::Env env, Napi::Object &exports) {
148
155
  StaticMethod<&LlamaContext::ToggleNativeLog>(
149
156
  "toggleNativeLog",
150
157
  static_cast<napi_property_attributes>(napi_enumerable)),
158
+ StaticMethod<&LlamaContext::GetBackendDevicesInfo>(
159
+ "getBackendDevicesInfo",
160
+ static_cast<napi_property_attributes>(napi_enumerable)),
151
161
  InstanceMethod<&LlamaContext::GetMultimodalSupport>(
152
162
  "getMultimodalSupport",
153
163
  static_cast<napi_property_attributes>(napi_enumerable)),
@@ -25,6 +25,7 @@ public:
25
25
  ~LlamaContext();
26
26
  static void ToggleNativeLog(const Napi::CallbackInfo &info);
27
27
  static Napi::Value ModelInfo(const Napi::CallbackInfo &info);
28
+ static Napi::Value GetBackendDevicesInfo(const Napi::CallbackInfo &info);
28
29
  static void Init(Napi::Env env, Napi::Object &exports);
29
30
 
30
31
  private:
@@ -41,9 +41,9 @@ static std::string build_repetition(const std::string & item_rule, int min_items
41
41
  return result;
42
42
  }
43
43
 
44
- static void _build_min_max_int(int min_value, int max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) {
45
- auto has_min = min_value != std::numeric_limits<int>::min();
46
- auto has_max = max_value != std::numeric_limits<int>::max();
44
+ static void _build_min_max_int(int64_t min_value, int64_t max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) {
45
+ auto has_min = min_value != std::numeric_limits<int64_t>::min();
46
+ auto has_max = max_value != std::numeric_limits<int64_t>::max();
47
47
 
48
48
  auto digit_range = [&](char from, char to) {
49
49
  out << "[";
@@ -159,7 +159,7 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
159
159
  if (has_min) {
160
160
  if (min_value < 0) {
161
161
  out << "\"-\" (";
162
- _build_min_max_int(std::numeric_limits<int>::min(), -min_value, out, decimals_left, /* top_level= */ false);
162
+ _build_min_max_int(std::numeric_limits<int64_t>::min(), -min_value, out, decimals_left, /* top_level= */ false);
163
163
  out << ") | [0] | [1-9] ";
164
164
  more_digits(0, decimals_left - 1);
165
165
  } else if (min_value == 0) {
@@ -194,7 +194,7 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
194
194
  }
195
195
  digit_range(c, c);
196
196
  out << " (";
197
- _build_min_max_int(std::stoi(min_s.substr(1)), std::numeric_limits<int>::max(), out, less_decimals, /* top_level= */ false);
197
+ _build_min_max_int(std::stoll(min_s.substr(1)), std::numeric_limits<int64_t>::max(), out, less_decimals, /* top_level= */ false);
198
198
  out << ")";
199
199
  if (c < '9') {
200
200
  out << " | ";
@@ -216,7 +216,7 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
216
216
  _build_min_max_int(0, max_value, out, decimals_left, /* top_level= */ true);
217
217
  } else {
218
218
  out << "\"-\" (";
219
- _build_min_max_int(-max_value, std::numeric_limits<int>::max(), out, decimals_left, /* top_level= */ false);
219
+ _build_min_max_int(-max_value, std::numeric_limits<int64_t>::max(), out, decimals_left, /* top_level= */ false);
220
220
  out << ")";
221
221
  }
222
222
  return;
@@ -925,17 +925,17 @@ public:
925
925
  int max_len = schema.contains("maxLength") ? schema["maxLength"].get<int>() : std::numeric_limits<int>::max();
926
926
  return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\" space");
927
927
  } else if (schema_type == "integer" && (schema.contains("minimum") || schema.contains("exclusiveMinimum") || schema.contains("maximum") || schema.contains("exclusiveMaximum"))) {
928
- int min_value = std::numeric_limits<int>::min();
929
- int max_value = std::numeric_limits<int>::max();
928
+ int64_t min_value = std::numeric_limits<int64_t>::min();
929
+ int64_t max_value = std::numeric_limits<int64_t>::max();
930
930
  if (schema.contains("minimum")) {
931
- min_value = schema["minimum"].get<int>();
931
+ min_value = schema["minimum"].get<int64_t>();
932
932
  } else if (schema.contains("exclusiveMinimum")) {
933
- min_value = schema["exclusiveMinimum"].get<int>() + 1;
933
+ min_value = schema["exclusiveMinimum"].get<int64_t>() + 1;
934
934
  }
935
935
  if (schema.contains("maximum")) {
936
- max_value = schema["maximum"].get<int>();
936
+ max_value = schema["maximum"].get<int64_t>();
937
937
  } else if (schema.contains("exclusiveMaximum")) {
938
- max_value = schema["exclusiveMaximum"].get<int>() - 1;
938
+ max_value = schema["exclusiveMaximum"].get<int64_t>() - 1;
939
939
  }
940
940
  std::stringstream out;
941
941
  out << "(";
@@ -21,8 +21,7 @@ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const c
21
21
  GGML_BACKEND_API void ggml_backend_rpc_get_device_memory(const char * endpoint, uint32_t device, size_t * free, size_t * total);
22
22
 
23
23
  GGML_BACKEND_API void ggml_backend_rpc_start_server(const char * endpoint, const char * cache_dir,
24
- size_t n_threads, size_t n_devices,
25
- ggml_backend_dev_t * devices, size_t * free_mem, size_t * total_mem);
24
+ size_t n_threads, size_t n_devices, ggml_backend_dev_t * devices);
26
25
 
27
26
  GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_reg(void);
28
27
  GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_add_server(const char * endpoint);
@@ -485,8 +485,9 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS> class tensor_
485
485
  int32_t start = ith * task_per_thread;
486
486
  int32_t end = std::min((ith + 1) * task_per_thread, task_count);
487
487
  for (int32_t compute_idx = start; compute_idx < end; compute_idx++) {
488
- int32_t gemm_idx = compute_idx / block_size_m;
489
- int32_t m_idx = compute_idx % block_size_m * block_size_m;
488
+ int32_t gemm_idx = compute_idx / per_gemm_block_count_m;
489
+ int32_t block_idx_in_gemm = compute_idx % per_gemm_block_count_m;
490
+ int32_t m_idx = block_idx_in_gemm * block_size_m;
490
491
  const qnbitgemm_spacemit_ime_args & data = qnbitgemm_args[gemm_idx];
491
492
  int32_t rows_tobe_handled = (gemm_m - m_idx) > block_size_m ? block_size_m : (gemm_m - m_idx);
492
493
 
@@ -421,11 +421,8 @@ struct llama_model::impl {
421
421
  llama_mlocks mlock_bufs;
422
422
  llama_mlocks mlock_mmaps;
423
423
 
424
- // contexts where the model tensors metadata is stored
425
- std::vector<ggml_context_ptr> ctxs;
426
-
427
- // the model memory buffers for the tensor data
428
- std::vector<ggml_backend_buffer_ptr> bufs;
424
+ // contexts where the model tensors metadata is stored as well ass the corresponding buffers:
425
+ std::vector<std::pair<ggml_context_ptr, ggml_backend_buffer_ptr>> ctxs_bufs;
429
426
 
430
427
  buft_list_t cpu_buft_list;
431
428
  std::map<ggml_backend_dev_t, buft_list_t> gpu_buft_list;
@@ -2182,7 +2179,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2182
2179
  max_n_tensors += n_layer*2; // duplicated rope freq tensors
2183
2180
  const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors;
2184
2181
 
2185
- std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
2182
+ // define a comparator for the buft -> ctx map to ensure that the order is well-defined:
2183
+ struct ggml_backend_buft_comparator {
2184
+ bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const {
2185
+ return ggml_backend_buft_name(lhs) < ggml_backend_buft_name(rhs);
2186
+ }
2187
+ };
2188
+ std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> ctx_map;
2189
+
2186
2190
  auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
2187
2191
  auto it = ctx_map.find(buft);
2188
2192
  if (it == ctx_map.end()) {
@@ -2197,12 +2201,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2197
2201
  throw std::runtime_error(format("failed to create ggml context"));
2198
2202
  }
2199
2203
 
2200
- ctx_map[buft] = ctx;
2201
- pimpl->ctxs.emplace_back(ctx);
2204
+ ctx_map.emplace(buft, ctx);
2202
2205
 
2203
2206
  return ctx;
2204
2207
  }
2205
- return it->second;
2208
+ return it->second.get();
2206
2209
  };
2207
2210
 
2208
2211
  const auto TENSOR_DUPLICATED = llama_model_loader::TENSOR_DUPLICATED;
@@ -6037,16 +6040,15 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6037
6040
  pimpl->mappings.reserve(ml.mappings.size());
6038
6041
 
6039
6042
  // create the backend buffers
6040
- std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_bufs;
6041
- ctx_bufs.reserve(ctx_map.size());
6043
+ std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_buf_maps;
6044
+ ctx_buf_maps.reserve(ctx_map.size());
6042
6045
 
6043
6046
  // Ensure we have enough capacity for the maximum backend buffer we will potentially create
6044
6047
  const size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
6045
- pimpl->bufs.reserve(n_max_backend_buffer);
6048
+ pimpl->ctxs_bufs.reserve(n_max_backend_buffer);
6046
6049
 
6047
- for (auto & it : ctx_map) {
6048
- ggml_backend_buffer_type_t buft = it.first;
6049
- ggml_context * ctx = it.second;
6050
+ for (auto & [buft, ctx_ptr] : ctx_map) {
6051
+ ggml_context * ctx = ctx_ptr.get();
6050
6052
 
6051
6053
  // skip contexts without tensors
6052
6054
  if (ggml_get_first_tensor(ctx) == nullptr) {
@@ -6070,6 +6072,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6070
6072
  bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
6071
6073
  bool is_default_buft = buft == ggml_backend_dev_buffer_type(dev);
6072
6074
 
6075
+ ggml_backend_buffer_t buf = nullptr;
6073
6076
  if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
6074
6077
  for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
6075
6078
  // only the mmap region containing the tensors in the model is mapped to the backend buffer
@@ -6082,20 +6085,18 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6082
6085
  continue;
6083
6086
  }
6084
6087
  const size_t max_size = ggml_get_max_tensor_size(ctx);
6085
- ggml_backend_buffer_t buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
6088
+ buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
6086
6089
  if (buf == nullptr) {
6087
6090
  throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
6088
6091
  }
6089
- pimpl->bufs.emplace_back(buf);
6090
6092
  buf_map.emplace(idx, buf);
6091
6093
  }
6092
6094
  }
6093
6095
  else {
6094
- ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
6096
+ buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
6095
6097
  if (buf == nullptr) {
6096
6098
  throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
6097
6099
  }
6098
- pimpl->bufs.emplace_back(buf);
6099
6100
  if (use_mlock && ggml_backend_buffer_is_host(buf)) {
6100
6101
  pimpl->mlock_bufs.emplace_back(new llama_mlock);
6101
6102
  auto & mlock_buf = pimpl->mlock_bufs.back();
@@ -6106,10 +6107,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6106
6107
  buf_map.emplace(idx, buf);
6107
6108
  }
6108
6109
  }
6109
-
6110
- if (pimpl->bufs.empty()) {
6111
- throw std::runtime_error("failed to allocate buffer");
6112
- }
6110
+ pimpl->ctxs_bufs.emplace_back(std::move(ctx_ptr), buf);
6113
6111
 
6114
6112
  for (auto & buf : buf_map) {
6115
6113
  // indicate that this buffer contains weights
@@ -6117,7 +6115,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6117
6115
  ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
6118
6116
  }
6119
6117
 
6120
- ctx_bufs.emplace_back(ctx, buf_map);
6118
+ ctx_buf_maps.emplace_back(ctx, buf_map);
6121
6119
  }
6122
6120
 
6123
6121
  if (llama_supports_gpu_offload()) {
@@ -6135,22 +6133,20 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6135
6133
  }
6136
6134
 
6137
6135
  // print memory requirements per buffer type
6138
- for (auto & buf : pimpl->bufs) {
6136
+ for (auto & [_, buf] : pimpl->ctxs_bufs) {
6139
6137
  LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
6140
6138
  }
6141
6139
 
6142
6140
  // populate tensors_by_name
6143
- for (auto & ctx : pimpl->ctxs) {
6141
+ for (auto & [ctx, _] : pimpl->ctxs_bufs) {
6144
6142
  for (auto * cur = ggml_get_first_tensor(ctx.get()); cur != NULL; cur = ggml_get_next_tensor(ctx.get(), cur)) {
6145
6143
  tensors_by_name.emplace_back(ggml_get_name(cur), cur);
6146
6144
  }
6147
6145
  }
6148
6146
 
6149
6147
  // load tensor data
6150
- for (auto & it : ctx_bufs) {
6151
- ggml_context * ctx = it.first;
6152
- auto & bufs = it.second;
6153
- if (!ml.load_all_data(ctx, bufs, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
6148
+ for (auto & [ctx, buf_map] : ctx_buf_maps) {
6149
+ if (!ml.load_all_data(ctx, buf_map, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
6154
6150
  return false;
6155
6151
  }
6156
6152
  }
@@ -6190,8 +6186,8 @@ size_t llama_model::n_devices() const {
6190
6186
 
6191
6187
  std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
6192
6188
  std::map<ggml_backend_buffer_type_t, size_t> ret;
6193
- for (const ggml_backend_buffer_ptr & buf_ptr : pimpl->bufs) {
6194
- ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get());
6189
+ for (const auto & [_, buf] : pimpl->ctxs_bufs) {
6190
+ ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
6195
6191
  }
6196
6192
  return ret;
6197
6193
  }