@fugood/llama.node 1.4.13 → 1.4.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -110,7 +110,7 @@ struct llama_file::impl {
110
110
  }
111
111
  }
112
112
 
113
- void read_raw(void * ptr, size_t len) const {
113
+ void read_raw(void * ptr, size_t len) {
114
114
  size_t bytes_read = 0;
115
115
  while (bytes_read < len) {
116
116
  size_t chunk_size = std::min<size_t>(len - bytes_read, 64*1024*1024);
@@ -127,7 +127,7 @@ struct llama_file::impl {
127
127
  }
128
128
  }
129
129
 
130
- uint32_t read_u32() const {
130
+ uint32_t read_u32() {
131
131
  uint32_t val;
132
132
  read_raw(&val, sizeof(val));
133
133
  return val;
@@ -154,8 +154,8 @@ struct llama_file::impl {
154
154
  write_raw(&val, sizeof(val));
155
155
  }
156
156
 
157
- void read_aligned_chunk(size_t offset, void * dest, size_t size) const {
158
- throw std::runtime_error("DirectIO is not implemented on Windows.");
157
+ bool has_direct_io() const {
158
+ return true;
159
159
  }
160
160
 
161
161
  ~impl() {
@@ -164,33 +164,45 @@ struct llama_file::impl {
164
164
  }
165
165
  }
166
166
  #else
167
- impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) {
167
+ impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) : fname(fname) {
168
168
  #ifdef __linux__
169
169
  // Try unbuffered I/O for read only
170
170
  if (use_direct_io && std::strcmp(mode, "rb") == 0) {
171
- fd = open(fname, O_RDONLY | O_DIRECT);
171
+ if (init_fd()) {
172
+ return;
173
+ }
174
+ LLAMA_LOG_WARN("Failed to open file '%s' with error: %s. Falling back to buffered I/O",
175
+ fname, strerror(errno));
176
+ }
177
+ #endif
178
+ init_fp(mode);
179
+ }
172
180
 
173
- if (fd != -1) {
174
- struct stat file_stats{};
175
- fstat(fd, &file_stats);
181
+ #ifdef __linux__
182
+ bool init_fd() {
183
+ fd = open(fname.c_str(), O_RDONLY | O_DIRECT);
176
184
 
177
- size = file_stats.st_size;
178
- alignment = file_stats.st_blksize;
185
+ if (fd != -1) {
186
+ struct stat file_stats{};
187
+ fstat(fd, &file_stats);
179
188
 
180
- off_t ret = lseek(fd, 0, SEEK_SET);
181
- if (ret == -1) {
182
- throw std::runtime_error(format("seek error: %s", strerror(errno)));
183
- }
184
- return;
185
- }
189
+ size = file_stats.st_size;
190
+ alignment = file_stats.st_blksize;
186
191
 
187
- LLAMA_LOG_WARN("Failed to open model %s with error: %s. Falling back to buffered I/O",
188
- fname, strerror(errno));
192
+ off_t ret = lseek(fd, 0, SEEK_SET);
193
+ if (ret == -1) {
194
+ throw std::runtime_error(format("seek error: %s", strerror(errno)));
195
+ }
196
+ return true;
189
197
  }
198
+ return false;
199
+ }
190
200
  #endif
191
- fp = ggml_fopen(fname, mode);
201
+
202
+ void init_fp(const char * mode) {
203
+ fp = ggml_fopen(fname.c_str(), mode);
192
204
  if (fp == NULL) {
193
- throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
205
+ throw std::runtime_error(format("failed to open %s: %s", fname.c_str(), strerror(errno)));
194
206
  }
195
207
  seek(0, SEEK_END);
196
208
  size = tell();
@@ -226,7 +238,7 @@ struct llama_file::impl {
226
238
  }
227
239
  }
228
240
 
229
- void read_raw(void * ptr, size_t len) const {
241
+ void read_raw_unsafe(void * ptr, size_t len) {
230
242
  if (len == 0) {
231
243
  return;
232
244
  }
@@ -249,6 +261,17 @@ struct llama_file::impl {
249
261
  if (errno == EINTR) {
250
262
  continue; // Interrupted by signal, retry
251
263
  }
264
+ // Fallback to std::fread in case the DMA controller cannot access the buffer
265
+ if (errno == EFAULT) {
266
+ auto curr_off = tell();
267
+ close(fd);
268
+ fd = -1;
269
+ alignment = 1;
270
+ init_fp("rb");
271
+ seek(curr_off, SEEK_SET);
272
+ read_raw_unsafe(ptr, len);
273
+ return;
274
+ }
252
275
  throw std::runtime_error(format("read error: %s", strerror(errno)));
253
276
  }
254
277
  if (ret == 0) {
@@ -266,7 +289,8 @@ struct llama_file::impl {
266
289
  }
267
290
  }
268
291
 
269
- void read_aligned_chunk(size_t offset, void * dest, size_t size) const {
292
+ void read_aligned_chunk(void * dest, size_t size) {
293
+ size_t offset = tell();
270
294
  off_t aligned_offset = offset & ~(alignment - 1);
271
295
  off_t offset_from_alignment = offset - aligned_offset;
272
296
  size_t bytes_to_read = (offset_from_alignment + size + alignment - 1) & ~(alignment - 1);
@@ -283,13 +307,21 @@ struct llama_file::impl {
283
307
  std::unique_ptr<void, aligned_buffer_deleter> buffer(raw_buffer);
284
308
 
285
309
  seek(aligned_offset, SEEK_SET);
286
- read_raw(buffer.get(), bytes_to_read);
310
+ read_raw_unsafe(buffer.get(), bytes_to_read);
287
311
 
288
312
  uintptr_t actual_data = reinterpret_cast<uintptr_t>(buffer.get()) + offset_from_alignment;
289
313
  memcpy(dest, reinterpret_cast<void *>(actual_data), size);
290
314
  }
291
315
 
292
- uint32_t read_u32() const {
316
+ void read_raw(void * ptr, size_t len) {
317
+ if (has_direct_io()) {
318
+ read_aligned_chunk(ptr, len);
319
+ } else {
320
+ read_raw_unsafe(ptr, len);
321
+ }
322
+ }
323
+
324
+ uint32_t read_u32() {
293
325
  uint32_t ret;
294
326
  read_raw(&ret, sizeof(ret));
295
327
  return ret;
@@ -310,6 +342,10 @@ struct llama_file::impl {
310
342
  write_raw(&val, sizeof(val));
311
343
  }
312
344
 
345
+ bool has_direct_io() const {
346
+ return fd != -1 && alignment > 1;
347
+ }
348
+
313
349
  ~impl() {
314
350
  if (fd != -1) {
315
351
  close(fd);
@@ -318,17 +354,9 @@ struct llama_file::impl {
318
354
  }
319
355
  }
320
356
  int fd = -1;
357
+ std::string fname;
321
358
  #endif
322
359
 
323
- void read_raw_at(void * ptr, size_t len, size_t offset) const {
324
- if (alignment != 1) {
325
- read_aligned_chunk(offset, ptr, len);
326
- } else {
327
- seek(offset, SEEK_SET);
328
- read_raw(ptr, len);
329
- }
330
- }
331
-
332
360
  size_t read_alignment() const {
333
361
  return alignment;
334
362
  }
@@ -347,6 +375,7 @@ size_t llama_file::tell() const { return pimpl->tell(); }
347
375
  size_t llama_file::size() const { return pimpl->size; }
348
376
 
349
377
  size_t llama_file::read_alignment() const { return pimpl->read_alignment(); }
378
+ bool llama_file::has_direct_io() const { return pimpl->has_direct_io(); }
350
379
 
351
380
  int llama_file::file_id() const {
352
381
  #ifdef _WIN32
@@ -361,10 +390,14 @@ int llama_file::file_id() const {
361
390
  }
362
391
 
363
392
  void llama_file::seek(size_t offset, int whence) const { pimpl->seek(offset, whence); }
364
- void llama_file::read_raw(void * ptr, size_t len) const { pimpl->read_raw(ptr, len); }
365
- void llama_file::read_raw_at(void * ptr, size_t len, size_t offset) const { pimpl->read_raw_at(ptr, len, offset); }
393
+ void llama_file::read_raw(void * ptr, size_t len) { pimpl->read_raw(ptr, len); }
394
+ #ifdef _WIN32
395
+ void llama_file::read_raw_unsafe(void * ptr, size_t len) { pimpl->read_raw(ptr, len); }
396
+ #else
397
+ void llama_file::read_raw_unsafe(void * ptr, size_t len) { pimpl->read_raw_unsafe(ptr, len); }
398
+ #endif
366
399
 
367
- uint32_t llama_file::read_u32() const { return pimpl->read_u32(); }
400
+ uint32_t llama_file::read_u32() { return pimpl->read_u32(); }
368
401
 
369
402
  void llama_file::write_raw(const void * ptr, size_t len) const { pimpl->write_raw(ptr, len); }
370
403
  void llama_file::write_u32(uint32_t val) const { pimpl->write_u32(val); }
@@ -24,15 +24,16 @@ struct llama_file {
24
24
 
25
25
  void seek(size_t offset, int whence) const;
26
26
 
27
- void read_raw(void * ptr, size_t len) const;
28
- void read_raw_at(void * ptr, size_t len, size_t offset) const;
29
- void read_aligned_chunk(size_t offset, void * dest, size_t size) const;
30
- uint32_t read_u32() const;
27
+ void read_raw(void * ptr, size_t len);
28
+ void read_raw_unsafe(void * ptr, size_t len);
29
+ void read_aligned_chunk(void * dest, size_t size);
30
+ uint32_t read_u32();
31
31
 
32
32
  void write_raw(const void * ptr, size_t len) const;
33
33
  void write_u32(uint32_t val) const;
34
34
 
35
35
  size_t read_alignment() const;
36
+ bool has_direct_io() const;
36
37
  private:
37
38
  struct impl;
38
39
  std::unique_ptr<impl> pimpl;
@@ -495,6 +495,7 @@ llama_model_loader::llama_model_loader(
495
495
  const std::string & fname,
496
496
  std::vector<std::string> & splits,
497
497
  bool use_mmap,
498
+ bool use_direct_io,
498
499
  bool check_tensors,
499
500
  bool no_alloc,
500
501
  const llama_model_kv_override * param_overrides_p,
@@ -527,9 +528,17 @@ llama_model_loader::llama_model_loader(
527
528
  get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
528
529
  llm_kv = LLM_KV(llm_arch_from_string(arch_name));
529
530
 
530
- files.emplace_back(new llama_file(fname.c_str(), "rb", !use_mmap));
531
+ files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io));
531
532
  contexts.emplace_back(ctx);
532
533
 
534
+ use_direct_io = use_direct_io && files.back()->has_direct_io();
535
+
536
+ // Disable mmap in case Direct I/O is enabled and available
537
+ if (use_direct_io && use_mmap) {
538
+ use_mmap = false;
539
+ LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__);
540
+ }
541
+
533
542
  // Save tensors data offset of the main file.
534
543
  // For subsidiary files, `meta` tensor data offset must not be used,
535
544
  // so we build a unified tensors index for weights.
@@ -595,7 +604,7 @@ llama_model_loader::llama_model_loader(
595
604
  }
596
605
  }
597
606
 
598
- files.emplace_back(new llama_file(fname_split, "rb", !use_mmap));
607
+ files.emplace_back(new llama_file(fname_split, "rb", use_direct_io));
599
608
  contexts.emplace_back(ctx);
600
609
 
601
610
  // Save tensors data offset info of the shard.
@@ -739,6 +748,7 @@ llama_model_loader::llama_model_loader(
739
748
  }
740
749
 
741
750
  this->use_mmap = use_mmap;
751
+ this->use_direct_io = use_direct_io;
742
752
  this->check_tensors = check_tensors;
743
753
  this->no_alloc = no_alloc;
744
754
  }
@@ -1100,7 +1110,8 @@ bool llama_model_loader::load_all_data(
1100
1110
  const auto & file = files.at(weight->idx);
1101
1111
 
1102
1112
  if (ggml_backend_buffer_is_host(cur->buffer)) {
1103
- file->read_raw_at(cur->data, n_size, weight->offs);
1113
+ file->seek(weight->offs, SEEK_SET);
1114
+ file->read_raw(cur->data, n_size);
1104
1115
  if (check_tensors) {
1105
1116
  validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
1106
1117
  return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
@@ -1132,7 +1143,7 @@ bool llama_model_loader::load_all_data(
1132
1143
  ggml_backend_event_synchronize(events[buffer_idx]);
1133
1144
 
1134
1145
  // Read aligned chunk from file
1135
- file->read_raw(reinterpret_cast<void *>(ptr_dest_aligned), read_size);
1146
+ file->read_raw_unsafe(reinterpret_cast<void *>(ptr_dest_aligned), read_size);
1136
1147
 
1137
1148
  // Calculate actual data portion (excluding alignment padding)
1138
1149
  uintptr_t ptr_data = ptr_dest_aligned;
@@ -1162,7 +1173,8 @@ bool llama_model_loader::load_all_data(
1162
1173
  }
1163
1174
  } else {
1164
1175
  read_buf.resize(n_size);
1165
- file->read_raw_at(read_buf.data(), n_size, weight->offs);
1176
+ file->seek(weight->offs, SEEK_SET);
1177
+ file->read_raw(read_buf.data(), n_size);
1166
1178
  ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
1167
1179
  if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
1168
1180
  throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
@@ -70,6 +70,7 @@ struct llama_model_loader {
70
70
  size_t n_bytes = 0;
71
71
 
72
72
  bool use_mmap = false;
73
+ bool use_direct_io = false;
73
74
  bool check_tensors;
74
75
  bool no_alloc;
75
76
 
@@ -97,6 +98,7 @@ struct llama_model_loader {
97
98
  const std::string & fname,
98
99
  std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
99
100
  bool use_mmap,
101
+ bool use_direct_io,
100
102
  bool check_tensors,
101
103
  bool no_alloc,
102
104
  const llama_model_kv_override * param_overrides_p,
@@ -2440,7 +2440,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2440
2440
 
2441
2441
  const bool use_mmap_buffer = true;
2442
2442
 
2443
- LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false");
2443
+ LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s, direct_io = %s)\n",
2444
+ __func__, ml.use_mmap ? "true" : "false", ml.use_direct_io ? "true" : "false");
2444
2445
 
2445
2446
  // build a list of buffer types for the CPU and GPU devices
2446
2447
  pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts, params.no_host);
@@ -2451,6 +2452,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2451
2452
  pimpl->gpu_buft_list.emplace(dev, std::move(buft_list));
2452
2453
  }
2453
2454
 
2455
+ ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
2456
+ if (cpu_dev == nullptr) {
2457
+ throw std::runtime_error(format("%s: no CPU backend found", __func__));
2458
+ }
2459
+
2454
2460
  // calculate the split points
2455
2461
  bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + n_devices(), [](float x) { return x == 0.0f; });
2456
2462
  std::vector<float> splits(n_devices());
@@ -2461,6 +2467,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2461
2467
  size_t total;
2462
2468
  size_t free;
2463
2469
  ggml_backend_dev_memory(dev, &free, &total);
2470
+
2471
+ // devices can return 0 bytes for free and total memory if they do not
2472
+ // have any to report. in this case, we will use the host memory as a fallback
2473
+ // fixes: https://github.com/ggml-org/llama.cpp/issues/18577
2474
+ if (free == 0 && total == 0) {
2475
+ ggml_backend_dev_memory(cpu_dev, &free, &total);
2476
+ }
2464
2477
  splits[i] = free;
2465
2478
  }
2466
2479
  } else {
@@ -2477,10 +2490,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2477
2490
  splits[i] /= split_sum;
2478
2491
  }
2479
2492
 
2480
- ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
2481
- if (cpu_dev == nullptr) {
2482
- throw std::runtime_error(format("%s: no CPU backend found", __func__));
2483
- }
2484
2493
  const int i_gpu_start = std::max(int(hparams.n_layer) + 1 - n_gpu_layers, 0);
2485
2494
  const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, int(n_layer) + 1);
2486
2495
  auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
@@ -7973,6 +7982,7 @@ llama_model_params llama_model_default_params() {
7973
7982
  /*.kv_overrides =*/ nullptr,
7974
7983
  /*.vocab_only =*/ false,
7975
7984
  /*.use_mmap =*/ true,
7985
+ /*.use_direct_io =*/ true,
7976
7986
  /*.use_mlock =*/ false,
7977
7987
  /*.check_tensors =*/ false,
7978
7988
  /*.use_extra_bufts =*/ true,
@@ -596,7 +596,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
596
596
  }
597
597
 
598
598
  std::vector<std::string> splits = {};
599
- llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
599
+ llama_model_loader ml(fname_inp, splits, use_mmap, /*use_direct_io*/ true, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
600
600
  ml.init_mappings(false); // no prefetching
601
601
 
602
602
  llama_model model(llama_model_default_params());
@@ -2142,7 +2142,7 @@ struct llama_sampler_xtc {
2142
2142
  const uint32_t seed;
2143
2143
  uint32_t seed_cur;
2144
2144
 
2145
- std::mt19937 rng;
2145
+ std::mt19937 rng;
2146
2146
  };
2147
2147
 
2148
2148
  static const char * llama_sampler_xtc_name(const struct llama_sampler * /*smpl*/) {
@@ -111,8 +111,20 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
111
111
  }
112
112
  }
113
113
  for (size_t i = 0; i < ret.size(); i++) {
114
- size_t free, total;
114
+ size_t free;
115
+ size_t total;
115
116
  ggml_backend_dev_memory(model->devices[i], &free, &total);
117
+
118
+ // devices can return 0 bytes for free and total memory if they do not
119
+ // have any to report. in this case, we will use the host memory as a fallback
120
+ // fixes: https://github.com/ggml-org/llama.cpp/issues/18577
121
+ if (free == 0 && total == 0) {
122
+ ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
123
+ if (cpu_dev == nullptr) {
124
+ throw std::runtime_error(format("%s: no CPU backend found", __func__));
125
+ }
126
+ ggml_backend_dev_memory(cpu_dev, &free, &total);
127
+ }
116
128
  ret[i].free = free;
117
129
  ret[i].total = total;
118
130
  }
@@ -147,9 +159,8 @@ class llama_params_fit_exception : public std::runtime_error {
147
159
  static void llama_params_fit_impl(
148
160
  const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
149
161
  float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
150
- size_t margin_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
162
+ size_t * margins_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
151
163
  constexpr int64_t MiB = 1024*1024;
152
- const int64_t margin = margin_s; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
153
164
  typedef std::vector<llama_device_memory_data> dmds_t;
154
165
  const llama_model_params default_mparams = llama_model_default_params();
155
166
 
@@ -168,6 +179,12 @@ static void llama_params_fit_impl(
168
179
  return;
169
180
  }
170
181
 
182
+ std::vector<int64_t> margins; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
183
+ margins.reserve(nd);
184
+ for (size_t id = 0; id < nd; id++) {
185
+ margins.push_back(margins_s[id]);
186
+ }
187
+
171
188
  std::vector<std::string> dev_names;
172
189
  {
173
190
  dev_names.reserve(nd);
@@ -187,9 +204,10 @@ static void llama_params_fit_impl(
187
204
 
188
205
  int64_t sum_free = 0;
189
206
  int64_t sum_projected_free = 0;
190
- int64_t min_projected_free = INT64_MAX;
191
207
  int64_t sum_projected_used = 0;
192
208
  int64_t sum_projected_model = 0;
209
+ std::vector<int64_t> projected_free_per_device;
210
+ projected_free_per_device.reserve(nd);
193
211
 
194
212
  if (nd > 1) {
195
213
  LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__);
@@ -199,45 +217,63 @@ static void llama_params_fit_impl(
199
217
 
200
218
  const int64_t projected_used = dmd.mb.total();
201
219
  const int64_t projected_free = dmd.free - projected_used;
220
+ projected_free_per_device.push_back(projected_free);
202
221
 
203
222
  sum_free += dmd.free;
204
223
  sum_projected_used += projected_used;
205
224
  sum_projected_free += projected_free;
206
- min_projected_free = std::min(min_projected_free, projected_free);
207
225
  sum_projected_model += dmd.mb.model;
208
226
 
209
227
  if (nd > 1) {
210
- LLAMA_LOG_INFO("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " %s\n",
211
- __func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, std::abs(projected_free)/MiB,
212
- projected_free >= 0 ? "surplus" : "deficit");
228
+ LLAMA_LOG_INFO("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n",
229
+ __func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB);
213
230
  }
214
231
  }
215
232
  assert(sum_free >= 0 && sum_projected_used >= 0);
216
233
  LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
217
234
  __func__, sum_projected_used/MiB, sum_free/MiB);
218
- if (min_projected_free >= margin) {
219
- if (nd == 1) {
235
+ if (nd == 1) {
236
+ if (projected_free_per_device[0] >= margins[0]) {
220
237
  LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
221
- __func__, min_projected_free/MiB, margin/MiB);
238
+ __func__, projected_free_per_device[0]/MiB, margins[0]/MiB);
239
+ return;
240
+ }
241
+ } else {
242
+ bool changes_needed = false;
243
+ for (size_t id = 0; id < nd; id++) {
244
+ if (projected_free_per_device[id] < margins[id]) {
245
+ changes_needed = true;
246
+ break;
247
+ }
248
+ }
249
+ if (!changes_needed) {
250
+ LLAMA_LOG_INFO("%s: targets for free memory can be met on all devices, no changes needed\n", __func__);
222
251
  return;
223
252
  }
224
- LLAMA_LOG_INFO("%s: will leave at least %" PRId64 " >= %" PRId64 " MiB of free memory on all devices, no changes needed\n",
225
- __func__, min_projected_free/MiB, margin/MiB);
226
- return;
227
253
  }
228
254
 
229
255
  // step 2: try reducing memory use by reducing the context size
230
256
 
231
257
  {
232
- int64_t global_surplus = sum_projected_free - int64_t(nd)*margin;
258
+ int64_t global_surplus = sum_projected_free;
259
+ for (size_t id = 0; id < nd; id++) {
260
+ global_surplus -= margins[id];
261
+ }
233
262
  if (global_surplus < 0) {
234
- LLAMA_LOG_INFO(nd == 1 ?
235
- "%s: cannot fulfill margin of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n" :
236
- "%s: cannot fulfill margin of %" PRId64 " MiB on all devices, need to use %" PRId64 " MiB less in total\n",
237
- __func__, margin/MiB, -global_surplus/MiB);
263
+ if (nd == 1) {
264
+ LLAMA_LOG_INFO("%s: cannot meet free memory target of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n",
265
+ __func__, margins[0]/MiB, -global_surplus/MiB);
266
+ } else {
267
+ LLAMA_LOG_INFO(
268
+ "%s: cannot meet free memory targets on all devices, need to use %" PRId64 " MiB less in total\n",
269
+ __func__, -global_surplus/MiB);
270
+ }
238
271
  if (cparams->n_ctx == 0) {
239
272
  if (hp_nct > n_ctx_min) {
240
- int64_t sum_used_target = sum_free - nd*margin_s;
273
+ int64_t sum_used_target = sum_free;
274
+ for (size_t id = 0; id < nd; id++) {
275
+ sum_used_target -= margins[id];
276
+ }
241
277
  if (nd > 1) {
242
278
  // for multiple devices we need to be more conservative in terms of how much context we think can fit:
243
279
  // - for dense models only whole layers can be assigned to devices
@@ -448,9 +484,9 @@ static void llama_params_fit_impl(
448
484
  const dmds_t dmds_cpu_moe = llama_get_device_memory_data(
449
485
  path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
450
486
 
451
- for (const llama_device_memory_data & dmd : dmds_cpu_moe) {
452
- global_surplus_cpu_moe += dmd.free;
453
- global_surplus_cpu_moe -= int64_t(dmd.mb.total()) + margin;
487
+ for (size_t id = 0; id < nd; id++) {
488
+ global_surplus_cpu_moe += dmds_cpu_moe[id].free;
489
+ global_surplus_cpu_moe -= int64_t(dmds_cpu_moe[id].mb.total()) + margins[id];
454
490
  }
455
491
 
456
492
  if (global_surplus_cpu_moe > 0) {
@@ -469,7 +505,7 @@ static void llama_params_fit_impl(
469
505
  std::vector<int64_t> targets; // maximum acceptable memory use per device
470
506
  targets.reserve(nd);
471
507
  for (size_t id = 0; id < nd; id++) {
472
- targets.push_back(dmds_full[id].free - margin);
508
+ targets.push_back(dmds_full[id].free - margins[id]);
473
509
  LLAMA_LOG_DEBUG("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
474
510
  }
475
511
 
@@ -701,11 +737,11 @@ static void llama_params_fit_impl(
701
737
  enum llama_params_fit_status llama_params_fit(
702
738
  const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
703
739
  float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
704
- size_t margin_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
740
+ size_t * margins, uint32_t n_ctx_min, enum ggml_log_level log_level) {
705
741
  const int64_t t0_us = llama_time_us();
706
742
  llama_params_fit_status status = LLAMA_PARAMS_FIT_STATUS_SUCCESS;
707
743
  try {
708
- llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margin_s, n_ctx_min, log_level);
744
+ llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margins, n_ctx_min, log_level);
709
745
  LLAMA_LOG_INFO("%s: successfully fit params to free device memory\n", __func__);
710
746
  } catch (const llama_params_fit_exception & e) {
711
747
  LLAMA_LOG_WARN("%s: failed to fit params to free device memory: %s\n", __func__, e.what());
@@ -794,7 +830,7 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
794
830
  model.t_start_us = tm.t_start_us;
795
831
 
796
832
  try {
797
- llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
833
+ llama_model_loader ml(fname, splits, params.use_mmap, params.use_direct_io, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
798
834
 
799
835
  ml.print_info();
800
836