@fugood/llama.node 1.4.8 → 1.4.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,9 +13,10 @@
13
13
  #ifdef __has_include
14
14
  #if __has_include(<unistd.h>)
15
15
  #include <unistd.h>
16
+ #include <fcntl.h>
17
+ #include <sys/stat.h>
16
18
  #if defined(_POSIX_MAPPED_FILES)
17
19
  #include <sys/mman.h>
18
- #include <fcntl.h>
19
20
  #endif
20
21
  #if defined(_POSIX_MEMLOCK_RANGE)
21
22
  #include <sys/resource.h>
@@ -74,7 +75,7 @@ struct llama_file::impl {
74
75
  return ret;
75
76
  }
76
77
 
77
- impl(const char * fname, const char * mode) {
78
+ impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) {
78
79
  fp = ggml_fopen(fname, mode);
79
80
  if (fp == NULL) {
80
81
  throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
@@ -153,13 +154,40 @@ struct llama_file::impl {
153
154
  write_raw(&val, sizeof(val));
154
155
  }
155
156
 
157
+ void read_aligned_chunk(size_t offset, void * dest, size_t size) const {
158
+ throw std::runtime_error("DirectIO is not implemented on Windows.");
159
+ }
160
+
156
161
  ~impl() {
157
162
  if (fp) {
158
163
  std::fclose(fp);
159
164
  }
160
165
  }
161
166
  #else
162
- impl(const char * fname, const char * mode) {
167
+ impl(const char * fname, const char * mode, [[maybe_unused]] const bool use_direct_io = false) {
168
+ #ifdef __linux__
169
+ // Try unbuffered I/O for read only
170
+ if (use_direct_io && std::strcmp(mode, "rb") == 0) {
171
+ fd = open(fname, O_RDONLY | O_DIRECT);
172
+
173
+ if (fd != -1) {
174
+ struct stat file_stats{};
175
+ fstat(fd, &file_stats);
176
+
177
+ size = file_stats.st_size;
178
+ alignment = file_stats.st_blksize;
179
+
180
+ off_t ret = lseek(fd, 0, SEEK_SET);
181
+ if (ret == -1) {
182
+ throw std::runtime_error(format("seek error: %s", strerror(errno)));
183
+ }
184
+ return;
185
+ }
186
+
187
+ LLAMA_LOG_WARN("Failed to open model %s with error: %s. Falling back to buffered I/O",
188
+ fname, strerror(errno));
189
+ }
190
+ #endif
163
191
  fp = ggml_fopen(fname, mode);
164
192
  if (fp == NULL) {
165
193
  throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
@@ -170,27 +198,30 @@ struct llama_file::impl {
170
198
  }
171
199
 
172
200
  size_t tell() const {
173
- // TODO: this ifdef is never true?
174
- #ifdef _WIN32
175
- __int64 ret = _ftelli64(fp);
176
- #else
177
- long ret = std::ftell(fp);
178
- #endif
179
- if (ret == -1) {
180
- throw std::runtime_error(format("ftell error: %s", strerror(errno)));
201
+ if (fd == -1) {
202
+ long ret = std::ftell(fp);
203
+ if (ret == -1) {
204
+ throw std::runtime_error(format("ftell error: %s", strerror(errno)));
205
+ }
206
+
207
+ return (size_t) ret;
181
208
  }
182
209
 
183
- return (size_t) ret;
210
+ off_t pos = lseek(fd, 0, SEEK_CUR);
211
+ if (pos == -1) {
212
+ throw std::runtime_error(format("lseek error: %s", strerror(errno)));
213
+ }
214
+ return (size_t) pos;
184
215
  }
185
216
 
186
217
  void seek(size_t offset, int whence) const {
187
- // TODO: this ifdef is never true?
188
- #ifdef _WIN32
189
- int ret = _fseeki64(fp, (__int64) offset, whence);
190
- #else
191
- int ret = std::fseek(fp, (long) offset, whence);
192
- #endif
193
- if (ret != 0) {
218
+ off_t ret = 0;
219
+ if (fd == -1) {
220
+ ret = std::fseek(fp, (long) offset, whence);
221
+ } else {
222
+ ret = lseek(fd, offset, whence);
223
+ }
224
+ if (ret == -1) {
194
225
  throw std::runtime_error(format("seek error: %s", strerror(errno)));
195
226
  }
196
227
  }
@@ -200,13 +231,55 @@ struct llama_file::impl {
200
231
  return;
201
232
  }
202
233
  errno = 0;
203
- std::size_t ret = std::fread(ptr, len, 1, fp);
204
- if (ferror(fp)) {
205
- throw std::runtime_error(format("read error: %s", strerror(errno)));
234
+ if (fd == -1) {
235
+ std::size_t ret = std::fread(ptr, len, 1, fp);
236
+ if (ferror(fp)) {
237
+ throw std::runtime_error(format("read error: %s", strerror(errno)));
238
+ }
239
+ if (ret != 1) {
240
+ throw std::runtime_error("unexpectedly reached end of file");
241
+ }
242
+ } else {
243
+ bool successful = false;
244
+ while (!successful) {
245
+ off_t ret = read(fd, ptr, len);
246
+
247
+ if (ret == -1) {
248
+ if (errno == EINTR) {
249
+ continue; // Interrupted by signal, retry
250
+ }
251
+ throw std::runtime_error(format("read error: %s", strerror(errno)));
252
+ }
253
+ if (ret == 0) {
254
+ throw std::runtime_error("unexpectedly reached end of file");
255
+ }
256
+
257
+ successful = true;
258
+ }
206
259
  }
207
- if (ret != 1) {
208
- throw std::runtime_error("unexpectedly reached end of file");
260
+ }
261
+
262
+ void read_aligned_chunk(size_t offset, void * dest, size_t size) const {
263
+ off_t aligned_offset = offset & ~(alignment - 1);
264
+ off_t offset_from_alignment = offset - aligned_offset;
265
+ size_t bytes_to_read = (offset_from_alignment + size + alignment - 1) & ~(alignment - 1);
266
+
267
+ void * raw_buffer = nullptr;
268
+ int ret = posix_memalign(&raw_buffer, alignment, bytes_to_read);
269
+ if (ret != 0) {
270
+ throw std::runtime_error(format("posix_memalign failed with error %d", ret));
209
271
  }
272
+
273
+ struct aligned_buffer_deleter {
274
+ void operator()(void * p) const { free(p); }
275
+ };
276
+ std::unique_ptr<void, aligned_buffer_deleter> buffer(raw_buffer);
277
+
278
+ seek(aligned_offset, SEEK_SET);
279
+ read_raw(buffer.get(), bytes_to_read);
280
+
281
+ uintptr_t actual_data = reinterpret_cast<uintptr_t>(buffer.get()) + offset_from_alignment;
282
+ memcpy(dest, reinterpret_cast<void *>(actual_data), size);
210
283
  }
211
284
 
212
285
  uint32_t read_u32() const {
@@ -231,22 +304,43 @@ struct llama_file::impl {
231
304
  }
232
305
 
233
306
  ~impl() {
234
- if (fp) {
307
+ if (fd != -1) {
308
+ close(fd);
309
+ } else {
235
310
  std::fclose(fp);
236
311
  }
237
312
  }
313
+ int fd = -1;
238
314
  #endif
239
315
 
240
- FILE * fp;
241
- size_t size;
316
+ void read_raw_at(void * ptr, size_t len, size_t offset) const {
317
+ if (alignment != 1) {
318
+ read_aligned_chunk(offset, ptr, len);
319
+ } else {
320
+ seek(offset, SEEK_SET);
321
+ read_raw(ptr, len);
322
+ }
323
+ }
324
+
325
+ size_t read_alignment() const {
326
+ return alignment;
327
+ }
328
+
329
+ size_t alignment = 1;
330
+
331
+ FILE * fp{};
332
+ size_t size{};
242
333
  };
243
334
 
244
- llama_file::llama_file(const char * fname, const char * mode) : pimpl(std::make_unique<impl>(fname, mode)) {}
335
+ llama_file::llama_file(const char * fname, const char * mode, const bool use_direct_io) :
336
+ pimpl(std::make_unique<impl>(fname, mode, use_direct_io)) {}
245
337
  llama_file::~llama_file() = default;
246
338
 
247
339
  size_t llama_file::tell() const { return pimpl->tell(); }
248
340
  size_t llama_file::size() const { return pimpl->size; }
249
341
 
342
+ size_t llama_file::read_alignment() const { return pimpl->read_alignment(); }
343
+
250
344
  int llama_file::file_id() const {
251
345
  #ifdef _WIN32
252
346
  return _fileno(pimpl->fp);
@@ -261,6 +355,7 @@ int llama_file::file_id() const {
261
355
 
262
356
  void llama_file::seek(size_t offset, int whence) const { pimpl->seek(offset, whence); }
263
357
  void llama_file::read_raw(void * ptr, size_t len) const { pimpl->read_raw(ptr, len); }
358
+ void llama_file::read_raw_at(void * ptr, size_t len, size_t offset) const { pimpl->read_raw_at(ptr, len, offset); }
264
359
 
265
360
  uint32_t llama_file::read_u32() const { return pimpl->read_u32(); }
266
361
 
@@ -3,6 +3,7 @@
3
3
  #include <cstdint>
4
4
  #include <memory>
5
5
  #include <vector>
6
+ #include <cstdio>
6
7
 
7
8
  struct llama_file;
8
9
  struct llama_mmap;
@@ -13,7 +14,7 @@ using llama_mmaps = std::vector<std::unique_ptr<llama_mmap>>;
13
14
  using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
14
15
 
15
16
  struct llama_file {
16
- llama_file(const char * fname, const char * mode);
17
+ llama_file(const char * fname, const char * mode, bool use_direct_io = false);
17
18
  ~llama_file();
18
19
 
19
20
  size_t tell() const;
@@ -24,11 +25,14 @@ struct llama_file {
24
25
  void seek(size_t offset, int whence) const;
25
26
 
26
27
  void read_raw(void * ptr, size_t len) const;
28
+ void read_raw_at(void * ptr, size_t len, size_t offset) const;
29
+ void read_aligned_chunk(size_t offset, void * dest, size_t size) const;
27
30
  uint32_t read_u32() const;
28
31
 
29
32
  void write_raw(const void * ptr, size_t len) const;
30
33
  void write_u32(uint32_t val) const;
31
34
 
35
+ size_t read_alignment() const;
32
36
  private:
33
37
  struct impl;
34
38
  std::unique_ptr<impl> pimpl;
@@ -504,7 +504,7 @@ llama_model_loader::llama_model_loader(
504
504
  get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
505
505
  llm_kv = LLM_KV(llm_arch_from_string(arch_name));
506
506
 
507
- files.emplace_back(new llama_file(fname.c_str(), "rb"));
507
+ files.emplace_back(new llama_file(fname.c_str(), "rb", !use_mmap));
508
508
  contexts.emplace_back(ctx);
509
509
 
510
510
  // Save tensors data offset of the main file.
@@ -572,7 +572,7 @@ llama_model_loader::llama_model_loader(
572
572
  }
573
573
  }
574
574
 
575
- files.emplace_back(new llama_file(fname_split, "rb"));
575
+ files.emplace_back(new llama_file(fname_split, "rb", !use_mmap));
576
576
  contexts.emplace_back(ctx);
577
577
 
578
578
  // Save tensors data offset info of the shard.
@@ -935,7 +935,15 @@ bool llama_model_loader::load_all_data(
935
935
  // 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
936
936
  // NVMe raid configurations might require more / larger buffers.
937
937
  constexpr size_t n_buffers = 4;
938
- constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB
938
+
939
+ size_t alignment = 1;
940
+ for (const auto & file : files) {
941
+ alignment = std::max(file->read_alignment(), alignment);
942
+ }
943
+
944
+ // Buffer size: balance between memory usage and I/O efficiency
945
+ // 64MB works well for NVMe drives
946
+ const size_t buffer_size = alignment != 1 ? 64 * 1024 * 1024 + 2 * alignment : 1 * 1024 * 1024;
939
947
 
940
948
  std::vector<ggml_backend_buffer_t> host_buffers;
941
949
  std::vector<ggml_backend_event_t> events;
@@ -985,6 +993,7 @@ bool llama_model_loader::load_all_data(
985
993
  // If the backend is supported, create pinned memory buffers and events for synchronisation.
986
994
  for (size_t idx = 0; idx < n_buffers; ++idx) {
987
995
  auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size);
996
+
988
997
  if (!buf) {
989
998
  LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", func,
990
999
  ggml_backend_dev_name(dev));
@@ -1066,9 +1075,9 @@ bool llama_model_loader::load_all_data(
1066
1075
  }
1067
1076
  } else {
1068
1077
  const auto & file = files.at(weight->idx);
1078
+
1069
1079
  if (ggml_backend_buffer_is_host(cur->buffer)) {
1070
- file->seek(weight->offs, SEEK_SET);
1071
- file->read_raw(cur->data, n_size);
1080
+ file->read_raw_at(cur->data, n_size, weight->offs);
1072
1081
  if (check_tensors) {
1073
1082
  validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
1074
1083
  return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
@@ -1077,26 +1086,60 @@ bool llama_model_loader::load_all_data(
1077
1086
  } else {
1078
1087
  // If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
1079
1088
  if (upload_backend) {
1080
- file->seek(weight->offs, SEEK_SET);
1089
+ auto offset = (off_t) weight->offs;
1090
+ alignment = file->read_alignment();
1091
+ off_t aligned_offset = offset & ~(alignment - 1);
1092
+ off_t offset_from_alignment = offset - aligned_offset;
1093
+ file->seek(aligned_offset, SEEK_SET);
1094
+
1095
+ // Calculate aligned read boundaries
1096
+ size_t read_start = aligned_offset;
1097
+ size_t read_end = (offset + n_size + alignment - 1) & ~(alignment - 1);
1081
1098
 
1082
1099
  size_t bytes_read = 0;
1100
+ size_t data_read = 0; // Actual tensor data copied (excluding padding)
1101
+
1102
+ while (bytes_read < read_end - read_start) {
1103
+ size_t read_size = std::min<size_t>(buffer_size, read_end - read_start - bytes_read);
1083
1104
 
1084
- while (bytes_read < n_size) {
1085
- size_t read_iteration = std::min<size_t>(buffer_size, n_size - bytes_read);
1105
+ // Align the destination pointer within the pinned buffer
1106
+ uintptr_t ptr_dest_aligned = (reinterpret_cast<uintptr_t>(host_ptrs[buffer_idx]) + alignment - 1) & ~(alignment - 1);
1086
1107
 
1108
+ // Wait for previous upload to complete before reusing buffer
1087
1109
  ggml_backend_event_synchronize(events[buffer_idx]);
1088
- file->read_raw(host_ptrs[buffer_idx], read_iteration);
1089
- ggml_backend_tensor_set_async(upload_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);
1110
+
1111
+ // Read aligned chunk from file
1112
+ file->read_raw(reinterpret_cast<void *>(ptr_dest_aligned), read_size);
1113
+
1114
+ // Calculate actual data portion (excluding alignment padding)
1115
+ uintptr_t ptr_data = ptr_dest_aligned;
1116
+ size_t data_to_copy = read_size;
1117
+
1118
+ // Skip alignment padding at start of first chunk
1119
+ if (bytes_read == 0) {
1120
+ ptr_data += offset_from_alignment;
1121
+ data_to_copy -= offset_from_alignment;
1122
+ }
1123
+
1124
+ // Trim alignment padding at end of last chunk
1125
+ if (aligned_offset + bytes_read + read_size > offset + n_size) {
1126
+ data_to_copy -= (read_end - (offset + n_size));
1127
+ }
1128
+
1129
+ // Async upload actual data to GPU
1130
+ ggml_backend_tensor_set_async(upload_backend, cur,
1131
+ reinterpret_cast<void *>(ptr_data), data_read, data_to_copy);
1090
1132
  ggml_backend_event_record(events[buffer_idx], upload_backend);
1091
1133
 
1092
- bytes_read += read_iteration;
1134
+ data_read += data_to_copy;
1135
+ bytes_read += read_size;
1136
+
1093
1137
  ++buffer_idx;
1094
1138
  buffer_idx %= n_buffers;
1095
1139
  }
1096
1140
  } else {
1097
1141
  read_buf.resize(n_size);
1098
- file->seek(weight->offs, SEEK_SET);
1099
- file->read_raw(read_buf.data(), n_size);
1142
+ file->read_raw_at(read_buf.data(), n_size, weight->offs);
1100
1143
  ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
1101
1144
  if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
1102
1145
  throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
@@ -2378,10 +2378,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
2378
2378
  if (cpu_dev == nullptr) {
2379
2379
  throw std::runtime_error(format("%s: no CPU backend found", __func__));
2380
2380
  }
2381
- const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
2382
- const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
2381
+ const int i_gpu_start = std::max(int(hparams.n_layer) + 1 - n_gpu_layers, 0);
2382
+ const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, int(n_layer) + 1);
2383
2383
  auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
2384
- const bool is_swa = il < (int) hparams.n_layer && hparams.is_swa(il);
2384
+ const bool is_swa = il < int(hparams.n_layer) && hparams.is_swa(il);
2385
2385
  if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
2386
2386
  LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(cpu_dev), is_swa);
2387
2387
  return {cpu_dev, &pimpl->cpu_buft_list};
@@ -6693,10 +6693,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
6693
6693
  if (llama_supports_gpu_offload()) {
6694
6694
  const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
6695
6695
 
6696
- LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
6697
- if (n_gpu_layers > (int) hparams.n_layer) {
6696
+ int n_repeating = n_gpu;
6697
+ if (n_repeating > 0) {
6698
6698
  LLAMA_LOG_INFO("%s: offloading output layer to GPU\n", __func__);
6699
+ n_repeating--;
6699
6700
  }
6701
+ LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_repeating);
6700
6702
 
6701
6703
  const int max_backend_supported_layers = hparams.n_layer + 1;
6702
6704
  const int max_offloadable_layers = hparams.n_layer + 1;
@@ -362,23 +362,39 @@ const char * llama_sampler_name(const struct llama_sampler * smpl) {
362
362
  }
363
363
 
364
364
  void llama_sampler_accept(struct llama_sampler * smpl, llama_token token) {
365
+ if (!smpl) {
366
+ return;
367
+ }
368
+
365
369
  if (smpl->iface->accept) {
366
370
  smpl->iface->accept(smpl, token);
367
371
  }
368
372
  }
369
373
 
370
374
  void llama_sampler_apply(struct llama_sampler * smpl, struct llama_token_data_array * cur_p) {
375
+ if (!smpl) {
376
+ return;
377
+ }
378
+
371
379
  GGML_ASSERT(smpl->iface->apply);
372
380
  smpl->iface->apply(smpl, cur_p);
373
381
  }
374
382
 
375
383
  void llama_sampler_reset(struct llama_sampler * smpl) {
384
+ if (!smpl) {
385
+ return;
386
+ }
387
+
376
388
  if (smpl->iface->reset) {
377
389
  smpl->iface->reset(smpl);
378
390
  }
379
391
  }
380
392
 
381
393
  struct llama_sampler * llama_sampler_clone(const struct llama_sampler * smpl) {
394
+ if (!smpl) {
395
+ return nullptr;
396
+ }
397
+
382
398
  if (smpl->iface->clone) {
383
399
  return smpl->iface->clone(smpl);
384
400
  }
@@ -292,10 +292,6 @@ static void llama_params_fit_impl(
292
292
  if (mparams->split_mode == LLAMA_SPLIT_MODE_ROW) {
293
293
  throw std::runtime_error("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort");
294
294
  }
295
- if (hp_ngl < 2*nd) {
296
- throw std::runtime_error("model has only " + std::to_string(hp_ngl) + " layers but need at least "
297
- + std::to_string(2*nd) + " to fit memory for " + std::to_string(nd) + " devices, abort");
298
- }
299
295
  }
300
296
  if (!tensor_buft_overrides) {
301
297
  throw std::runtime_error("did not provide buffer to set tensor_buft_overrides, abort");
@@ -362,8 +358,7 @@ static void llama_params_fit_impl(
362
358
  auto set_ngl_tensor_split_tbo = [&](
363
359
  const std::vector<ngl_t> & ngl_per_device,
364
360
  const std::vector<ggml_backend_buffer_type_t> & overflow_bufts,
365
- llama_model_params & mparams,
366
- const bool add_nonrepeating) {
361
+ llama_model_params & mparams) {
367
362
  mparams.n_gpu_layers = 0;
368
363
  for (size_t id = 0; id < nd; id++) {
369
364
  mparams.n_gpu_layers += ngl_per_device[id].n_layer;
@@ -371,13 +366,9 @@ static void llama_params_fit_impl(
371
366
  tensor_split[id] = ngl_per_device[id].n_layer;
372
367
  }
373
368
  }
374
- assert(uint32_t(mparams.n_gpu_layers) <= hp_ngl);
375
- uint32_t il0 = hp_ngl - mparams.n_gpu_layers; // start index for tensor buft overrides
369
+ assert(uint32_t(mparams.n_gpu_layers) <= hp_ngl + 1);
370
+ uint32_t il0 = hp_ngl + 1 - mparams.n_gpu_layers; // start index for tensor buft overrides
376
371
 
377
- if (add_nonrepeating) {
378
- mparams.n_gpu_layers += 1;
379
- tensor_split[nd - 1] += 1;
380
- }
381
372
  mparams.tensor_split = tensor_split;
382
373
 
383
374
  size_t itbo = 0;
@@ -408,10 +399,9 @@ static void llama_params_fit_impl(
408
399
  auto get_memory_for_layers = [&](
409
400
  const char * func_name,
410
401
  const std::vector<ngl_t> & ngl_per_device,
411
- const std::vector<ggml_backend_buffer_type_t> & overflow_bufts,
412
- const bool add_nonrepeating) -> std::vector<int64_t> {
402
+ const std::vector<ggml_backend_buffer_type_t> & overflow_bufts) -> std::vector<int64_t> {
413
403
  llama_model_params mparams_copy = *mparams;
414
- set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy, add_nonrepeating);
404
+ set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy);
415
405
 
416
406
  const dmds_t dmd_nl = llama_get_device_memory_data(
417
407
  path_model, &mparams_copy, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
@@ -469,9 +459,6 @@ static void llama_params_fit_impl(
469
459
  LLAMA_LOG_DEBUG("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
470
460
  }
471
461
 
472
- // whether for the optimal memory use we expect to load at least some MoE tensors:
473
- const bool partial_moe = hp_nex > 0 && global_surplus_cpu_moe > 0;
474
-
475
462
  std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the partial layers of a device overflow to:
476
463
  overflow_bufts.reserve(nd);
477
464
  for (size_t id = 0; id < nd - 1; ++id) {
@@ -480,7 +467,7 @@ static void llama_params_fit_impl(
480
467
  overflow_bufts.push_back(ggml_backend_cpu_buffer_type());
481
468
 
482
469
  std::vector<ngl_t> ngl_per_device(nd);
483
- std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts, partial_moe);
470
+ std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts);
484
471
  if (hp_nex > 0) {
485
472
  for (size_t id = 0; id < nd; id++) {
486
473
  ngl_per_device[id].overflow_type = LAYER_FRACTION_MOE;
@@ -493,13 +480,14 @@ static void llama_params_fit_impl(
493
480
  // - interpolate the memory use / layer between low and high linearly to get a guess where it meets our target
494
481
  // - check memory use of our guess, replace either the low or high bound
495
482
  // - once we only have a difference of a single layer, stop and return the lower bound that just barely still fits
483
+ // - the last device has the output layer, which cannot be a partial layer
496
484
  if (hp_nex == 0) {
497
485
  LLAMA_LOG_INFO("%s: filling dense layers back-to-front:\n", __func__);
498
486
  } else {
499
487
  LLAMA_LOG_INFO("%s: filling dense-only layers back-to-front:\n", __func__);
500
488
  }
501
489
  for (int id = nd - 1; id >= 0; id--) {
502
- uint32_t n_unassigned = hp_ngl;
490
+ uint32_t n_unassigned = hp_ngl + 1;
503
491
  for (size_t jd = id + 1; jd < nd; ++jd) {
504
492
  assert(n_unassigned >= ngl_per_device[jd].n_layer);
505
493
  n_unassigned -= ngl_per_device[jd].n_layer;
@@ -508,10 +496,10 @@ static void llama_params_fit_impl(
508
496
  std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
509
497
  ngl_per_device_high[id].n_layer = n_unassigned;
510
498
  if (hp_nex > 0) {
511
- ngl_per_device_high[id].n_part = ngl_per_device_high[id].n_layer;
499
+ ngl_per_device_high[id].n_part = size_t(id) < nd - 1 ? ngl_per_device_high[id].n_layer : ngl_per_device_high[id].n_layer - 1;
512
500
  }
513
501
  if (ngl_per_device_high[id].n_layer > 0) {
514
- std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts, partial_moe);
502
+ std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
515
503
  if (mem_high[id] > targets[id]) {
516
504
  assert(ngl_per_device_high[id].n_layer > ngl_per_device[id].n_layer);
517
505
  uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
@@ -526,7 +514,7 @@ static void llama_params_fit_impl(
526
514
  if (hp_nex) {
527
515
  ngl_per_device_test[id].n_part += step_size;
528
516
  }
529
- const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
517
+ const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
530
518
 
531
519
  if (mem_test[id] <= targets[id]) {
532
520
  ngl_per_device = ngl_per_device_test;
@@ -542,6 +530,7 @@ static void llama_params_fit_impl(
542
530
  } else {
543
531
  assert(ngl_per_device_high[id].n_layer == n_unassigned);
544
532
  ngl_per_device = ngl_per_device_high;
533
+ mem = mem_high;
545
534
  LLAMA_LOG_DEBUG("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
546
535
  }
547
536
  }
@@ -552,7 +541,7 @@ static void llama_params_fit_impl(
552
541
  __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, mem[id]/MiB, projected_margin/MiB);
553
542
  }
554
543
  if (hp_nex == 0 || global_surplus_cpu_moe <= 0) {
555
- set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams, partial_moe);
544
+ set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
556
545
  return;
557
546
  }
558
547
 
@@ -575,13 +564,13 @@ static void llama_params_fit_impl(
575
564
  for (size_t id = 0; id <= id_dense_start; id++) {
576
565
  std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
577
566
  for (size_t jd = id_dense_start; jd < nd; jd++) {
578
- const uint32_t n_layer_move = ngl_per_device_high[jd].n_layer;
567
+ const uint32_t n_layer_move = jd < nd - 1 ? ngl_per_device_high[jd].n_layer : ngl_per_device_high[jd].n_layer - 1;
579
568
  ngl_per_device_high[id].n_layer += n_layer_move;
580
569
  ngl_per_device_high[jd].n_layer -= n_layer_move;
581
570
  ngl_per_device_high[jd].n_part = 0;
582
571
  }
583
572
  size_t id_dense_start_high = nd - 1;
584
- std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts, partial_moe);
573
+ std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts);
585
574
 
586
575
  if (mem_high[id] > targets[id]) {
587
576
  assert(ngl_per_device_high[id].n_layer >= ngl_per_device_high[id].n_part);
@@ -609,7 +598,7 @@ static void llama_params_fit_impl(
609
598
  break;
610
599
  }
611
600
  }
612
- const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
601
+ const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
613
602
 
614
603
  if (mem_test[id] <= targets[id]) {
615
604
  ngl_per_device = ngl_per_device_test;
@@ -629,13 +618,14 @@ static void llama_params_fit_impl(
629
618
  }
630
619
  } else {
631
620
  ngl_per_device = ngl_per_device_high;
621
+ mem = mem_high;
632
622
  id_dense_start = id_dense_start_high;
633
623
  LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
634
624
  __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
635
625
  }
636
626
 
637
627
  // try to fit at least part of one more layer
638
- if (ngl_per_device[id_dense_start].n_layer > 0) {
628
+ if (ngl_per_device[id_dense_start].n_layer > (id < nd - 1 ? 0 : 1)) {
639
629
  std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
640
630
  size_t id_dense_start_test = id_dense_start;
641
631
  ngl_per_device_test[id_dense_start_test].n_layer--;
@@ -647,7 +637,7 @@ static void llama_params_fit_impl(
647
637
  }
648
638
  ngl_per_device_test[id].overflow_type = LAYER_FRACTION_UP;
649
639
  LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__);
650
- std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
640
+ std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
651
641
  if (mem_test[id] < targets[id]) {
652
642
  ngl_per_device = ngl_per_device_test;
653
643
  mem = mem_test;
@@ -657,7 +647,7 @@ static void llama_params_fit_impl(
657
647
 
658
648
  ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE;
659
649
  LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__);
660
- mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
650
+ mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
661
651
  if (mem_test[id] < targets[id]) {
662
652
  ngl_per_device = ngl_per_device_test;
663
653
  mem = mem_test;
@@ -668,7 +658,7 @@ static void llama_params_fit_impl(
668
658
  } else {
669
659
  ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN;
670
660
  LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__);
671
- mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
661
+ mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts);
672
662
  if (mem_test[id] < targets[id]) {
673
663
  ngl_per_device = ngl_per_device_test;
674
664
  mem = mem_test;
@@ -685,7 +675,7 @@ static void llama_params_fit_impl(
685
675
  __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
686
676
  }
687
677
 
688
- set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams, partial_moe);
678
+ set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams);
689
679
  }
690
680
 
691
681
  bool llama_params_fit(