@fugood/llama.node 1.4.6 → 1.4.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. package/lib/binding.ts +8 -0
  2. package/package.json +15 -15
  3. package/scripts/llama.cpp.patch +25 -26
  4. package/src/LlamaContext.cpp +2 -2
  5. package/src/llama.cpp/common/CMakeLists.txt +2 -0
  6. package/src/llama.cpp/common/arg.cpp +364 -193
  7. package/src/llama.cpp/common/arg.h +43 -2
  8. package/src/llama.cpp/common/chat-parser-xml-toolcall.cpp +36 -18
  9. package/src/llama.cpp/common/chat-parser-xml-toolcall.h +1 -1
  10. package/src/llama.cpp/common/chat-parser.cpp +3 -2
  11. package/src/llama.cpp/common/chat-peg-parser.cpp +16 -2
  12. package/src/llama.cpp/common/chat.cpp +272 -0
  13. package/src/llama.cpp/common/common.cpp +130 -67
  14. package/src/llama.cpp/common/common.h +40 -16
  15. package/src/llama.cpp/common/console.cpp +680 -47
  16. package/src/llama.cpp/common/console.h +30 -8
  17. package/src/llama.cpp/common/download.cpp +69 -25
  18. package/src/llama.cpp/common/json-schema-to-grammar.cpp +132 -3
  19. package/src/llama.cpp/common/json-schema-to-grammar.h +20 -0
  20. package/src/llama.cpp/common/log.cpp +5 -0
  21. package/src/llama.cpp/common/log.h +1 -0
  22. package/src/llama.cpp/common/peg-parser.cpp +1 -1
  23. package/src/llama.cpp/common/preset.cpp +206 -0
  24. package/src/llama.cpp/common/preset.h +32 -0
  25. package/src/llama.cpp/common/sampling.cpp +91 -92
  26. package/src/llama.cpp/common/sampling.h +11 -6
  27. package/src/llama.cpp/common/speculative.cpp +1 -1
  28. package/src/llama.cpp/ggml/CMakeLists.txt +5 -0
  29. package/src/llama.cpp/ggml/include/ggml-alloc.h +9 -0
  30. package/src/llama.cpp/ggml/include/ggml-backend.h +1 -0
  31. package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
  32. package/src/llama.cpp/ggml/include/ggml.h +7 -8
  33. package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
  34. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +3 -0
  35. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2 -0
  36. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +69 -39
  37. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
  38. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +2 -1
  39. package/src/llama.cpp/include/llama.h +18 -1
  40. package/src/llama.cpp/src/CMakeLists.txt +2 -1
  41. package/src/llama.cpp/src/llama-arch.cpp +1890 -2248
  42. package/src/llama.cpp/src/llama-arch.h +9 -2
  43. package/src/llama.cpp/src/llama-batch.cpp +12 -2
  44. package/src/llama.cpp/src/llama-batch.h +4 -2
  45. package/src/llama.cpp/src/llama-context.cpp +99 -29
  46. package/src/llama.cpp/src/llama-context.h +9 -3
  47. package/src/llama.cpp/src/llama-grammar.cpp +233 -33
  48. package/src/llama.cpp/src/llama-grammar.h +20 -1
  49. package/src/llama.cpp/src/llama-graph.cpp +85 -17
  50. package/src/llama.cpp/src/llama-graph.h +17 -4
  51. package/src/llama.cpp/src/llama-hparams.cpp +6 -0
  52. package/src/llama.cpp/src/llama-hparams.h +5 -1
  53. package/src/llama.cpp/src/llama-impl.cpp +4 -0
  54. package/src/llama.cpp/src/llama-kv-cache.cpp +90 -42
  55. package/src/llama.cpp/src/llama-kv-cache.h +19 -2
  56. package/src/llama.cpp/src/llama-memory-hybrid.cpp +1 -1
  57. package/src/llama.cpp/src/llama-model-loader.cpp +2 -0
  58. package/src/llama.cpp/src/llama-model-loader.h +2 -0
  59. package/src/llama.cpp/src/llama-model.cpp +123 -52
  60. package/src/llama.cpp/src/llama-model.h +1 -0
  61. package/src/llama.cpp/src/llama-quant.cpp +1 -1
  62. package/src/llama.cpp/src/llama-vocab.cpp +2 -1
  63. package/src/llama.cpp/src/llama.cpp +675 -1
  64. package/src/llama.cpp/src/models/deepseek2.cpp +9 -5
  65. package/src/llama.cpp/src/models/{gemma3-iswa.cpp → gemma3.cpp} +30 -5
  66. package/src/llama.cpp/src/models/glm4-moe.cpp +28 -11
  67. package/src/llama.cpp/src/models/glm4.cpp +27 -4
  68. package/src/llama.cpp/src/models/models.h +8 -7
  69. package/src/llama.cpp/src/models/nemotron-h.cpp +35 -6
  70. package/src/llama.cpp/src/models/qwen2.cpp +12 -3
  71. package/src/llama.cpp/src/models/qwen3next.cpp +81 -266
@@ -1,6 +1,9 @@
1
+ #include "llama.h"
2
+
1
3
  #include "llama-impl.h"
2
4
 
3
5
  #include "llama-chat.h"
6
+ #include "llama-context.h"
4
7
  #include "llama-mmap.h"
5
8
  #include "llama-vocab.h"
6
9
  #include "llama-model-loader.h"
@@ -11,11 +14,14 @@
11
14
  #include "ggml-backend.h"
12
15
 
13
16
  #include <algorithm>
17
+ #include <cassert>
18
+ #include <cinttypes>
14
19
  #include <cstddef>
15
20
  #include <cstdint>
16
21
  #include <cstdio>
17
22
  #include <cstring>
18
23
  #include <ctime>
24
+ #include <stdexcept>
19
25
 
20
26
  #if defined(_MSC_VER)
21
27
  #pragma warning(disable: 4244 4267) // possible loss of data
@@ -37,6 +43,669 @@ const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_ty
37
43
  GGML_ABORT("fatal error");
38
44
  }
39
45
 
46
+ struct llama_device_memory_data {
47
+ int64_t total;
48
+ int64_t free;
49
+ llama_memory_breakdown_data mb;
50
+ };
51
+
52
+ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
53
+ const char * path_model, const llama_model_params * mparams, const llama_context_params * cparams,
54
+ std::vector<ggml_backend_dev_t> & devs, uint32_t & hp_ngl, uint32_t & hp_n_ctx_train, uint32_t & hp_n_expert,
55
+ const ggml_log_level log_level) {
56
+ struct user_data_t {
57
+ struct {
58
+ ggml_log_callback callback;
59
+ void * user_data;
60
+ } original_logger;
61
+ ggml_log_level min_level; // prints below this log level go to debug log
62
+ };
63
+ user_data_t ud;
64
+ llama_log_get(&ud.original_logger.callback, &ud.original_logger.user_data);
65
+ ud.min_level = log_level;
66
+
67
+ llama_log_set([](ggml_log_level level, const char * text, void * user_data) {
68
+ const user_data_t * ud = (const user_data_t *) user_data;
69
+ const ggml_log_level level_eff = level >= ud->min_level ? level : GGML_LOG_LEVEL_DEBUG;
70
+ ud->original_logger.callback(level_eff, text, ud->original_logger.user_data);
71
+ }, &ud);
72
+
73
+ llama_model_params mparams_copy = *mparams;
74
+ mparams_copy.no_alloc = true;
75
+ mparams_copy.use_mmap = false;
76
+ mparams_copy.use_mlock = false;
77
+
78
+ llama_model * model = llama_model_load_from_file(path_model, mparams_copy);
79
+ if (model == nullptr) {
80
+ llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
81
+ throw std::runtime_error("failed to load model");
82
+ }
83
+
84
+ llama_context * ctx = llama_init_from_model(model, *cparams);
85
+ if (ctx == nullptr) {
86
+ llama_model_free(model);
87
+ llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
88
+ throw std::runtime_error("failed to create llama_context from model");
89
+ }
90
+
91
+ std::vector<llama_device_memory_data> ret(model->devices.size());
92
+
93
+ std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown = ctx->memory_breakdown();
94
+
95
+ for (const auto & [buft, mb] : memory_breakdown) {
96
+ if (ggml_backend_buft_is_host(buft)) {
97
+ continue;
98
+ }
99
+
100
+ ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
101
+ if (!dev) {
102
+ continue;
103
+ }
104
+ for (size_t i = 0; i < ret.size(); i++) {
105
+ if (model->devices[i] == dev) {
106
+ ret[i].mb.model += mb.model;
107
+ ret[i].mb.context += mb.context;
108
+ ret[i].mb.compute += mb.compute;
109
+ break;
110
+ }
111
+ }
112
+ }
113
+ for (size_t i = 0; i < ret.size(); i++) {
114
+ size_t free, total;
115
+ ggml_backend_dev_memory(model->devices[i], &free, &total);
116
+ ret[i].free = free;
117
+ ret[i].total = total;
118
+ }
119
+
120
+ devs = model->devices;
121
+ hp_ngl = model->hparams.n_layer;
122
+ hp_n_ctx_train = model->hparams.n_ctx_train;
123
+ hp_n_expert = model->hparams.n_expert;
124
+
125
+ llama_memory_breakdown_print(ctx); // goes to debug log
126
+
127
+ llama_free(ctx);
128
+ llama_model_free(model);
129
+ llama_log_set(ud.original_logger.callback, ud.original_logger.user_data);
130
+ return ret;
131
+ }
132
+
133
+ // enum to identify part of a layer for distributing its tensors:
134
+ enum layer_fraction_t {
135
+ LAYER_FRACTION_NONE = 0, // nothing
136
+ LAYER_FRACTION_ATTN = 1, // attention
137
+ LAYER_FRACTION_UP = 2, // attention + up
138
+ LAYER_FRACTION_GATE = 3, // attention + up + gate
139
+ LAYER_FRACTION_MOE = 4, // everything but sparse MoE weights
140
+ };
141
+ // this enum is only used in llama_params_fit_impl but needs to be defined outside of it to fix a Windows compilation issue
142
+
143
+ static void llama_params_fit_impl(
144
+ const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
145
+ float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
146
+ size_t margin_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
147
+ constexpr int64_t MiB = 1024*1024;
148
+ const int64_t margin = margin_s; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
149
+ typedef std::vector<llama_device_memory_data> dmds_t;
150
+ const llama_model_params default_mparams = llama_model_default_params();
151
+
152
+ std::vector<ggml_backend_dev_t> devs;
153
+ uint32_t hp_ngl = 0; // hparams.n_gpu_layers
154
+ uint32_t hp_nct = 0; // hparams.n_ctx_train
155
+ uint32_t hp_nex = 0; // hparams.n_expert
156
+
157
+ // step 1: get data for default parameters and check whether any changes are necessary in the first place
158
+
159
+ LLAMA_LOG_DEBUG("%s: getting device memory data for initial parameters:\n", __func__);
160
+ const dmds_t dmds_full = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
161
+ const size_t nd = devs.size(); // number of devices
162
+ if (nd == 0) {
163
+ LLAMA_LOG_INFO("%s: no devices with dedicated memory found\n", __func__);
164
+ return;
165
+ }
166
+
167
+ std::vector<std::string> dev_names;
168
+ {
169
+ dev_names.reserve(nd);
170
+ size_t max_length = 0;
171
+ for (ggml_backend_dev_t dev : devs) {
172
+ std::string name = ggml_backend_dev_name(dev);
173
+ name += " (";
174
+ name += ggml_backend_dev_description(dev);
175
+ name += ")";
176
+ dev_names.push_back(name);
177
+ max_length = std::max(max_length, name.length());
178
+ }
179
+ for (std::string & dn : dev_names) {
180
+ dn.insert(dn.end(), max_length - dn.length(), ' ');
181
+ }
182
+ }
183
+
184
+ int64_t sum_total = 0;
185
+ int64_t sum_projected_free = 0;
186
+ int64_t min_projected_free = INT64_MAX;
187
+ int64_t sum_projected_used = 0;
188
+ int64_t sum_projected_model = 0;
189
+ int64_t sum_projected_ctx = 0;
190
+
191
+ if (nd > 1) {
192
+ LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__);
193
+ }
194
+ for (size_t id = 0; id < nd; id++) {
195
+ const llama_device_memory_data & dmd = dmds_full[id];
196
+
197
+ const int64_t projected_used = dmd.mb.total();
198
+ const int64_t projected_free = dmd.free - projected_used;
199
+
200
+ sum_total += dmd.total;
201
+ sum_projected_used += projected_used;
202
+ sum_projected_free += projected_free;
203
+ min_projected_free = std::min(min_projected_free, projected_free);
204
+ sum_projected_model += dmd.mb.model;
205
+ sum_projected_ctx += dmd.mb.context;
206
+
207
+ if (nd > 1) {
208
+ LLAMA_LOG_INFO("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " %s\n",
209
+ __func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, std::abs(projected_free)/MiB,
210
+ projected_free >= 0 ? "surplus" : "deficit");
211
+ }
212
+ }
213
+ assert(sum_total >= 0 && sum_projected_used >= 0 && sum_projected_ctx >= 0);
214
+ assert(sum_projected_used >= sum_projected_ctx);
215
+ LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
216
+ __func__, sum_projected_used/MiB, sum_total/MiB);
217
+ if (min_projected_free >= margin) {
218
+ if (nd == 1) {
219
+ LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
220
+ __func__, min_projected_free/MiB, margin/MiB);
221
+ return;
222
+ }
223
+ LLAMA_LOG_INFO("%s: will leave at least %" PRId64 " >= %" PRId64 " MiB of free memory on all devices, no changes needed\n",
224
+ __func__, min_projected_free/MiB, margin/MiB);
225
+ return;
226
+ }
227
+
228
+ // step 2: try reducing memory use by reducing the context size
229
+
230
+ {
231
+ int64_t global_surplus = sum_projected_free - int64_t(nd)*margin;
232
+ if (global_surplus < 0) {
233
+ LLAMA_LOG_INFO(nd == 1 ?
234
+ "%s: cannot fulfill margin of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n" :
235
+ "%s: cannot fulfill margin of %" PRId64 " MiB on all devices, need to use %" PRId64 " MiB less in total\n",
236
+ __func__, margin/MiB, -global_surplus/MiB);
237
+ if (cparams->n_ctx == 0) {
238
+ if (hp_nct > n_ctx_min) {
239
+ const int64_t bytes_per_ctx = sum_projected_ctx / hp_nct;
240
+
241
+ int64_t memory_reduction = -global_surplus;
242
+ if (nd > 1) {
243
+ // for multiple devices we need to be more conservative in terms of how much context we think can fit:
244
+ // - for dense models only whole layers can be assigned to devices
245
+ // - for MoE models only whole tensors can be assigned to devices, which we estimate to be <= 1/3 of a layer
246
+ // - on average we expect a waste of 0.5 layers/tensors per device
247
+ // - use slightly more than the expected average for nd devices to be safe
248
+ const int64_t model_per_layer = sum_projected_model / std::min(uint32_t(mparams->n_gpu_layers), hp_ngl);
249
+ memory_reduction += (nd + 1) * model_per_layer / (hp_nex == 0 ? 2 : 6);
250
+ }
251
+
252
+ uint32_t ctx_reduction = std::min(uint32_t((memory_reduction + bytes_per_ctx - 1) / bytes_per_ctx), hp_nct - n_ctx_min);
253
+ cparams->n_ctx = hp_nct - ctx_reduction;
254
+ cparams->n_ctx = std::max(cparams->n_ctx - cparams->n_ctx % 256, n_ctx_min); // round down context for CUDA backend
255
+
256
+ ctx_reduction = hp_nct - cparams->n_ctx;
257
+ memory_reduction = ctx_reduction * bytes_per_ctx;
258
+ global_surplus += memory_reduction;
259
+ LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
260
+ __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
261
+ if (global_surplus >= 0) {
262
+ if (nd == 1) {
263
+ LLAMA_LOG_INFO("%s: entire model can be fit by reducing context\n", __func__);
264
+ return;
265
+ }
266
+ LLAMA_LOG_INFO("%s: entire model should be fit across devices by reducing context\n", __func__);
267
+ }
268
+ } else {
269
+ LLAMA_LOG_INFO("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n",
270
+ __func__, hp_nct, n_ctx_min);
271
+ }
272
+ } else {
273
+ LLAMA_LOG_INFO("%s: context size set by user to %" PRIu32 " -> no change\n", __func__, cparams->n_ctx);
274
+ }
275
+ }
276
+ }
277
+
278
+ if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) {
279
+ throw std::runtime_error("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort");
280
+ }
281
+ if (nd > 1) {
282
+ if (!tensor_split) {
283
+ throw std::runtime_error("did not provide a buffer to write the tensor_split to, abort");
284
+ }
285
+ if (mparams->tensor_split) {
286
+ for (size_t id = 0; id < nd; id++) {
287
+ if (mparams->tensor_split[id] != 0.0f) {
288
+ throw std::runtime_error("model_params::tensor_split already set by user, abort");
289
+ }
290
+ }
291
+ }
292
+ if (mparams->split_mode == LLAMA_SPLIT_MODE_ROW) {
293
+ throw std::runtime_error("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort");
294
+ }
295
+ if (hp_ngl < 2*nd) {
296
+ throw std::runtime_error("model has only " + std::to_string(hp_ngl) + " layers but need at least "
297
+ + std::to_string(2*nd) + " to fit memory for " + std::to_string(nd) + " devices, abort");
298
+ }
299
+ }
300
+ if (!tensor_buft_overrides) {
301
+ throw std::runtime_error("did not provide buffer to set tensor_buft_overrides, abort");
302
+ }
303
+ if (mparams->tensor_buft_overrides && (mparams->tensor_buft_overrides->pattern || mparams->tensor_buft_overrides->buft)) {
304
+ throw std::runtime_error("model_params::tensor_buft_overrides already set by user, abort");
305
+ }
306
+
307
+ // step 3: iteratively fill the back to front with "dense" layers
308
+ // - for a dense model simply fill full layers, giving each device a contiguous slice of the model
309
+ // - for a MoE model, same as dense model but with all MoE tensors in system memory
310
+
311
+ // utility function that returns a static C string matching the tensors for a specific layer index and layer fraction:
312
+ auto get_overflow_pattern = [&](const size_t il, const layer_fraction_t lf) -> const char * {
313
+ constexpr size_t n_strings = 1000;
314
+ if (il >= n_strings) {
315
+ throw std::runtime_error("at most " + std::to_string(n_strings) + " model layers are supported");
316
+ }
317
+ switch (lf) {
318
+ case LAYER_FRACTION_ATTN: {
319
+ static std::array<std::string, n_strings> patterns;
320
+ if (patterns[il].empty()) {
321
+ patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(up|gate|down).*";
322
+ }
323
+ return patterns[il].c_str();
324
+ }
325
+ case LAYER_FRACTION_UP: {
326
+ static std::array<std::string, n_strings> patterns;
327
+ if (patterns[il].empty()) {
328
+ patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(gate|down).*";
329
+ }
330
+ return patterns[il].c_str();
331
+ }
332
+ case LAYER_FRACTION_GATE: {
333
+ static std::array<std::string, n_strings> patterns;
334
+ if (patterns[il].empty()) {
335
+ patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_down.*";
336
+ }
337
+ return patterns[il].c_str();
338
+ }
339
+ case LAYER_FRACTION_MOE: {
340
+ static std::array<std::string, n_strings> patterns;
341
+ if (patterns[il].empty()) {
342
+ patterns[il] = "blk\\." + std::to_string(il) + "\\.ffn_(up|down|gate)_(ch|)exps";
343
+ }
344
+ return patterns[il].c_str();
345
+ }
346
+ default:
347
+ GGML_ABORT("fatal error");
348
+ }
349
+ };
350
+
351
+ struct ngl_t {
352
+ uint32_t n_layer = 0; // number of total layers
353
+ uint32_t n_part = 0; // number of partial layers, <= n_layer
354
+
355
+ // for the first partial layer varying parts can overflow, all further layers use LAYER_FRACTION_MOE:
356
+ layer_fraction_t overflow_type = LAYER_FRACTION_MOE;
357
+ };
358
+
359
+ const size_t ntbo = llama_max_tensor_buft_overrides();
360
+
361
+ // utility function to set n_gpu_layers and tensor_split
362
+ auto set_ngl_tensor_split_tbo = [&](
363
+ const std::vector<ngl_t> & ngl_per_device,
364
+ const std::vector<ggml_backend_buffer_type_t> & overflow_bufts,
365
+ llama_model_params & mparams,
366
+ const bool add_nonrepeating) {
367
+ mparams.n_gpu_layers = 0;
368
+ for (size_t id = 0; id < nd; id++) {
369
+ mparams.n_gpu_layers += ngl_per_device[id].n_layer;
370
+ if (nd > 1) {
371
+ tensor_split[id] = ngl_per_device[id].n_layer;
372
+ }
373
+ }
374
+ assert(uint32_t(mparams.n_gpu_layers) <= hp_ngl);
375
+ uint32_t il0 = hp_ngl - mparams.n_gpu_layers; // start index for tensor buft overrides
376
+
377
+ if (add_nonrepeating) {
378
+ mparams.n_gpu_layers += 1;
379
+ tensor_split[nd - 1] += 1;
380
+ }
381
+ mparams.tensor_split = tensor_split;
382
+
383
+ size_t itbo = 0;
384
+ for (size_t id = 0; id < nd; id++) {
385
+ il0 += ngl_per_device[id].n_layer - ngl_per_device[id].n_part;
386
+ for (uint32_t il = il0; il < il0 + ngl_per_device[id].n_part; il++) {
387
+ if (itbo + 1 >= ntbo) {
388
+ tensor_buft_overrides[itbo].pattern = nullptr;
389
+ tensor_buft_overrides[itbo].buft = nullptr;
390
+ itbo++;
391
+ mparams.tensor_buft_overrides = tensor_buft_overrides;
392
+ throw std::runtime_error("llama_params_fit_n_tensor_buft_overrides() == "
393
+ + std::to_string(ntbo) + " is insufficient for model\n");
394
+ }
395
+ tensor_buft_overrides[itbo].pattern = get_overflow_pattern(il, il == il0 ? ngl_per_device[id].overflow_type : LAYER_FRACTION_MOE);
396
+ tensor_buft_overrides[itbo].buft = overflow_bufts[id];
397
+ itbo++;
398
+ }
399
+ il0 += ngl_per_device[id].n_part;
400
+ }
401
+ tensor_buft_overrides[itbo].pattern = nullptr;
402
+ tensor_buft_overrides[itbo].buft = nullptr;
403
+ itbo++;
404
+ mparams.tensor_buft_overrides = tensor_buft_overrides;
405
+ };
406
+
407
+ // utility function that returns the memory use per device for given numbers of layers per device
408
+ auto get_memory_for_layers = [&](
409
+ const char * func_name,
410
+ const std::vector<ngl_t> & ngl_per_device,
411
+ const std::vector<ggml_backend_buffer_type_t> & overflow_bufts,
412
+ const bool add_nonrepeating) -> std::vector<int64_t> {
413
+ llama_model_params mparams_copy = *mparams;
414
+ set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy, add_nonrepeating);
415
+
416
+ const dmds_t dmd_nl = llama_get_device_memory_data(
417
+ path_model, &mparams_copy, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
418
+
419
+ LLAMA_LOG_DEBUG("%s: memory for test allocation by device:\n", func_name);
420
+ for (size_t id = 0; id < nd; id++) {
421
+ const ngl_t & n = ngl_per_device[id];
422
+ LLAMA_LOG_DEBUG(
423
+ "%s: id=%zu, n_layer=%2" PRIu32 ", n_part=%2" PRIu32 ", overflow_type=%d, mem=%6" PRId64 " MiB\n",
424
+ func_name, id, n.n_layer, n.n_part, int(n.overflow_type), dmd_nl[id].mb.total()/MiB);
425
+ }
426
+
427
+ std::vector<int64_t> ret;
428
+ ret.reserve(nd);
429
+ for (const llama_device_memory_data & dmd : dmd_nl) {
430
+ ret.push_back(dmd.mb.total());
431
+ }
432
+ return ret;
433
+ };
434
+
435
+ int64_t global_surplus_cpu_moe = 0;
436
+ if (hp_nex > 0) {
437
+ const static std::string pattern_moe_all = "blk\\.\\d+\\.ffn_(up|down|gate)_(ch|)exps"; // matches all MoE tensors
438
+ ggml_backend_buffer_type_t cpu_buft = ggml_backend_cpu_buffer_type();
439
+ tensor_buft_overrides[0] = {pattern_moe_all.c_str(), cpu_buft};
440
+ tensor_buft_overrides[1] = {nullptr, nullptr};
441
+ mparams->tensor_buft_overrides = tensor_buft_overrides;
442
+
443
+ LLAMA_LOG_DEBUG("%s: getting device memory data with all MoE tensors moved to system memory:\n", __func__);
444
+ const dmds_t dmds_cpu_moe = llama_get_device_memory_data(
445
+ path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
446
+
447
+ for (const llama_device_memory_data & dmd : dmds_cpu_moe) {
448
+ global_surplus_cpu_moe += dmd.free;
449
+ global_surplus_cpu_moe -= int64_t(dmd.mb.total()) + margin;
450
+ }
451
+
452
+ if (global_surplus_cpu_moe > 0) {
453
+ LLAMA_LOG_INFO("%s: with only dense weights in device memory there is a total surplus of %" PRId64 " MiB\n",
454
+ __func__, global_surplus_cpu_moe/MiB);
455
+ } else {
456
+ LLAMA_LOG_INFO("%s: with only dense weights in device memory there is still a total deficit of %" PRId64 " MiB\n",
457
+ __func__, -global_surplus_cpu_moe/MiB);
458
+ }
459
+
460
+ // reset
461
+ tensor_buft_overrides[0] = {nullptr, nullptr};
462
+ mparams->tensor_buft_overrides = tensor_buft_overrides;
463
+ }
464
+
465
+ std::vector<int64_t> targets; // maximum acceptable memory use per device
466
+ targets.reserve(nd);
467
+ for (size_t id = 0; id < nd; id++) {
468
+ targets.push_back(dmds_full[id].free - margin);
469
+ LLAMA_LOG_DEBUG("%s: id=%zu, target=%" PRId64 " MiB\n", __func__, id, targets[id]/MiB);
470
+ }
471
+
472
+ // whether for the optimal memory use we expect to load at least some MoE tensors:
473
+ const bool partial_moe = hp_nex > 0 && global_surplus_cpu_moe > 0;
474
+
475
+ std::vector<ggml_backend_buffer_type_t> overflow_bufts; // which bufts the partial layers of a device overflow to:
476
+ overflow_bufts.reserve(nd);
477
+ for (size_t id = 0; id < nd - 1; ++id) {
478
+ overflow_bufts.push_back(ggml_backend_dev_buffer_type(devs[id + 1]));
479
+ }
480
+ overflow_bufts.push_back(ggml_backend_cpu_buffer_type());
481
+
482
+ std::vector<ngl_t> ngl_per_device(nd);
483
+ std::vector<int64_t> mem = get_memory_for_layers(__func__, ngl_per_device, overflow_bufts, partial_moe);
484
+ if (hp_nex > 0) {
485
+ for (size_t id = 0; id < nd; id++) {
486
+ ngl_per_device[id].overflow_type = LAYER_FRACTION_MOE;
487
+ }
488
+ }
489
+
490
+ // optimize the number of layers per device using the method of false position:
491
+ // - ngl_per_device has 0 layers for each device, lower bound
492
+ // - try a "high" configuration where a device is given all unassigned layers
493
+ // - interpolate the memory use / layer between low and high linearly to get a guess where it meets our target
494
+ // - check memory use of our guess, replace either the low or high bound
495
+ // - once we only have a difference of a single layer, stop and return the lower bound that just barely still fits
496
+ if (hp_nex == 0) {
497
+ LLAMA_LOG_INFO("%s: filling dense layers back-to-front:\n", __func__);
498
+ } else {
499
+ LLAMA_LOG_INFO("%s: filling dense-only layers back-to-front:\n", __func__);
500
+ }
501
+ for (int id = nd - 1; id >= 0; id--) {
502
+ uint32_t n_unassigned = hp_ngl;
503
+ for (size_t jd = id + 1; jd < nd; ++jd) {
504
+ assert(n_unassigned >= ngl_per_device[jd].n_layer);
505
+ n_unassigned -= ngl_per_device[jd].n_layer;
506
+ }
507
+
508
+ std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
509
+ ngl_per_device_high[id].n_layer = n_unassigned;
510
+ if (hp_nex > 0) {
511
+ ngl_per_device_high[id].n_part = ngl_per_device_high[id].n_layer;
512
+ }
513
+ if (ngl_per_device_high[id].n_layer > 0) {
514
+ std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts, partial_moe);
515
+ if (mem_high[id] > targets[id]) {
516
+ assert(ngl_per_device_high[id].n_layer > ngl_per_device[id].n_layer);
517
+ uint32_t delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
518
+ LLAMA_LOG_DEBUG("%s: start filling device %" PRIu32 ", delta=%" PRIu32 "\n", __func__, id, delta);
519
+ while (delta > 1) {
520
+ uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
521
+ step_size = std::max(step_size, uint32_t(1));
522
+ step_size = std::min(step_size, delta - 1);
523
+
524
+ std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
525
+ ngl_per_device_test[id].n_layer += step_size;
526
+ if (hp_nex) {
527
+ ngl_per_device_test[id].n_part += step_size;
528
+ }
529
+ const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
530
+
531
+ if (mem_test[id] <= targets[id]) {
532
+ ngl_per_device = ngl_per_device_test;
533
+ mem = mem_test;
534
+ LLAMA_LOG_DEBUG("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
535
+ } else {
536
+ ngl_per_device_high = ngl_per_device_test;
537
+ mem_high = mem_test;
538
+ LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device_high[id].n_layer);
539
+ }
540
+ delta = ngl_per_device_high[id].n_layer - ngl_per_device[id].n_layer;
541
+ }
542
+ } else {
543
+ assert(ngl_per_device_high[id].n_layer == n_unassigned);
544
+ ngl_per_device = ngl_per_device_high;
545
+ LLAMA_LOG_DEBUG("%s: set ngl_per_device[%d].n_layer=%" PRIu32 "\n", __func__, id, ngl_per_device[id].n_layer);
546
+ }
547
+ }
548
+
549
+ const int64_t projected_margin = dmds_full[id].free - mem[id];
550
+ LLAMA_LOG_INFO(
551
+ "%s: - %s: %2" PRIu32 " layers, %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
552
+ __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, mem[id]/MiB, projected_margin/MiB);
553
+ }
554
+ if (hp_nex == 0 || global_surplus_cpu_moe <= 0) {
555
+ set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams, partial_moe);
556
+ return;
557
+ }
558
+
559
+ // step 4: for a MoE model where all dense tensors fit,
560
+ // convert the dense-only layers in the back to full layers in the front until all devices are full
561
+ // essentially the same procedure as for the dense-only layers except front-to-back
562
+ // also, try fitting at least part of one more layer to reduce waste for "small" GPUs with e.g. 24 GiB VRAM
563
+
564
+ size_t id_dense_start = nd;
565
+ for (int id = nd - 1; id >= 0; id--) {
566
+ if (ngl_per_device[id].n_layer > 0) {
567
+ id_dense_start = id;
568
+ continue;
569
+ }
570
+ break;
571
+ }
572
+ assert(id_dense_start < nd);
573
+
574
+ LLAMA_LOG_INFO("%s: converting dense-only layers to full layers and filling them front-to-back with overflow to next device/system memory:\n", __func__);
575
+ for (size_t id = 0; id <= id_dense_start; id++) {
576
+ std::vector<ngl_t> ngl_per_device_high = ngl_per_device;
577
+ for (size_t jd = id_dense_start; jd < nd; jd++) {
578
+ const uint32_t n_layer_move = ngl_per_device_high[jd].n_layer;
579
+ ngl_per_device_high[id].n_layer += n_layer_move;
580
+ ngl_per_device_high[jd].n_layer -= n_layer_move;
581
+ ngl_per_device_high[jd].n_part = 0;
582
+ }
583
+ size_t id_dense_start_high = nd - 1;
584
+ std::vector<int64_t> mem_high = get_memory_for_layers(__func__, ngl_per_device_high, overflow_bufts, partial_moe);
585
+
586
+ if (mem_high[id] > targets[id]) {
587
+ assert(ngl_per_device_high[id].n_layer >= ngl_per_device_high[id].n_part);
588
+ assert(ngl_per_device[id].n_layer >= ngl_per_device[id].n_part);
589
+ assert((ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
590
+ >= ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
591
+ uint32_t delta = (ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
592
+ - (ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
593
+ while (delta > 1) {
594
+ uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]);
595
+ step_size = std::max(step_size, uint32_t(1));
596
+ step_size = std::min(step_size, delta - 1);
597
+
598
+ std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
599
+ size_t id_dense_start_test = id_dense_start;
600
+ uint32_t n_converted_test = 0;
601
+ for (;id_dense_start_test < nd; id_dense_start_test++) {
602
+ const uint32_t n_convert_jd = std::min(step_size - n_converted_test, ngl_per_device_test[id_dense_start_test].n_part);
603
+ ngl_per_device_test[id_dense_start_test].n_layer -= n_convert_jd;
604
+ ngl_per_device_test[id_dense_start_test].n_part -= n_convert_jd;
605
+ ngl_per_device_test[id].n_layer += n_convert_jd;
606
+ n_converted_test += n_convert_jd;
607
+
608
+ if (ngl_per_device_test[id_dense_start_test].n_layer > 0) {
609
+ break;
610
+ }
611
+ }
612
+ const std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
613
+
614
+ if (mem_test[id] <= targets[id]) {
615
+ ngl_per_device = ngl_per_device_test;
616
+ mem = mem_test;
617
+ id_dense_start = id_dense_start_test;
618
+ LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
619
+ __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
620
+ } else {
621
+ ngl_per_device_high = ngl_per_device_test;
622
+ mem_high = mem_test;
623
+ id_dense_start_high = id_dense_start_test;
624
+ LLAMA_LOG_DEBUG("%s: set ngl_per_device_high[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start_high=%zu\n",
625
+ __func__, id, ngl_per_device_high[id].n_layer, ngl_per_device_high[id].n_part, id_dense_start_high);
626
+ }
627
+ delta = (ngl_per_device_high[id].n_layer - ngl_per_device_high[id].n_part)
628
+ - (ngl_per_device[id].n_layer - ngl_per_device[id].n_part);
629
+ }
630
+ } else {
631
+ ngl_per_device = ngl_per_device_high;
632
+ id_dense_start = id_dense_start_high;
633
+ LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part)=(%" PRIu32 ", %" PRIu32 "), id_dense_start=%zu\n",
634
+ __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
635
+ }
636
+
637
+ // try to fit at least part of one more layer
638
+ if (ngl_per_device[id_dense_start].n_layer > 0) {
639
+ std::vector<ngl_t> ngl_per_device_test = ngl_per_device;
640
+ size_t id_dense_start_test = id_dense_start;
641
+ ngl_per_device_test[id_dense_start_test].n_layer--;
642
+ ngl_per_device_test[id_dense_start_test].n_part--;
643
+ ngl_per_device_test[id].n_layer++;
644
+ ngl_per_device_test[id].n_part++;
645
+ if (ngl_per_device_test[id_dense_start_test].n_layer == 0) {
646
+ id_dense_start_test++;
647
+ }
648
+ ngl_per_device_test[id].overflow_type = LAYER_FRACTION_UP;
649
+ LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_UP\n", __func__);
650
+ std::vector<int64_t> mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
651
+ if (mem_test[id] < targets[id]) {
652
+ ngl_per_device = ngl_per_device_test;
653
+ mem = mem_test;
654
+ id_dense_start = id_dense_start_test;
655
+ LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", UP), id_dense_start=%zu\n",
656
+ __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
657
+
658
+ ngl_per_device_test[id].overflow_type = LAYER_FRACTION_GATE;
659
+ LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_GATE\n", __func__);
660
+ mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
661
+ if (mem_test[id] < targets[id]) {
662
+ ngl_per_device = ngl_per_device_test;
663
+ mem = mem_test;
664
+ id_dense_start = id_dense_start_test;
665
+ LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", GATE), id_dense_start=%zu\n",
666
+ __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
667
+ }
668
+ } else {
669
+ ngl_per_device_test[id].overflow_type = LAYER_FRACTION_ATTN;
670
+ LLAMA_LOG_DEBUG("%s: trying to fit one extra layer with overflow_type=LAYER_FRACTION_ATTN\n", __func__);
671
+ mem_test = get_memory_for_layers(__func__, ngl_per_device_test, overflow_bufts, partial_moe);
672
+ if (mem_test[id] < targets[id]) {
673
+ ngl_per_device = ngl_per_device_test;
674
+ mem = mem_test;
675
+ id_dense_start = id_dense_start_test;
676
+ LLAMA_LOG_DEBUG("%s: set ngl_per_device[%zu].(n_layer, n_part, overflow_type)=(%" PRIu32 ", %" PRIu32 ", ATTN), id_dense_start=%zu\n",
677
+ __func__, id, ngl_per_device[id].n_layer, ngl_per_device[id].n_part, id_dense_start);
678
+ }
679
+ }
680
+ }
681
+
682
+ const int64_t projected_margin = dmds_full[id].free - mem[id];
683
+ LLAMA_LOG_INFO(
684
+ "%s: - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n",
685
+ __func__, dev_names[id].c_str(), ngl_per_device[id].n_layer, ngl_per_device[id].n_part, mem[id]/MiB, projected_margin/MiB);
686
+ }
687
+
688
+ set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, *mparams, partial_moe);
689
+ }
690
+
691
+ bool llama_params_fit(
692
+ const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams,
693
+ float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides,
694
+ size_t margin_s, uint32_t n_ctx_min, enum ggml_log_level log_level) {
695
+ const int64_t t0_us = llama_time_us();
696
+ bool ok = true;
697
+ try {
698
+ llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margin_s, n_ctx_min, log_level);
699
+ LLAMA_LOG_INFO("%s: successfully fit params to free device memory\n", __func__);
700
+ } catch (const std::runtime_error & e) {
701
+ LLAMA_LOG_WARN("%s: failed to fit params to free device memory: %s\n", __func__, e.what());
702
+ ok = false;
703
+ }
704
+ const int64_t t1_us = llama_time_us();
705
+ LLAMA_LOG_INFO("%s: fitting params to free memory took %.2f seconds\n", __func__, (t1_us - t0_us) * 1e-6);
706
+ return ok;
707
+ }
708
+
40
709
  struct llama_sampler_chain_params llama_sampler_chain_default_params() {
41
710
  struct llama_sampler_chain_params result = {
42
711
  /*.no_perf =*/ true,
@@ -49,6 +718,10 @@ size_t llama_max_devices(void) {
49
718
  return 16;
50
719
  }
51
720
 
721
+ size_t llama_max_tensor_buft_overrides() {
722
+ return 4096;
723
+ }
724
+
52
725
  bool llama_supports_mmap(void) {
53
726
  return llama_mmap::SUPPORTED;
54
727
  }
@@ -108,11 +781,12 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
108
781
  model.t_start_us = tm.t_start_us;
109
782
 
110
783
  try {
111
- llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides, params.tensor_buft_overrides);
784
+ llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides);
112
785
 
113
786
  ml.print_info();
114
787
 
115
788
  model.hparams.vocab_only = params.vocab_only;
789
+ model.hparams.no_alloc = params.no_alloc;
116
790
 
117
791
  try {
118
792
  model.load_arch(ml);