@fugood/llama.node 1.1.4 → 1.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/lib/binding.ts +8 -0
  2. package/package.json +14 -14
  3. package/scripts/llama.cpp.patch +17 -13
  4. package/src/LlamaCompletionWorker.cpp +2 -0
  5. package/src/LlamaContext.cpp +3 -0
  6. package/src/llama.cpp/common/arg.cpp +80 -10
  7. package/src/llama.cpp/common/chat.cpp +52 -8
  8. package/src/llama.cpp/common/chat.h +7 -2
  9. package/src/llama.cpp/common/common.cpp +1 -0
  10. package/src/llama.cpp/common/common.h +16 -6
  11. package/src/llama.cpp/common/speculative.cpp +135 -54
  12. package/src/llama.cpp/common/speculative.h +8 -1
  13. package/src/llama.cpp/ggml/CMakeLists.txt +4 -2
  14. package/src/llama.cpp/ggml/include/ggml.h +37 -1
  15. package/src/llama.cpp/ggml/src/CMakeLists.txt +12 -1
  16. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +61 -0
  17. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +96 -8
  18. package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +3196 -0
  19. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +20 -0
  20. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14 -1
  21. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +207 -9
  22. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -7
  23. package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +35 -0
  24. package/src/llama.cpp/ggml/src/ggml-cpu/quants.h +8 -0
  25. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +263 -0
  26. package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +11 -0
  27. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +19 -4
  28. package/src/llama.cpp/include/llama.h +9 -4
  29. package/src/llama.cpp/src/llama-arch.cpp +105 -0
  30. package/src/llama.cpp/src/llama-arch.h +12 -0
  31. package/src/llama.cpp/src/llama-batch.cpp +1 -1
  32. package/src/llama.cpp/src/llama-chat.cpp +33 -1
  33. package/src/llama.cpp/src/llama-chat.h +2 -0
  34. package/src/llama.cpp/src/llama-context.cpp +19 -10
  35. package/src/llama.cpp/src/llama-context.h +4 -1
  36. package/src/llama.cpp/src/llama-graph.cpp +175 -148
  37. package/src/llama.cpp/src/llama-graph.h +60 -23
  38. package/src/llama.cpp/src/llama-hparams.h +5 -3
  39. package/src/llama.cpp/src/llama-kv-cache-unified.cpp +6 -2
  40. package/src/llama.cpp/src/llama-kv-cache-unified.h +1 -1
  41. package/src/llama.cpp/src/llama-memory-hybrid.cpp +2 -1
  42. package/src/llama.cpp/src/llama-memory-hybrid.h +1 -0
  43. package/src/llama.cpp/src/llama-model-loader.cpp +1 -0
  44. package/src/llama.cpp/src/llama-model-loader.h +3 -2
  45. package/src/llama.cpp/src/llama-model.cpp +949 -75
  46. package/src/llama.cpp/src/llama-model.h +24 -4
  47. package/src/llama.cpp/src/llama-quant.cpp +40 -4
  48. package/src/llama.cpp/src/llama-vocab.cpp +49 -1
  49. package/src/llama.cpp/src/llama-vocab.h +1 -0
@@ -1,30 +1,39 @@
1
1
  #include "speculative.h"
2
2
 
3
+ #include "ggml.h"
4
+ #include "llama.h"
3
5
  #include "log.h"
4
6
  #include "common.h"
5
7
  #include "sampling.h"
6
8
 
7
9
  #include <cstring>
8
10
  #include <algorithm>
11
+ #include <map>
9
12
 
10
13
  #define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 128
11
14
  #define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
12
15
 
13
16
  struct common_speculative {
14
- struct llama_context * ctx;
17
+ struct llama_context * ctx_tgt; // only used for retokenizing from ctx_dft
18
+ struct llama_context * ctx_dft;
15
19
  struct common_sampler * smpl;
16
20
 
17
21
  llama_batch batch;
18
- llama_tokens prompt;
22
+ llama_tokens prompt_dft;
23
+ bool vocab_dft_compatible = true; // whether retokenization is needed
24
+ std::map<std::string, std::string> tgt_dft_replacements = {};
19
25
  };
20
26
 
21
27
  struct common_speculative * common_speculative_init(
28
+ struct llama_context * ctx_tgt,
22
29
  struct llama_context * ctx_dft) {
23
30
  auto * result = new common_speculative {
24
- /* .ctx = */ ctx_dft,
25
- /* .smpl = */ nullptr,
26
- /* .batch = */ llama_batch_init(llama_n_batch(ctx_dft), 0, 1),
27
- /* .prompt = */ {},
31
+ /* .ctx_tgt = */ ctx_tgt,
32
+ /* .ctx_dft = */ ctx_dft,
33
+ /* .smpl = */ nullptr,
34
+ /* .batch = */ llama_batch_init(llama_n_batch(ctx_dft), 0, 1),
35
+ /* .prompt_dft = */ {},
36
+ /* .vocab_dft_compatible = */ false,
28
37
  };
29
38
 
30
39
  // TODO: optimize or pass from outside?
@@ -59,6 +68,9 @@ struct common_speculative * common_speculative_init(
59
68
  }
60
69
  #endif
61
70
 
71
+ result->vocab_dft_compatible = common_speculative_are_compatible(ctx_tgt, ctx_dft);
72
+ LOG_DBG("vocab_dft_compatible = %d\n", result->vocab_dft_compatible);
73
+
62
74
  return result;
63
75
  }
64
76
 
@@ -75,8 +87,8 @@ void common_speculative_free(struct common_speculative * spec) {
75
87
  }
76
88
 
77
89
  bool common_speculative_are_compatible(
78
- const struct llama_context * ctx_tgt,
79
- const struct llama_context * ctx_dft) {
90
+ const struct llama_context * ctx_tgt,
91
+ const struct llama_context * ctx_dft) {
80
92
  const struct llama_model * model_tgt = llama_get_model(ctx_tgt);
81
93
  const struct llama_model * model_dft = llama_get_model(ctx_dft);
82
94
 
@@ -90,31 +102,32 @@ bool common_speculative_are_compatible(
90
102
  LOG_DBG("%s: vocab_type dft: %d\n", __func__, vocab_type_dft);
91
103
 
92
104
  if (vocab_type_tgt != vocab_type_dft) {
93
- LOG_ERR("%s: draft model vocab type must match target model to use speculation but "
94
- "vocab_type_dft = %d while vocab_type_tgt = %d\n", __func__, vocab_type_dft, vocab_type_tgt);
105
+ LOG_DBG("%s: draft model vocab type must match target model to use speculation but ", __func__);
106
+ LOG_DBG("vocab_type_dft = %d while vocab_type_tgt = %d\n", vocab_type_dft, vocab_type_tgt);
95
107
  return false;
96
108
  }
97
109
 
98
- if (llama_vocab_get_add_bos(vocab_tgt) != llama_vocab_get_add_bos(vocab_dft) ||
110
+ if (
111
+ llama_vocab_get_add_bos(vocab_tgt) != llama_vocab_get_add_bos(vocab_dft) ||
99
112
  llama_vocab_get_add_eos(vocab_tgt) != llama_vocab_get_add_eos(vocab_dft) ||
100
113
  llama_vocab_bos(vocab_tgt) != llama_vocab_bos(vocab_dft) ||
101
- llama_vocab_eos(vocab_tgt) != llama_vocab_eos(vocab_dft)) {
102
- LOG_ERR("%s: draft vocab special tokens must match target vocab to use speculation\n", __func__);
103
- LOG_ERR("%s: tgt: bos = %d (%d), eos = %d (%d)\n", __func__, llama_vocab_bos(vocab_tgt), llama_vocab_get_add_bos(vocab_tgt), llama_vocab_eos(vocab_tgt), llama_vocab_get_add_eos(vocab_tgt));
104
- LOG_ERR("%s: dft: bos = %d (%d), eos = %d (%d)\n", __func__, llama_vocab_bos(vocab_dft), llama_vocab_get_add_bos(vocab_dft), llama_vocab_eos(vocab_dft), llama_vocab_get_add_eos(vocab_dft));
114
+ llama_vocab_eos(vocab_tgt) != llama_vocab_eos(vocab_dft)
115
+ ) {
116
+ LOG_DBG("%s: draft model special tokens must match target model to use speculation\n", __func__);
105
117
  return false;
106
118
  }
107
119
 
108
120
  {
109
121
  const int n_vocab_tgt = llama_vocab_n_tokens(vocab_tgt);
110
122
  const int n_vocab_dft = llama_vocab_n_tokens(vocab_dft);
111
-
112
- const int vocab_diff = std::abs(n_vocab_tgt - n_vocab_dft);
123
+ const int vocab_diff = n_vocab_tgt > n_vocab_dft
124
+ ? n_vocab_tgt - n_vocab_dft
125
+ : n_vocab_dft - n_vocab_tgt;
113
126
 
114
127
  if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
115
- LOG_ERR("%s: draft model vocab must closely match target model to use speculation but "
116
- "target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
117
- __func__, n_vocab_tgt, llama_vocab_n_tokens(vocab_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
128
+ LOG_DBG("%s: draft model vocab must closely match target model to use speculation but ", __func__);
129
+ LOG_DBG("target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
130
+ n_vocab_tgt, llama_vocab_n_tokens(vocab_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
118
131
  return false;
119
132
  }
120
133
 
@@ -122,8 +135,8 @@ bool common_speculative_are_compatible(
122
135
  const char * token_text_tgt = llama_vocab_get_text(vocab_tgt, i);
123
136
  const char * token_text_dft = llama_vocab_get_text(vocab_dft, i);
124
137
  if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
125
- LOG_ERR("%s: draft vocab vocab must match target vocab to use speculation but "
126
- "token %d content differs - target '%s', draft '%s'\n", __func__, i,
138
+ LOG_DBG("%s: draft model vocab must match target model to use speculation but ", __func__);
139
+ LOG_DBG("token %d content differs - target '%s', draft '%s'\n", i,
127
140
  common_token_to_piece(ctx_tgt, i).c_str(),
128
141
  common_token_to_piece(ctx_dft, i).c_str());
129
142
  return false;
@@ -134,32 +147,93 @@ bool common_speculative_are_compatible(
134
147
  return true;
135
148
  }
136
149
 
150
+ void common_speculative_add_replacement_tgt_dft(
151
+ struct common_speculative * spec,
152
+ const char *source, const char *dest) {
153
+ spec->tgt_dft_replacements[source] = dest;
154
+ }
155
+
156
+ static std::string replace_to_dft(
157
+ struct common_speculative * spec,
158
+ const std::string& input) {
159
+ std::string result = input;
160
+ for (const auto & pair : spec->tgt_dft_replacements) {
161
+ size_t pos = result.find(pair.first);
162
+ while (pos != std::string::npos) {
163
+ result.replace(pos, pair.first.length(), pair.second);
164
+ pos = result.find(pair.first, pos + pair.second.length());
165
+ }
166
+ }
167
+ return result;
168
+ }
169
+
170
+ static std::string replace_to_tgt(
171
+ struct common_speculative * spec,
172
+ const std::string& input) {
173
+ std::string result = input;
174
+ for (const auto& pair : spec->tgt_dft_replacements) {
175
+ size_t pos = result.find(pair.second);
176
+ while (pos != std::string::npos) {
177
+ result.replace(pos, pair.second.length(), pair.first);
178
+ pos = result.find(pair.second, pos + pair.first.length());
179
+ }
180
+ }
181
+ return result;
182
+ }
183
+
184
+
137
185
  llama_tokens common_speculative_gen_draft(
138
186
  struct common_speculative * spec,
139
187
  struct common_speculative_params params,
140
- const llama_tokens & prompt_tgt,
188
+ const llama_tokens & prompt_tgt_main_model, // specified in target model vocab
141
189
  llama_token id_last) {
142
190
  auto & batch = spec->batch;
143
- auto & ctx = spec->ctx;
191
+ auto & ctx_tgt = spec->ctx_tgt;
192
+ auto & ctx_dft = spec->ctx_dft;
144
193
  auto & smpl = spec->smpl;
145
- auto & prompt = spec->prompt;
194
+ auto & prompt_dft = spec->prompt_dft;
146
195
 
147
- auto * mem = llama_get_memory(ctx);
196
+ auto * mem_dft = llama_get_memory(ctx_dft);
148
197
 
149
198
  int reuse_i = 0;
150
199
  int reuse_n = 0;
151
200
 
152
- const int n_ctx = llama_n_ctx(ctx) - params.n_draft;
201
+ const int n_ctx = llama_n_ctx(ctx_dft) - params.n_draft;
202
+
203
+ llama_tokens prompt_tgt_draft_model;
204
+ if (!spec->vocab_dft_compatible) {
205
+ std::string text;
206
+ text = common_detokenize(ctx_tgt, prompt_tgt_main_model, true);
207
+ text = replace_to_dft(spec, text);
208
+ LOG_DBG("%s: main->draft detokenized string: '%s'\n", __func__, text.c_str());
209
+ prompt_tgt_draft_model = common_tokenize(ctx_dft, text, false, true);
210
+
211
+ // convert id_last to draft vocab. llama_detokenize is called directly to avoid an allocation
212
+ const auto * model_tgt = llama_get_model(ctx_tgt);
213
+ const auto * vocab_tgt = llama_model_get_vocab(model_tgt);
214
+
215
+ int32_t n_chars = llama_detokenize(vocab_tgt, &id_last, 1, nullptr, 0, false, false);
216
+ GGML_ASSERT(n_chars < 0 && "failed to detokenize id_last");
217
+ text.resize(-n_chars);
218
+ llama_detokenize(vocab_tgt, &id_last, 1, text.data(), text.size(), false, false);
219
+ text = replace_to_dft(spec, text);
220
+
221
+ LOG_DBG("main->draft detokenized id_last(%d): '%s'\n", id_last, text.c_str());
222
+ id_last = common_tokenize(ctx_dft, text, false, true)[0];
223
+ }
224
+ // prompt_tgt's tokens will always be compatible with ctx_dft
225
+ const llama_tokens &prompt_tgt =
226
+ spec->vocab_dft_compatible ? prompt_tgt_main_model : prompt_tgt_draft_model;
153
227
 
154
228
  const int i_start = std::max<int>(0, (int) prompt_tgt.size() - n_ctx);
155
229
 
156
230
  // reuse as much as possible from the old draft context
157
231
  // ideally, the draft context should be as big as the target context and we will always reuse the entire prompt
158
- for (int i = 0; i < (int) prompt.size(); ++i) {
232
+ for (int i = 0; i < (int) prompt_dft.size(); ++i) {
159
233
  int cur = 0;
160
234
  while (i_start + cur < (int) prompt_tgt.size() &&
161
- i + cur < (int) prompt.size() &&
162
- prompt_tgt[i_start + cur] == prompt[i + cur]) {
235
+ i + cur < (int) prompt_dft.size() &&
236
+ prompt_tgt[i_start + cur] == prompt_dft[i + cur]) {
163
237
  cur++;
164
238
  }
165
239
 
@@ -169,21 +243,20 @@ llama_tokens common_speculative_gen_draft(
169
243
  }
170
244
  }
171
245
 
172
- LOG_DBG("%s: reuse_i = %d, reuse_n = %d, prompt = %d\n", __func__, reuse_i, reuse_n, (int) prompt.size());
246
+ LOG_DBG("%s: reuse_i = %d, reuse_n = %d, prompt = %d\n", __func__, reuse_i, reuse_n, (int) prompt_dft.size());
173
247
 
174
248
  llama_tokens result;
175
249
  result.reserve(params.n_draft);
176
250
 
177
251
  if (reuse_n == 0) {
178
- llama_memory_clear(mem, false);
179
-
180
- prompt.clear();
252
+ llama_memory_clear(mem_dft, false);
253
+ prompt_dft.clear();
181
254
  } else {
182
255
  // this happens when a previous draft has been discarded (for example, due to being too small), but the
183
256
  // target model agreed with it. in this case, we simply pass back the previous results to save compute
184
- if (reuse_i + reuse_n < (int) prompt.size() && prompt[reuse_i + reuse_n] == id_last) {
185
- for (int i = reuse_i + reuse_n + 1; i < (int) prompt.size(); ++i) {
186
- result.push_back(prompt[i]);
257
+ if (reuse_i + reuse_n < (int) prompt_dft.size() && prompt_dft[reuse_i + reuse_n] == id_last) {
258
+ for (int i = reuse_i + reuse_n + 1; i < (int) prompt_dft.size(); ++i) {
259
+ result.push_back(prompt_dft[i]);
187
260
 
188
261
  if (params.n_draft <= (int) result.size()) {
189
262
  break;
@@ -194,16 +267,15 @@ llama_tokens common_speculative_gen_draft(
194
267
  }
195
268
 
196
269
  if (reuse_i > 0) {
197
- llama_memory_seq_rm (mem, 0, 0, reuse_i);
198
- llama_memory_seq_add(mem, 0, reuse_i, -1, -reuse_i);
270
+ llama_memory_seq_rm (mem_dft, 0, 0, reuse_i);
271
+ llama_memory_seq_add(mem_dft, 0, reuse_i, -1, -reuse_i);
199
272
 
200
- prompt.erase(prompt.begin(), prompt.begin() + reuse_i);
273
+ prompt_dft.erase(prompt_dft.begin(), prompt_dft.begin() + reuse_i);
201
274
  }
202
275
 
203
- if (reuse_n < (int) prompt.size()) {
204
- llama_memory_seq_rm (mem, 0, reuse_n, -1);
205
-
206
- prompt.erase(prompt.begin() + reuse_n, prompt.end());
276
+ if (reuse_n < (int) prompt_dft.size()) {
277
+ llama_memory_seq_rm (mem_dft, 0, reuse_n, -1);
278
+ prompt_dft.erase(prompt_dft.begin() + reuse_n, prompt_dft.end());
207
279
  }
208
280
  }
209
281
 
@@ -214,28 +286,28 @@ llama_tokens common_speculative_gen_draft(
214
286
  //LOG_DBG("i = %d, i_start = %d, reuse_n = %d, i - i_start = %d, id = %6d\n", i, i_start, reuse_n, i - i_start, prompt_tgt[i]);
215
287
  common_batch_add(batch, prompt_tgt[i], i - i_start, { 0 }, false);
216
288
 
217
- prompt.push_back(prompt_tgt[i]);
289
+ prompt_dft.push_back(prompt_tgt[i]);
218
290
  }
219
291
 
220
292
  // we should rarely end-up here during normal decoding
221
293
  if (batch.n_tokens > 0) {
222
294
  //LOG_DBG("%s: draft prompt batch: %s\n", __func__, string_from(ctx, batch).c_str());
223
295
 
224
- llama_decode(ctx, batch);
296
+ llama_decode(ctx_dft, batch);
225
297
  }
226
298
 
227
- const llama_pos n_past = prompt.size();
299
+ const llama_pos n_past = prompt_dft.size();
228
300
 
229
301
  LOG_DBG("%s: n_past = %d\n", __func__, n_past);
230
302
 
231
303
  common_batch_clear(batch);
232
304
  common_batch_add (batch, id_last, n_past, { 0 }, true);
233
305
 
234
- prompt.push_back(id_last);
306
+ prompt_dft.push_back(id_last);
235
307
 
236
- //LOG_DBG("%s: draft prompt: %s\n", __func__, string_from(ctx, prompt).c_str());
308
+ LOG_DBG("%s: draft prompt: %s\n", __func__, string_from(ctx_dft, prompt_dft).c_str());
237
309
 
238
- llama_decode(ctx, batch);
310
+ llama_decode(ctx_dft, batch);
239
311
 
240
312
  common_sampler_reset(smpl);
241
313
 
@@ -243,13 +315,13 @@ llama_tokens common_speculative_gen_draft(
243
315
  for (int i = 0; i < params.n_draft; ++i) {
244
316
  common_batch_clear(batch);
245
317
 
246
- common_sampler_sample(smpl, ctx, 0, true);
318
+ common_sampler_sample(smpl, ctx_dft, 0, true);
247
319
 
248
320
  const auto * cur_p = common_sampler_get_candidates(smpl);
249
321
 
250
322
  for (int k = 0; k < std::min(3, (int) cur_p->size); ++k) {
251
323
  LOG_DBG(" - draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n",
252
- k, i, cur_p->data[k].id, cur_p->data[k].p, common_token_to_piece(ctx, cur_p->data[k].id).c_str());
324
+ k, i, cur_p->data[k].id, cur_p->data[k].p, common_token_to_piece(ctx_dft, cur_p->data[k].id).c_str());
253
325
  }
254
326
 
255
327
  // add drafted token for each sequence
@@ -271,10 +343,19 @@ llama_tokens common_speculative_gen_draft(
271
343
  common_batch_add(batch, id, n_past + i + 1, { 0 }, true);
272
344
 
273
345
  // evaluate the drafted tokens on the draft model
274
- llama_decode(ctx, batch);
346
+ llama_decode(ctx_dft, batch);
275
347
 
276
- prompt.push_back(id);
348
+ prompt_dft.push_back(id);
277
349
  }
278
350
 
351
+ if (!spec->vocab_dft_compatible) {
352
+ std::string detokenized = common_detokenize(ctx_dft, result, true);
353
+ detokenized = replace_to_tgt(spec, detokenized);
354
+ LOG_DBG("draft->main detokenized string: '%s'\n", detokenized.c_str());
355
+ result = common_tokenize(ctx_tgt, detokenized, false, true);
356
+ if (result.size() > (size_t)params.n_draft) {
357
+ result.resize(params.n_draft);
358
+ }
359
+ }
279
360
  return result;
280
361
  }
@@ -12,7 +12,10 @@ struct common_speculative_params {
12
12
  float p_min = 0.75f; // min probability required to accept a token in the draft
13
13
  };
14
14
 
15
- struct common_speculative * common_speculative_init(struct llama_context * ctx_dft);
15
+ struct common_speculative * common_speculative_init(
16
+ struct llama_context * ctx_tgt,
17
+ struct llama_context * ctx_dft
18
+ );
16
19
 
17
20
  void common_speculative_free(struct common_speculative * spec);
18
21
 
@@ -20,6 +23,10 @@ bool common_speculative_are_compatible(
20
23
  const struct llama_context * ctx_tgt,
21
24
  const struct llama_context * ctx_dft);
22
25
 
26
+ void common_speculative_add_replacement_tgt_dft(
27
+ struct common_speculative * spec,
28
+ const char *source, const char *dest);
29
+
23
30
  // sample up to n_draft tokens and add them to the batch using the draft model
24
31
  llama_tokens common_speculative_gen_draft(
25
32
  struct common_speculative * spec,
@@ -39,8 +39,9 @@ if (WIN32)
39
39
  set(CMAKE_SHARED_MODULE_PREFIX "")
40
40
  endif()
41
41
 
42
- option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
43
- option(GGML_BACKEND_DL "ggml: build backends as dynamic libraries (requires BUILD_SHARED_LIBS)" OFF)
42
+ option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
43
+ option(GGML_BACKEND_DL "ggml: build backends as dynamic libraries (requires BUILD_SHARED_LIBS)" OFF)
44
+ set(GGML_BACKEND_DIR "" CACHE PATH "ggml: directory to load dynamic backends from (requires GGML_BACKEND_DL")
44
45
 
45
46
  #
46
47
  # option list
@@ -174,6 +175,7 @@ option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental,
174
175
  option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
175
176
  option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
176
177
  option(GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 "ggml: enable rocWMMA FlashAttention on GFX12" OFF)
178
+ option(GGML_HIP_MMQ_MFMA "ggml: enable MFMA MMA for CDNA in MMQ" ON)
177
179
  option(GGML_MUSA_GRAPHS "ggml: use MUSA graph, experimental, unstable" OFF)
178
180
  option(GGML_MUSA_MUDNN_COPY "ggml: enable muDNN for accelerated copy" OFF)
179
181
  option(GGML_VULKAN "ggml: use Vulkan" OFF)
@@ -304,6 +304,16 @@
304
304
  GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
305
305
  GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
306
306
 
307
+ #define GGML_TENSOR_TERNARY_OP_LOCALS \
308
+ GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
309
+ GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
310
+ GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
311
+ GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) \
312
+ GGML_TENSOR_LOCALS(int64_t, ne2, src2, ne) \
313
+ GGML_TENSOR_LOCALS(size_t, nb2, src2, nb) \
314
+ GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
315
+ GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
316
+
307
317
  #define GGML_TENSOR_BINARY_OP_LOCALS01 \
308
318
  GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
309
319
  GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
@@ -395,7 +405,8 @@ extern "C" {
395
405
  // GGML_TYPE_IQ4_NL_4_4 = 36,
396
406
  // GGML_TYPE_IQ4_NL_4_8 = 37,
397
407
  // GGML_TYPE_IQ4_NL_8_8 = 38,
398
- GGML_TYPE_COUNT = 39,
408
+ GGML_TYPE_MXFP4 = 39, // MXFP4 (1 block)
409
+ GGML_TYPE_COUNT = 40,
399
410
  };
400
411
 
401
412
  // precision
@@ -430,6 +441,7 @@ extern "C" {
430
441
  GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
431
442
  GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
432
443
  GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
444
+ GGML_FTYPE_MOSTLY_MXFP4 = 25, // except 1d tensors
433
445
  };
434
446
 
435
447
  // available tensor operations:
@@ -438,6 +450,7 @@ extern "C" {
438
450
 
439
451
  GGML_OP_DUP,
440
452
  GGML_OP_ADD,
453
+ GGML_OP_ADD_ID,
441
454
  GGML_OP_ADD1,
442
455
  GGML_OP_ACC,
443
456
  GGML_OP_SUB,
@@ -557,6 +570,7 @@ extern "C" {
557
570
  GGML_GLU_OP_REGLU,
558
571
  GGML_GLU_OP_GEGLU,
559
572
  GGML_GLU_OP_SWIGLU,
573
+ GGML_GLU_OP_SWIGLU_OAI,
560
574
  GGML_GLU_OP_GEGLU_ERF,
561
575
  GGML_GLU_OP_GEGLU_QUICK,
562
576
 
@@ -831,6 +845,13 @@ extern "C" {
831
845
  struct ggml_tensor * b,
832
846
  enum ggml_type type);
833
847
 
848
+ // dst[i0, i1, i2] = a[i0, i1, i2] + b[i0, ids[i1, i2]]
849
+ GGML_API struct ggml_tensor * ggml_add_id(
850
+ struct ggml_context * ctx,
851
+ struct ggml_tensor * a,
852
+ struct ggml_tensor * b,
853
+ struct ggml_tensor * ids);
854
+
834
855
  GGML_API struct ggml_tensor * ggml_add1(
835
856
  struct ggml_context * ctx,
836
857
  struct ggml_tensor * a,
@@ -1198,6 +1219,13 @@ extern "C" {
1198
1219
  struct ggml_tensor * a,
1199
1220
  struct ggml_tensor * b);
1200
1221
 
1222
+ GGML_API struct ggml_tensor * ggml_swiglu_oai(
1223
+ struct ggml_context * ctx,
1224
+ struct ggml_tensor * a,
1225
+ struct ggml_tensor * b,
1226
+ float alpha,
1227
+ float limit);
1228
+
1201
1229
  // normalize along rows
1202
1230
  GGML_API struct ggml_tensor * ggml_norm(
1203
1231
  struct ggml_context * ctx,
@@ -1570,6 +1598,10 @@ extern "C" {
1570
1598
  float scale,
1571
1599
  float max_bias);
1572
1600
 
1601
+ GGML_API void ggml_soft_max_add_sinks(
1602
+ struct ggml_tensor * a,
1603
+ struct ggml_tensor * sinks);
1604
+
1573
1605
  GGML_API struct ggml_tensor * ggml_soft_max_ext_back(
1574
1606
  struct ggml_context * ctx,
1575
1607
  struct ggml_tensor * a,
@@ -2052,6 +2084,10 @@ extern "C" {
2052
2084
  GGML_API enum ggml_prec ggml_flash_attn_ext_get_prec(
2053
2085
  const struct ggml_tensor * a);
2054
2086
 
2087
+ GGML_API void ggml_flash_attn_ext_add_sinks(
2088
+ struct ggml_tensor * a,
2089
+ struct ggml_tensor * sinks);
2090
+
2055
2091
  // TODO: needs to be adapted to ggml_flash_attn_ext
2056
2092
  GGML_API struct ggml_tensor * ggml_flash_attn_back(
2057
2093
  struct ggml_context * ctx,
@@ -214,6 +214,13 @@ add_library(ggml
214
214
  ggml-backend-reg.cpp)
215
215
  add_library(ggml::ggml ALIAS ggml)
216
216
 
217
+ if (GGML_BACKEND_DIR)
218
+ if (NOT GGML_BACKEND_DL)
219
+ message(FATAL_ERROR "GGML_BACKEND_DIR requires GGML_BACKEND_DL")
220
+ endif()
221
+ target_compile_definitions(ggml PUBLIC GGML_BACKEND_DIR="${GGML_BACKEND_DIR}")
222
+ endif()
223
+
217
224
  target_link_libraries(ggml PUBLIC ggml-base)
218
225
 
219
226
  if (CMAKE_SYSTEM_NAME MATCHES "Linux")
@@ -227,7 +234,11 @@ function(ggml_add_backend_library backend)
227
234
  set_target_properties(${backend} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
228
235
  target_compile_definitions(${backend} PRIVATE GGML_BACKEND_DL)
229
236
  add_dependencies(ggml ${backend})
230
- install(TARGETS ${backend} LIBRARY DESTINATION ${CMAKE_INSTALL_BINDIR})
237
+ if (GGML_BACKEND_DIR)
238
+ install(TARGETS ${backend} LIBRARY DESTINATION ${GGML_BACKEND_DIR})
239
+ else()
240
+ install(TARGETS ${backend} LIBRARY DESTINATION ${CMAKE_INSTALL_BINDIR})
241
+ endif()
231
242
  else()
232
243
  add_library(${backend} ${ARGN})
233
244
  target_link_libraries(ggml PUBLIC ${backend})
@@ -589,6 +589,67 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
589
589
  *s = sumf;
590
590
  }
591
591
 
592
+ void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
593
+ assert(nrc == 1);
594
+ UNUSED(nrc);
595
+ UNUSED(bx);
596
+ UNUSED(by);
597
+ UNUSED(bs);
598
+ assert(n % QK_MXFP4 == 0);
599
+ static_assert(QK_MXFP4 == QK8_0, "QK_MXFP4 and QK8_0 must be the same");
600
+
601
+ const block_mxfp4 * GGML_RESTRICT x = vx;
602
+ const block_q8_0 * GGML_RESTRICT y = vy;
603
+
604
+ const int nb = n / QK_MXFP4;
605
+
606
+ int ib = 0;
607
+ float sumf = 0;
608
+
609
+ #if defined __ARM_NEON
610
+ const int8x16_t values = vld1q_s8(kvalues_mxfp4);
611
+ const uint8x16_t m4b = vdupq_n_u8(0x0f);
612
+ uint8x16x2_t q4bits;
613
+ int8x16x4_t q4b;
614
+ int8x16x4_t q8b;
615
+ int32x4_t prod_1;
616
+ int32x4_t prod_2;
617
+
618
+ for (; ib + 1 < nb; ib += 2) {
619
+ q4bits.val[0] = vld1q_u8(x[ib + 0].qs);
620
+ q4bits.val[1] = vld1q_u8(x[ib + 1].qs);
621
+ q8b.val[0] = vld1q_s8(y[ib + 0].qs);
622
+ q8b.val[1] = vld1q_s8(y[ib + 0].qs + 16);
623
+ q8b.val[2] = vld1q_s8(y[ib + 1].qs);
624
+ q8b.val[3] = vld1q_s8(y[ib + 1].qs + 16);
625
+
626
+ q4b.val[0] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[0], m4b));
627
+ q4b.val[1] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[0], 4));
628
+ q4b.val[2] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[1], m4b));
629
+ q4b.val[3] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[1], 4));
630
+
631
+ prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]);
632
+ prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]);
633
+
634
+ sumf +=
635
+ GGML_E8M0_TO_FP32_HALF(x[ib + 0].e) * GGML_CPU_FP16_TO_FP32(y[ib + 0].d) * vaddvq_s32(prod_1) +
636
+ GGML_E8M0_TO_FP32_HALF(x[ib + 1].e) * GGML_CPU_FP16_TO_FP32(y[ib + 1].d) * vaddvq_s32(prod_2);
637
+ }
638
+
639
+ #endif
640
+ for (; ib < nb; ++ib) {
641
+ const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_E8M0_TO_FP32_HALF(x[ib].e);
642
+ int sumi1 = 0;
643
+ int sumi2 = 0;
644
+ for (int j = 0; j < QK_MXFP4/2; ++j) {
645
+ sumi1 += y[ib].qs[j + 0] * kvalues_mxfp4[x[ib].qs[j] & 0xf];
646
+ sumi2 += y[ib].qs[j + QK_MXFP4/2] * kvalues_mxfp4[x[ib].qs[j] >> 4];
647
+ }
648
+ sumf += d * (sumi1 + sumi2);
649
+ }
650
+ *s = sumf;
651
+ }
652
+
592
653
  void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
593
654
  const int qk = QK8_0;
594
655
  const int nb = n / qk;