@fugood/llama.node 1.1.4 → 1.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +8 -0
- package/package.json +14 -14
- package/scripts/llama.cpp.patch +17 -13
- package/src/LlamaCompletionWorker.cpp +2 -0
- package/src/LlamaContext.cpp +3 -0
- package/src/llama.cpp/common/arg.cpp +80 -10
- package/src/llama.cpp/common/chat.cpp +52 -8
- package/src/llama.cpp/common/chat.h +7 -2
- package/src/llama.cpp/common/common.cpp +1 -0
- package/src/llama.cpp/common/common.h +16 -6
- package/src/llama.cpp/common/speculative.cpp +135 -54
- package/src/llama.cpp/common/speculative.h +8 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +4 -2
- package/src/llama.cpp/ggml/include/ggml.h +37 -1
- package/src/llama.cpp/ggml/src/CMakeLists.txt +12 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +61 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +96 -8
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +3196 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +20 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +207 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/quants.c +35 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/quants.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +263 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +11 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +19 -4
- package/src/llama.cpp/include/llama.h +9 -4
- package/src/llama.cpp/src/llama-arch.cpp +105 -0
- package/src/llama.cpp/src/llama-arch.h +12 -0
- package/src/llama.cpp/src/llama-batch.cpp +1 -1
- package/src/llama.cpp/src/llama-chat.cpp +33 -1
- package/src/llama.cpp/src/llama-chat.h +2 -0
- package/src/llama.cpp/src/llama-context.cpp +19 -10
- package/src/llama.cpp/src/llama-context.h +4 -1
- package/src/llama.cpp/src/llama-graph.cpp +175 -148
- package/src/llama.cpp/src/llama-graph.h +60 -23
- package/src/llama.cpp/src/llama-hparams.h +5 -3
- package/src/llama.cpp/src/llama-kv-cache-unified.cpp +6 -2
- package/src/llama.cpp/src/llama-kv-cache-unified.h +1 -1
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +2 -1
- package/src/llama.cpp/src/llama-memory-hybrid.h +1 -0
- package/src/llama.cpp/src/llama-model-loader.cpp +1 -0
- package/src/llama.cpp/src/llama-model-loader.h +3 -2
- package/src/llama.cpp/src/llama-model.cpp +949 -75
- package/src/llama.cpp/src/llama-model.h +24 -4
- package/src/llama.cpp/src/llama-quant.cpp +40 -4
- package/src/llama.cpp/src/llama-vocab.cpp +49 -1
- package/src/llama.cpp/src/llama-vocab.h +1 -0
|
@@ -1,30 +1,39 @@
|
|
|
1
1
|
#include "speculative.h"
|
|
2
2
|
|
|
3
|
+
#include "ggml.h"
|
|
4
|
+
#include "llama.h"
|
|
3
5
|
#include "log.h"
|
|
4
6
|
#include "common.h"
|
|
5
7
|
#include "sampling.h"
|
|
6
8
|
|
|
7
9
|
#include <cstring>
|
|
8
10
|
#include <algorithm>
|
|
11
|
+
#include <map>
|
|
9
12
|
|
|
10
13
|
#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 128
|
|
11
14
|
#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
|
|
12
15
|
|
|
13
16
|
struct common_speculative {
|
|
14
|
-
struct llama_context *
|
|
17
|
+
struct llama_context * ctx_tgt; // only used for retokenizing from ctx_dft
|
|
18
|
+
struct llama_context * ctx_dft;
|
|
15
19
|
struct common_sampler * smpl;
|
|
16
20
|
|
|
17
21
|
llama_batch batch;
|
|
18
|
-
llama_tokens
|
|
22
|
+
llama_tokens prompt_dft;
|
|
23
|
+
bool vocab_dft_compatible = true; // whether retokenization is needed
|
|
24
|
+
std::map<std::string, std::string> tgt_dft_replacements = {};
|
|
19
25
|
};
|
|
20
26
|
|
|
21
27
|
struct common_speculative * common_speculative_init(
|
|
28
|
+
struct llama_context * ctx_tgt,
|
|
22
29
|
struct llama_context * ctx_dft) {
|
|
23
30
|
auto * result = new common_speculative {
|
|
24
|
-
/* .
|
|
25
|
-
/* .
|
|
26
|
-
/* .
|
|
27
|
-
/* .
|
|
31
|
+
/* .ctx_tgt = */ ctx_tgt,
|
|
32
|
+
/* .ctx_dft = */ ctx_dft,
|
|
33
|
+
/* .smpl = */ nullptr,
|
|
34
|
+
/* .batch = */ llama_batch_init(llama_n_batch(ctx_dft), 0, 1),
|
|
35
|
+
/* .prompt_dft = */ {},
|
|
36
|
+
/* .vocab_dft_compatible = */ false,
|
|
28
37
|
};
|
|
29
38
|
|
|
30
39
|
// TODO: optimize or pass from outside?
|
|
@@ -59,6 +68,9 @@ struct common_speculative * common_speculative_init(
|
|
|
59
68
|
}
|
|
60
69
|
#endif
|
|
61
70
|
|
|
71
|
+
result->vocab_dft_compatible = common_speculative_are_compatible(ctx_tgt, ctx_dft);
|
|
72
|
+
LOG_DBG("vocab_dft_compatible = %d\n", result->vocab_dft_compatible);
|
|
73
|
+
|
|
62
74
|
return result;
|
|
63
75
|
}
|
|
64
76
|
|
|
@@ -75,8 +87,8 @@ void common_speculative_free(struct common_speculative * spec) {
|
|
|
75
87
|
}
|
|
76
88
|
|
|
77
89
|
bool common_speculative_are_compatible(
|
|
78
|
-
|
|
79
|
-
|
|
90
|
+
const struct llama_context * ctx_tgt,
|
|
91
|
+
const struct llama_context * ctx_dft) {
|
|
80
92
|
const struct llama_model * model_tgt = llama_get_model(ctx_tgt);
|
|
81
93
|
const struct llama_model * model_dft = llama_get_model(ctx_dft);
|
|
82
94
|
|
|
@@ -90,31 +102,32 @@ bool common_speculative_are_compatible(
|
|
|
90
102
|
LOG_DBG("%s: vocab_type dft: %d\n", __func__, vocab_type_dft);
|
|
91
103
|
|
|
92
104
|
if (vocab_type_tgt != vocab_type_dft) {
|
|
93
|
-
|
|
94
|
-
|
|
105
|
+
LOG_DBG("%s: draft model vocab type must match target model to use speculation but ", __func__);
|
|
106
|
+
LOG_DBG("vocab_type_dft = %d while vocab_type_tgt = %d\n", vocab_type_dft, vocab_type_tgt);
|
|
95
107
|
return false;
|
|
96
108
|
}
|
|
97
109
|
|
|
98
|
-
if (
|
|
110
|
+
if (
|
|
111
|
+
llama_vocab_get_add_bos(vocab_tgt) != llama_vocab_get_add_bos(vocab_dft) ||
|
|
99
112
|
llama_vocab_get_add_eos(vocab_tgt) != llama_vocab_get_add_eos(vocab_dft) ||
|
|
100
113
|
llama_vocab_bos(vocab_tgt) != llama_vocab_bos(vocab_dft) ||
|
|
101
|
-
llama_vocab_eos(vocab_tgt) != llama_vocab_eos(vocab_dft)
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
LOG_ERR("%s: dft: bos = %d (%d), eos = %d (%d)\n", __func__, llama_vocab_bos(vocab_dft), llama_vocab_get_add_bos(vocab_dft), llama_vocab_eos(vocab_dft), llama_vocab_get_add_eos(vocab_dft));
|
|
114
|
+
llama_vocab_eos(vocab_tgt) != llama_vocab_eos(vocab_dft)
|
|
115
|
+
) {
|
|
116
|
+
LOG_DBG("%s: draft model special tokens must match target model to use speculation\n", __func__);
|
|
105
117
|
return false;
|
|
106
118
|
}
|
|
107
119
|
|
|
108
120
|
{
|
|
109
121
|
const int n_vocab_tgt = llama_vocab_n_tokens(vocab_tgt);
|
|
110
122
|
const int n_vocab_dft = llama_vocab_n_tokens(vocab_dft);
|
|
111
|
-
|
|
112
|
-
|
|
123
|
+
const int vocab_diff = n_vocab_tgt > n_vocab_dft
|
|
124
|
+
? n_vocab_tgt - n_vocab_dft
|
|
125
|
+
: n_vocab_dft - n_vocab_tgt;
|
|
113
126
|
|
|
114
127
|
if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
128
|
+
LOG_DBG("%s: draft model vocab must closely match target model to use speculation but ", __func__);
|
|
129
|
+
LOG_DBG("target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
|
|
130
|
+
n_vocab_tgt, llama_vocab_n_tokens(vocab_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
|
|
118
131
|
return false;
|
|
119
132
|
}
|
|
120
133
|
|
|
@@ -122,8 +135,8 @@ bool common_speculative_are_compatible(
|
|
|
122
135
|
const char * token_text_tgt = llama_vocab_get_text(vocab_tgt, i);
|
|
123
136
|
const char * token_text_dft = llama_vocab_get_text(vocab_dft, i);
|
|
124
137
|
if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
|
|
125
|
-
|
|
126
|
-
|
|
138
|
+
LOG_DBG("%s: draft model vocab must match target model to use speculation but ", __func__);
|
|
139
|
+
LOG_DBG("token %d content differs - target '%s', draft '%s'\n", i,
|
|
127
140
|
common_token_to_piece(ctx_tgt, i).c_str(),
|
|
128
141
|
common_token_to_piece(ctx_dft, i).c_str());
|
|
129
142
|
return false;
|
|
@@ -134,32 +147,93 @@ bool common_speculative_are_compatible(
|
|
|
134
147
|
return true;
|
|
135
148
|
}
|
|
136
149
|
|
|
150
|
+
void common_speculative_add_replacement_tgt_dft(
|
|
151
|
+
struct common_speculative * spec,
|
|
152
|
+
const char *source, const char *dest) {
|
|
153
|
+
spec->tgt_dft_replacements[source] = dest;
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
static std::string replace_to_dft(
|
|
157
|
+
struct common_speculative * spec,
|
|
158
|
+
const std::string& input) {
|
|
159
|
+
std::string result = input;
|
|
160
|
+
for (const auto & pair : spec->tgt_dft_replacements) {
|
|
161
|
+
size_t pos = result.find(pair.first);
|
|
162
|
+
while (pos != std::string::npos) {
|
|
163
|
+
result.replace(pos, pair.first.length(), pair.second);
|
|
164
|
+
pos = result.find(pair.first, pos + pair.second.length());
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
return result;
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
static std::string replace_to_tgt(
|
|
171
|
+
struct common_speculative * spec,
|
|
172
|
+
const std::string& input) {
|
|
173
|
+
std::string result = input;
|
|
174
|
+
for (const auto& pair : spec->tgt_dft_replacements) {
|
|
175
|
+
size_t pos = result.find(pair.second);
|
|
176
|
+
while (pos != std::string::npos) {
|
|
177
|
+
result.replace(pos, pair.second.length(), pair.first);
|
|
178
|
+
pos = result.find(pair.second, pos + pair.first.length());
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
return result;
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
|
|
137
185
|
llama_tokens common_speculative_gen_draft(
|
|
138
186
|
struct common_speculative * spec,
|
|
139
187
|
struct common_speculative_params params,
|
|
140
|
-
const llama_tokens &
|
|
188
|
+
const llama_tokens & prompt_tgt_main_model, // specified in target model vocab
|
|
141
189
|
llama_token id_last) {
|
|
142
190
|
auto & batch = spec->batch;
|
|
143
|
-
auto &
|
|
191
|
+
auto & ctx_tgt = spec->ctx_tgt;
|
|
192
|
+
auto & ctx_dft = spec->ctx_dft;
|
|
144
193
|
auto & smpl = spec->smpl;
|
|
145
|
-
auto &
|
|
194
|
+
auto & prompt_dft = spec->prompt_dft;
|
|
146
195
|
|
|
147
|
-
auto *
|
|
196
|
+
auto * mem_dft = llama_get_memory(ctx_dft);
|
|
148
197
|
|
|
149
198
|
int reuse_i = 0;
|
|
150
199
|
int reuse_n = 0;
|
|
151
200
|
|
|
152
|
-
const int n_ctx = llama_n_ctx(
|
|
201
|
+
const int n_ctx = llama_n_ctx(ctx_dft) - params.n_draft;
|
|
202
|
+
|
|
203
|
+
llama_tokens prompt_tgt_draft_model;
|
|
204
|
+
if (!spec->vocab_dft_compatible) {
|
|
205
|
+
std::string text;
|
|
206
|
+
text = common_detokenize(ctx_tgt, prompt_tgt_main_model, true);
|
|
207
|
+
text = replace_to_dft(spec, text);
|
|
208
|
+
LOG_DBG("%s: main->draft detokenized string: '%s'\n", __func__, text.c_str());
|
|
209
|
+
prompt_tgt_draft_model = common_tokenize(ctx_dft, text, false, true);
|
|
210
|
+
|
|
211
|
+
// convert id_last to draft vocab. llama_detokenize is called directly to avoid an allocation
|
|
212
|
+
const auto * model_tgt = llama_get_model(ctx_tgt);
|
|
213
|
+
const auto * vocab_tgt = llama_model_get_vocab(model_tgt);
|
|
214
|
+
|
|
215
|
+
int32_t n_chars = llama_detokenize(vocab_tgt, &id_last, 1, nullptr, 0, false, false);
|
|
216
|
+
GGML_ASSERT(n_chars < 0 && "failed to detokenize id_last");
|
|
217
|
+
text.resize(-n_chars);
|
|
218
|
+
llama_detokenize(vocab_tgt, &id_last, 1, text.data(), text.size(), false, false);
|
|
219
|
+
text = replace_to_dft(spec, text);
|
|
220
|
+
|
|
221
|
+
LOG_DBG("main->draft detokenized id_last(%d): '%s'\n", id_last, text.c_str());
|
|
222
|
+
id_last = common_tokenize(ctx_dft, text, false, true)[0];
|
|
223
|
+
}
|
|
224
|
+
// prompt_tgt's tokens will always be compatible with ctx_dft
|
|
225
|
+
const llama_tokens &prompt_tgt =
|
|
226
|
+
spec->vocab_dft_compatible ? prompt_tgt_main_model : prompt_tgt_draft_model;
|
|
153
227
|
|
|
154
228
|
const int i_start = std::max<int>(0, (int) prompt_tgt.size() - n_ctx);
|
|
155
229
|
|
|
156
230
|
// reuse as much as possible from the old draft context
|
|
157
231
|
// ideally, the draft context should be as big as the target context and we will always reuse the entire prompt
|
|
158
|
-
for (int i = 0; i < (int)
|
|
232
|
+
for (int i = 0; i < (int) prompt_dft.size(); ++i) {
|
|
159
233
|
int cur = 0;
|
|
160
234
|
while (i_start + cur < (int) prompt_tgt.size() &&
|
|
161
|
-
i + cur < (int)
|
|
162
|
-
prompt_tgt[i_start + cur] ==
|
|
235
|
+
i + cur < (int) prompt_dft.size() &&
|
|
236
|
+
prompt_tgt[i_start + cur] == prompt_dft[i + cur]) {
|
|
163
237
|
cur++;
|
|
164
238
|
}
|
|
165
239
|
|
|
@@ -169,21 +243,20 @@ llama_tokens common_speculative_gen_draft(
|
|
|
169
243
|
}
|
|
170
244
|
}
|
|
171
245
|
|
|
172
|
-
LOG_DBG("%s: reuse_i = %d, reuse_n = %d, prompt = %d\n", __func__, reuse_i, reuse_n, (int)
|
|
246
|
+
LOG_DBG("%s: reuse_i = %d, reuse_n = %d, prompt = %d\n", __func__, reuse_i, reuse_n, (int) prompt_dft.size());
|
|
173
247
|
|
|
174
248
|
llama_tokens result;
|
|
175
249
|
result.reserve(params.n_draft);
|
|
176
250
|
|
|
177
251
|
if (reuse_n == 0) {
|
|
178
|
-
llama_memory_clear(
|
|
179
|
-
|
|
180
|
-
prompt.clear();
|
|
252
|
+
llama_memory_clear(mem_dft, false);
|
|
253
|
+
prompt_dft.clear();
|
|
181
254
|
} else {
|
|
182
255
|
// this happens when a previous draft has been discarded (for example, due to being too small), but the
|
|
183
256
|
// target model agreed with it. in this case, we simply pass back the previous results to save compute
|
|
184
|
-
if (reuse_i + reuse_n < (int)
|
|
185
|
-
for (int i = reuse_i + reuse_n + 1; i < (int)
|
|
186
|
-
result.push_back(
|
|
257
|
+
if (reuse_i + reuse_n < (int) prompt_dft.size() && prompt_dft[reuse_i + reuse_n] == id_last) {
|
|
258
|
+
for (int i = reuse_i + reuse_n + 1; i < (int) prompt_dft.size(); ++i) {
|
|
259
|
+
result.push_back(prompt_dft[i]);
|
|
187
260
|
|
|
188
261
|
if (params.n_draft <= (int) result.size()) {
|
|
189
262
|
break;
|
|
@@ -194,16 +267,15 @@ llama_tokens common_speculative_gen_draft(
|
|
|
194
267
|
}
|
|
195
268
|
|
|
196
269
|
if (reuse_i > 0) {
|
|
197
|
-
llama_memory_seq_rm (
|
|
198
|
-
llama_memory_seq_add(
|
|
270
|
+
llama_memory_seq_rm (mem_dft, 0, 0, reuse_i);
|
|
271
|
+
llama_memory_seq_add(mem_dft, 0, reuse_i, -1, -reuse_i);
|
|
199
272
|
|
|
200
|
-
|
|
273
|
+
prompt_dft.erase(prompt_dft.begin(), prompt_dft.begin() + reuse_i);
|
|
201
274
|
}
|
|
202
275
|
|
|
203
|
-
if (reuse_n < (int)
|
|
204
|
-
llama_memory_seq_rm (
|
|
205
|
-
|
|
206
|
-
prompt.erase(prompt.begin() + reuse_n, prompt.end());
|
|
276
|
+
if (reuse_n < (int) prompt_dft.size()) {
|
|
277
|
+
llama_memory_seq_rm (mem_dft, 0, reuse_n, -1);
|
|
278
|
+
prompt_dft.erase(prompt_dft.begin() + reuse_n, prompt_dft.end());
|
|
207
279
|
}
|
|
208
280
|
}
|
|
209
281
|
|
|
@@ -214,28 +286,28 @@ llama_tokens common_speculative_gen_draft(
|
|
|
214
286
|
//LOG_DBG("i = %d, i_start = %d, reuse_n = %d, i - i_start = %d, id = %6d\n", i, i_start, reuse_n, i - i_start, prompt_tgt[i]);
|
|
215
287
|
common_batch_add(batch, prompt_tgt[i], i - i_start, { 0 }, false);
|
|
216
288
|
|
|
217
|
-
|
|
289
|
+
prompt_dft.push_back(prompt_tgt[i]);
|
|
218
290
|
}
|
|
219
291
|
|
|
220
292
|
// we should rarely end-up here during normal decoding
|
|
221
293
|
if (batch.n_tokens > 0) {
|
|
222
294
|
//LOG_DBG("%s: draft prompt batch: %s\n", __func__, string_from(ctx, batch).c_str());
|
|
223
295
|
|
|
224
|
-
llama_decode(
|
|
296
|
+
llama_decode(ctx_dft, batch);
|
|
225
297
|
}
|
|
226
298
|
|
|
227
|
-
const llama_pos n_past =
|
|
299
|
+
const llama_pos n_past = prompt_dft.size();
|
|
228
300
|
|
|
229
301
|
LOG_DBG("%s: n_past = %d\n", __func__, n_past);
|
|
230
302
|
|
|
231
303
|
common_batch_clear(batch);
|
|
232
304
|
common_batch_add (batch, id_last, n_past, { 0 }, true);
|
|
233
305
|
|
|
234
|
-
|
|
306
|
+
prompt_dft.push_back(id_last);
|
|
235
307
|
|
|
236
|
-
|
|
308
|
+
LOG_DBG("%s: draft prompt: %s\n", __func__, string_from(ctx_dft, prompt_dft).c_str());
|
|
237
309
|
|
|
238
|
-
llama_decode(
|
|
310
|
+
llama_decode(ctx_dft, batch);
|
|
239
311
|
|
|
240
312
|
common_sampler_reset(smpl);
|
|
241
313
|
|
|
@@ -243,13 +315,13 @@ llama_tokens common_speculative_gen_draft(
|
|
|
243
315
|
for (int i = 0; i < params.n_draft; ++i) {
|
|
244
316
|
common_batch_clear(batch);
|
|
245
317
|
|
|
246
|
-
common_sampler_sample(smpl,
|
|
318
|
+
common_sampler_sample(smpl, ctx_dft, 0, true);
|
|
247
319
|
|
|
248
320
|
const auto * cur_p = common_sampler_get_candidates(smpl);
|
|
249
321
|
|
|
250
322
|
for (int k = 0; k < std::min(3, (int) cur_p->size); ++k) {
|
|
251
323
|
LOG_DBG(" - draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n",
|
|
252
|
-
k, i, cur_p->data[k].id, cur_p->data[k].p, common_token_to_piece(
|
|
324
|
+
k, i, cur_p->data[k].id, cur_p->data[k].p, common_token_to_piece(ctx_dft, cur_p->data[k].id).c_str());
|
|
253
325
|
}
|
|
254
326
|
|
|
255
327
|
// add drafted token for each sequence
|
|
@@ -271,10 +343,19 @@ llama_tokens common_speculative_gen_draft(
|
|
|
271
343
|
common_batch_add(batch, id, n_past + i + 1, { 0 }, true);
|
|
272
344
|
|
|
273
345
|
// evaluate the drafted tokens on the draft model
|
|
274
|
-
llama_decode(
|
|
346
|
+
llama_decode(ctx_dft, batch);
|
|
275
347
|
|
|
276
|
-
|
|
348
|
+
prompt_dft.push_back(id);
|
|
277
349
|
}
|
|
278
350
|
|
|
351
|
+
if (!spec->vocab_dft_compatible) {
|
|
352
|
+
std::string detokenized = common_detokenize(ctx_dft, result, true);
|
|
353
|
+
detokenized = replace_to_tgt(spec, detokenized);
|
|
354
|
+
LOG_DBG("draft->main detokenized string: '%s'\n", detokenized.c_str());
|
|
355
|
+
result = common_tokenize(ctx_tgt, detokenized, false, true);
|
|
356
|
+
if (result.size() > (size_t)params.n_draft) {
|
|
357
|
+
result.resize(params.n_draft);
|
|
358
|
+
}
|
|
359
|
+
}
|
|
279
360
|
return result;
|
|
280
361
|
}
|
|
@@ -12,7 +12,10 @@ struct common_speculative_params {
|
|
|
12
12
|
float p_min = 0.75f; // min probability required to accept a token in the draft
|
|
13
13
|
};
|
|
14
14
|
|
|
15
|
-
struct common_speculative * common_speculative_init(
|
|
15
|
+
struct common_speculative * common_speculative_init(
|
|
16
|
+
struct llama_context * ctx_tgt,
|
|
17
|
+
struct llama_context * ctx_dft
|
|
18
|
+
);
|
|
16
19
|
|
|
17
20
|
void common_speculative_free(struct common_speculative * spec);
|
|
18
21
|
|
|
@@ -20,6 +23,10 @@ bool common_speculative_are_compatible(
|
|
|
20
23
|
const struct llama_context * ctx_tgt,
|
|
21
24
|
const struct llama_context * ctx_dft);
|
|
22
25
|
|
|
26
|
+
void common_speculative_add_replacement_tgt_dft(
|
|
27
|
+
struct common_speculative * spec,
|
|
28
|
+
const char *source, const char *dest);
|
|
29
|
+
|
|
23
30
|
// sample up to n_draft tokens and add them to the batch using the draft model
|
|
24
31
|
llama_tokens common_speculative_gen_draft(
|
|
25
32
|
struct common_speculative * spec,
|
|
@@ -39,8 +39,9 @@ if (WIN32)
|
|
|
39
39
|
set(CMAKE_SHARED_MODULE_PREFIX "")
|
|
40
40
|
endif()
|
|
41
41
|
|
|
42
|
-
option(BUILD_SHARED_LIBS
|
|
43
|
-
option(GGML_BACKEND_DL
|
|
42
|
+
option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
|
|
43
|
+
option(GGML_BACKEND_DL "ggml: build backends as dynamic libraries (requires BUILD_SHARED_LIBS)" OFF)
|
|
44
|
+
set(GGML_BACKEND_DIR "" CACHE PATH "ggml: directory to load dynamic backends from (requires GGML_BACKEND_DL")
|
|
44
45
|
|
|
45
46
|
#
|
|
46
47
|
# option list
|
|
@@ -174,6 +175,7 @@ option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental,
|
|
|
174
175
|
option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
|
|
175
176
|
option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
|
|
176
177
|
option(GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 "ggml: enable rocWMMA FlashAttention on GFX12" OFF)
|
|
178
|
+
option(GGML_HIP_MMQ_MFMA "ggml: enable MFMA MMA for CDNA in MMQ" ON)
|
|
177
179
|
option(GGML_MUSA_GRAPHS "ggml: use MUSA graph, experimental, unstable" OFF)
|
|
178
180
|
option(GGML_MUSA_MUDNN_COPY "ggml: enable muDNN for accelerated copy" OFF)
|
|
179
181
|
option(GGML_VULKAN "ggml: use Vulkan" OFF)
|
|
@@ -304,6 +304,16 @@
|
|
|
304
304
|
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
|
|
305
305
|
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
|
306
306
|
|
|
307
|
+
#define GGML_TENSOR_TERNARY_OP_LOCALS \
|
|
308
|
+
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
|
|
309
|
+
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
|
|
310
|
+
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
|
|
311
|
+
GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) \
|
|
312
|
+
GGML_TENSOR_LOCALS(int64_t, ne2, src2, ne) \
|
|
313
|
+
GGML_TENSOR_LOCALS(size_t, nb2, src2, nb) \
|
|
314
|
+
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
|
|
315
|
+
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
|
316
|
+
|
|
307
317
|
#define GGML_TENSOR_BINARY_OP_LOCALS01 \
|
|
308
318
|
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
|
|
309
319
|
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
|
|
@@ -395,7 +405,8 @@ extern "C" {
|
|
|
395
405
|
// GGML_TYPE_IQ4_NL_4_4 = 36,
|
|
396
406
|
// GGML_TYPE_IQ4_NL_4_8 = 37,
|
|
397
407
|
// GGML_TYPE_IQ4_NL_8_8 = 38,
|
|
398
|
-
|
|
408
|
+
GGML_TYPE_MXFP4 = 39, // MXFP4 (1 block)
|
|
409
|
+
GGML_TYPE_COUNT = 40,
|
|
399
410
|
};
|
|
400
411
|
|
|
401
412
|
// precision
|
|
@@ -430,6 +441,7 @@ extern "C" {
|
|
|
430
441
|
GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
|
|
431
442
|
GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
|
|
432
443
|
GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
|
|
444
|
+
GGML_FTYPE_MOSTLY_MXFP4 = 25, // except 1d tensors
|
|
433
445
|
};
|
|
434
446
|
|
|
435
447
|
// available tensor operations:
|
|
@@ -438,6 +450,7 @@ extern "C" {
|
|
|
438
450
|
|
|
439
451
|
GGML_OP_DUP,
|
|
440
452
|
GGML_OP_ADD,
|
|
453
|
+
GGML_OP_ADD_ID,
|
|
441
454
|
GGML_OP_ADD1,
|
|
442
455
|
GGML_OP_ACC,
|
|
443
456
|
GGML_OP_SUB,
|
|
@@ -557,6 +570,7 @@ extern "C" {
|
|
|
557
570
|
GGML_GLU_OP_REGLU,
|
|
558
571
|
GGML_GLU_OP_GEGLU,
|
|
559
572
|
GGML_GLU_OP_SWIGLU,
|
|
573
|
+
GGML_GLU_OP_SWIGLU_OAI,
|
|
560
574
|
GGML_GLU_OP_GEGLU_ERF,
|
|
561
575
|
GGML_GLU_OP_GEGLU_QUICK,
|
|
562
576
|
|
|
@@ -831,6 +845,13 @@ extern "C" {
|
|
|
831
845
|
struct ggml_tensor * b,
|
|
832
846
|
enum ggml_type type);
|
|
833
847
|
|
|
848
|
+
// dst[i0, i1, i2] = a[i0, i1, i2] + b[i0, ids[i1, i2]]
|
|
849
|
+
GGML_API struct ggml_tensor * ggml_add_id(
|
|
850
|
+
struct ggml_context * ctx,
|
|
851
|
+
struct ggml_tensor * a,
|
|
852
|
+
struct ggml_tensor * b,
|
|
853
|
+
struct ggml_tensor * ids);
|
|
854
|
+
|
|
834
855
|
GGML_API struct ggml_tensor * ggml_add1(
|
|
835
856
|
struct ggml_context * ctx,
|
|
836
857
|
struct ggml_tensor * a,
|
|
@@ -1198,6 +1219,13 @@ extern "C" {
|
|
|
1198
1219
|
struct ggml_tensor * a,
|
|
1199
1220
|
struct ggml_tensor * b);
|
|
1200
1221
|
|
|
1222
|
+
GGML_API struct ggml_tensor * ggml_swiglu_oai(
|
|
1223
|
+
struct ggml_context * ctx,
|
|
1224
|
+
struct ggml_tensor * a,
|
|
1225
|
+
struct ggml_tensor * b,
|
|
1226
|
+
float alpha,
|
|
1227
|
+
float limit);
|
|
1228
|
+
|
|
1201
1229
|
// normalize along rows
|
|
1202
1230
|
GGML_API struct ggml_tensor * ggml_norm(
|
|
1203
1231
|
struct ggml_context * ctx,
|
|
@@ -1570,6 +1598,10 @@ extern "C" {
|
|
|
1570
1598
|
float scale,
|
|
1571
1599
|
float max_bias);
|
|
1572
1600
|
|
|
1601
|
+
GGML_API void ggml_soft_max_add_sinks(
|
|
1602
|
+
struct ggml_tensor * a,
|
|
1603
|
+
struct ggml_tensor * sinks);
|
|
1604
|
+
|
|
1573
1605
|
GGML_API struct ggml_tensor * ggml_soft_max_ext_back(
|
|
1574
1606
|
struct ggml_context * ctx,
|
|
1575
1607
|
struct ggml_tensor * a,
|
|
@@ -2052,6 +2084,10 @@ extern "C" {
|
|
|
2052
2084
|
GGML_API enum ggml_prec ggml_flash_attn_ext_get_prec(
|
|
2053
2085
|
const struct ggml_tensor * a);
|
|
2054
2086
|
|
|
2087
|
+
GGML_API void ggml_flash_attn_ext_add_sinks(
|
|
2088
|
+
struct ggml_tensor * a,
|
|
2089
|
+
struct ggml_tensor * sinks);
|
|
2090
|
+
|
|
2055
2091
|
// TODO: needs to be adapted to ggml_flash_attn_ext
|
|
2056
2092
|
GGML_API struct ggml_tensor * ggml_flash_attn_back(
|
|
2057
2093
|
struct ggml_context * ctx,
|
|
@@ -214,6 +214,13 @@ add_library(ggml
|
|
|
214
214
|
ggml-backend-reg.cpp)
|
|
215
215
|
add_library(ggml::ggml ALIAS ggml)
|
|
216
216
|
|
|
217
|
+
if (GGML_BACKEND_DIR)
|
|
218
|
+
if (NOT GGML_BACKEND_DL)
|
|
219
|
+
message(FATAL_ERROR "GGML_BACKEND_DIR requires GGML_BACKEND_DL")
|
|
220
|
+
endif()
|
|
221
|
+
target_compile_definitions(ggml PUBLIC GGML_BACKEND_DIR="${GGML_BACKEND_DIR}")
|
|
222
|
+
endif()
|
|
223
|
+
|
|
217
224
|
target_link_libraries(ggml PUBLIC ggml-base)
|
|
218
225
|
|
|
219
226
|
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
|
|
@@ -227,7 +234,11 @@ function(ggml_add_backend_library backend)
|
|
|
227
234
|
set_target_properties(${backend} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
|
|
228
235
|
target_compile_definitions(${backend} PRIVATE GGML_BACKEND_DL)
|
|
229
236
|
add_dependencies(ggml ${backend})
|
|
230
|
-
|
|
237
|
+
if (GGML_BACKEND_DIR)
|
|
238
|
+
install(TARGETS ${backend} LIBRARY DESTINATION ${GGML_BACKEND_DIR})
|
|
239
|
+
else()
|
|
240
|
+
install(TARGETS ${backend} LIBRARY DESTINATION ${CMAKE_INSTALL_BINDIR})
|
|
241
|
+
endif()
|
|
231
242
|
else()
|
|
232
243
|
add_library(${backend} ${ARGN})
|
|
233
244
|
target_link_libraries(ggml PUBLIC ${backend})
|
|
@@ -589,6 +589,67 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
|
589
589
|
*s = sumf;
|
|
590
590
|
}
|
|
591
591
|
|
|
592
|
+
void ggml_vec_dot_mxfp4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
593
|
+
assert(nrc == 1);
|
|
594
|
+
UNUSED(nrc);
|
|
595
|
+
UNUSED(bx);
|
|
596
|
+
UNUSED(by);
|
|
597
|
+
UNUSED(bs);
|
|
598
|
+
assert(n % QK_MXFP4 == 0);
|
|
599
|
+
static_assert(QK_MXFP4 == QK8_0, "QK_MXFP4 and QK8_0 must be the same");
|
|
600
|
+
|
|
601
|
+
const block_mxfp4 * GGML_RESTRICT x = vx;
|
|
602
|
+
const block_q8_0 * GGML_RESTRICT y = vy;
|
|
603
|
+
|
|
604
|
+
const int nb = n / QK_MXFP4;
|
|
605
|
+
|
|
606
|
+
int ib = 0;
|
|
607
|
+
float sumf = 0;
|
|
608
|
+
|
|
609
|
+
#if defined __ARM_NEON
|
|
610
|
+
const int8x16_t values = vld1q_s8(kvalues_mxfp4);
|
|
611
|
+
const uint8x16_t m4b = vdupq_n_u8(0x0f);
|
|
612
|
+
uint8x16x2_t q4bits;
|
|
613
|
+
int8x16x4_t q4b;
|
|
614
|
+
int8x16x4_t q8b;
|
|
615
|
+
int32x4_t prod_1;
|
|
616
|
+
int32x4_t prod_2;
|
|
617
|
+
|
|
618
|
+
for (; ib + 1 < nb; ib += 2) {
|
|
619
|
+
q4bits.val[0] = vld1q_u8(x[ib + 0].qs);
|
|
620
|
+
q4bits.val[1] = vld1q_u8(x[ib + 1].qs);
|
|
621
|
+
q8b.val[0] = vld1q_s8(y[ib + 0].qs);
|
|
622
|
+
q8b.val[1] = vld1q_s8(y[ib + 0].qs + 16);
|
|
623
|
+
q8b.val[2] = vld1q_s8(y[ib + 1].qs);
|
|
624
|
+
q8b.val[3] = vld1q_s8(y[ib + 1].qs + 16);
|
|
625
|
+
|
|
626
|
+
q4b.val[0] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[0], m4b));
|
|
627
|
+
q4b.val[1] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[0], 4));
|
|
628
|
+
q4b.val[2] = ggml_vqtbl1q_s8(values, vandq_u8 (q4bits.val[1], m4b));
|
|
629
|
+
q4b.val[3] = ggml_vqtbl1q_s8(values, vshrq_n_u8(q4bits.val[1], 4));
|
|
630
|
+
|
|
631
|
+
prod_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[0], q8b.val[0]), q4b.val[1], q8b.val[1]);
|
|
632
|
+
prod_2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q4b.val[2], q8b.val[2]), q4b.val[3], q8b.val[3]);
|
|
633
|
+
|
|
634
|
+
sumf +=
|
|
635
|
+
GGML_E8M0_TO_FP32_HALF(x[ib + 0].e) * GGML_CPU_FP16_TO_FP32(y[ib + 0].d) * vaddvq_s32(prod_1) +
|
|
636
|
+
GGML_E8M0_TO_FP32_HALF(x[ib + 1].e) * GGML_CPU_FP16_TO_FP32(y[ib + 1].d) * vaddvq_s32(prod_2);
|
|
637
|
+
}
|
|
638
|
+
|
|
639
|
+
#endif
|
|
640
|
+
for (; ib < nb; ++ib) {
|
|
641
|
+
const float d = GGML_CPU_FP16_TO_FP32(y[ib].d)*GGML_E8M0_TO_FP32_HALF(x[ib].e);
|
|
642
|
+
int sumi1 = 0;
|
|
643
|
+
int sumi2 = 0;
|
|
644
|
+
for (int j = 0; j < QK_MXFP4/2; ++j) {
|
|
645
|
+
sumi1 += y[ib].qs[j + 0] * kvalues_mxfp4[x[ib].qs[j] & 0xf];
|
|
646
|
+
sumi2 += y[ib].qs[j + QK_MXFP4/2] * kvalues_mxfp4[x[ib].qs[j] >> 4];
|
|
647
|
+
}
|
|
648
|
+
sumf += d * (sumi1 + sumi2);
|
|
649
|
+
}
|
|
650
|
+
*s = sumf;
|
|
651
|
+
}
|
|
652
|
+
|
|
592
653
|
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
|
|
593
654
|
const int qk = QK8_0;
|
|
594
655
|
const int nb = n / qk;
|