@fugood/llama.node 1.1.11 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +5 -8
- package/lib/binding.ts +18 -1
- package/lib/index.js +2 -2
- package/lib/index.ts +2 -2
- package/package.json +20 -16
- package/src/DecodeAudioTokenWorker.cpp +23 -26
- package/src/DecodeAudioTokenWorker.h +6 -8
- package/src/DetokenizeWorker.cpp +5 -8
- package/src/DetokenizeWorker.h +6 -5
- package/src/DisposeWorker.cpp +23 -3
- package/src/DisposeWorker.h +4 -2
- package/src/EmbeddingWorker.cpp +9 -35
- package/src/EmbeddingWorker.h +3 -2
- package/src/LlamaCompletionWorker.cpp +217 -315
- package/src/LlamaCompletionWorker.h +6 -12
- package/src/LlamaContext.cpp +166 -396
- package/src/LlamaContext.h +8 -13
- package/src/LoadSessionWorker.cpp +22 -19
- package/src/LoadSessionWorker.h +3 -2
- package/src/RerankWorker.h +3 -2
- package/src/SaveSessionWorker.cpp +22 -19
- package/src/SaveSessionWorker.h +3 -2
- package/src/TokenizeWorker.cpp +38 -35
- package/src/TokenizeWorker.h +12 -3
- package/src/common.hpp +0 -458
- package/src/llama.cpp/common/arg.cpp +50 -30
- package/src/llama.cpp/common/chat.cpp +250 -1
- package/src/llama.cpp/common/chat.h +4 -0
- package/src/llama.cpp/common/common.h +1 -1
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +21 -1
- package/src/llama.cpp/common/log.cpp +53 -2
- package/src/llama.cpp/common/log.h +10 -4
- package/src/llama.cpp/common/sampling.cpp +23 -2
- package/src/llama.cpp/common/sampling.h +3 -1
- package/src/llama.cpp/common/speculative.cpp +1 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +3 -2
- package/src/llama.cpp/ggml/include/ggml-backend.h +15 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-metal.h +0 -6
- package/src/llama.cpp/ggml/include/ggml.h +56 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +21 -14
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +210 -96
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +57 -59
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +6 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +25 -38
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +4 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +379 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +41 -37
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +150 -28
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +320 -73
- package/src/llama.cpp/include/llama.h +5 -6
- package/src/llama.cpp/src/llama-adapter.cpp +33 -0
- package/src/llama.cpp/src/llama-adapter.h +3 -0
- package/src/llama.cpp/src/llama-arch.cpp +28 -4
- package/src/llama.cpp/src/llama-arch.h +3 -0
- package/src/llama.cpp/src/llama-context.cpp +65 -57
- package/src/llama.cpp/src/llama-context.h +1 -1
- package/src/llama.cpp/src/llama-graph.cpp +57 -11
- package/src/llama.cpp/src/llama-graph.h +8 -0
- package/src/llama.cpp/src/llama-hparams.cpp +37 -0
- package/src/llama.cpp/src/llama-hparams.h +10 -3
- package/src/llama.cpp/src/llama-kv-cache.cpp +56 -38
- package/src/llama.cpp/src/llama-kv-cache.h +9 -0
- package/src/llama.cpp/src/llama-model.cpp +217 -97
- package/src/llama.cpp/src/llama-model.h +0 -1
- package/src/llama.cpp/src/llama-quant.cpp +3 -3
- package/src/llama.cpp/src/llama-sampling.cpp +226 -126
- package/src/llama.cpp/src/llama.cpp +53 -10
- package/src/anyascii.c +0 -22223
- package/src/anyascii.h +0 -42
- package/src/tts_utils.cpp +0 -371
- package/src/tts_utils.h +0 -103
|
@@ -270,19 +270,7 @@ llama_context::llama_context(
|
|
|
270
270
|
}
|
|
271
271
|
}
|
|
272
272
|
|
|
273
|
-
// resolve automatic Flash Attention use and reserve worst-case graph
|
|
274
273
|
if (!hparams.vocab_only) {
|
|
275
|
-
const uint32_t n_seqs = cparams.kv_unified ? 1 : cparams.n_seq_max;
|
|
276
|
-
const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
|
|
277
|
-
|
|
278
|
-
LLAMA_LOG_DEBUG("%s: worst-case: n_tokens = %d, n_seqs = %d, n_outputs = %d\n", __func__, n_tokens, n_seqs, n_outputs);
|
|
279
|
-
|
|
280
|
-
int n_splits_pp = -1;
|
|
281
|
-
int n_nodes_pp = -1;
|
|
282
|
-
|
|
283
|
-
int n_splits_tg = -1;
|
|
284
|
-
int n_nodes_tg = -1;
|
|
285
|
-
|
|
286
274
|
llama_memory_context_ptr mctx;
|
|
287
275
|
if (memory) {
|
|
288
276
|
LLAMA_LOG_DEBUG("%s: reserving full memory module\n", __func__);
|
|
@@ -294,53 +282,68 @@ llama_context::llama_context(
|
|
|
294
282
|
|
|
295
283
|
cross.v_embd.clear();
|
|
296
284
|
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
285
|
+
const uint32_t n_seqs = cparams.kv_unified ? 1 : cparams.n_seq_max;
|
|
286
|
+
const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
|
|
287
|
+
|
|
288
|
+
// avoid reserving graphs with zero outputs - assume one output per sequence
|
|
289
|
+
n_outputs = n_seqs;
|
|
290
|
+
|
|
291
|
+
LLAMA_LOG_DEBUG("%s: worst-case: n_tokens = %d, n_seqs = %d, n_outputs = %d\n", __func__, n_tokens, n_seqs, n_outputs);
|
|
292
|
+
|
|
293
|
+
// resolve automatic Flash Attention use
|
|
294
|
+
if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO) {
|
|
295
|
+
auto * gf = graph_reserve(1, n_seqs, n_outputs, mctx.get(), true);
|
|
300
296
|
if (!gf) {
|
|
301
|
-
throw std::runtime_error("failed to
|
|
297
|
+
throw std::runtime_error("failed to split graph for Flash Attention check");
|
|
302
298
|
}
|
|
303
299
|
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
ggml_tensor * n = ggml_graph_node(gf, i);
|
|
311
|
-
if (n->op != GGML_OP_FLASH_ATTN_EXT) {
|
|
312
|
-
continue;
|
|
313
|
-
}
|
|
314
|
-
ggml_backend_dev_t device_fa = ggml_backend_get_device(
|
|
315
|
-
ggml_backend_sched_get_tensor_backend(sched.get(), n));
|
|
316
|
-
|
|
317
|
-
// TODO: instead of the tensor names, use a map to keep track of which (FA) tensors belong to which layer
|
|
318
|
-
GGML_ASSERT(strncmp(n->name, LLAMA_TENSOR_NAME_FATTN "-", prefix_len) == 0);
|
|
319
|
-
const int il = std::stoi(n->name + prefix_len);
|
|
320
|
-
ggml_backend_dev_t device_kv = model.dev_layer(il);
|
|
321
|
-
if (device_fa != device_kv) {
|
|
322
|
-
LLAMA_LOG_WARN("%s: layer %d is assigned to device %s but the Flash Attention tensor "
|
|
323
|
-
"is assigned to device %s (usually due to missing support)\n",
|
|
324
|
-
__func__, il, ggml_backend_dev_name(device_kv), ggml_backend_dev_name(device_fa));
|
|
325
|
-
// FIXME: fa_device_mismatch logic is wrong for --no-kv-offload, but this is broken anyways
|
|
326
|
-
fa_device_mismatch = true;
|
|
327
|
-
break;
|
|
328
|
-
}
|
|
300
|
+
const size_t prefix_len = strlen(LLAMA_TENSOR_NAME_FATTN) + 1;
|
|
301
|
+
bool fa_device_mismatch = false;
|
|
302
|
+
for (int i = 0; i < ggml_graph_n_nodes(gf); i++) {
|
|
303
|
+
ggml_tensor * n = ggml_graph_node(gf, i);
|
|
304
|
+
if (n->op != GGML_OP_FLASH_ATTN_EXT) {
|
|
305
|
+
continue;
|
|
329
306
|
}
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
307
|
+
ggml_backend_dev_t device_fa = ggml_backend_get_device(
|
|
308
|
+
ggml_backend_sched_get_tensor_backend(sched.get(), n));
|
|
309
|
+
|
|
310
|
+
// TODO: instead of the tensor names, use a map to keep track of which (FA) tensors belong to which layer
|
|
311
|
+
GGML_ASSERT(strncmp(n->name, LLAMA_TENSOR_NAME_FATTN "-", prefix_len) == 0);
|
|
312
|
+
const int il = std::stoi(n->name + prefix_len);
|
|
313
|
+
ggml_backend_dev_t device_kv = model.dev_layer(il);
|
|
314
|
+
if (device_fa != device_kv) {
|
|
315
|
+
LLAMA_LOG_WARN("%s: layer %d is assigned to device %s but the Flash Attention tensor "
|
|
316
|
+
"is assigned to device %s (usually due to missing support)\n",
|
|
317
|
+
__func__, il, ggml_backend_dev_name(device_kv), ggml_backend_dev_name(device_fa));
|
|
318
|
+
// FIXME: fa_device_mismatch logic is wrong for --no-kv-offload, but this is broken anyways
|
|
319
|
+
fa_device_mismatch = true;
|
|
320
|
+
break;
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
if (fa_device_mismatch) {
|
|
324
|
+
cparams.flash_attn = false;
|
|
325
|
+
LLAMA_LOG_WARN("%s: Flash Attention was auto, set to disabled\n", __func__);
|
|
326
|
+
if (ggml_is_quantized(params.type_v)) {
|
|
327
|
+
throw std::runtime_error("quantized V cache was requested, but this requires Flash Attention");
|
|
343
328
|
}
|
|
329
|
+
} else {
|
|
330
|
+
cparams.flash_attn = true;
|
|
331
|
+
LLAMA_LOG_INFO("%s: Flash Attention was auto, set to enabled\n", __func__);
|
|
332
|
+
}
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
// reserve worst-case graph
|
|
336
|
+
int n_splits_pp = -1;
|
|
337
|
+
int n_nodes_pp = -1;
|
|
338
|
+
|
|
339
|
+
int n_splits_tg = -1;
|
|
340
|
+
int n_nodes_tg = -1;
|
|
341
|
+
|
|
342
|
+
// reserve pp (prompt processing) graph first so that buffers are only allocated once
|
|
343
|
+
{
|
|
344
|
+
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
|
|
345
|
+
if (!gf) {
|
|
346
|
+
throw std::runtime_error("failed to allocate compute pp buffers");
|
|
344
347
|
}
|
|
345
348
|
|
|
346
349
|
n_splits_pp = ggml_backend_sched_get_n_splits(sched.get());
|
|
@@ -1366,8 +1369,9 @@ llm_graph_result * llama_context::get_gf_res_reserve() const {
|
|
|
1366
1369
|
return static_cast<llm_graph_result *>(gf_res_reserve.get());
|
|
1367
1370
|
}
|
|
1368
1371
|
|
|
1369
|
-
ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx) {
|
|
1372
|
+
ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only) {
|
|
1370
1373
|
LLAMA_LOG_DEBUG("%s: reserving a graph for ubatch with n_tokens = %4u, n_seqs = %2u, n_outputs = %4u\n", __func__, n_tokens, n_seqs, n_outputs);
|
|
1374
|
+
GGML_ASSERT(n_outputs >= 1);
|
|
1371
1375
|
|
|
1372
1376
|
if (n_tokens % n_seqs != 0) {
|
|
1373
1377
|
n_tokens = ((n_tokens + (n_seqs - 1)) / n_seqs) * n_seqs; // round to next multiple of n_seqs
|
|
@@ -1401,7 +1405,9 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
|
|
|
1401
1405
|
this->n_outputs = save_n_outputs;
|
|
1402
1406
|
|
|
1403
1407
|
// initialize scheduler with the specified graph
|
|
1404
|
-
if (
|
|
1408
|
+
if (split_only) {
|
|
1409
|
+
ggml_backend_sched_split_graph(sched.get(), gf);
|
|
1410
|
+
} else if (!ggml_backend_sched_reserve(sched.get(), gf)) {
|
|
1405
1411
|
LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
|
|
1406
1412
|
return nullptr;
|
|
1407
1413
|
}
|
|
@@ -1441,7 +1447,9 @@ ggml_status llama_context::graph_compute(
|
|
|
1441
1447
|
if (backend_cpu != nullptr) {
|
|
1442
1448
|
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend_cpu));
|
|
1443
1449
|
auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool");
|
|
1444
|
-
set_threadpool_fn
|
|
1450
|
+
if (set_threadpool_fn) {
|
|
1451
|
+
set_threadpool_fn(backend_cpu, tp);
|
|
1452
|
+
}
|
|
1445
1453
|
}
|
|
1446
1454
|
|
|
1447
1455
|
// set the number of threads for all the backends
|
|
@@ -196,7 +196,7 @@ public:
|
|
|
196
196
|
ggml_status graph_compute(ggml_cgraph * gf, bool batched);
|
|
197
197
|
|
|
198
198
|
// reserve a graph with a dummy ubatch of the specified size
|
|
199
|
-
ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx);
|
|
199
|
+
ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false);
|
|
200
200
|
|
|
201
201
|
private:
|
|
202
202
|
llm_graph_params graph_params(
|
|
@@ -258,6 +258,36 @@ void llm_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) {
|
|
|
258
258
|
}
|
|
259
259
|
}
|
|
260
260
|
|
|
261
|
+
static void print_mask(float * data, int64_t n_tokens, int64_t n_kv, int64_t n_swa, llama_swa_type swa_type) {
|
|
262
|
+
LLAMA_LOG_DEBUG("%s: === Attention mask ===\n", __func__);
|
|
263
|
+
const char * swa_type_str = (swa_type == LLAMA_SWA_TYPE_NONE) ? "LLAMA_SWA_TYPE_NONE" :
|
|
264
|
+
(swa_type == LLAMA_SWA_TYPE_STANDARD) ? "LLAMA_SWA_TYPE_STANDARD" :
|
|
265
|
+
(swa_type == LLAMA_SWA_TYPE_CHUNKED) ? "LLAMA_SWA_TYPE_CHUNKED" :
|
|
266
|
+
(swa_type == LLAMA_SWA_TYPE_SYMMETRIC) ? "LLAMA_SWA_TYPE_SYMMETRIC" : "unknown";
|
|
267
|
+
LLAMA_LOG_DEBUG("%s: n_swa : %d, n_kv: %d, swq_type: %s\n", __func__, (int)n_swa, (int)n_kv, swa_type_str);
|
|
268
|
+
LLAMA_LOG_DEBUG("%s: '0' = can attend, '∞' = masked\n", __func__);
|
|
269
|
+
LLAMA_LOG_DEBUG("%s: Rows = query tokens, Columns = key/value tokens\n\n", __func__);
|
|
270
|
+
|
|
271
|
+
LLAMA_LOG_DEBUG(" ");
|
|
272
|
+
for (int j = 0; j < std::min((int64_t)20, n_kv); ++j) {
|
|
273
|
+
LLAMA_LOG_DEBUG("%2d", j);
|
|
274
|
+
}
|
|
275
|
+
LLAMA_LOG_DEBUG("\n");
|
|
276
|
+
|
|
277
|
+
for (int i = 0; i < std::min((int64_t)20, n_tokens); ++i) {
|
|
278
|
+
LLAMA_LOG_DEBUG(" %2d ", i);
|
|
279
|
+
for (int j = 0; j < std::min((int64_t)20, n_kv); ++j) {
|
|
280
|
+
float val = data[i * n_kv + j];
|
|
281
|
+
if (val == -INFINITY) {
|
|
282
|
+
LLAMA_LOG_DEBUG(" ∞");
|
|
283
|
+
} else {
|
|
284
|
+
LLAMA_LOG_DEBUG(" 0");
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
LLAMA_LOG_DEBUG("\n");
|
|
288
|
+
}
|
|
289
|
+
}
|
|
290
|
+
|
|
261
291
|
void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
|
|
262
292
|
const int64_t n_kv = ubatch->n_tokens;
|
|
263
293
|
const int64_t n_tokens = ubatch->n_tokens;
|
|
@@ -267,6 +297,9 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
|
|
|
267
297
|
|
|
268
298
|
float * data = (float *) kq_mask->data;
|
|
269
299
|
|
|
300
|
+
// [TAG_NO_CACHE_ISWA]
|
|
301
|
+
GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "TODO: implement");
|
|
302
|
+
|
|
270
303
|
for (int h = 0; h < 1; ++h) {
|
|
271
304
|
for (int i1 = 0; i1 < n_tokens; ++i1) {
|
|
272
305
|
const llama_seq_id s1 = ubatch->seq_id[i1][0];
|
|
@@ -277,21 +310,33 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
|
|
|
277
310
|
for (int s = 0; s < ubatch->n_seq_id[i0]; ++s) {
|
|
278
311
|
const llama_seq_id s0 = ubatch->seq_id[i0][0];
|
|
279
312
|
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
if (hparams.use_alibi) {
|
|
283
|
-
f = -std::abs(ubatch->pos[i0] - ubatch->pos[i1]);
|
|
284
|
-
} else {
|
|
285
|
-
f = 0.0f;
|
|
286
|
-
}
|
|
287
|
-
break;
|
|
313
|
+
if (s0 != s1) {
|
|
314
|
+
continue; // skip different sequences
|
|
288
315
|
}
|
|
289
|
-
}
|
|
290
316
|
|
|
317
|
+
if (cparams.causal_attn && ubatch->pos[i0] > ubatch->pos[i1]) {
|
|
318
|
+
continue; // skip future tokens for causal attention
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
// TODO: this does not take into account that some layers are SWA and others are note (i.e. iSWA) [TAG_NO_CACHE_ISWA]
|
|
322
|
+
//if (hparams.is_masked_swa(ubatch->pos[i0], ubatch->pos[i1])) {
|
|
323
|
+
// continue; // skip masked tokens for SWA
|
|
324
|
+
//}
|
|
325
|
+
|
|
326
|
+
// TODO: reimplement this like in llama_kv_cache_unified
|
|
327
|
+
if (hparams.use_alibi) {
|
|
328
|
+
f = -std::abs(ubatch->pos[i0] - ubatch->pos[i1]);
|
|
329
|
+
} else {
|
|
330
|
+
f = 0.0f;
|
|
331
|
+
}
|
|
332
|
+
}
|
|
291
333
|
data[h*(n_kv*n_tokens) + i1*n_kv + i0] = f;
|
|
292
334
|
}
|
|
293
335
|
}
|
|
294
336
|
}
|
|
337
|
+
if (debug) {
|
|
338
|
+
print_mask(data, n_tokens, n_kv, hparams.n_swa, hparams.swa_type);
|
|
339
|
+
}
|
|
295
340
|
}
|
|
296
341
|
|
|
297
342
|
void llm_graph_input_attn_kv::set_input(const llama_ubatch * ubatch) {
|
|
@@ -1228,7 +1273,7 @@ ggml_tensor * llm_graph_context::build_attn_mha(
|
|
|
1228
1273
|
// split the batch into streams if needed
|
|
1229
1274
|
const auto n_stream = k->ne[3];
|
|
1230
1275
|
|
|
1231
|
-
q =
|
|
1276
|
+
q = ggml_view_4d(ctx0, q, q->ne[0], q->ne[1], q->ne[2]/n_stream, n_stream, q->nb[1], q->nb[2], q->nb[3]/n_stream, 0);
|
|
1232
1277
|
|
|
1233
1278
|
q = ggml_permute(ctx0, q, 0, 2, 1, 3);
|
|
1234
1279
|
k = ggml_permute(ctx0, k, 0, 2, 1, 3);
|
|
@@ -1386,7 +1431,8 @@ ggml_tensor * llm_graph_context::build_attn(
|
|
|
1386
1431
|
|
|
1387
1432
|
// [TAG_NO_CACHE_PAD]
|
|
1388
1433
|
// TODO: if ubatch.equal_seqs() == true, we can split the three tensors below into ubatch.n_seqs_unq streams
|
|
1389
|
-
|
|
1434
|
+
// but it might not be worth it: https://github.com/ggml-org/llama.cpp/pull/15636
|
|
1435
|
+
//assert(!ubatch.equal_seqs() || (k_cur->ne[3] == 1 && k_cur->ne[3] == ubatch.n_seqs_unq));
|
|
1390
1436
|
|
|
1391
1437
|
ggml_tensor * q = q_cur;
|
|
1392
1438
|
ggml_tensor * k = k_cur;
|
|
@@ -78,6 +78,11 @@ struct llm_graph_params;
|
|
|
78
78
|
|
|
79
79
|
class llm_graph_input_i {
|
|
80
80
|
public:
|
|
81
|
+
llm_graph_input_i() {
|
|
82
|
+
const char * LLAMA_GRAPH_INPUT_DEBUG = getenv("LLAMA_GRAPH_INPUT_DEBUG");
|
|
83
|
+
debug = LLAMA_GRAPH_INPUT_DEBUG ? atoi(LLAMA_GRAPH_INPUT_DEBUG) : 0;
|
|
84
|
+
}
|
|
85
|
+
|
|
81
86
|
virtual ~llm_graph_input_i() = default;
|
|
82
87
|
|
|
83
88
|
virtual void set_input(const llama_ubatch * ubatch) = 0;
|
|
@@ -90,6 +95,9 @@ public:
|
|
|
90
95
|
GGML_UNUSED(params);
|
|
91
96
|
return false;
|
|
92
97
|
}
|
|
98
|
+
protected:
|
|
99
|
+
// env: LLAMA_GRAPH_INPUT_DEBUG
|
|
100
|
+
int debug = 0;
|
|
93
101
|
};
|
|
94
102
|
|
|
95
103
|
using llm_graph_input_ptr = std::unique_ptr<llm_graph_input_i>;
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
#include "llama-hparams.h"
|
|
2
2
|
|
|
3
3
|
#include "ggml.h"
|
|
4
|
+
#include <cassert>
|
|
4
5
|
|
|
5
6
|
void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) {
|
|
6
7
|
if (dense_first) {
|
|
@@ -178,3 +179,39 @@ uint32_t llama_hparams::n_layer_kv() const {
|
|
|
178
179
|
|
|
179
180
|
return res;
|
|
180
181
|
}
|
|
182
|
+
|
|
183
|
+
bool llama_hparams::is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1) {
|
|
184
|
+
assert(p0 >= 0 && p1 >= 0);
|
|
185
|
+
|
|
186
|
+
switch (swa_type) {
|
|
187
|
+
case LLAMA_SWA_TYPE_NONE:
|
|
188
|
+
{
|
|
189
|
+
} break;
|
|
190
|
+
case LLAMA_SWA_TYPE_STANDARD:
|
|
191
|
+
{
|
|
192
|
+
if (p1 - p0 >= (int32_t) n_swa) {
|
|
193
|
+
return true;
|
|
194
|
+
}
|
|
195
|
+
} break;
|
|
196
|
+
case LLAMA_SWA_TYPE_CHUNKED:
|
|
197
|
+
{
|
|
198
|
+
const llama_pos pos_chunk_start = (p1 / n_swa) * n_swa;
|
|
199
|
+
|
|
200
|
+
if (p0 < pos_chunk_start) {
|
|
201
|
+
return true;
|
|
202
|
+
}
|
|
203
|
+
} break;
|
|
204
|
+
case LLAMA_SWA_TYPE_SYMMETRIC:
|
|
205
|
+
{
|
|
206
|
+
const int32_t half_n_swa = (int32_t) n_swa / 2;
|
|
207
|
+
const int32_t pos_diff = p1 - p0;
|
|
208
|
+
|
|
209
|
+
// Mask if outside the symmetric window
|
|
210
|
+
if (pos_diff < -half_n_swa || pos_diff > half_n_swa) {
|
|
211
|
+
return true;
|
|
212
|
+
}
|
|
213
|
+
} break;
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
return false;
|
|
217
|
+
}
|
|
@@ -16,9 +16,10 @@ enum llama_expert_gating_func_type {
|
|
|
16
16
|
};
|
|
17
17
|
|
|
18
18
|
enum llama_swa_type {
|
|
19
|
-
LLAMA_SWA_TYPE_NONE
|
|
20
|
-
LLAMA_SWA_TYPE_STANDARD
|
|
21
|
-
LLAMA_SWA_TYPE_CHUNKED
|
|
19
|
+
LLAMA_SWA_TYPE_NONE = 0,
|
|
20
|
+
LLAMA_SWA_TYPE_STANDARD = 1,
|
|
21
|
+
LLAMA_SWA_TYPE_CHUNKED = 2,
|
|
22
|
+
LLAMA_SWA_TYPE_SYMMETRIC = 3,
|
|
22
23
|
};
|
|
23
24
|
|
|
24
25
|
struct llama_hparams_posnet {
|
|
@@ -158,6 +159,7 @@ struct llama_hparams {
|
|
|
158
159
|
// needed by encoder-decoder models (e.g. T5, FLAN-T5)
|
|
159
160
|
// ref: https://github.com/ggerganov/llama.cpp/pull/8141
|
|
160
161
|
llama_token dec_start_token_id = LLAMA_TOKEN_NULL;
|
|
162
|
+
uint32_t dec_n_layer = 0;
|
|
161
163
|
|
|
162
164
|
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
|
|
163
165
|
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
|
|
@@ -227,6 +229,11 @@ struct llama_hparams {
|
|
|
227
229
|
|
|
228
230
|
// number of layers for which has_kv() returns true
|
|
229
231
|
uint32_t n_layer_kv() const;
|
|
232
|
+
|
|
233
|
+
// note that this function uses different SWA parameters from those in the hparams
|
|
234
|
+
// TODO: think of a better place for this function
|
|
235
|
+
// TODO: pack the SWA params in a struct?
|
|
236
|
+
static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1);
|
|
230
237
|
};
|
|
231
238
|
|
|
232
239
|
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
|
|
@@ -1018,16 +1018,33 @@ ggml_tensor * llama_kv_cache::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggm
|
|
|
1018
1018
|
|
|
1019
1019
|
const int32_t ikv = map_layer_ids.at(il);
|
|
1020
1020
|
|
|
1021
|
-
|
|
1021
|
+
ggml_tensor * k = layers[ikv].k;
|
|
1022
|
+
|
|
1023
|
+
const int64_t n_embd_head = k_cur->ne[0];
|
|
1024
|
+
const int64_t n_head = k_cur->ne[1];
|
|
1025
|
+
const int64_t n_tokens = k_cur->ne[2];
|
|
1026
|
+
|
|
1027
|
+
const int64_t n_embd_gqa = n_embd_head*n_head;
|
|
1028
|
+
|
|
1029
|
+
// we can merge dims 0 and 1
|
|
1030
|
+
// TODO: add ggml helper function for this?
|
|
1031
|
+
GGML_ASSERT(ggml_row_size(k_cur->type, n_embd_head) == k_cur->nb[1]);
|
|
1022
1032
|
|
|
1023
|
-
|
|
1033
|
+
k_cur = ggml_view_2d(ctx, k_cur, n_embd_gqa, n_tokens, k_cur->nb[2], 0);
|
|
1024
1034
|
|
|
1025
|
-
|
|
1035
|
+
const int64_t n_stream = k->ne[2];
|
|
1036
|
+
|
|
1037
|
+
if (n_stream > 1) {
|
|
1038
|
+
const int64_t kv_size = get_size();
|
|
1026
1039
|
|
|
1027
|
-
|
|
1028
|
-
|
|
1040
|
+
assert(n_embd_gqa == k->ne[0]);
|
|
1041
|
+
assert(kv_size == k->ne[1]);
|
|
1042
|
+
|
|
1043
|
+
// merge the buffer across all streams because the idxs are global
|
|
1044
|
+
k = ggml_reshape_2d(ctx, k, n_embd_gqa, kv_size*n_stream);
|
|
1029
1045
|
}
|
|
1030
1046
|
|
|
1047
|
+
// store the current K values into the cache
|
|
1031
1048
|
return ggml_set_rows(ctx, k, k_cur, k_idxs);
|
|
1032
1049
|
}
|
|
1033
1050
|
|
|
@@ -1038,28 +1055,51 @@ ggml_tensor * llama_kv_cache::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggm
|
|
|
1038
1055
|
|
|
1039
1056
|
auto * v = layers[ikv].v;
|
|
1040
1057
|
|
|
1041
|
-
const int64_t
|
|
1042
|
-
const int64_t
|
|
1058
|
+
const int64_t n_embd_head = v_cur->ne[0];
|
|
1059
|
+
const int64_t n_head = v_cur->ne[1];
|
|
1060
|
+
const int64_t n_tokens = v_cur->ne[2];
|
|
1061
|
+
|
|
1062
|
+
const int64_t n_embd_gqa = n_embd_head*n_head;
|
|
1063
|
+
|
|
1064
|
+
// we can merge dims 0 and 1
|
|
1065
|
+
GGML_ASSERT(ggml_row_size(v_cur->type, n_embd_head) == v_cur->nb[1]);
|
|
1043
1066
|
|
|
1044
|
-
|
|
1067
|
+
const int64_t n_stream = v->ne[2];
|
|
1045
1068
|
|
|
1069
|
+
// take this branch when FA is enabled (the V cache is not transposed)
|
|
1046
1070
|
if (!v_trans) {
|
|
1047
|
-
|
|
1048
|
-
|
|
1071
|
+
v_cur = ggml_view_2d(ctx, v_cur, n_embd_gqa, n_tokens, v_cur->nb[2], 0);
|
|
1072
|
+
|
|
1073
|
+
if (n_stream > 1) {
|
|
1074
|
+
const int64_t kv_size = get_size();
|
|
1075
|
+
|
|
1076
|
+
assert(n_embd_gqa == v->ne[0]);
|
|
1077
|
+
assert(kv_size == v->ne[1]);
|
|
1078
|
+
|
|
1079
|
+
// merge the buffer across all streams because the idxs are global
|
|
1080
|
+
v = ggml_reshape_2d(ctx, v, n_embd_gqa, kv_size*n_stream);
|
|
1049
1081
|
}
|
|
1050
1082
|
|
|
1051
1083
|
return ggml_set_rows(ctx, v, v_cur, v_idxs);
|
|
1052
1084
|
}
|
|
1053
1085
|
|
|
1086
|
+
if (ggml_row_size(v_cur->type, n_embd_gqa) == v_cur->nb[2]) {
|
|
1087
|
+
// we can merge dims 0, 1 and 2
|
|
1088
|
+
v_cur = ggml_reshape_2d(ctx, v_cur, n_embd_gqa, n_tokens);
|
|
1089
|
+
} else {
|
|
1090
|
+
// otherwise -> make a copy to get contiguous data
|
|
1091
|
+
v_cur = ggml_cont_2d (ctx, v_cur, n_embd_gqa, n_tokens);
|
|
1092
|
+
}
|
|
1093
|
+
|
|
1054
1094
|
// [TAG_V_CACHE_VARIABLE]
|
|
1055
|
-
if (
|
|
1056
|
-
v_cur = ggml_pad(ctx, v_cur, v->ne[0] -
|
|
1095
|
+
if (n_embd_gqa < v->ne[0]) {
|
|
1096
|
+
v_cur = ggml_pad(ctx, v_cur, v->ne[0] - n_embd_gqa, 0, 0, 0);
|
|
1057
1097
|
}
|
|
1058
1098
|
|
|
1059
|
-
// the row
|
|
1060
|
-
ggml_tensor * v_view = ggml_reshape_2d(ctx, v, 1, v
|
|
1099
|
+
// in this branch the v_idxs are constructed in such a way that each row is a single head element
|
|
1100
|
+
ggml_tensor * v_view = ggml_reshape_2d(ctx, v, 1, ggml_nelements(v));
|
|
1061
1101
|
|
|
1062
|
-
v_cur = ggml_reshape_2d(ctx, v_cur, 1, v_cur
|
|
1102
|
+
v_cur = ggml_reshape_2d(ctx, v_cur, 1, ggml_nelements(v_cur));
|
|
1063
1103
|
|
|
1064
1104
|
return ggml_set_rows(ctx, v_view, v_cur, v_idxs);
|
|
1065
1105
|
}
|
|
@@ -1393,29 +1433,7 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co
|
|
|
1393
1433
|
}
|
|
1394
1434
|
|
|
1395
1435
|
bool llama_kv_cache::is_masked_swa(llama_pos p0, llama_pos p1) const {
|
|
1396
|
-
|
|
1397
|
-
|
|
1398
|
-
switch (swa_type) {
|
|
1399
|
-
case LLAMA_SWA_TYPE_NONE:
|
|
1400
|
-
{
|
|
1401
|
-
} break;
|
|
1402
|
-
case LLAMA_SWA_TYPE_STANDARD:
|
|
1403
|
-
{
|
|
1404
|
-
if (p1 - p0 >= (int32_t) n_swa) {
|
|
1405
|
-
return true;
|
|
1406
|
-
}
|
|
1407
|
-
} break;
|
|
1408
|
-
case LLAMA_SWA_TYPE_CHUNKED:
|
|
1409
|
-
{
|
|
1410
|
-
const llama_pos pos_chunk_start = (p1 / n_swa) * n_swa;
|
|
1411
|
-
|
|
1412
|
-
if (p0 < pos_chunk_start) {
|
|
1413
|
-
return true;
|
|
1414
|
-
}
|
|
1415
|
-
} break;
|
|
1416
|
-
}
|
|
1417
|
-
|
|
1418
|
-
return false;
|
|
1436
|
+
return llama_hparams::is_masked_swa(n_swa, swa_type, p0, p1);
|
|
1419
1437
|
}
|
|
1420
1438
|
|
|
1421
1439
|
void llama_kv_cache::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
|
|
@@ -212,6 +212,7 @@ private:
|
|
|
212
212
|
// env: LLAMA_KV_CACHE_DEBUG
|
|
213
213
|
int debug = 0;
|
|
214
214
|
|
|
215
|
+
// this is the SWA type of the cache - not to be confused with the model SWA type
|
|
215
216
|
const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
|
|
216
217
|
|
|
217
218
|
std::vector<ggml_context_ptr> ctxs;
|
|
@@ -316,9 +317,17 @@ public:
|
|
|
316
317
|
ggml_tensor * get_v(ggml_context * ctx, int32_t il) const;
|
|
317
318
|
|
|
318
319
|
// store k_cur and v_cur in the cache based on the provided head location
|
|
320
|
+
// note: the heads in k_cur and v_cur should be layed out contiguously in memory
|
|
321
|
+
// - k_cur [n_embd_head_k, n_head_k, n_tokens]
|
|
322
|
+
// - k_idxs [n_tokens]
|
|
323
|
+
// - v_cur [n_embd_head_v, n_head_v, n_tokens]
|
|
324
|
+
// - v_idxs [n_tokens] or [n_tokens*n_embd_v_gqa] depending if V cache is transposed
|
|
319
325
|
ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il) const;
|
|
320
326
|
ggml_tensor * cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * v_idxs, int32_t il) const;
|
|
321
327
|
|
|
328
|
+
// create destination indices for each head of the current batch for where it would be written in the KV cache
|
|
329
|
+
// the indices address the global KV cache (not per stream) - this is not relevant for the user of this API, but
|
|
330
|
+
// helps understand the implementation logic of cpy_k and cpy_v
|
|
322
331
|
ggml_tensor * build_input_k_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const;
|
|
323
332
|
ggml_tensor * build_input_v_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const;
|
|
324
333
|
|