@fugood/llama.node 1.2.5 → 1.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +14 -14
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +6 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +1 -1
- package/src/llama.cpp/src/llama-graph.cpp +74 -43
- package/src/llama.cpp/src/llama-graph.h +7 -3
- package/src/llama.cpp/src/llama-model.cpp +5 -6
- package/src/llama.cpp/src/llama.cpp +1 -0
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@fugood/llama.node",
|
|
3
3
|
"access": "public",
|
|
4
|
-
"version": "1.2.
|
|
4
|
+
"version": "1.2.6",
|
|
5
5
|
"description": "An another Node binding of llama.cpp",
|
|
6
6
|
"main": "lib/index.js",
|
|
7
7
|
"scripts": {
|
|
@@ -72,19 +72,19 @@
|
|
|
72
72
|
"CMakeLists.txt"
|
|
73
73
|
],
|
|
74
74
|
"optionalDependencies": {
|
|
75
|
-
"@fugood/node-llama-linux-x64": "1.2.
|
|
76
|
-
"@fugood/node-llama-linux-x64-vulkan": "1.2.
|
|
77
|
-
"@fugood/node-llama-linux-x64-cuda": "1.2.
|
|
78
|
-
"@fugood/node-llama-linux-arm64": "1.2.
|
|
79
|
-
"@fugood/node-llama-linux-arm64-vulkan": "1.2.
|
|
80
|
-
"@fugood/node-llama-linux-arm64-cuda": "1.2.
|
|
81
|
-
"@fugood/node-llama-win32-x64": "1.2.
|
|
82
|
-
"@fugood/node-llama-win32-x64-vulkan": "1.2.
|
|
83
|
-
"@fugood/node-llama-win32-x64-cuda": "1.2.
|
|
84
|
-
"@fugood/node-llama-win32-arm64": "1.2.
|
|
85
|
-
"@fugood/node-llama-win32-arm64-vulkan": "1.2.
|
|
86
|
-
"@fugood/node-llama-darwin-x64": "1.2.
|
|
87
|
-
"@fugood/node-llama-darwin-arm64": "1.2.
|
|
75
|
+
"@fugood/node-llama-linux-x64": "1.2.6",
|
|
76
|
+
"@fugood/node-llama-linux-x64-vulkan": "1.2.6",
|
|
77
|
+
"@fugood/node-llama-linux-x64-cuda": "1.2.6",
|
|
78
|
+
"@fugood/node-llama-linux-arm64": "1.2.6",
|
|
79
|
+
"@fugood/node-llama-linux-arm64-vulkan": "1.2.6",
|
|
80
|
+
"@fugood/node-llama-linux-arm64-cuda": "1.2.6",
|
|
81
|
+
"@fugood/node-llama-win32-x64": "1.2.6",
|
|
82
|
+
"@fugood/node-llama-win32-x64-vulkan": "1.2.6",
|
|
83
|
+
"@fugood/node-llama-win32-x64-cuda": "1.2.6",
|
|
84
|
+
"@fugood/node-llama-win32-arm64": "1.2.6",
|
|
85
|
+
"@fugood/node-llama-win32-arm64-vulkan": "1.2.6",
|
|
86
|
+
"@fugood/node-llama-darwin-x64": "1.2.6",
|
|
87
|
+
"@fugood/node-llama-darwin-arm64": "1.2.6"
|
|
88
88
|
},
|
|
89
89
|
"devDependencies": {
|
|
90
90
|
"@babel/preset-env": "^7.24.4",
|
|
@@ -689,8 +689,13 @@ bool ggml_is_numa(void) {
|
|
|
689
689
|
#endif
|
|
690
690
|
|
|
691
691
|
static void ggml_init_arm_arch_features(void) {
|
|
692
|
-
#if defined(
|
|
692
|
+
#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE)
|
|
693
|
+
#if defined(__linux__)
|
|
693
694
|
ggml_arm_arch_features.sve_cnt = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
|
|
695
|
+
#else
|
|
696
|
+
// TODO: add support of SVE for non-linux systems
|
|
697
|
+
#error "TODO: SVE is not supported on this platform. To use SVE, sve_cnt needs to be initialized here."
|
|
698
|
+
#endif
|
|
694
699
|
#endif
|
|
695
700
|
}
|
|
696
701
|
|
|
@@ -463,9 +463,9 @@ ggml_float ggml_vec_cvar_f32(const int n, float * y, const float * x, const floa
|
|
|
463
463
|
#endif
|
|
464
464
|
for (; i < n; ++i) {
|
|
465
465
|
float val = x[i] - mean;
|
|
466
|
+
y[i] = val;
|
|
466
467
|
val *= val;
|
|
467
468
|
sum += (ggml_float)val;
|
|
468
|
-
y[i] = val;
|
|
469
469
|
}
|
|
470
470
|
return sum/n;
|
|
471
471
|
}
|
|
@@ -261,12 +261,17 @@ void llm_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) {
|
|
|
261
261
|
}
|
|
262
262
|
}
|
|
263
263
|
|
|
264
|
-
static void print_mask(float * data, int64_t n_tokens, int64_t n_kv, int64_t n_swa, llama_swa_type swa_type) {
|
|
264
|
+
static void print_mask(const float * data, int64_t n_tokens, int64_t n_kv, int64_t n_swa, llama_swa_type swa_type) {
|
|
265
265
|
LLAMA_LOG_DEBUG("%s: === Attention mask ===\n", __func__);
|
|
266
|
-
const char * swa_type_str =
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
266
|
+
const char * swa_type_str = "unknown";
|
|
267
|
+
|
|
268
|
+
switch (swa_type) {
|
|
269
|
+
case LLAMA_SWA_TYPE_NONE: swa_type_str = "LLAMA_SWA_TYPE_NONE"; break;
|
|
270
|
+
case LLAMA_SWA_TYPE_STANDARD: swa_type_str = "LLAMA_SWA_TYPE_STANDARD"; break;
|
|
271
|
+
case LLAMA_SWA_TYPE_CHUNKED: swa_type_str = "LLAMA_SWA_TYPE_CHUNKED"; break;
|
|
272
|
+
case LLAMA_SWA_TYPE_SYMMETRIC: swa_type_str = "LLAMA_SWA_TYPE_SYMMETRIC"; break;
|
|
273
|
+
};
|
|
274
|
+
|
|
270
275
|
LLAMA_LOG_DEBUG("%s: n_swa : %d, n_kv: %d, swq_type: %s\n", __func__, (int)n_swa, (int)n_kv, swa_type_str);
|
|
271
276
|
LLAMA_LOG_DEBUG("%s: '0' = can attend, '∞' = masked\n", __func__);
|
|
272
277
|
LLAMA_LOG_DEBUG("%s: Rows = query tokens, Columns = key/value tokens\n\n", __func__);
|
|
@@ -295,50 +300,67 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
|
|
|
295
300
|
const int64_t n_kv = ubatch->n_tokens;
|
|
296
301
|
const int64_t n_tokens = ubatch->n_tokens;
|
|
297
302
|
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
// [TAG_NO_CACHE_ISWA]
|
|
304
|
-
GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "TODO: implement");
|
|
303
|
+
const auto fill_mask = [&](float * data, int n_swa, llama_swa_type swa_type) {
|
|
304
|
+
for (int h = 0; h < 1; ++h) {
|
|
305
|
+
for (int i1 = 0; i1 < n_tokens; ++i1) {
|
|
306
|
+
const llama_seq_id s1 = ubatch->seq_id[i1][0];
|
|
307
|
+
const llama_pos p1 = ubatch->pos[i1];
|
|
305
308
|
|
|
306
|
-
|
|
307
|
-
for (int i1 = 0; i1 < n_tokens; ++i1) {
|
|
308
|
-
const llama_seq_id s1 = ubatch->seq_id[i1][0];
|
|
309
|
+
const uint64_t idst = h*(n_kv*n_tokens) + i1*n_kv;
|
|
309
310
|
|
|
310
|
-
|
|
311
|
-
float f = -INFINITY;
|
|
312
|
-
|
|
313
|
-
for (int s = 0; s < ubatch->n_seq_id[i0]; ++s) {
|
|
311
|
+
for (int i0 = 0; i0 < n_tokens; ++i0) {
|
|
314
312
|
const llama_seq_id s0 = ubatch->seq_id[i0][0];
|
|
313
|
+
const llama_pos p0 = ubatch->pos[i0];
|
|
315
314
|
|
|
315
|
+
// mask different sequences
|
|
316
316
|
if (s0 != s1) {
|
|
317
|
-
continue;
|
|
317
|
+
continue;
|
|
318
318
|
}
|
|
319
319
|
|
|
320
|
-
|
|
321
|
-
|
|
320
|
+
// mask future tokens
|
|
321
|
+
if (cparams.causal_attn && p0 > p1) {
|
|
322
|
+
continue;
|
|
322
323
|
}
|
|
323
324
|
|
|
324
|
-
//
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
//}
|
|
328
|
-
|
|
329
|
-
// TODO: reimplement this like in llama_kv_cache_unified
|
|
330
|
-
if (hparams.use_alibi) {
|
|
331
|
-
f = -std::abs(ubatch->pos[i0] - ubatch->pos[i1]);
|
|
332
|
-
} else {
|
|
333
|
-
f = 0.0f;
|
|
325
|
+
// apply SWA if any
|
|
326
|
+
if (llama_hparams::is_masked_swa(n_swa, swa_type, p0, p1)) {
|
|
327
|
+
continue;
|
|
334
328
|
}
|
|
329
|
+
|
|
330
|
+
data[idst + i0] = hparams.use_alibi ? -std::abs(p0 - p1) : 0.0f;
|
|
335
331
|
}
|
|
336
|
-
data[h*(n_kv*n_tokens) + i1*n_kv + i0] = f;
|
|
337
332
|
}
|
|
338
333
|
}
|
|
334
|
+
};
|
|
335
|
+
|
|
336
|
+
{
|
|
337
|
+
GGML_ASSERT(self_kq_mask);
|
|
338
|
+
GGML_ASSERT(ggml_backend_buffer_is_host(self_kq_mask->buffer));
|
|
339
|
+
|
|
340
|
+
float * data = (float *) self_kq_mask->data;
|
|
341
|
+
|
|
342
|
+
std::fill(data, data + ggml_nelements(self_kq_mask), -INFINITY);
|
|
343
|
+
|
|
344
|
+
fill_mask(data, 0, LLAMA_SWA_TYPE_NONE);
|
|
345
|
+
|
|
346
|
+
if (debug) {
|
|
347
|
+
print_mask(data, n_tokens, n_kv, 0, LLAMA_SWA_TYPE_NONE);
|
|
348
|
+
}
|
|
339
349
|
}
|
|
340
|
-
|
|
341
|
-
|
|
350
|
+
|
|
351
|
+
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
|
|
352
|
+
GGML_ASSERT(self_kq_mask_swa);
|
|
353
|
+
GGML_ASSERT(ggml_backend_buffer_is_host(self_kq_mask_swa->buffer));
|
|
354
|
+
|
|
355
|
+
float * data = (float *) self_kq_mask_swa->data;
|
|
356
|
+
|
|
357
|
+
std::fill(data, data + ggml_nelements(self_kq_mask_swa), -INFINITY);
|
|
358
|
+
|
|
359
|
+
fill_mask(data, hparams.n_swa, hparams.swa_type);
|
|
360
|
+
|
|
361
|
+
if (debug) {
|
|
362
|
+
print_mask(data, n_tokens, n_kv, hparams.n_swa, hparams.swa_type);
|
|
363
|
+
}
|
|
342
364
|
}
|
|
343
365
|
}
|
|
344
366
|
|
|
@@ -1299,12 +1321,9 @@ ggml_tensor * llm_graph_context::build_attn_mha(
|
|
|
1299
1321
|
k = ggml_permute(ctx0, k, 0, 2, 1, 3);
|
|
1300
1322
|
v = ggml_permute(ctx0, v, 0, 2, 1, 3);
|
|
1301
1323
|
|
|
1302
|
-
const auto n_kv = k->ne[1];
|
|
1303
|
-
|
|
1304
1324
|
ggml_tensor * cur;
|
|
1305
1325
|
|
|
1306
|
-
|
|
1307
|
-
if (cparams.flash_attn && (n_kv % 256 == 0) && kq_b == nullptr) {
|
|
1326
|
+
if (cparams.flash_attn && kq_b == nullptr) {
|
|
1308
1327
|
GGML_ASSERT(kq_b == nullptr && "Flash attention does not support KQ bias yet");
|
|
1309
1328
|
|
|
1310
1329
|
if (v_trans) {
|
|
@@ -1419,10 +1438,20 @@ llm_graph_input_attn_no_cache * llm_graph_context::build_attn_inp_no_cache() con
|
|
|
1419
1438
|
auto inp = std::make_unique<llm_graph_input_attn_no_cache>(hparams, cparams);
|
|
1420
1439
|
|
|
1421
1440
|
// note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch
|
|
1422
|
-
inp->
|
|
1423
|
-
ggml_set_input(inp->
|
|
1441
|
+
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
|
|
1442
|
+
ggml_set_input(inp->self_kq_mask);
|
|
1443
|
+
|
|
1444
|
+
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
|
|
1424
1445
|
|
|
1425
|
-
|
|
1446
|
+
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
|
|
1447
|
+
inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
|
|
1448
|
+
ggml_set_input(inp->self_kq_mask_swa);
|
|
1449
|
+
|
|
1450
|
+
inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
|
|
1451
|
+
} else {
|
|
1452
|
+
inp->self_kq_mask_swa = nullptr;
|
|
1453
|
+
inp->self_kq_mask_swa_cnv = nullptr;
|
|
1454
|
+
}
|
|
1426
1455
|
|
|
1427
1456
|
return (llm_graph_input_attn_no_cache *) res->add_input(std::move(inp));
|
|
1428
1457
|
}
|
|
@@ -1447,7 +1476,9 @@ ggml_tensor * llm_graph_context::build_attn(
|
|
|
1447
1476
|
ggml_build_forward_expand(gf, k_cur);
|
|
1448
1477
|
ggml_build_forward_expand(gf, v_cur);
|
|
1449
1478
|
|
|
1450
|
-
const
|
|
1479
|
+
const bool is_swa = hparams.is_swa(il);
|
|
1480
|
+
|
|
1481
|
+
const auto & kq_mask = is_swa ? inp->get_kq_mask_swa() : inp->get_kq_mask();
|
|
1451
1482
|
|
|
1452
1483
|
// [TAG_NO_CACHE_PAD]
|
|
1453
1484
|
// TODO: if ubatch.equal_seqs() == true, we can split the three tensors below into ubatch.n_seqs_unq streams
|
|
@@ -257,10 +257,14 @@ public:
|
|
|
257
257
|
|
|
258
258
|
void set_input(const llama_ubatch * ubatch) override;
|
|
259
259
|
|
|
260
|
-
ggml_tensor * get_kq_mask()
|
|
260
|
+
ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }
|
|
261
|
+
ggml_tensor * get_kq_mask_swa() const { return self_kq_mask_swa_cnv; }
|
|
261
262
|
|
|
262
|
-
|
|
263
|
-
ggml_tensor *
|
|
263
|
+
// n_tokens == n_batch
|
|
264
|
+
ggml_tensor * self_kq_mask = nullptr; // F32 [n_tokens, n_batch/n_stream, 1, n_stream]
|
|
265
|
+
ggml_tensor * self_kq_mask_cnv = nullptr; // [n_tokens, n_batch/n_stream, 1, n_stream]
|
|
266
|
+
ggml_tensor * self_kq_mask_swa = nullptr; // F32 [n_tokens, n_batch/n_stream, 1, n_stream]
|
|
267
|
+
ggml_tensor * self_kq_mask_swa_cnv = nullptr; // [n_tokens, n_batch/n_stream, 1, n_stream]
|
|
264
268
|
|
|
265
269
|
const llama_hparams hparams;
|
|
266
270
|
const llama_cparams cparams;
|
|
@@ -11358,8 +11358,8 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
|
|
|
11358
11358
|
}
|
|
11359
11359
|
};
|
|
11360
11360
|
|
|
11361
|
-
struct
|
|
11362
|
-
|
|
11361
|
+
struct llm_build_gemma_embedding : public llm_graph_context {
|
|
11362
|
+
llm_build_gemma_embedding(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
|
|
11363
11363
|
const int64_t n_embd_head = hparams.n_embd_head_k;
|
|
11364
11364
|
|
|
11365
11365
|
ggml_tensor * cur;
|
|
@@ -11376,8 +11376,7 @@ struct llm_build_gemma_embedding_iswa : public llm_graph_context {
|
|
|
11376
11376
|
// inp_pos - contains the positions
|
|
11377
11377
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
11378
11378
|
|
|
11379
|
-
|
|
11380
|
-
auto * inp_attn = build_attn_inp_kv_iswa();
|
|
11379
|
+
auto * inp_attn = build_attn_inp_no_cache();
|
|
11381
11380
|
|
|
11382
11381
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11383
11382
|
|
|
@@ -19378,7 +19377,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
19378
19377
|
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
19379
19378
|
case LLM_ARCH_NEO_BERT:
|
|
19380
19379
|
case LLM_ARCH_WAVTOKENIZER_DEC:
|
|
19381
|
-
|
|
19380
|
+
case LLM_ARCH_GEMMA_EMBEDDING:
|
|
19382
19381
|
case LLM_ARCH_DREAM:
|
|
19383
19382
|
case LLM_ARCH_LLADA:
|
|
19384
19383
|
case LLM_ARCH_LLADA_MOE:
|
|
@@ -19671,7 +19670,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
|
|
|
19671
19670
|
} break;
|
|
19672
19671
|
case LLM_ARCH_GEMMA_EMBEDDING:
|
|
19673
19672
|
{
|
|
19674
|
-
llm = std::make_unique<
|
|
19673
|
+
llm = std::make_unique<llm_build_gemma_embedding>(*this, params);
|
|
19675
19674
|
} break;
|
|
19676
19675
|
case LLM_ARCH_STARCODER2:
|
|
19677
19676
|
{
|