@fugood/llama.node 1.2.5 → 1.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@fugood/llama.node",
3
3
  "access": "public",
4
- "version": "1.2.5",
4
+ "version": "1.2.6",
5
5
  "description": "An another Node binding of llama.cpp",
6
6
  "main": "lib/index.js",
7
7
  "scripts": {
@@ -72,19 +72,19 @@
72
72
  "CMakeLists.txt"
73
73
  ],
74
74
  "optionalDependencies": {
75
- "@fugood/node-llama-linux-x64": "1.2.5",
76
- "@fugood/node-llama-linux-x64-vulkan": "1.2.5",
77
- "@fugood/node-llama-linux-x64-cuda": "1.2.5",
78
- "@fugood/node-llama-linux-arm64": "1.2.5",
79
- "@fugood/node-llama-linux-arm64-vulkan": "1.2.5",
80
- "@fugood/node-llama-linux-arm64-cuda": "1.2.5",
81
- "@fugood/node-llama-win32-x64": "1.2.5",
82
- "@fugood/node-llama-win32-x64-vulkan": "1.2.5",
83
- "@fugood/node-llama-win32-x64-cuda": "1.2.5",
84
- "@fugood/node-llama-win32-arm64": "1.2.5",
85
- "@fugood/node-llama-win32-arm64-vulkan": "1.2.5",
86
- "@fugood/node-llama-darwin-x64": "1.2.5",
87
- "@fugood/node-llama-darwin-arm64": "1.2.5"
75
+ "@fugood/node-llama-linux-x64": "1.2.6",
76
+ "@fugood/node-llama-linux-x64-vulkan": "1.2.6",
77
+ "@fugood/node-llama-linux-x64-cuda": "1.2.6",
78
+ "@fugood/node-llama-linux-arm64": "1.2.6",
79
+ "@fugood/node-llama-linux-arm64-vulkan": "1.2.6",
80
+ "@fugood/node-llama-linux-arm64-cuda": "1.2.6",
81
+ "@fugood/node-llama-win32-x64": "1.2.6",
82
+ "@fugood/node-llama-win32-x64-vulkan": "1.2.6",
83
+ "@fugood/node-llama-win32-x64-cuda": "1.2.6",
84
+ "@fugood/node-llama-win32-arm64": "1.2.6",
85
+ "@fugood/node-llama-win32-arm64-vulkan": "1.2.6",
86
+ "@fugood/node-llama-darwin-x64": "1.2.6",
87
+ "@fugood/node-llama-darwin-arm64": "1.2.6"
88
88
  },
89
89
  "devDependencies": {
90
90
  "@babel/preset-env": "^7.24.4",
@@ -68,7 +68,7 @@ struct ggml_compute_params {
68
68
  #endif // __VXE2__
69
69
  #endif // __s390x__ && __VEC__
70
70
 
71
- #if defined(__ARM_FEATURE_SVE)
71
+ #if defined(__ARM_FEATURE_SVE) && defined(__linux__)
72
72
  #include <sys/prctl.h>
73
73
  #endif
74
74
 
@@ -689,8 +689,13 @@ bool ggml_is_numa(void) {
689
689
  #endif
690
690
 
691
691
  static void ggml_init_arm_arch_features(void) {
692
- #if defined(__linux__) && defined(__aarch64__) && defined(__ARM_FEATURE_SVE)
692
+ #if defined(__aarch64__) && defined(__ARM_FEATURE_SVE)
693
+ #if defined(__linux__)
693
694
  ggml_arm_arch_features.sve_cnt = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
695
+ #else
696
+ // TODO: add support of SVE for non-linux systems
697
+ #error "TODO: SVE is not supported on this platform. To use SVE, sve_cnt needs to be initialized here."
698
+ #endif
694
699
  #endif
695
700
  }
696
701
 
@@ -463,9 +463,9 @@ ggml_float ggml_vec_cvar_f32(const int n, float * y, const float * x, const floa
463
463
  #endif
464
464
  for (; i < n; ++i) {
465
465
  float val = x[i] - mean;
466
+ y[i] = val;
466
467
  val *= val;
467
468
  sum += (ggml_float)val;
468
- y[i] = val;
469
469
  }
470
470
  return sum/n;
471
471
  }
@@ -261,12 +261,17 @@ void llm_graph_input_cross_embd::set_input(const llama_ubatch * ubatch) {
261
261
  }
262
262
  }
263
263
 
264
- static void print_mask(float * data, int64_t n_tokens, int64_t n_kv, int64_t n_swa, llama_swa_type swa_type) {
264
+ static void print_mask(const float * data, int64_t n_tokens, int64_t n_kv, int64_t n_swa, llama_swa_type swa_type) {
265
265
  LLAMA_LOG_DEBUG("%s: === Attention mask ===\n", __func__);
266
- const char * swa_type_str = (swa_type == LLAMA_SWA_TYPE_NONE) ? "LLAMA_SWA_TYPE_NONE" :
267
- (swa_type == LLAMA_SWA_TYPE_STANDARD) ? "LLAMA_SWA_TYPE_STANDARD" :
268
- (swa_type == LLAMA_SWA_TYPE_CHUNKED) ? "LLAMA_SWA_TYPE_CHUNKED" :
269
- (swa_type == LLAMA_SWA_TYPE_SYMMETRIC) ? "LLAMA_SWA_TYPE_SYMMETRIC" : "unknown";
266
+ const char * swa_type_str = "unknown";
267
+
268
+ switch (swa_type) {
269
+ case LLAMA_SWA_TYPE_NONE: swa_type_str = "LLAMA_SWA_TYPE_NONE"; break;
270
+ case LLAMA_SWA_TYPE_STANDARD: swa_type_str = "LLAMA_SWA_TYPE_STANDARD"; break;
271
+ case LLAMA_SWA_TYPE_CHUNKED: swa_type_str = "LLAMA_SWA_TYPE_CHUNKED"; break;
272
+ case LLAMA_SWA_TYPE_SYMMETRIC: swa_type_str = "LLAMA_SWA_TYPE_SYMMETRIC"; break;
273
+ };
274
+
270
275
  LLAMA_LOG_DEBUG("%s: n_swa : %d, n_kv: %d, swq_type: %s\n", __func__, (int)n_swa, (int)n_kv, swa_type_str);
271
276
  LLAMA_LOG_DEBUG("%s: '0' = can attend, '∞' = masked\n", __func__);
272
277
  LLAMA_LOG_DEBUG("%s: Rows = query tokens, Columns = key/value tokens\n\n", __func__);
@@ -295,50 +300,67 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
295
300
  const int64_t n_kv = ubatch->n_tokens;
296
301
  const int64_t n_tokens = ubatch->n_tokens;
297
302
 
298
- GGML_ASSERT(kq_mask);
299
- GGML_ASSERT(ggml_backend_buffer_is_host(kq_mask->buffer));
300
-
301
- float * data = (float *) kq_mask->data;
302
-
303
- // [TAG_NO_CACHE_ISWA]
304
- GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "TODO: implement");
303
+ const auto fill_mask = [&](float * data, int n_swa, llama_swa_type swa_type) {
304
+ for (int h = 0; h < 1; ++h) {
305
+ for (int i1 = 0; i1 < n_tokens; ++i1) {
306
+ const llama_seq_id s1 = ubatch->seq_id[i1][0];
307
+ const llama_pos p1 = ubatch->pos[i1];
305
308
 
306
- for (int h = 0; h < 1; ++h) {
307
- for (int i1 = 0; i1 < n_tokens; ++i1) {
308
- const llama_seq_id s1 = ubatch->seq_id[i1][0];
309
+ const uint64_t idst = h*(n_kv*n_tokens) + i1*n_kv;
309
310
 
310
- for (int i0 = 0; i0 < n_tokens; ++i0) {
311
- float f = -INFINITY;
312
-
313
- for (int s = 0; s < ubatch->n_seq_id[i0]; ++s) {
311
+ for (int i0 = 0; i0 < n_tokens; ++i0) {
314
312
  const llama_seq_id s0 = ubatch->seq_id[i0][0];
313
+ const llama_pos p0 = ubatch->pos[i0];
315
314
 
315
+ // mask different sequences
316
316
  if (s0 != s1) {
317
- continue; // skip different sequences
317
+ continue;
318
318
  }
319
319
 
320
- if (cparams.causal_attn && ubatch->pos[i0] > ubatch->pos[i1]) {
321
- continue; // skip future tokens for causal attention
320
+ // mask future tokens
321
+ if (cparams.causal_attn && p0 > p1) {
322
+ continue;
322
323
  }
323
324
 
324
- // TODO: this does not take into account that some layers are SWA and others are note (i.e. iSWA) [TAG_NO_CACHE_ISWA]
325
- //if (hparams.is_masked_swa(ubatch->pos[i0], ubatch->pos[i1])) {
326
- // continue; // skip masked tokens for SWA
327
- //}
328
-
329
- // TODO: reimplement this like in llama_kv_cache_unified
330
- if (hparams.use_alibi) {
331
- f = -std::abs(ubatch->pos[i0] - ubatch->pos[i1]);
332
- } else {
333
- f = 0.0f;
325
+ // apply SWA if any
326
+ if (llama_hparams::is_masked_swa(n_swa, swa_type, p0, p1)) {
327
+ continue;
334
328
  }
329
+
330
+ data[idst + i0] = hparams.use_alibi ? -std::abs(p0 - p1) : 0.0f;
335
331
  }
336
- data[h*(n_kv*n_tokens) + i1*n_kv + i0] = f;
337
332
  }
338
333
  }
334
+ };
335
+
336
+ {
337
+ GGML_ASSERT(self_kq_mask);
338
+ GGML_ASSERT(ggml_backend_buffer_is_host(self_kq_mask->buffer));
339
+
340
+ float * data = (float *) self_kq_mask->data;
341
+
342
+ std::fill(data, data + ggml_nelements(self_kq_mask), -INFINITY);
343
+
344
+ fill_mask(data, 0, LLAMA_SWA_TYPE_NONE);
345
+
346
+ if (debug) {
347
+ print_mask(data, n_tokens, n_kv, 0, LLAMA_SWA_TYPE_NONE);
348
+ }
339
349
  }
340
- if (debug) {
341
- print_mask(data, n_tokens, n_kv, hparams.n_swa, hparams.swa_type);
350
+
351
+ if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
352
+ GGML_ASSERT(self_kq_mask_swa);
353
+ GGML_ASSERT(ggml_backend_buffer_is_host(self_kq_mask_swa->buffer));
354
+
355
+ float * data = (float *) self_kq_mask_swa->data;
356
+
357
+ std::fill(data, data + ggml_nelements(self_kq_mask_swa), -INFINITY);
358
+
359
+ fill_mask(data, hparams.n_swa, hparams.swa_type);
360
+
361
+ if (debug) {
362
+ print_mask(data, n_tokens, n_kv, hparams.n_swa, hparams.swa_type);
363
+ }
342
364
  }
343
365
  }
344
366
 
@@ -1299,12 +1321,9 @@ ggml_tensor * llm_graph_context::build_attn_mha(
1299
1321
  k = ggml_permute(ctx0, k, 0, 2, 1, 3);
1300
1322
  v = ggml_permute(ctx0, v, 0, 2, 1, 3);
1301
1323
 
1302
- const auto n_kv = k->ne[1];
1303
-
1304
1324
  ggml_tensor * cur;
1305
1325
 
1306
- // TODO: replace hardcoded padding with ggml-provided padding
1307
- if (cparams.flash_attn && (n_kv % 256 == 0) && kq_b == nullptr) {
1326
+ if (cparams.flash_attn && kq_b == nullptr) {
1308
1327
  GGML_ASSERT(kq_b == nullptr && "Flash attention does not support KQ bias yet");
1309
1328
 
1310
1329
  if (v_trans) {
@@ -1419,10 +1438,20 @@ llm_graph_input_attn_no_cache * llm_graph_context::build_attn_inp_no_cache() con
1419
1438
  auto inp = std::make_unique<llm_graph_input_attn_no_cache>(hparams, cparams);
1420
1439
 
1421
1440
  // note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch
1422
- inp->kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
1423
- ggml_set_input(inp->kq_mask);
1441
+ inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
1442
+ ggml_set_input(inp->self_kq_mask);
1443
+
1444
+ inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
1424
1445
 
1425
- inp->kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->kq_mask, GGML_TYPE_F16) : inp->kq_mask;
1446
+ if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
1447
+ inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
1448
+ ggml_set_input(inp->self_kq_mask_swa);
1449
+
1450
+ inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
1451
+ } else {
1452
+ inp->self_kq_mask_swa = nullptr;
1453
+ inp->self_kq_mask_swa_cnv = nullptr;
1454
+ }
1426
1455
 
1427
1456
  return (llm_graph_input_attn_no_cache *) res->add_input(std::move(inp));
1428
1457
  }
@@ -1447,7 +1476,9 @@ ggml_tensor * llm_graph_context::build_attn(
1447
1476
  ggml_build_forward_expand(gf, k_cur);
1448
1477
  ggml_build_forward_expand(gf, v_cur);
1449
1478
 
1450
- const auto & kq_mask = inp->get_kq_mask();
1479
+ const bool is_swa = hparams.is_swa(il);
1480
+
1481
+ const auto & kq_mask = is_swa ? inp->get_kq_mask_swa() : inp->get_kq_mask();
1451
1482
 
1452
1483
  // [TAG_NO_CACHE_PAD]
1453
1484
  // TODO: if ubatch.equal_seqs() == true, we can split the three tensors below into ubatch.n_seqs_unq streams
@@ -257,10 +257,14 @@ public:
257
257
 
258
258
  void set_input(const llama_ubatch * ubatch) override;
259
259
 
260
- ggml_tensor * get_kq_mask() const { return kq_mask_cnv; }
260
+ ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }
261
+ ggml_tensor * get_kq_mask_swa() const { return self_kq_mask_swa_cnv; }
261
262
 
262
- ggml_tensor * kq_mask = nullptr; // F32 [n_tokens, n_batch, 1, 1]
263
- ggml_tensor * kq_mask_cnv = nullptr; // [n_tokens, n_batch, 1, 1]
263
+ // n_tokens == n_batch
264
+ ggml_tensor * self_kq_mask = nullptr; // F32 [n_tokens, n_batch/n_stream, 1, n_stream]
265
+ ggml_tensor * self_kq_mask_cnv = nullptr; // [n_tokens, n_batch/n_stream, 1, n_stream]
266
+ ggml_tensor * self_kq_mask_swa = nullptr; // F32 [n_tokens, n_batch/n_stream, 1, n_stream]
267
+ ggml_tensor * self_kq_mask_swa_cnv = nullptr; // [n_tokens, n_batch/n_stream, 1, n_stream]
264
268
 
265
269
  const llama_hparams hparams;
266
270
  const llama_cparams cparams;
@@ -11358,8 +11358,8 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
11358
11358
  }
11359
11359
  };
11360
11360
 
11361
- struct llm_build_gemma_embedding_iswa : public llm_graph_context {
11362
- llm_build_gemma_embedding_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
11361
+ struct llm_build_gemma_embedding : public llm_graph_context {
11362
+ llm_build_gemma_embedding(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
11363
11363
  const int64_t n_embd_head = hparams.n_embd_head_k;
11364
11364
 
11365
11365
  ggml_tensor * cur;
@@ -11376,8 +11376,7 @@ struct llm_build_gemma_embedding_iswa : public llm_graph_context {
11376
11376
  // inp_pos - contains the positions
11377
11377
  ggml_tensor * inp_pos = build_inp_pos();
11378
11378
 
11379
- // TODO: support cacheless iSWA embeddings [TAG_NO_CACHE_ISWA]
11380
- auto * inp_attn = build_attn_inp_kv_iswa();
11379
+ auto * inp_attn = build_attn_inp_no_cache();
11381
11380
 
11382
11381
  ggml_tensor * inp_out_ids = build_inp_out_ids();
11383
11382
 
@@ -19378,7 +19377,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
19378
19377
  case LLM_ARCH_NOMIC_BERT_MOE:
19379
19378
  case LLM_ARCH_NEO_BERT:
19380
19379
  case LLM_ARCH_WAVTOKENIZER_DEC:
19381
- //case LLM_ARCH_GEMMA_EMBEDDING: // TODO: disabled until the cacheless SWA logic is fixed [TAG_NO_CACHE_ISWA]
19380
+ case LLM_ARCH_GEMMA_EMBEDDING:
19382
19381
  case LLM_ARCH_DREAM:
19383
19382
  case LLM_ARCH_LLADA:
19384
19383
  case LLM_ARCH_LLADA_MOE:
@@ -19671,7 +19670,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
19671
19670
  } break;
19672
19671
  case LLM_ARCH_GEMMA_EMBEDDING:
19673
19672
  {
19674
- llm = std::make_unique<llm_build_gemma_embedding_iswa>(*this, params);
19673
+ llm = std::make_unique<llm_build_gemma_embedding>(*this, params);
19675
19674
  } break;
19676
19675
  case LLM_ARCH_STARCODER2:
19677
19676
  {
@@ -312,6 +312,7 @@ struct llama_model * llama_model_load_from_splits(
312
312
  LLAMA_LOG_ERROR("%s: list of splits is empty\n", __func__);
313
313
  return nullptr;
314
314
  }
315
+ splits.reserve(n_paths);
315
316
  for (size_t i = 0; i < n_paths; ++i) {
316
317
  splits.push_back(paths[i]);
317
318
  }