@fugood/llama.node 1.4.14 → 1.5.0-rc.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. package/lib/binding.ts +13 -6
  2. package/lib/index.js +2 -2
  3. package/lib/index.ts +8 -3
  4. package/package.json +15 -15
  5. package/scripts/llama.cpp.patch +77 -65
  6. package/src/LlamaContext.cpp +31 -34
  7. package/src/llama.cpp/CMakeLists.txt +24 -8
  8. package/src/llama.cpp/common/CMakeLists.txt +15 -34
  9. package/src/llama.cpp/common/arg.cpp +59 -10
  10. package/src/llama.cpp/common/chat-parser.cpp +115 -0
  11. package/src/llama.cpp/common/chat.cpp +356 -34
  12. package/src/llama.cpp/common/chat.h +17 -13
  13. package/src/llama.cpp/common/common.cpp +0 -1
  14. package/src/llama.cpp/common/common.h +30 -25
  15. package/src/llama.cpp/common/debug.cpp +165 -0
  16. package/src/llama.cpp/common/debug.h +43 -0
  17. package/src/llama.cpp/common/download.cpp +12 -342
  18. package/src/llama.cpp/common/download.h +6 -0
  19. package/src/llama.cpp/common/jinja/caps.cpp +237 -0
  20. package/src/llama.cpp/common/jinja/caps.h +24 -0
  21. package/src/llama.cpp/common/jinja/lexer.cpp +341 -0
  22. package/src/llama.cpp/common/jinja/lexer.h +157 -0
  23. package/src/llama.cpp/common/jinja/parser.cpp +591 -0
  24. package/src/llama.cpp/common/jinja/parser.h +21 -0
  25. package/src/llama.cpp/common/jinja/runtime.cpp +865 -0
  26. package/src/llama.cpp/common/jinja/runtime.h +628 -0
  27. package/src/llama.cpp/common/jinja/string.cpp +207 -0
  28. package/src/llama.cpp/common/jinja/string.h +58 -0
  29. package/src/llama.cpp/common/jinja/utils.h +49 -0
  30. package/src/llama.cpp/common/jinja/value.cpp +1221 -0
  31. package/src/llama.cpp/common/jinja/value.h +464 -0
  32. package/src/llama.cpp/common/preset.cpp +12 -2
  33. package/src/llama.cpp/common/sampling.cpp +52 -19
  34. package/src/llama.cpp/ggml/include/ggml.h +39 -7
  35. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +4 -0
  36. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +63 -37
  37. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +31 -0
  38. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +18 -0
  39. package/src/llama.cpp/include/llama-cpp.h +3 -1
  40. package/src/llama.cpp/include/llama.h +29 -2
  41. package/src/llama.cpp/src/CMakeLists.txt +1 -0
  42. package/src/llama.cpp/src/llama-adapter.cpp +7 -13
  43. package/src/llama.cpp/src/llama-adapter.h +1 -3
  44. package/src/llama.cpp/src/llama-arch.cpp +35 -0
  45. package/src/llama.cpp/src/llama-arch.h +1 -0
  46. package/src/llama.cpp/src/llama-chat.cpp +20 -0
  47. package/src/llama.cpp/src/llama-chat.h +1 -0
  48. package/src/llama.cpp/src/llama-context.cpp +232 -144
  49. package/src/llama.cpp/src/llama-context.h +10 -0
  50. package/src/llama.cpp/src/llama-cparams.h +2 -0
  51. package/src/llama.cpp/src/llama-graph.cpp +31 -43
  52. package/src/llama.cpp/src/llama-hparams.cpp +0 -36
  53. package/src/llama.cpp/src/llama-hparams.h +38 -1
  54. package/src/llama.cpp/src/llama-kv-cache.cpp +201 -59
  55. package/src/llama.cpp/src/llama-kv-cache.h +0 -2
  56. package/src/llama.cpp/src/llama-mmap.cpp +13 -6
  57. package/src/llama.cpp/src/llama-model-loader.cpp +21 -7
  58. package/src/llama.cpp/src/llama-model.cpp +215 -97
  59. package/src/llama.cpp/src/llama-model.h +3 -2
  60. package/src/llama.cpp/src/llama-sampling.cpp +170 -13
  61. package/src/llama.cpp/src/llama-vocab.cpp +37 -24
  62. package/src/llama.cpp/src/llama-vocab.h +1 -0
  63. package/src/llama.cpp/src/models/exaone-moe.cpp +146 -0
  64. package/src/llama.cpp/src/models/gemma3n-iswa.cpp +13 -3
  65. package/src/llama.cpp/src/models/models.h +13 -2
  66. package/src/llama.cpp/src/models/qwen3next.cpp +198 -182
@@ -96,11 +96,9 @@ void llm_graph_input_pos_bucket::set_input(const llama_ubatch * ubatch) {
96
96
 
97
97
  int32_t * data = (int32_t *) pos_bucket->data;
98
98
 
99
- for (int h = 0; h < 1; ++h) {
100
- for (int j = 0; j < n_tokens; ++j) {
101
- for (int i = 0; i < n_tokens; ++i) {
102
- data[h*(n_tokens*n_tokens) + j*n_tokens + i] = llama_relative_position_bucket(ubatch->pos[i], ubatch->pos[j], hparams.n_rel_attn_bkts, true);
103
- }
99
+ for (int j = 0; j < n_tokens; ++j) {
100
+ for (int i = 0; i < n_tokens; ++i) {
101
+ data[j*n_tokens + i] = llama_relative_position_bucket(ubatch->pos[i], ubatch->pos[j], hparams.n_rel_attn_bkts, true);
104
102
  }
105
103
  }
106
104
  }
@@ -323,34 +321,32 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
323
321
  const int64_t n_tokens = ubatch->n_tokens;
324
322
 
325
323
  const auto fill_mask = [&](float * data, int n_swa, llama_swa_type swa_type) {
326
- for (int h = 0; h < 1; ++h) {
327
- for (int i1 = 0; i1 < n_tokens; ++i1) {
328
- const llama_seq_id s1 = ubatch->seq_id[i1][0];
329
- const llama_pos p1 = ubatch->pos[i1];
324
+ for (int i1 = 0; i1 < n_tokens; ++i1) {
325
+ const llama_seq_id s1 = ubatch->seq_id[i1][0];
326
+ const llama_pos p1 = ubatch->pos[i1];
330
327
 
331
- const uint64_t idst = h*(n_kv*n_tokens) + i1*n_kv;
328
+ const uint64_t idst = i1*n_kv;
332
329
 
333
- for (int i0 = 0; i0 < n_tokens; ++i0) {
334
- const llama_seq_id s0 = ubatch->seq_id[i0][0];
335
- const llama_pos p0 = ubatch->pos[i0];
336
-
337
- // mask different sequences
338
- if (s0 != s1) {
339
- continue;
340
- }
330
+ for (int i0 = 0; i0 < n_tokens; ++i0) {
331
+ const llama_seq_id s0 = ubatch->seq_id[i0][0];
332
+ const llama_pos p0 = ubatch->pos[i0];
341
333
 
342
- // mask future tokens
343
- if (cparams.causal_attn && p0 > p1) {
344
- continue;
345
- }
334
+ // mask different sequences
335
+ if (s0 != s1) {
336
+ continue;
337
+ }
346
338
 
347
- // apply SWA if any
348
- if (llama_hparams::is_masked_swa(n_swa, swa_type, p0, p1)) {
349
- continue;
350
- }
339
+ // mask future tokens
340
+ if (cparams.causal_attn && p0 > p1) {
341
+ continue;
342
+ }
351
343
 
352
- data[idst + i0] = hparams.use_alibi ? -std::abs(p0 - p1) : 0.0f;
344
+ // apply SWA if any
345
+ if (llama_hparams::is_masked_swa(n_swa, swa_type, p0, p1)) {
346
+ continue;
353
347
  }
348
+
349
+ data[idst + i0] = hparams.use_alibi ? -std::abs(p0 - p1) : 0.0f;
354
350
  }
355
351
  }
356
352
  };
@@ -454,27 +450,19 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
454
450
 
455
451
  float * data = (float *) cross_kq_mask->data;
456
452
 
457
- for (int h = 0; h < 1; ++h) {
458
- for (int i = 0; i < n_tokens; ++i) {
459
- for (int j = 0; j < n_enc; ++j) {
460
- float f = -INFINITY;
453
+ for (int i = 0; i < n_tokens; ++i) {
454
+ for (int j = 0; j < n_enc; ++j) {
455
+ float f = -INFINITY;
461
456
 
462
- for (int s = 0; s < ubatch->n_seq_id[i]; ++s) {
463
- const llama_seq_id seq_id = ubatch->seq_id[i][s];
457
+ for (int s = 0; s < ubatch->n_seq_id[i]; ++s) {
458
+ const llama_seq_id seq_id = ubatch->seq_id[i][s];
464
459
 
465
- if (cross->seq_ids_enc[j].find(seq_id) != cross->seq_ids_enc[j].end()) {
466
- f = 0.0f;
467
- }
460
+ if (cross->seq_ids_enc[j].find(seq_id) != cross->seq_ids_enc[j].end()) {
461
+ f = 0.0f;
468
462
  }
469
-
470
- data[h*(n_enc*n_tokens) + i*n_enc + j] = f;
471
463
  }
472
- }
473
464
 
474
- for (int i = n_tokens; i < n_tokens; ++i) {
475
- for (int j = 0; j < n_enc; ++j) {
476
- data[h*(n_enc*n_tokens) + i*n_enc + j] = -INFINITY;
477
- }
465
+ data[i*n_enc + j] = f;
478
466
  }
479
467
  }
480
468
  }
@@ -200,42 +200,6 @@ uint32_t llama_hparams::n_layer_kv() const {
200
200
  return res;
201
201
  }
202
202
 
203
- bool llama_hparams::is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1) {
204
- assert(p0 >= 0 && p1 >= 0);
205
-
206
- switch (swa_type) {
207
- case LLAMA_SWA_TYPE_NONE:
208
- {
209
- } break;
210
- case LLAMA_SWA_TYPE_STANDARD:
211
- {
212
- if (p1 - p0 >= (int32_t) n_swa) {
213
- return true;
214
- }
215
- } break;
216
- case LLAMA_SWA_TYPE_CHUNKED:
217
- {
218
- const llama_pos pos_chunk_start = (p1 / n_swa) * n_swa;
219
-
220
- if (p0 < pos_chunk_start) {
221
- return true;
222
- }
223
- } break;
224
- case LLAMA_SWA_TYPE_SYMMETRIC:
225
- {
226
- const int32_t half_n_swa = (int32_t) n_swa / 2;
227
- const int32_t pos_diff = p1 - p0;
228
-
229
- // Mask if outside the symmetric window
230
- if (pos_diff < -half_n_swa || pos_diff > half_n_swa) {
231
- return true;
232
- }
233
- } break;
234
- }
235
-
236
- return false;
237
- }
238
-
239
203
  bool llama_hparams::use_mrope() const {
240
204
  return rope_sections[0] > 0 && rope_sections[1] > 0;
241
205
  }
@@ -3,6 +3,7 @@
3
3
  #include "llama.h"
4
4
 
5
5
  #include <array>
6
+ #include <cassert>
6
7
 
7
8
  // bump if necessary
8
9
  #define LLAMA_MAX_LAYERS 512
@@ -274,9 +275,45 @@ struct llama_hparams {
274
275
  uint32_t n_layer_kv() const;
275
276
 
276
277
  // note that this function uses different SWA parameters from those in the hparams
278
+ // note: inlined on purpose for performance reasons
277
279
  // TODO: think of a better place for this function
278
280
  // TODO: pack the SWA params in a struct?
279
- static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1);
281
+ static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1) {
282
+ assert(p0 >= 0 && p1 >= 0);
283
+
284
+ switch (swa_type) {
285
+ case LLAMA_SWA_TYPE_NONE:
286
+ {
287
+ } break;
288
+ case LLAMA_SWA_TYPE_STANDARD:
289
+ {
290
+ if (p1 - p0 >= (int32_t) n_swa) {
291
+ return true;
292
+ }
293
+ } break;
294
+ case LLAMA_SWA_TYPE_CHUNKED:
295
+ {
296
+ const llama_pos pos_chunk_start = (p1 / n_swa) * n_swa;
297
+
298
+ if (p0 < pos_chunk_start) {
299
+ return true;
300
+ }
301
+ } break;
302
+ case LLAMA_SWA_TYPE_SYMMETRIC:
303
+ {
304
+ const int32_t half_n_swa = (int32_t) n_swa / 2;
305
+ const int32_t pos_diff = p1 - p0;
306
+
307
+ // Mask if outside the symmetric window
308
+ if (pos_diff < -half_n_swa || pos_diff > half_n_swa) {
309
+ return true;
310
+ }
311
+ } break;
312
+ }
313
+
314
+ return false;
315
+ }
316
+
280
317
 
281
318
  bool use_mrope() const;
282
319
  };
@@ -852,7 +852,7 @@ llama_kv_cache::slot_info llama_kv_cache::find_slot(const llama_ubatch & ubatch,
852
852
  const llama_seq_id seq_id_cell = cells.seq_get(idx);
853
853
 
854
854
  // SWA mask
855
- if (is_masked_swa(pos_cell, cells.seq_pos_max(seq_id_cell) + 1)) {
855
+ if (llama_hparams::is_masked_swa(n_swa, swa_type, pos_cell, cells.seq_pos_max(seq_id_cell) + 1)) {
856
856
  can_use = true;
857
857
  }
858
858
  }
@@ -1237,90 +1237,236 @@ void llama_kv_cache::set_input_k_shift(ggml_tensor * dst) const {
1237
1237
  }
1238
1238
  }
1239
1239
 
1240
- void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const {
1241
- const uint32_t n_tokens = ubatch->n_tokens;
1240
+ struct args_set_input_kq_mask {
1241
+ const llama_hparams & hparams;
1242
+ const llama_ubatch * ubatch;
1242
1243
 
1243
- GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
1244
- float * data = (float *) dst->data;
1244
+ const std::vector<llama_kv_cells> & v_cells;
1245
+ const std::vector<uint32_t> & seq_to_stream;
1245
1246
 
1246
- const int64_t n_kv = dst->ne[0];
1247
- const int64_t n_stream = dst->ne[3]; // num streams in the current ubatch
1247
+ uint32_t n_swa;
1248
+ llama_swa_type swa_type;
1248
1249
 
1249
- GGML_ASSERT(n_tokens%n_stream == 0);
1250
+ int64_t n_kv;
1251
+ int64_t n_stream;
1252
+ int64_t n_tps;
1253
+ };
1250
1254
 
1251
- // n_tps == n_tokens_per_stream
1252
- const int64_t n_tps = n_tokens/n_stream;
1255
+ template<bool causal, bool swa, bool is_2d, bool alibi>
1256
+ static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float * data) {
1257
+ //const auto & hparams = args.hparams;
1258
+ const auto & ubatch = args.ubatch;
1253
1259
 
1254
- std::fill(data, data + ggml_nelements(dst), -INFINITY);
1255
-
1256
- // Use only the previous KV cells of the correct sequence for each token of the ubatch.
1257
- // It's assumed that if a token in the batch has multiple sequences, they are equivalent.
1258
- // Example with a cache of 10 tokens, 2 tokens populated in cache and 3 tokens in batch:
1259
- // Causal mask:
1260
- // xxx-------
1261
- // xxxx------
1262
- // xxxxx-----
1263
- // Non-causal mask:
1264
- // xxxxx-----
1265
- // xxxxx-----
1266
- // xxxxx-----
1267
- // To visualize the mask, see https://github.com/ggml-org/llama.cpp/pull/12615
1268
- // TODO: optimize this section
1269
- for (uint32_t h = 0; h < 1; ++h) {
1270
- for (uint32_t s = 0; s < n_stream; ++s) {
1271
- for (uint32_t ii = 0; ii < n_tps; ++ii) {
1272
- const uint32_t i = s*n_tps + ii;
1260
+ const auto & v_cells = args.v_cells;
1261
+ const auto & seq_to_stream = args.seq_to_stream;
1273
1262
 
1274
- const llama_seq_id seq_id = ubatch->seq_id[i][0];
1263
+ const uint32_t n_swa = args.n_swa;
1264
+ const llama_swa_type swa_type = args.swa_type;
1275
1265
 
1276
- const auto & cells = v_cells[seq_to_stream[seq_id]];
1266
+ const int64_t n_kv = args.n_kv;
1267
+ const int64_t n_stream = args.n_stream;
1268
+ const int64_t n_tps = args.n_tps;
1277
1269
 
1278
- const llama_pos p1 = ubatch->pos[i];
1270
+ // the min position in the batch for each sequence
1271
+ llama_pos seq_pos_min[LLAMA_MAX_SEQ];
1272
+ std::fill(seq_pos_min, seq_pos_min + LLAMA_MAX_SEQ, INT32_MAX);
1279
1273
 
1280
- // for M-RoPE
1281
- const bool is_2d = ubatch->is_pos_2d();
1282
- const llama_pos p1_x = is_2d ? ubatch->pos[i + ubatch->n_tokens*2] : 0;
1283
- const llama_pos p1_y = is_2d ? ubatch->pos[i + ubatch->n_tokens] : 0;
1274
+ for (uint32_t i = 0; i < ubatch->n_tokens; ++i) {
1275
+ const llama_seq_id seq_id = ubatch->seq_id[i][0];
1284
1276
 
1285
- const uint64_t idst = n_kv*(h*n_stream*n_tps + s*n_tps + ii);
1277
+ seq_pos_min[seq_id] = std::min(seq_pos_min[seq_id], ubatch->pos[i]);
1278
+ }
1286
1279
 
1287
- for (uint32_t j = 0; j < n_kv; ++j) {
1288
- if (cells.is_empty(j)) {
1289
- continue;
1290
- }
1280
+ for (uint32_t s = 0; s < n_stream; ++s) {
1281
+ // bookeeping of the KQ mask cells that could change for other tokens of the same sequence
1282
+ std::unordered_map<llama_seq_id, uint32_t> seq_srct;
1283
+ std::unordered_map<llama_seq_id, std::vector<uint32_t>> seq_idxs;
1284
+
1285
+ for (uint32_t ii = 0; ii < n_tps; ++ii) {
1286
+ const uint32_t i = s*n_tps + ii;
1287
+
1288
+ const llama_seq_id seq_id = ubatch->seq_id[i][0];
1289
+
1290
+ const auto & cells = v_cells.at(seq_to_stream[seq_id]);
1291
+
1292
+ llama_pos p0 = -1;
1293
+ const llama_pos p1 = ubatch->pos[i];
1294
+
1295
+ // for M-RoPE
1296
+ const llama_pos p1_x = is_2d ? ubatch->pos[i + ubatch->n_tokens*2] : 0;
1297
+ const llama_pos p1_y = is_2d ? ubatch->pos[i + ubatch->n_tokens] : 0;
1298
+
1299
+ const uint64_t idst = n_kv*i;
1300
+
1301
+ // for tokens of the same sequence, the mask is mostly the same, so we can reuse it
1302
+ // the only cells that could change are the ones that are with similar positions as the
1303
+ // ones in the batch (i.e. due to causal masking, SWA, etc.)
1304
+ // keep track of those cells and shortcut the loop to save time
1305
+ // note: this optimization is not compatible with Alibi position encoding
1306
+ // ref: https://github.com/ggml-org/llama.cpp/pull/18842
1307
+ bool prev = false;
1291
1308
 
1292
- // mask the token if not the same sequence
1293
- if (!cells.seq_has(j, seq_id)) {
1294
- continue;
1309
+ auto & idxs = seq_idxs[seq_id];
1310
+
1311
+ if (!alibi) {
1312
+ if (seq_srct.find(seq_id) != seq_srct.end()) {
1313
+ const uint32_t srct = seq_srct[seq_id];
1314
+
1315
+ const uint64_t idst_prev = n_kv*srct;
1316
+
1317
+ std::copy(data + idst_prev, data + idst_prev + n_kv, data + idst);
1318
+
1319
+ prev = true;
1320
+ } else {
1321
+ idxs.clear();
1322
+ idxs.reserve(ubatch->n_tokens + n_swa + 32);
1323
+
1324
+ seq_srct[seq_id] = i;
1325
+ }
1326
+ }
1327
+
1328
+ for (uint32_t jj = 0; jj < n_kv; ++jj) {
1329
+ uint32_t j = jj;
1330
+
1331
+ // we have an exiting mask for this sequence -> update just seq_idxs
1332
+ if (!alibi) {
1333
+ if (prev) {
1334
+ if (jj >= idxs.size()) {
1335
+ break;
1336
+ }
1337
+
1338
+ j = idxs[jj];
1295
1339
  }
1340
+ }
1341
+
1342
+ if (cells.is_empty(j)) {
1343
+ goto skip;
1344
+ }
1345
+
1346
+ // mask the token if not the same sequence
1347
+ if (!cells.seq_has(j, seq_id)) {
1348
+ goto skip;
1349
+ }
1350
+
1351
+ p0 = cells.pos_get(j);
1296
1352
 
1297
- const llama_pos p0 = cells.pos_get(j);
1353
+ if (!alibi) {
1354
+ if (!prev) {
1355
+ // record all cells for which: p0 >= seq_pos_min[seq_id] - n_swa - 32
1356
+ if (p0 + (int32_t) (n_swa + 32) >= seq_pos_min[seq_id]) {
1357
+ idxs.push_back(j);
1358
+ }
1359
+ }
1360
+ }
1298
1361
 
1362
+ if (causal) {
1299
1363
  // mask future tokens
1300
- if (causal_attn && p0 > p1) {
1301
- continue;
1364
+ if (p0 > p1) {
1365
+ goto skip;
1302
1366
  }
1303
1367
 
1304
1368
  // M-RoPE causal mask
1305
- if (causal_attn && is_2d && p0 == p1) {
1306
- const auto & p0_ext = cells.ext_get(j);
1307
- if (p0_ext.is_2d_gt(p1_x, p1_y)) {
1308
- continue;
1369
+ if (is_2d) {
1370
+ if (p0 == p1) {
1371
+ const auto & p0_ext = cells.ext_get(j);
1372
+
1373
+ if (p0_ext.is_2d_gt(p1_x, p1_y)) {
1374
+ goto skip;
1375
+ }
1309
1376
  }
1310
1377
  }
1378
+ }
1311
1379
 
1312
- // apply SWA if any
1313
- if (is_masked_swa(p0, p1)) {
1314
- continue;
1380
+ // apply SWA if any
1381
+ if (swa) {
1382
+ if (llama_hparams::is_masked_swa(n_swa, swa_type, p0, p1)) {
1383
+ goto skip;
1315
1384
  }
1385
+ }
1316
1386
 
1317
- data[idst + j] = hparams.use_alibi ? -std::abs(p0 - p1) : 0.0f;
1387
+ if (alibi) {
1388
+ data[idst + j] = -std::abs(p0 - p1);
1389
+ } else {
1390
+ data[idst + j] = 0.0f;
1318
1391
  }
1392
+
1393
+ continue;
1394
+ skip:
1395
+ data[idst + j] = -INFINITY;
1319
1396
  }
1320
1397
  }
1321
1398
  }
1322
1399
  }
1323
1400
 
1401
+ template<bool causal, bool swa, bool is_2d>
1402
+ static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float * data) {
1403
+ const bool alibi = args.hparams.use_alibi;
1404
+ if (alibi) {
1405
+ set_input_kq_mask_impl<causal, swa, is_2d, true> (args, data);
1406
+ } else {
1407
+ set_input_kq_mask_impl<causal, swa, is_2d, false>(args, data);
1408
+ }
1409
+ }
1410
+
1411
+ template<bool causal, bool swa>
1412
+ static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float * data) {
1413
+ const bool is_2d = args.ubatch->is_pos_2d();
1414
+ if (is_2d) {
1415
+ set_input_kq_mask_impl<causal, swa, true> (args, data);
1416
+ } else {
1417
+ set_input_kq_mask_impl<causal, swa, false>(args, data);
1418
+ }
1419
+ }
1420
+
1421
+ template<bool causal>
1422
+ static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float * data) {
1423
+ const bool swa = args.swa_type != LLAMA_SWA_TYPE_NONE;
1424
+ if (swa) {
1425
+ set_input_kq_mask_impl<causal, true> (args, data);
1426
+ } else {
1427
+ set_input_kq_mask_impl<causal, false>(args, data);
1428
+ }
1429
+ }
1430
+
1431
+ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const {
1432
+ const uint32_t n_tokens = ubatch->n_tokens;
1433
+
1434
+ GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
1435
+ float * data = (float *) dst->data;
1436
+
1437
+ const int64_t n_kv = dst->ne[0];
1438
+ const int64_t n_stream = dst->ne[3]; // num streams in the current ubatch
1439
+
1440
+ GGML_ASSERT(n_tokens%n_stream == 0);
1441
+
1442
+ // n_tps == n_tokens_per_stream
1443
+ const int64_t n_tps = n_tokens/n_stream;
1444
+
1445
+ //const int64_t t_start = ggml_time_us();
1446
+
1447
+ const args_set_input_kq_mask args = {
1448
+ /*.hparams =*/ hparams,
1449
+ /*.ubatch =*/ ubatch,
1450
+ /*.v_cells =*/ v_cells,
1451
+ /*.seq_to_stream =*/ seq_to_stream,
1452
+ /*.n_swa =*/ n_swa,
1453
+ /*.swa_type =*/ swa_type,
1454
+ /*.n_kv =*/ n_kv,
1455
+ /*.n_stream =*/ n_stream,
1456
+ /*.n_tps =*/ n_tps,
1457
+ };
1458
+
1459
+ if (causal_attn) {
1460
+ set_input_kq_mask_impl<true> (args, data);
1461
+ } else {
1462
+ set_input_kq_mask_impl<false>(args, data);
1463
+ }
1464
+
1465
+ //const int64_t t_end = ggml_time_us();
1466
+
1467
+ //LLAMA_LOG_ERROR("%s: kq mask time: %0.3f ms\n", __func__, (t_end - t_start)/1000.0);
1468
+ }
1469
+
1324
1470
  void llama_kv_cache::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const {
1325
1471
  const int64_t n_tokens = ubatch->n_tokens;
1326
1472
 
@@ -1483,10 +1629,6 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co
1483
1629
  return gf;
1484
1630
  }
1485
1631
 
1486
- bool llama_kv_cache::is_masked_swa(llama_pos p0, llama_pos p1) const {
1487
- return llama_hparams::is_masked_swa(n_swa, swa_type, p0, p1);
1488
- }
1489
-
1490
1632
  void llama_kv_cache::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
1491
1633
  GGML_UNUSED(flags);
1492
1634
 
@@ -257,8 +257,6 @@ private:
257
257
  size_t size_k_bytes() const;
258
258
  size_t size_v_bytes() const;
259
259
 
260
- bool is_masked_swa(llama_pos p0, llama_pos p1) const;
261
-
262
260
  ggml_tensor * build_rope_shift(
263
261
  const llama_cparams & cparams,
264
262
  ggml_context * ctx,
@@ -244,11 +244,14 @@ struct llama_file::impl {
244
244
  }
245
245
  errno = 0;
246
246
  if (fd == -1) {
247
- std::size_t ret = std::fread(ptr, len, 1, fp);
247
+ const size_t curr_off = tell();
248
+ const size_t to_read = std::min(len, size - curr_off);
249
+
250
+ std::size_t ret = std::fread(ptr, to_read, 1, fp);
248
251
  if (ferror(fp)) {
249
252
  throw std::runtime_error(format("read error: %s", strerror(errno)));
250
253
  }
251
- if (ret != 1) {
254
+ if (to_read > 0 && ret != 1) {
252
255
  throw std::runtime_error("unexpectedly reached end of file");
253
256
  }
254
257
  } else {
@@ -262,7 +265,8 @@ struct llama_file::impl {
262
265
  continue; // Interrupted by signal, retry
263
266
  }
264
267
  // Fallback to std::fread in case the DMA controller cannot access the buffer
265
- if (errno == EFAULT) {
268
+ if (errno == EFAULT || errno == EINVAL) {
269
+ LLAMA_LOG_WARN("%s: Falling back to buffered IO due to %s\n", __func__, strerror(errno));
266
270
  auto curr_off = tell();
267
271
  close(fd);
268
272
  fd = -1;
@@ -381,6 +385,9 @@ int llama_file::file_id() const {
381
385
  #ifdef _WIN32
382
386
  return _fileno(pimpl->fp);
383
387
  #else
388
+ if (pimpl->fd != -1) {
389
+ return pimpl->fd;
390
+ }
384
391
  #if defined(fileno)
385
392
  return fileno(pimpl->fp);
386
393
  #else
@@ -611,9 +618,9 @@ struct llama_mlock::impl {
611
618
 
612
619
  char* errmsg = std::strerror(errno);
613
620
  bool suggest = (errno == ENOMEM);
614
- #if defined(TARGET_OS_VISION) || defined(TARGET_OS_TV) || defined(_AIX)
615
- // visionOS/tvOS dont't support RLIMIT_MEMLOCK
616
- // Skip resource limit checks on visionOS/tvOS
621
+ #if defined(TARGET_OS_VISION) || defined(TARGET_OS_TV) || defined(_AIX) || defined(__HAIKU__)
622
+ // visionOS/tvOS/Haiku don't support RLIMIT_MEMLOCK
623
+ // Skip resource limit checks on these platforms
617
624
  suggest = false;
618
625
  #else
619
626
  struct rlimit lock_limit;
@@ -2,6 +2,7 @@
2
2
 
3
3
  #include "ggml.h"
4
4
 
5
+ #include <algorithm>
5
6
  #include <array>
6
7
  #include <cinttypes>
7
8
  #include <cstring>
@@ -344,6 +345,7 @@ namespace GGUFMeta {
344
345
  GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx, kid);
345
346
 
346
347
  switch (arr_info.gt) {
348
+ case GGUF_TYPE_BOOL:
347
349
  case GGUF_TYPE_UINT32:
348
350
  case GGUF_TYPE_INT32: GGML_ASSERT((std::is_same<T, int32_t>::value) ||
349
351
  (std::is_same<T, uint32_t>::value)); break;
@@ -365,7 +367,13 @@ namespace GGUFMeta {
365
367
  result[i] = value;
366
368
  }
367
369
  } else {
368
- std::copy((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length, result.begin());
370
+ if (arr_info.gt == GGUF_TYPE_BOOL) {
371
+ std::transform((const bool *)arr_info.data, (const bool *)arr_info.data + arr_info.length, result.begin(), [](bool x) {
372
+ return static_cast<T>(x);
373
+ });
374
+ } else {
375
+ std::copy((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length, result.begin());
376
+ }
369
377
  }
370
378
 
371
379
  return true;
@@ -531,12 +539,18 @@ llama_model_loader::llama_model_loader(
531
539
  files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io));
532
540
  contexts.emplace_back(ctx);
533
541
 
534
- use_direct_io = use_direct_io && files.back()->has_direct_io();
535
-
536
- // Disable mmap in case Direct I/O is enabled and available
537
- if (use_direct_io && use_mmap) {
538
- use_mmap = false;
539
- LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__);
542
+ if (use_mmap && use_direct_io) {
543
+ if (files.back()->has_direct_io()) {
544
+ // Disable mmap, as DirectIO is available
545
+ use_mmap = false;
546
+ LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__);
547
+ } else {
548
+ // Disable DirectIO and reopen file using std::fopen for mmap
549
+ use_direct_io = false;
550
+ files.pop_back();
551
+ files.emplace_back(new llama_file(fname.c_str(), "rb", false));
552
+ LLAMA_LOG_WARN("%s: direct I/O is not available, using mmap\n", __func__);
553
+ }
540
554
  }
541
555
 
542
556
  // Save tensors data offset of the main file.