@fugood/llama.node 1.4.14 → 1.5.0-rc.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +13 -6
- package/lib/index.js +2 -2
- package/lib/index.ts +8 -3
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +77 -65
- package/src/LlamaContext.cpp +31 -34
- package/src/llama.cpp/CMakeLists.txt +24 -8
- package/src/llama.cpp/common/CMakeLists.txt +15 -34
- package/src/llama.cpp/common/arg.cpp +59 -10
- package/src/llama.cpp/common/chat-parser.cpp +115 -0
- package/src/llama.cpp/common/chat.cpp +356 -34
- package/src/llama.cpp/common/chat.h +17 -13
- package/src/llama.cpp/common/common.cpp +0 -1
- package/src/llama.cpp/common/common.h +30 -25
- package/src/llama.cpp/common/debug.cpp +165 -0
- package/src/llama.cpp/common/debug.h +43 -0
- package/src/llama.cpp/common/download.cpp +12 -342
- package/src/llama.cpp/common/download.h +6 -0
- package/src/llama.cpp/common/jinja/caps.cpp +237 -0
- package/src/llama.cpp/common/jinja/caps.h +24 -0
- package/src/llama.cpp/common/jinja/lexer.cpp +341 -0
- package/src/llama.cpp/common/jinja/lexer.h +157 -0
- package/src/llama.cpp/common/jinja/parser.cpp +591 -0
- package/src/llama.cpp/common/jinja/parser.h +21 -0
- package/src/llama.cpp/common/jinja/runtime.cpp +865 -0
- package/src/llama.cpp/common/jinja/runtime.h +628 -0
- package/src/llama.cpp/common/jinja/string.cpp +207 -0
- package/src/llama.cpp/common/jinja/string.h +58 -0
- package/src/llama.cpp/common/jinja/utils.h +49 -0
- package/src/llama.cpp/common/jinja/value.cpp +1221 -0
- package/src/llama.cpp/common/jinja/value.h +464 -0
- package/src/llama.cpp/common/preset.cpp +12 -2
- package/src/llama.cpp/common/sampling.cpp +52 -19
- package/src/llama.cpp/ggml/include/ggml.h +39 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +63 -37
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +31 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +18 -0
- package/src/llama.cpp/include/llama-cpp.h +3 -1
- package/src/llama.cpp/include/llama.h +29 -2
- package/src/llama.cpp/src/CMakeLists.txt +1 -0
- package/src/llama.cpp/src/llama-adapter.cpp +7 -13
- package/src/llama.cpp/src/llama-adapter.h +1 -3
- package/src/llama.cpp/src/llama-arch.cpp +35 -0
- package/src/llama.cpp/src/llama-arch.h +1 -0
- package/src/llama.cpp/src/llama-chat.cpp +20 -0
- package/src/llama.cpp/src/llama-chat.h +1 -0
- package/src/llama.cpp/src/llama-context.cpp +232 -144
- package/src/llama.cpp/src/llama-context.h +10 -0
- package/src/llama.cpp/src/llama-cparams.h +2 -0
- package/src/llama.cpp/src/llama-graph.cpp +31 -43
- package/src/llama.cpp/src/llama-hparams.cpp +0 -36
- package/src/llama.cpp/src/llama-hparams.h +38 -1
- package/src/llama.cpp/src/llama-kv-cache.cpp +201 -59
- package/src/llama.cpp/src/llama-kv-cache.h +0 -2
- package/src/llama.cpp/src/llama-mmap.cpp +13 -6
- package/src/llama.cpp/src/llama-model-loader.cpp +21 -7
- package/src/llama.cpp/src/llama-model.cpp +215 -97
- package/src/llama.cpp/src/llama-model.h +3 -2
- package/src/llama.cpp/src/llama-sampling.cpp +170 -13
- package/src/llama.cpp/src/llama-vocab.cpp +37 -24
- package/src/llama.cpp/src/llama-vocab.h +1 -0
- package/src/llama.cpp/src/models/exaone-moe.cpp +146 -0
- package/src/llama.cpp/src/models/gemma3n-iswa.cpp +13 -3
- package/src/llama.cpp/src/models/models.h +13 -2
- package/src/llama.cpp/src/models/qwen3next.cpp +198 -182
|
@@ -96,11 +96,9 @@ void llm_graph_input_pos_bucket::set_input(const llama_ubatch * ubatch) {
|
|
|
96
96
|
|
|
97
97
|
int32_t * data = (int32_t *) pos_bucket->data;
|
|
98
98
|
|
|
99
|
-
for (int
|
|
100
|
-
for (int
|
|
101
|
-
|
|
102
|
-
data[h*(n_tokens*n_tokens) + j*n_tokens + i] = llama_relative_position_bucket(ubatch->pos[i], ubatch->pos[j], hparams.n_rel_attn_bkts, true);
|
|
103
|
-
}
|
|
99
|
+
for (int j = 0; j < n_tokens; ++j) {
|
|
100
|
+
for (int i = 0; i < n_tokens; ++i) {
|
|
101
|
+
data[j*n_tokens + i] = llama_relative_position_bucket(ubatch->pos[i], ubatch->pos[j], hparams.n_rel_attn_bkts, true);
|
|
104
102
|
}
|
|
105
103
|
}
|
|
106
104
|
}
|
|
@@ -323,34 +321,32 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
|
|
|
323
321
|
const int64_t n_tokens = ubatch->n_tokens;
|
|
324
322
|
|
|
325
323
|
const auto fill_mask = [&](float * data, int n_swa, llama_swa_type swa_type) {
|
|
326
|
-
for (int
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
const llama_pos p1 = ubatch->pos[i1];
|
|
324
|
+
for (int i1 = 0; i1 < n_tokens; ++i1) {
|
|
325
|
+
const llama_seq_id s1 = ubatch->seq_id[i1][0];
|
|
326
|
+
const llama_pos p1 = ubatch->pos[i1];
|
|
330
327
|
|
|
331
|
-
|
|
328
|
+
const uint64_t idst = i1*n_kv;
|
|
332
329
|
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
// mask different sequences
|
|
338
|
-
if (s0 != s1) {
|
|
339
|
-
continue;
|
|
340
|
-
}
|
|
330
|
+
for (int i0 = 0; i0 < n_tokens; ++i0) {
|
|
331
|
+
const llama_seq_id s0 = ubatch->seq_id[i0][0];
|
|
332
|
+
const llama_pos p0 = ubatch->pos[i0];
|
|
341
333
|
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
334
|
+
// mask different sequences
|
|
335
|
+
if (s0 != s1) {
|
|
336
|
+
continue;
|
|
337
|
+
}
|
|
346
338
|
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
339
|
+
// mask future tokens
|
|
340
|
+
if (cparams.causal_attn && p0 > p1) {
|
|
341
|
+
continue;
|
|
342
|
+
}
|
|
351
343
|
|
|
352
|
-
|
|
344
|
+
// apply SWA if any
|
|
345
|
+
if (llama_hparams::is_masked_swa(n_swa, swa_type, p0, p1)) {
|
|
346
|
+
continue;
|
|
353
347
|
}
|
|
348
|
+
|
|
349
|
+
data[idst + i0] = hparams.use_alibi ? -std::abs(p0 - p1) : 0.0f;
|
|
354
350
|
}
|
|
355
351
|
}
|
|
356
352
|
};
|
|
@@ -454,27 +450,19 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
|
|
|
454
450
|
|
|
455
451
|
float * data = (float *) cross_kq_mask->data;
|
|
456
452
|
|
|
457
|
-
for (int
|
|
458
|
-
for (int
|
|
459
|
-
|
|
460
|
-
float f = -INFINITY;
|
|
453
|
+
for (int i = 0; i < n_tokens; ++i) {
|
|
454
|
+
for (int j = 0; j < n_enc; ++j) {
|
|
455
|
+
float f = -INFINITY;
|
|
461
456
|
|
|
462
|
-
|
|
463
|
-
|
|
457
|
+
for (int s = 0; s < ubatch->n_seq_id[i]; ++s) {
|
|
458
|
+
const llama_seq_id seq_id = ubatch->seq_id[i][s];
|
|
464
459
|
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
}
|
|
460
|
+
if (cross->seq_ids_enc[j].find(seq_id) != cross->seq_ids_enc[j].end()) {
|
|
461
|
+
f = 0.0f;
|
|
468
462
|
}
|
|
469
|
-
|
|
470
|
-
data[h*(n_enc*n_tokens) + i*n_enc + j] = f;
|
|
471
463
|
}
|
|
472
|
-
}
|
|
473
464
|
|
|
474
|
-
|
|
475
|
-
for (int j = 0; j < n_enc; ++j) {
|
|
476
|
-
data[h*(n_enc*n_tokens) + i*n_enc + j] = -INFINITY;
|
|
477
|
-
}
|
|
465
|
+
data[i*n_enc + j] = f;
|
|
478
466
|
}
|
|
479
467
|
}
|
|
480
468
|
}
|
|
@@ -200,42 +200,6 @@ uint32_t llama_hparams::n_layer_kv() const {
|
|
|
200
200
|
return res;
|
|
201
201
|
}
|
|
202
202
|
|
|
203
|
-
bool llama_hparams::is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1) {
|
|
204
|
-
assert(p0 >= 0 && p1 >= 0);
|
|
205
|
-
|
|
206
|
-
switch (swa_type) {
|
|
207
|
-
case LLAMA_SWA_TYPE_NONE:
|
|
208
|
-
{
|
|
209
|
-
} break;
|
|
210
|
-
case LLAMA_SWA_TYPE_STANDARD:
|
|
211
|
-
{
|
|
212
|
-
if (p1 - p0 >= (int32_t) n_swa) {
|
|
213
|
-
return true;
|
|
214
|
-
}
|
|
215
|
-
} break;
|
|
216
|
-
case LLAMA_SWA_TYPE_CHUNKED:
|
|
217
|
-
{
|
|
218
|
-
const llama_pos pos_chunk_start = (p1 / n_swa) * n_swa;
|
|
219
|
-
|
|
220
|
-
if (p0 < pos_chunk_start) {
|
|
221
|
-
return true;
|
|
222
|
-
}
|
|
223
|
-
} break;
|
|
224
|
-
case LLAMA_SWA_TYPE_SYMMETRIC:
|
|
225
|
-
{
|
|
226
|
-
const int32_t half_n_swa = (int32_t) n_swa / 2;
|
|
227
|
-
const int32_t pos_diff = p1 - p0;
|
|
228
|
-
|
|
229
|
-
// Mask if outside the symmetric window
|
|
230
|
-
if (pos_diff < -half_n_swa || pos_diff > half_n_swa) {
|
|
231
|
-
return true;
|
|
232
|
-
}
|
|
233
|
-
} break;
|
|
234
|
-
}
|
|
235
|
-
|
|
236
|
-
return false;
|
|
237
|
-
}
|
|
238
|
-
|
|
239
203
|
bool llama_hparams::use_mrope() const {
|
|
240
204
|
return rope_sections[0] > 0 && rope_sections[1] > 0;
|
|
241
205
|
}
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
#include "llama.h"
|
|
4
4
|
|
|
5
5
|
#include <array>
|
|
6
|
+
#include <cassert>
|
|
6
7
|
|
|
7
8
|
// bump if necessary
|
|
8
9
|
#define LLAMA_MAX_LAYERS 512
|
|
@@ -274,9 +275,45 @@ struct llama_hparams {
|
|
|
274
275
|
uint32_t n_layer_kv() const;
|
|
275
276
|
|
|
276
277
|
// note that this function uses different SWA parameters from those in the hparams
|
|
278
|
+
// note: inlined on purpose for performance reasons
|
|
277
279
|
// TODO: think of a better place for this function
|
|
278
280
|
// TODO: pack the SWA params in a struct?
|
|
279
|
-
static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1)
|
|
281
|
+
static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1) {
|
|
282
|
+
assert(p0 >= 0 && p1 >= 0);
|
|
283
|
+
|
|
284
|
+
switch (swa_type) {
|
|
285
|
+
case LLAMA_SWA_TYPE_NONE:
|
|
286
|
+
{
|
|
287
|
+
} break;
|
|
288
|
+
case LLAMA_SWA_TYPE_STANDARD:
|
|
289
|
+
{
|
|
290
|
+
if (p1 - p0 >= (int32_t) n_swa) {
|
|
291
|
+
return true;
|
|
292
|
+
}
|
|
293
|
+
} break;
|
|
294
|
+
case LLAMA_SWA_TYPE_CHUNKED:
|
|
295
|
+
{
|
|
296
|
+
const llama_pos pos_chunk_start = (p1 / n_swa) * n_swa;
|
|
297
|
+
|
|
298
|
+
if (p0 < pos_chunk_start) {
|
|
299
|
+
return true;
|
|
300
|
+
}
|
|
301
|
+
} break;
|
|
302
|
+
case LLAMA_SWA_TYPE_SYMMETRIC:
|
|
303
|
+
{
|
|
304
|
+
const int32_t half_n_swa = (int32_t) n_swa / 2;
|
|
305
|
+
const int32_t pos_diff = p1 - p0;
|
|
306
|
+
|
|
307
|
+
// Mask if outside the symmetric window
|
|
308
|
+
if (pos_diff < -half_n_swa || pos_diff > half_n_swa) {
|
|
309
|
+
return true;
|
|
310
|
+
}
|
|
311
|
+
} break;
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
return false;
|
|
315
|
+
}
|
|
316
|
+
|
|
280
317
|
|
|
281
318
|
bool use_mrope() const;
|
|
282
319
|
};
|
|
@@ -852,7 +852,7 @@ llama_kv_cache::slot_info llama_kv_cache::find_slot(const llama_ubatch & ubatch,
|
|
|
852
852
|
const llama_seq_id seq_id_cell = cells.seq_get(idx);
|
|
853
853
|
|
|
854
854
|
// SWA mask
|
|
855
|
-
if (is_masked_swa(pos_cell, cells.seq_pos_max(seq_id_cell) + 1)) {
|
|
855
|
+
if (llama_hparams::is_masked_swa(n_swa, swa_type, pos_cell, cells.seq_pos_max(seq_id_cell) + 1)) {
|
|
856
856
|
can_use = true;
|
|
857
857
|
}
|
|
858
858
|
}
|
|
@@ -1237,90 +1237,236 @@ void llama_kv_cache::set_input_k_shift(ggml_tensor * dst) const {
|
|
|
1237
1237
|
}
|
|
1238
1238
|
}
|
|
1239
1239
|
|
|
1240
|
-
|
|
1241
|
-
const
|
|
1240
|
+
struct args_set_input_kq_mask {
|
|
1241
|
+
const llama_hparams & hparams;
|
|
1242
|
+
const llama_ubatch * ubatch;
|
|
1242
1243
|
|
|
1243
|
-
|
|
1244
|
-
|
|
1244
|
+
const std::vector<llama_kv_cells> & v_cells;
|
|
1245
|
+
const std::vector<uint32_t> & seq_to_stream;
|
|
1245
1246
|
|
|
1246
|
-
|
|
1247
|
-
|
|
1247
|
+
uint32_t n_swa;
|
|
1248
|
+
llama_swa_type swa_type;
|
|
1248
1249
|
|
|
1249
|
-
|
|
1250
|
+
int64_t n_kv;
|
|
1251
|
+
int64_t n_stream;
|
|
1252
|
+
int64_t n_tps;
|
|
1253
|
+
};
|
|
1250
1254
|
|
|
1251
|
-
|
|
1252
|
-
|
|
1255
|
+
template<bool causal, bool swa, bool is_2d, bool alibi>
|
|
1256
|
+
static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float * data) {
|
|
1257
|
+
//const auto & hparams = args.hparams;
|
|
1258
|
+
const auto & ubatch = args.ubatch;
|
|
1253
1259
|
|
|
1254
|
-
|
|
1255
|
-
|
|
1256
|
-
// Use only the previous KV cells of the correct sequence for each token of the ubatch.
|
|
1257
|
-
// It's assumed that if a token in the batch has multiple sequences, they are equivalent.
|
|
1258
|
-
// Example with a cache of 10 tokens, 2 tokens populated in cache and 3 tokens in batch:
|
|
1259
|
-
// Causal mask:
|
|
1260
|
-
// xxx-------
|
|
1261
|
-
// xxxx------
|
|
1262
|
-
// xxxxx-----
|
|
1263
|
-
// Non-causal mask:
|
|
1264
|
-
// xxxxx-----
|
|
1265
|
-
// xxxxx-----
|
|
1266
|
-
// xxxxx-----
|
|
1267
|
-
// To visualize the mask, see https://github.com/ggml-org/llama.cpp/pull/12615
|
|
1268
|
-
// TODO: optimize this section
|
|
1269
|
-
for (uint32_t h = 0; h < 1; ++h) {
|
|
1270
|
-
for (uint32_t s = 0; s < n_stream; ++s) {
|
|
1271
|
-
for (uint32_t ii = 0; ii < n_tps; ++ii) {
|
|
1272
|
-
const uint32_t i = s*n_tps + ii;
|
|
1260
|
+
const auto & v_cells = args.v_cells;
|
|
1261
|
+
const auto & seq_to_stream = args.seq_to_stream;
|
|
1273
1262
|
|
|
1274
|
-
|
|
1263
|
+
const uint32_t n_swa = args.n_swa;
|
|
1264
|
+
const llama_swa_type swa_type = args.swa_type;
|
|
1275
1265
|
|
|
1276
|
-
|
|
1266
|
+
const int64_t n_kv = args.n_kv;
|
|
1267
|
+
const int64_t n_stream = args.n_stream;
|
|
1268
|
+
const int64_t n_tps = args.n_tps;
|
|
1277
1269
|
|
|
1278
|
-
|
|
1270
|
+
// the min position in the batch for each sequence
|
|
1271
|
+
llama_pos seq_pos_min[LLAMA_MAX_SEQ];
|
|
1272
|
+
std::fill(seq_pos_min, seq_pos_min + LLAMA_MAX_SEQ, INT32_MAX);
|
|
1279
1273
|
|
|
1280
|
-
|
|
1281
|
-
|
|
1282
|
-
const llama_pos p1_x = is_2d ? ubatch->pos[i + ubatch->n_tokens*2] : 0;
|
|
1283
|
-
const llama_pos p1_y = is_2d ? ubatch->pos[i + ubatch->n_tokens] : 0;
|
|
1274
|
+
for (uint32_t i = 0; i < ubatch->n_tokens; ++i) {
|
|
1275
|
+
const llama_seq_id seq_id = ubatch->seq_id[i][0];
|
|
1284
1276
|
|
|
1285
|
-
|
|
1277
|
+
seq_pos_min[seq_id] = std::min(seq_pos_min[seq_id], ubatch->pos[i]);
|
|
1278
|
+
}
|
|
1286
1279
|
|
|
1287
|
-
|
|
1288
|
-
|
|
1289
|
-
|
|
1290
|
-
|
|
1280
|
+
for (uint32_t s = 0; s < n_stream; ++s) {
|
|
1281
|
+
// bookeeping of the KQ mask cells that could change for other tokens of the same sequence
|
|
1282
|
+
std::unordered_map<llama_seq_id, uint32_t> seq_srct;
|
|
1283
|
+
std::unordered_map<llama_seq_id, std::vector<uint32_t>> seq_idxs;
|
|
1284
|
+
|
|
1285
|
+
for (uint32_t ii = 0; ii < n_tps; ++ii) {
|
|
1286
|
+
const uint32_t i = s*n_tps + ii;
|
|
1287
|
+
|
|
1288
|
+
const llama_seq_id seq_id = ubatch->seq_id[i][0];
|
|
1289
|
+
|
|
1290
|
+
const auto & cells = v_cells.at(seq_to_stream[seq_id]);
|
|
1291
|
+
|
|
1292
|
+
llama_pos p0 = -1;
|
|
1293
|
+
const llama_pos p1 = ubatch->pos[i];
|
|
1294
|
+
|
|
1295
|
+
// for M-RoPE
|
|
1296
|
+
const llama_pos p1_x = is_2d ? ubatch->pos[i + ubatch->n_tokens*2] : 0;
|
|
1297
|
+
const llama_pos p1_y = is_2d ? ubatch->pos[i + ubatch->n_tokens] : 0;
|
|
1298
|
+
|
|
1299
|
+
const uint64_t idst = n_kv*i;
|
|
1300
|
+
|
|
1301
|
+
// for tokens of the same sequence, the mask is mostly the same, so we can reuse it
|
|
1302
|
+
// the only cells that could change are the ones that are with similar positions as the
|
|
1303
|
+
// ones in the batch (i.e. due to causal masking, SWA, etc.)
|
|
1304
|
+
// keep track of those cells and shortcut the loop to save time
|
|
1305
|
+
// note: this optimization is not compatible with Alibi position encoding
|
|
1306
|
+
// ref: https://github.com/ggml-org/llama.cpp/pull/18842
|
|
1307
|
+
bool prev = false;
|
|
1291
1308
|
|
|
1292
|
-
|
|
1293
|
-
|
|
1294
|
-
|
|
1309
|
+
auto & idxs = seq_idxs[seq_id];
|
|
1310
|
+
|
|
1311
|
+
if (!alibi) {
|
|
1312
|
+
if (seq_srct.find(seq_id) != seq_srct.end()) {
|
|
1313
|
+
const uint32_t srct = seq_srct[seq_id];
|
|
1314
|
+
|
|
1315
|
+
const uint64_t idst_prev = n_kv*srct;
|
|
1316
|
+
|
|
1317
|
+
std::copy(data + idst_prev, data + idst_prev + n_kv, data + idst);
|
|
1318
|
+
|
|
1319
|
+
prev = true;
|
|
1320
|
+
} else {
|
|
1321
|
+
idxs.clear();
|
|
1322
|
+
idxs.reserve(ubatch->n_tokens + n_swa + 32);
|
|
1323
|
+
|
|
1324
|
+
seq_srct[seq_id] = i;
|
|
1325
|
+
}
|
|
1326
|
+
}
|
|
1327
|
+
|
|
1328
|
+
for (uint32_t jj = 0; jj < n_kv; ++jj) {
|
|
1329
|
+
uint32_t j = jj;
|
|
1330
|
+
|
|
1331
|
+
// we have an exiting mask for this sequence -> update just seq_idxs
|
|
1332
|
+
if (!alibi) {
|
|
1333
|
+
if (prev) {
|
|
1334
|
+
if (jj >= idxs.size()) {
|
|
1335
|
+
break;
|
|
1336
|
+
}
|
|
1337
|
+
|
|
1338
|
+
j = idxs[jj];
|
|
1295
1339
|
}
|
|
1340
|
+
}
|
|
1341
|
+
|
|
1342
|
+
if (cells.is_empty(j)) {
|
|
1343
|
+
goto skip;
|
|
1344
|
+
}
|
|
1345
|
+
|
|
1346
|
+
// mask the token if not the same sequence
|
|
1347
|
+
if (!cells.seq_has(j, seq_id)) {
|
|
1348
|
+
goto skip;
|
|
1349
|
+
}
|
|
1350
|
+
|
|
1351
|
+
p0 = cells.pos_get(j);
|
|
1296
1352
|
|
|
1297
|
-
|
|
1353
|
+
if (!alibi) {
|
|
1354
|
+
if (!prev) {
|
|
1355
|
+
// record all cells for which: p0 >= seq_pos_min[seq_id] - n_swa - 32
|
|
1356
|
+
if (p0 + (int32_t) (n_swa + 32) >= seq_pos_min[seq_id]) {
|
|
1357
|
+
idxs.push_back(j);
|
|
1358
|
+
}
|
|
1359
|
+
}
|
|
1360
|
+
}
|
|
1298
1361
|
|
|
1362
|
+
if (causal) {
|
|
1299
1363
|
// mask future tokens
|
|
1300
|
-
if (
|
|
1301
|
-
|
|
1364
|
+
if (p0 > p1) {
|
|
1365
|
+
goto skip;
|
|
1302
1366
|
}
|
|
1303
1367
|
|
|
1304
1368
|
// M-RoPE causal mask
|
|
1305
|
-
if (
|
|
1306
|
-
|
|
1307
|
-
|
|
1308
|
-
|
|
1369
|
+
if (is_2d) {
|
|
1370
|
+
if (p0 == p1) {
|
|
1371
|
+
const auto & p0_ext = cells.ext_get(j);
|
|
1372
|
+
|
|
1373
|
+
if (p0_ext.is_2d_gt(p1_x, p1_y)) {
|
|
1374
|
+
goto skip;
|
|
1375
|
+
}
|
|
1309
1376
|
}
|
|
1310
1377
|
}
|
|
1378
|
+
}
|
|
1311
1379
|
|
|
1312
|
-
|
|
1313
|
-
|
|
1314
|
-
|
|
1380
|
+
// apply SWA if any
|
|
1381
|
+
if (swa) {
|
|
1382
|
+
if (llama_hparams::is_masked_swa(n_swa, swa_type, p0, p1)) {
|
|
1383
|
+
goto skip;
|
|
1315
1384
|
}
|
|
1385
|
+
}
|
|
1316
1386
|
|
|
1317
|
-
|
|
1387
|
+
if (alibi) {
|
|
1388
|
+
data[idst + j] = -std::abs(p0 - p1);
|
|
1389
|
+
} else {
|
|
1390
|
+
data[idst + j] = 0.0f;
|
|
1318
1391
|
}
|
|
1392
|
+
|
|
1393
|
+
continue;
|
|
1394
|
+
skip:
|
|
1395
|
+
data[idst + j] = -INFINITY;
|
|
1319
1396
|
}
|
|
1320
1397
|
}
|
|
1321
1398
|
}
|
|
1322
1399
|
}
|
|
1323
1400
|
|
|
1401
|
+
template<bool causal, bool swa, bool is_2d>
|
|
1402
|
+
static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float * data) {
|
|
1403
|
+
const bool alibi = args.hparams.use_alibi;
|
|
1404
|
+
if (alibi) {
|
|
1405
|
+
set_input_kq_mask_impl<causal, swa, is_2d, true> (args, data);
|
|
1406
|
+
} else {
|
|
1407
|
+
set_input_kq_mask_impl<causal, swa, is_2d, false>(args, data);
|
|
1408
|
+
}
|
|
1409
|
+
}
|
|
1410
|
+
|
|
1411
|
+
template<bool causal, bool swa>
|
|
1412
|
+
static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float * data) {
|
|
1413
|
+
const bool is_2d = args.ubatch->is_pos_2d();
|
|
1414
|
+
if (is_2d) {
|
|
1415
|
+
set_input_kq_mask_impl<causal, swa, true> (args, data);
|
|
1416
|
+
} else {
|
|
1417
|
+
set_input_kq_mask_impl<causal, swa, false>(args, data);
|
|
1418
|
+
}
|
|
1419
|
+
}
|
|
1420
|
+
|
|
1421
|
+
template<bool causal>
|
|
1422
|
+
static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float * data) {
|
|
1423
|
+
const bool swa = args.swa_type != LLAMA_SWA_TYPE_NONE;
|
|
1424
|
+
if (swa) {
|
|
1425
|
+
set_input_kq_mask_impl<causal, true> (args, data);
|
|
1426
|
+
} else {
|
|
1427
|
+
set_input_kq_mask_impl<causal, false>(args, data);
|
|
1428
|
+
}
|
|
1429
|
+
}
|
|
1430
|
+
|
|
1431
|
+
void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const {
|
|
1432
|
+
const uint32_t n_tokens = ubatch->n_tokens;
|
|
1433
|
+
|
|
1434
|
+
GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
|
|
1435
|
+
float * data = (float *) dst->data;
|
|
1436
|
+
|
|
1437
|
+
const int64_t n_kv = dst->ne[0];
|
|
1438
|
+
const int64_t n_stream = dst->ne[3]; // num streams in the current ubatch
|
|
1439
|
+
|
|
1440
|
+
GGML_ASSERT(n_tokens%n_stream == 0);
|
|
1441
|
+
|
|
1442
|
+
// n_tps == n_tokens_per_stream
|
|
1443
|
+
const int64_t n_tps = n_tokens/n_stream;
|
|
1444
|
+
|
|
1445
|
+
//const int64_t t_start = ggml_time_us();
|
|
1446
|
+
|
|
1447
|
+
const args_set_input_kq_mask args = {
|
|
1448
|
+
/*.hparams =*/ hparams,
|
|
1449
|
+
/*.ubatch =*/ ubatch,
|
|
1450
|
+
/*.v_cells =*/ v_cells,
|
|
1451
|
+
/*.seq_to_stream =*/ seq_to_stream,
|
|
1452
|
+
/*.n_swa =*/ n_swa,
|
|
1453
|
+
/*.swa_type =*/ swa_type,
|
|
1454
|
+
/*.n_kv =*/ n_kv,
|
|
1455
|
+
/*.n_stream =*/ n_stream,
|
|
1456
|
+
/*.n_tps =*/ n_tps,
|
|
1457
|
+
};
|
|
1458
|
+
|
|
1459
|
+
if (causal_attn) {
|
|
1460
|
+
set_input_kq_mask_impl<true> (args, data);
|
|
1461
|
+
} else {
|
|
1462
|
+
set_input_kq_mask_impl<false>(args, data);
|
|
1463
|
+
}
|
|
1464
|
+
|
|
1465
|
+
//const int64_t t_end = ggml_time_us();
|
|
1466
|
+
|
|
1467
|
+
//LLAMA_LOG_ERROR("%s: kq mask time: %0.3f ms\n", __func__, (t_end - t_start)/1000.0);
|
|
1468
|
+
}
|
|
1469
|
+
|
|
1324
1470
|
void llama_kv_cache::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const {
|
|
1325
1471
|
const int64_t n_tokens = ubatch->n_tokens;
|
|
1326
1472
|
|
|
@@ -1483,10 +1629,6 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co
|
|
|
1483
1629
|
return gf;
|
|
1484
1630
|
}
|
|
1485
1631
|
|
|
1486
|
-
bool llama_kv_cache::is_masked_swa(llama_pos p0, llama_pos p1) const {
|
|
1487
|
-
return llama_hparams::is_masked_swa(n_swa, swa_type, p0, p1);
|
|
1488
|
-
}
|
|
1489
|
-
|
|
1490
1632
|
void llama_kv_cache::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
|
|
1491
1633
|
GGML_UNUSED(flags);
|
|
1492
1634
|
|
|
@@ -244,11 +244,14 @@ struct llama_file::impl {
|
|
|
244
244
|
}
|
|
245
245
|
errno = 0;
|
|
246
246
|
if (fd == -1) {
|
|
247
|
-
|
|
247
|
+
const size_t curr_off = tell();
|
|
248
|
+
const size_t to_read = std::min(len, size - curr_off);
|
|
249
|
+
|
|
250
|
+
std::size_t ret = std::fread(ptr, to_read, 1, fp);
|
|
248
251
|
if (ferror(fp)) {
|
|
249
252
|
throw std::runtime_error(format("read error: %s", strerror(errno)));
|
|
250
253
|
}
|
|
251
|
-
if (ret != 1) {
|
|
254
|
+
if (to_read > 0 && ret != 1) {
|
|
252
255
|
throw std::runtime_error("unexpectedly reached end of file");
|
|
253
256
|
}
|
|
254
257
|
} else {
|
|
@@ -262,7 +265,8 @@ struct llama_file::impl {
|
|
|
262
265
|
continue; // Interrupted by signal, retry
|
|
263
266
|
}
|
|
264
267
|
// Fallback to std::fread in case the DMA controller cannot access the buffer
|
|
265
|
-
if (errno == EFAULT) {
|
|
268
|
+
if (errno == EFAULT || errno == EINVAL) {
|
|
269
|
+
LLAMA_LOG_WARN("%s: Falling back to buffered IO due to %s\n", __func__, strerror(errno));
|
|
266
270
|
auto curr_off = tell();
|
|
267
271
|
close(fd);
|
|
268
272
|
fd = -1;
|
|
@@ -381,6 +385,9 @@ int llama_file::file_id() const {
|
|
|
381
385
|
#ifdef _WIN32
|
|
382
386
|
return _fileno(pimpl->fp);
|
|
383
387
|
#else
|
|
388
|
+
if (pimpl->fd != -1) {
|
|
389
|
+
return pimpl->fd;
|
|
390
|
+
}
|
|
384
391
|
#if defined(fileno)
|
|
385
392
|
return fileno(pimpl->fp);
|
|
386
393
|
#else
|
|
@@ -611,9 +618,9 @@ struct llama_mlock::impl {
|
|
|
611
618
|
|
|
612
619
|
char* errmsg = std::strerror(errno);
|
|
613
620
|
bool suggest = (errno == ENOMEM);
|
|
614
|
-
#if defined(TARGET_OS_VISION) || defined(TARGET_OS_TV) || defined(_AIX)
|
|
615
|
-
// visionOS/tvOS
|
|
616
|
-
// Skip resource limit checks on
|
|
621
|
+
#if defined(TARGET_OS_VISION) || defined(TARGET_OS_TV) || defined(_AIX) || defined(__HAIKU__)
|
|
622
|
+
// visionOS/tvOS/Haiku don't support RLIMIT_MEMLOCK
|
|
623
|
+
// Skip resource limit checks on these platforms
|
|
617
624
|
suggest = false;
|
|
618
625
|
#else
|
|
619
626
|
struct rlimit lock_limit;
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
#include "ggml.h"
|
|
4
4
|
|
|
5
|
+
#include <algorithm>
|
|
5
6
|
#include <array>
|
|
6
7
|
#include <cinttypes>
|
|
7
8
|
#include <cstring>
|
|
@@ -344,6 +345,7 @@ namespace GGUFMeta {
|
|
|
344
345
|
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx, kid);
|
|
345
346
|
|
|
346
347
|
switch (arr_info.gt) {
|
|
348
|
+
case GGUF_TYPE_BOOL:
|
|
347
349
|
case GGUF_TYPE_UINT32:
|
|
348
350
|
case GGUF_TYPE_INT32: GGML_ASSERT((std::is_same<T, int32_t>::value) ||
|
|
349
351
|
(std::is_same<T, uint32_t>::value)); break;
|
|
@@ -365,7 +367,13 @@ namespace GGUFMeta {
|
|
|
365
367
|
result[i] = value;
|
|
366
368
|
}
|
|
367
369
|
} else {
|
|
368
|
-
|
|
370
|
+
if (arr_info.gt == GGUF_TYPE_BOOL) {
|
|
371
|
+
std::transform((const bool *)arr_info.data, (const bool *)arr_info.data + arr_info.length, result.begin(), [](bool x) {
|
|
372
|
+
return static_cast<T>(x);
|
|
373
|
+
});
|
|
374
|
+
} else {
|
|
375
|
+
std::copy((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length, result.begin());
|
|
376
|
+
}
|
|
369
377
|
}
|
|
370
378
|
|
|
371
379
|
return true;
|
|
@@ -531,12 +539,18 @@ llama_model_loader::llama_model_loader(
|
|
|
531
539
|
files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io));
|
|
532
540
|
contexts.emplace_back(ctx);
|
|
533
541
|
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
542
|
+
if (use_mmap && use_direct_io) {
|
|
543
|
+
if (files.back()->has_direct_io()) {
|
|
544
|
+
// Disable mmap, as DirectIO is available
|
|
545
|
+
use_mmap = false;
|
|
546
|
+
LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__);
|
|
547
|
+
} else {
|
|
548
|
+
// Disable DirectIO and reopen file using std::fopen for mmap
|
|
549
|
+
use_direct_io = false;
|
|
550
|
+
files.pop_back();
|
|
551
|
+
files.emplace_back(new llama_file(fname.c_str(), "rb", false));
|
|
552
|
+
LLAMA_LOG_WARN("%s: direct I/O is not available, using mmap\n", __func__);
|
|
553
|
+
}
|
|
540
554
|
}
|
|
541
555
|
|
|
542
556
|
// Save tensors data offset of the main file.
|