@fugood/llama.node 0.4.7 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +4 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/lib/binding.ts +66 -6
- package/lib/index.js +59 -17
- package/lib/index.ts +74 -23
- package/package.json +1 -1
- package/src/DecodeAudioTokenWorker.cpp +40 -0
- package/src/DecodeAudioTokenWorker.h +22 -0
- package/src/EmbeddingWorker.cpp +7 -5
- package/src/LlamaCompletionWorker.cpp +68 -54
- package/src/LlamaCompletionWorker.h +7 -8
- package/src/LlamaContext.cpp +551 -235
- package/src/LlamaContext.h +26 -4
- package/src/LoadSessionWorker.cpp +4 -2
- package/src/SaveSessionWorker.cpp +10 -6
- package/src/TokenizeWorker.cpp +23 -14
- package/src/TokenizeWorker.h +2 -2
- package/src/addons.cc +8 -11
- package/src/common.hpp +129 -126
- package/src/llama.cpp/.github/workflows/build.yml +2 -2
- package/src/llama.cpp/.github/workflows/release.yml +152 -129
- package/src/llama.cpp/.github/workflows/winget.yml +42 -0
- package/src/llama.cpp/common/arg.cpp +14 -13
- package/src/llama.cpp/common/common.cpp +4 -75
- package/src/llama.cpp/common/common.h +7 -12
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +0 -13
- package/src/llama.cpp/examples/lookup/lookup.cpp +0 -11
- package/src/llama.cpp/examples/parallel/parallel.cpp +0 -9
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +6 -6
- package/src/llama.cpp/examples/simple/simple.cpp +1 -1
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
- package/src/llama.cpp/examples/sycl/run-llama2.sh +4 -4
- package/src/llama.cpp/examples/sycl/run-llama3.sh +28 -0
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
- package/src/llama.cpp/examples/sycl/win-run-llama3.bat +9 -0
- package/src/llama.cpp/ggml/include/ggml-opt.h +2 -0
- package/src/llama.cpp/ggml/include/ggml.h +11 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +274 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +27 -0
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +18 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +107 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +16 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +8 -2
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +315 -155
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +5 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +43 -12
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +171 -112
- package/src/llama.cpp/ggml/src/ggml.c +64 -18
- package/src/llama.cpp/include/llama.h +24 -124
- package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +5 -1
- package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +5 -1
- package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
- package/src/llama.cpp/src/llama-batch.cpp +3 -1
- package/src/llama.cpp/src/llama-context.cpp +60 -110
- package/src/llama.cpp/src/llama-graph.cpp +137 -233
- package/src/llama.cpp/src/llama-graph.h +49 -7
- package/src/llama.cpp/src/llama-hparams.cpp +17 -1
- package/src/llama.cpp/src/llama-hparams.h +34 -5
- package/src/llama.cpp/src/llama-kv-cache.cpp +654 -321
- package/src/llama.cpp/src/llama-kv-cache.h +201 -85
- package/src/llama.cpp/src/llama-memory.h +3 -2
- package/src/llama.cpp/src/llama-model.cpp +273 -94
- package/src/llama.cpp/src/llama-model.h +4 -1
- package/src/llama.cpp/tests/test-arg-parser.cpp +1 -1
- package/src/llama.cpp/tools/llama-bench/llama-bench.cpp +1 -0
- package/src/llama.cpp/tools/mtmd/CMakeLists.txt +13 -2
- package/src/llama.cpp/tools/mtmd/clip-impl.h +108 -11
- package/src/llama.cpp/tools/mtmd/clip.cpp +466 -88
- package/src/llama.cpp/tools/mtmd/clip.h +6 -4
- package/src/llama.cpp/tools/mtmd/miniaudio.h +93468 -0
- package/src/llama.cpp/tools/mtmd/mtmd-audio.cpp +855 -0
- package/src/llama.cpp/tools/mtmd/mtmd-audio.h +62 -0
- package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +21 -14
- package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +36 -49
- package/src/llama.cpp/tools/mtmd/mtmd.cpp +362 -98
- package/src/llama.cpp/tools/mtmd/mtmd.h +52 -21
- package/src/llama.cpp/tools/run/run.cpp +2 -2
- package/src/llama.cpp/tools/server/server.cpp +158 -47
- package/src/llama.cpp/tools/server/utils.hpp +71 -43
- package/src/llama.cpp/tools/tts/tts.cpp +4 -2
- package/src/tts_utils.cpp +342 -0
- package/src/tts_utils.h +62 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
|
@@ -1102,6 +1102,9 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
|
|
|
1102
1102
|
mparams.tensor_buft_overrides = params.tensor_buft_overrides.data();
|
|
1103
1103
|
}
|
|
1104
1104
|
|
|
1105
|
+
mparams.progress_callback = params.load_progress_callback;
|
|
1106
|
+
mparams.progress_callback_user_data = params.load_progress_callback_user_data;
|
|
1107
|
+
|
|
1105
1108
|
return mparams;
|
|
1106
1109
|
}
|
|
1107
1110
|
|
|
@@ -1133,6 +1136,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
|
|
|
1133
1136
|
cparams.flash_attn = params.flash_attn;
|
|
1134
1137
|
cparams.no_perf = params.no_perf;
|
|
1135
1138
|
cparams.op_offload = !params.no_op_offload;
|
|
1139
|
+
cparams.swa_full = params.swa_full;
|
|
1136
1140
|
|
|
1137
1141
|
if (params.reranking) {
|
|
1138
1142
|
cparams.embeddings = true;
|
|
@@ -1325,81 +1329,6 @@ std::string common_detokenize(const struct llama_vocab * vocab, const std::vecto
|
|
|
1325
1329
|
return text;
|
|
1326
1330
|
}
|
|
1327
1331
|
|
|
1328
|
-
//
|
|
1329
|
-
// KV cache utils
|
|
1330
|
-
//
|
|
1331
|
-
|
|
1332
|
-
void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
|
|
1333
|
-
static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
|
|
1334
|
-
|
|
1335
|
-
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
|
|
1336
|
-
view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
|
|
1337
|
-
|
|
1338
|
-
llama_kv_cache_view_cell * c_curr = view.cells;
|
|
1339
|
-
llama_seq_id * cs_curr = view.cells_sequences;
|
|
1340
|
-
|
|
1341
|
-
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
|
|
1342
|
-
if (i % row_size == 0) {
|
|
1343
|
-
printf("\n%5d: ", i);
|
|
1344
|
-
}
|
|
1345
|
-
int seq_count = 0;
|
|
1346
|
-
for (int j = 0; j < view.n_seq_max; j++) {
|
|
1347
|
-
if (cs_curr[j] >= 0) { seq_count++; }
|
|
1348
|
-
}
|
|
1349
|
-
putchar(slot_chars[std::min(sizeof(slot_chars) - 2, size_t(seq_count))]);
|
|
1350
|
-
}
|
|
1351
|
-
|
|
1352
|
-
printf("\n=== Done dumping\n");
|
|
1353
|
-
}
|
|
1354
|
-
|
|
1355
|
-
void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) {
|
|
1356
|
-
static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
|
|
1357
|
-
|
|
1358
|
-
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
|
|
1359
|
-
view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
|
|
1360
|
-
|
|
1361
|
-
std::unordered_map<llama_seq_id, size_t> seqs;
|
|
1362
|
-
llama_kv_cache_view_cell * c_curr = view.cells;
|
|
1363
|
-
llama_seq_id * cs_curr = view.cells_sequences;
|
|
1364
|
-
|
|
1365
|
-
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
|
|
1366
|
-
for (int j = 0; j < view.n_seq_max; j++) {
|
|
1367
|
-
if (cs_curr[j] < 0) { continue; }
|
|
1368
|
-
if (seqs.find(cs_curr[j]) == seqs.end()) {
|
|
1369
|
-
if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
|
|
1370
|
-
const size_t sz = seqs.size();
|
|
1371
|
-
seqs[cs_curr[j]] = sz;
|
|
1372
|
-
}
|
|
1373
|
-
}
|
|
1374
|
-
if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
|
|
1375
|
-
}
|
|
1376
|
-
|
|
1377
|
-
printf("=== Sequence legend: ");
|
|
1378
|
-
for (const auto & it : seqs) {
|
|
1379
|
-
printf("%zu=%d, ", it.second, it.first);
|
|
1380
|
-
}
|
|
1381
|
-
printf("'+'=other sequence ids");
|
|
1382
|
-
|
|
1383
|
-
c_curr = view.cells;
|
|
1384
|
-
cs_curr = view.cells_sequences;
|
|
1385
|
-
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
|
|
1386
|
-
if (i % row_size == 0) {
|
|
1387
|
-
printf("\n%5d: ", i);
|
|
1388
|
-
}
|
|
1389
|
-
for (int j = 0; j < view.n_seq_max; j++) {
|
|
1390
|
-
if (cs_curr[j] >= 0) {
|
|
1391
|
-
const auto & it = seqs.find(cs_curr[j]);
|
|
1392
|
-
putchar(it != seqs.end() ? int(slot_chars[it->second]) : '+');
|
|
1393
|
-
} else {
|
|
1394
|
-
putchar('.');
|
|
1395
|
-
}
|
|
1396
|
-
}
|
|
1397
|
-
putchar(' ');
|
|
1398
|
-
}
|
|
1399
|
-
|
|
1400
|
-
printf("\n=== Done dumping\n");
|
|
1401
|
-
}
|
|
1402
|
-
|
|
1403
1332
|
//
|
|
1404
1333
|
// Embedding utils
|
|
1405
1334
|
//
|
|
@@ -76,7 +76,7 @@ enum llama_example {
|
|
|
76
76
|
LLAMA_EXAMPLE_SERVER,
|
|
77
77
|
LLAMA_EXAMPLE_CVECTOR_GENERATOR,
|
|
78
78
|
LLAMA_EXAMPLE_EXPORT_LORA,
|
|
79
|
-
|
|
79
|
+
LLAMA_EXAMPLE_MTMD,
|
|
80
80
|
LLAMA_EXAMPLE_LOOKUP,
|
|
81
81
|
LLAMA_EXAMPLE_PARALLEL,
|
|
82
82
|
LLAMA_EXAMPLE_TTS,
|
|
@@ -323,13 +323,13 @@ struct common_params {
|
|
|
323
323
|
bool flash_attn = false; // flash attention
|
|
324
324
|
bool no_perf = false; // disable performance metrics
|
|
325
325
|
bool ctx_shift = true; // context shift on inifinite text generation
|
|
326
|
+
bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
|
|
326
327
|
|
|
327
328
|
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
|
328
329
|
bool use_mmap = true; // use mmap for faster loads
|
|
329
330
|
bool use_mlock = false; // use mlock to keep model in memory
|
|
330
331
|
bool verbose_prompt = false; // print prompt tokens before generation
|
|
331
332
|
bool display_prompt = true; // print prompt before generation
|
|
332
|
-
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
|
|
333
333
|
bool no_kv_offload = false; // disable KV offloading
|
|
334
334
|
bool warmup = true; // warmup run
|
|
335
335
|
bool check_tensors = false; // validate tensor data
|
|
@@ -428,6 +428,11 @@ struct common_params {
|
|
|
428
428
|
|
|
429
429
|
// common params
|
|
430
430
|
std::string out_file; // output filename for all example programs
|
|
431
|
+
// optional callback for model loading progress and cancellation:
|
|
432
|
+
// called with a progress value between 0.0 and 1.0.
|
|
433
|
+
// return false from callback to abort model loading or true to continue
|
|
434
|
+
llama_progress_callback load_progress_callback = NULL;
|
|
435
|
+
void * load_progress_callback_user_data = NULL;
|
|
431
436
|
};
|
|
432
437
|
|
|
433
438
|
// call once at the start of a program if it uses libcommon
|
|
@@ -616,16 +621,6 @@ std::string common_detokenize(
|
|
|
616
621
|
const std::vector<llama_token> & tokens,
|
|
617
622
|
bool special = true);
|
|
618
623
|
|
|
619
|
-
//
|
|
620
|
-
// KV cache utils
|
|
621
|
-
//
|
|
622
|
-
|
|
623
|
-
// Dump the KV cache view with the number of sequences per cell.
|
|
624
|
-
void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
|
|
625
|
-
|
|
626
|
-
// Dump the KV cache view showing individual sequences in each cell (long output).
|
|
627
|
-
void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
|
|
628
|
-
|
|
629
624
|
//
|
|
630
625
|
// Embedding utils
|
|
631
626
|
//
|
|
@@ -50,8 +50,6 @@ int main(int argc, char ** argv) {
|
|
|
50
50
|
const int N = 5; // n-gram size
|
|
51
51
|
const int G = 15; // max verification n-grams
|
|
52
52
|
|
|
53
|
-
const bool dump_kv_cache = params.dump_kv_cache;
|
|
54
|
-
|
|
55
53
|
// init llama.cpp
|
|
56
54
|
llama_backend_init();
|
|
57
55
|
llama_numa_init(params.numa);
|
|
@@ -152,9 +150,6 @@ int main(int argc, char ** argv) {
|
|
|
152
150
|
// here we keep adding new n-grams as we go
|
|
153
151
|
ngram_container ngrams_observed(llama_vocab_n_tokens(vocab), N, G);
|
|
154
152
|
|
|
155
|
-
// debug
|
|
156
|
-
struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, W + G + 1);
|
|
157
|
-
|
|
158
153
|
const auto t_dec_start = ggml_time_us();
|
|
159
154
|
|
|
160
155
|
// sample first token
|
|
@@ -172,12 +167,6 @@ int main(int argc, char ** argv) {
|
|
|
172
167
|
}
|
|
173
168
|
|
|
174
169
|
while (true) {
|
|
175
|
-
// debug
|
|
176
|
-
if (dump_kv_cache) {
|
|
177
|
-
llama_kv_cache_view_update(ctx, &kvc_view);
|
|
178
|
-
common_kv_cache_dump_view_seqs(kvc_view, 40);
|
|
179
|
-
}
|
|
180
|
-
|
|
181
170
|
// build the mask from https://lmsys.org/blog/2023-11-21-lookahead-decoding/
|
|
182
171
|
//
|
|
183
172
|
// Example for W = 5, N = 4, G = 2:
|
|
@@ -473,8 +462,6 @@ int main(int argc, char ** argv) {
|
|
|
473
462
|
|
|
474
463
|
common_sampler_free(smpl);
|
|
475
464
|
|
|
476
|
-
llama_kv_cache_view_free(&kvc_view);
|
|
477
|
-
|
|
478
465
|
llama_batch_free(batch);
|
|
479
466
|
|
|
480
467
|
llama_backend_free();
|
|
@@ -24,8 +24,6 @@ int main(int argc, char ** argv){
|
|
|
24
24
|
// max. number of additional tokens to draft if match is found
|
|
25
25
|
const int n_draft = params.speculative.n_max;
|
|
26
26
|
|
|
27
|
-
const bool dump_kv_cache = params.dump_kv_cache;
|
|
28
|
-
|
|
29
27
|
// init llama.cpp
|
|
30
28
|
llama_backend_init();
|
|
31
29
|
llama_numa_init(params.numa);
|
|
@@ -110,18 +108,9 @@ int main(int argc, char ** argv){
|
|
|
110
108
|
|
|
111
109
|
llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, 1);
|
|
112
110
|
|
|
113
|
-
// debug
|
|
114
|
-
struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, 1);
|
|
115
|
-
|
|
116
111
|
const auto t_dec_start = ggml_time_us();
|
|
117
112
|
|
|
118
113
|
while (true) {
|
|
119
|
-
// debug
|
|
120
|
-
if (dump_kv_cache) {
|
|
121
|
-
llama_kv_cache_view_update(ctx, &kvc_view);
|
|
122
|
-
common_kv_cache_dump_view_seqs(kvc_view, 40);
|
|
123
|
-
}
|
|
124
|
-
|
|
125
114
|
// print current draft sequence
|
|
126
115
|
LOG_DBG("drafted %s\n", string_from(ctx, draft).c_str());
|
|
127
116
|
|
|
@@ -178,8 +178,6 @@ int main(int argc, char ** argv) {
|
|
|
178
178
|
// insert new requests as soon as the previous one is done
|
|
179
179
|
const bool cont_batching = params.cont_batching;
|
|
180
180
|
|
|
181
|
-
const bool dump_kv_cache = params.dump_kv_cache;
|
|
182
|
-
|
|
183
181
|
// is the system prompt shared in the cache
|
|
184
182
|
const bool is_sp_shared = params.is_pp_shared;
|
|
185
183
|
|
|
@@ -241,8 +239,6 @@ int main(int argc, char ** argv) {
|
|
|
241
239
|
int32_t n_total_gen = 0;
|
|
242
240
|
int32_t n_cache_miss = 0;
|
|
243
241
|
|
|
244
|
-
struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, n_clients);
|
|
245
|
-
|
|
246
242
|
const auto t_main_start = ggml_time_us();
|
|
247
243
|
|
|
248
244
|
LOG_INF("%s: Simulating parallel requests from clients:\n", __func__);
|
|
@@ -272,11 +268,6 @@ int main(int argc, char ** argv) {
|
|
|
272
268
|
LOG_INF("Processing requests ...\n\n");
|
|
273
269
|
|
|
274
270
|
while (true) {
|
|
275
|
-
if (dump_kv_cache) {
|
|
276
|
-
llama_kv_cache_view_update(ctx, &kvc_view);
|
|
277
|
-
common_kv_cache_dump_view_seqs(kvc_view, 40);
|
|
278
|
-
}
|
|
279
|
-
|
|
280
271
|
common_batch_clear(batch);
|
|
281
272
|
|
|
282
273
|
// decode any currently ongoing sequences
|
|
@@ -81,14 +81,14 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
|
|
|
81
81
|
}
|
|
82
82
|
}
|
|
83
83
|
|
|
84
|
-
static void
|
|
84
|
+
static void batch_encode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
|
|
85
85
|
// clear previous kv_cache values (irrelevant for embeddings)
|
|
86
86
|
llama_kv_self_clear(ctx);
|
|
87
87
|
|
|
88
88
|
// run model
|
|
89
89
|
LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
|
|
90
|
-
if (
|
|
91
|
-
LOG_ERR("%s : failed to
|
|
90
|
+
if (llama_encode(ctx, batch) < 0) {
|
|
91
|
+
LOG_ERR("%s : failed to encode\n", __func__);
|
|
92
92
|
}
|
|
93
93
|
|
|
94
94
|
for (int i = 0; i < batch.n_tokens; i++) {
|
|
@@ -233,7 +233,7 @@ int main(int argc, char ** argv) {
|
|
|
233
233
|
// encode if at capacity
|
|
234
234
|
if (batch.n_tokens + n_toks > n_batch) {
|
|
235
235
|
float * out = emb + p * n_embd;
|
|
236
|
-
|
|
236
|
+
batch_encode(ctx, batch, out, s, n_embd);
|
|
237
237
|
common_batch_clear(batch);
|
|
238
238
|
p += s;
|
|
239
239
|
s = 0;
|
|
@@ -246,7 +246,7 @@ int main(int argc, char ** argv) {
|
|
|
246
246
|
|
|
247
247
|
// final batch
|
|
248
248
|
float * out = emb + p * n_embd;
|
|
249
|
-
|
|
249
|
+
batch_encode(ctx, batch, out, s, n_embd);
|
|
250
250
|
|
|
251
251
|
// save embeddings to chunks
|
|
252
252
|
for (int i = 0; i < n_chunks; i++) {
|
|
@@ -267,7 +267,7 @@ int main(int argc, char ** argv) {
|
|
|
267
267
|
batch_add_seq(query_batch, query_tokens, 0);
|
|
268
268
|
|
|
269
269
|
std::vector<float> query_emb(n_embd, 0);
|
|
270
|
-
|
|
270
|
+
batch_encode(ctx, query_batch, query_emb.data(), 1, n_embd);
|
|
271
271
|
|
|
272
272
|
common_batch_clear(query_batch);
|
|
273
273
|
|
|
@@ -84,13 +84,13 @@ int main(int argc, char ** argv) {
|
|
|
84
84
|
model_params.n_gpu_layers = ngl;
|
|
85
85
|
|
|
86
86
|
llama_model * model = llama_model_load_from_file(model_path.c_str(), model_params);
|
|
87
|
-
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
88
87
|
|
|
89
88
|
if (model == NULL) {
|
|
90
89
|
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
|
|
91
90
|
return 1;
|
|
92
91
|
}
|
|
93
92
|
|
|
93
|
+
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
94
94
|
// tokenize the prompt
|
|
95
95
|
|
|
96
96
|
// find the number of tokens in the prompt
|
|
@@ -98,7 +98,7 @@ int main(int argc, char ** argv) {
|
|
|
98
98
|
auto generate = [&](const std::string & prompt) {
|
|
99
99
|
std::string response;
|
|
100
100
|
|
|
101
|
-
const bool is_first =
|
|
101
|
+
const bool is_first = llama_kv_self_seq_pos_max(ctx, 0) == 0;
|
|
102
102
|
|
|
103
103
|
// tokenize the prompt
|
|
104
104
|
const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
|
|
@@ -113,7 +113,7 @@ int main(int argc, char ** argv) {
|
|
|
113
113
|
while (true) {
|
|
114
114
|
// check if we have enough space in the context to evaluate this batch
|
|
115
115
|
int n_ctx = llama_n_ctx(ctx);
|
|
116
|
-
int n_ctx_used =
|
|
116
|
+
int n_ctx_used = llama_kv_self_seq_pos_max(ctx, 0);
|
|
117
117
|
if (n_ctx_used + batch.n_tokens > n_ctx) {
|
|
118
118
|
printf("\033[0m\n");
|
|
119
119
|
fprintf(stderr, "context size exceeded\n");
|
|
@@ -12,16 +12,16 @@ source /opt/intel/oneapi/setvars.sh
|
|
|
12
12
|
|
|
13
13
|
INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:"
|
|
14
14
|
MODEL_FILE=models/llama-2-7b.Q4_0.gguf
|
|
15
|
-
NGL=
|
|
16
|
-
|
|
15
|
+
NGL=99
|
|
16
|
+
CONTEXT=4096
|
|
17
17
|
|
|
18
18
|
if [ $# -gt 0 ]; then
|
|
19
19
|
GGML_SYCL_DEVICE=$1
|
|
20
20
|
echo "use $GGML_SYCL_DEVICE as main GPU"
|
|
21
21
|
#use signle GPU only
|
|
22
|
-
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${
|
|
22
|
+
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none
|
|
23
23
|
|
|
24
24
|
else
|
|
25
25
|
#use multiple GPUs with same max compute units
|
|
26
|
-
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${
|
|
26
|
+
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT}
|
|
27
27
|
fi
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
|
|
3
|
+
# MIT license
|
|
4
|
+
# Copyright (C) 2025 Intel Corporation
|
|
5
|
+
# SPDX-License-Identifier: MIT
|
|
6
|
+
|
|
7
|
+
# If you want more control, DPC++ Allows selecting a specific device through the
|
|
8
|
+
# following environment variable
|
|
9
|
+
#export ONEAPI_DEVICE_SELECTOR="level_zero:0"
|
|
10
|
+
source /opt/intel/oneapi/setvars.sh
|
|
11
|
+
|
|
12
|
+
#export GGML_SYCL_DEBUG=1
|
|
13
|
+
|
|
14
|
+
#ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.
|
|
15
|
+
|
|
16
|
+
INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:"
|
|
17
|
+
MODEL_FILE=models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf
|
|
18
|
+
NGL=99 # Layers offloaded to the GPU. If the device runs out of memory, reduce this value according to the model you are using.
|
|
19
|
+
CONTEXT=4096
|
|
20
|
+
|
|
21
|
+
if [ $# -gt 0 ]; then
|
|
22
|
+
GGML_SYCL_DEVICE=$1
|
|
23
|
+
echo "Using $GGML_SYCL_DEVICE as the main GPU"
|
|
24
|
+
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none
|
|
25
|
+
else
|
|
26
|
+
#use multiple GPUs with same max compute units
|
|
27
|
+
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -c ${CONTEXT}
|
|
28
|
+
fi
|
|
@@ -6,4 +6,4 @@ set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
|
|
|
6
6
|
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
|
7
7
|
|
|
8
8
|
|
|
9
|
-
.\build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl
|
|
9
|
+
.\build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 99 -s 0
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
:: MIT license
|
|
2
|
+
:: Copyright (C) 2024 Intel Corporation
|
|
3
|
+
:: SPDX-License-Identifier: MIT
|
|
4
|
+
|
|
5
|
+
set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
|
|
6
|
+
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
.\build\bin\llama-cli.exe -m models\Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf -p %INPUT2% -n 400 -e -ngl 99
|
|
@@ -128,6 +128,8 @@ extern "C" {
|
|
|
128
128
|
// set gradients to zero, initilize loss, and optionally reset the optimizer
|
|
129
129
|
GGML_API void ggml_opt_reset(ggml_opt_context_t opt_ctx, bool optimizer);
|
|
130
130
|
|
|
131
|
+
GGML_API bool ggml_opt_static_graphs(ggml_opt_context_t opt_ctx); // whether the graphs are allocated_statically
|
|
132
|
+
|
|
131
133
|
// get underlying tensors that store data
|
|
132
134
|
// if not using static graphs these pointers become invalid with the next call to ggml_opt_alloc
|
|
133
135
|
GGML_API struct ggml_tensor * ggml_opt_inputs( ggml_opt_context_t opt_ctx); // forward graph input tensor
|
|
@@ -536,6 +536,7 @@ extern "C" {
|
|
|
536
536
|
GGML_UNARY_OP_HARDSWISH,
|
|
537
537
|
GGML_UNARY_OP_HARDSIGMOID,
|
|
538
538
|
GGML_UNARY_OP_EXP,
|
|
539
|
+
GGML_UNARY_OP_GELU_ERF,
|
|
539
540
|
|
|
540
541
|
GGML_UNARY_OP_COUNT,
|
|
541
542
|
};
|
|
@@ -1024,6 +1025,16 @@ extern "C" {
|
|
|
1024
1025
|
struct ggml_context * ctx,
|
|
1025
1026
|
struct ggml_tensor * a);
|
|
1026
1027
|
|
|
1028
|
+
// GELU using erf (error function) when possible
|
|
1029
|
+
// some backends may fallback to approximation based on Abramowitz and Stegun formula
|
|
1030
|
+
GGML_API struct ggml_tensor * ggml_gelu_erf(
|
|
1031
|
+
struct ggml_context * ctx,
|
|
1032
|
+
struct ggml_tensor * a);
|
|
1033
|
+
|
|
1034
|
+
GGML_API struct ggml_tensor * ggml_gelu_erf_inplace(
|
|
1035
|
+
struct ggml_context * ctx,
|
|
1036
|
+
struct ggml_tensor * a);
|
|
1037
|
+
|
|
1027
1038
|
GGML_API struct ggml_tensor * ggml_gelu_quick(
|
|
1028
1039
|
struct ggml_context * ctx,
|
|
1029
1040
|
struct ggml_tensor * a);
|