@fugood/llama.node 0.4.7 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. package/bin/darwin/arm64/llama-node.node +0 -0
  2. package/bin/darwin/x64/llama-node.node +0 -0
  3. package/bin/linux/arm64/llama-node.node +0 -0
  4. package/bin/linux/x64/llama-node.node +0 -0
  5. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  6. package/bin/linux-cuda/x64/llama-node.node +0 -0
  7. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  9. package/bin/win32/arm64/llama-node.node +0 -0
  10. package/bin/win32/arm64/node.lib +0 -0
  11. package/bin/win32/x64/llama-node.node +0 -0
  12. package/bin/win32/x64/node.lib +0 -0
  13. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  14. package/bin/win32-vulkan/arm64/node.lib +0 -0
  15. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  16. package/bin/win32-vulkan/x64/node.lib +0 -0
  17. package/lib/binding.ts +20 -6
  18. package/lib/index.js +41 -17
  19. package/lib/index.ts +50 -23
  20. package/package.json +1 -1
  21. package/src/LlamaCompletionWorker.cpp +9 -9
  22. package/src/LlamaCompletionWorker.h +2 -2
  23. package/src/LlamaContext.cpp +37 -18
  24. package/src/LlamaContext.h +1 -0
  25. package/src/TokenizeWorker.cpp +16 -12
  26. package/src/TokenizeWorker.h +2 -2
  27. package/src/common.hpp +54 -50
  28. package/src/llama.cpp/.github/workflows/build.yml +2 -2
  29. package/src/llama.cpp/.github/workflows/release.yml +152 -129
  30. package/src/llama.cpp/.github/workflows/winget.yml +42 -0
  31. package/src/llama.cpp/common/arg.cpp +14 -13
  32. package/src/llama.cpp/common/common.cpp +4 -75
  33. package/src/llama.cpp/common/common.h +7 -12
  34. package/src/llama.cpp/examples/lookahead/lookahead.cpp +0 -13
  35. package/src/llama.cpp/examples/lookup/lookup.cpp +0 -11
  36. package/src/llama.cpp/examples/parallel/parallel.cpp +0 -9
  37. package/src/llama.cpp/examples/retrieval/retrieval.cpp +6 -6
  38. package/src/llama.cpp/examples/simple/simple.cpp +1 -1
  39. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
  40. package/src/llama.cpp/examples/sycl/run-llama2.sh +4 -4
  41. package/src/llama.cpp/examples/sycl/run-llama3.sh +28 -0
  42. package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
  43. package/src/llama.cpp/examples/sycl/win-run-llama3.bat +9 -0
  44. package/src/llama.cpp/ggml/include/ggml-opt.h +2 -0
  45. package/src/llama.cpp/ggml/include/ggml.h +11 -0
  46. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +274 -0
  47. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +27 -0
  48. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +18 -2
  49. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1 -0
  50. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +107 -0
  51. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +16 -0
  52. package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +8 -2
  53. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +315 -155
  54. package/src/llama.cpp/ggml/src/ggml-opt.cpp +5 -0
  55. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +43 -12
  56. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +171 -112
  57. package/src/llama.cpp/ggml/src/ggml.c +64 -18
  58. package/src/llama.cpp/include/llama.h +24 -124
  59. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +5 -1
  60. package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +5 -1
  61. package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
  62. package/src/llama.cpp/src/llama-batch.cpp +3 -1
  63. package/src/llama.cpp/src/llama-context.cpp +60 -110
  64. package/src/llama.cpp/src/llama-graph.cpp +137 -233
  65. package/src/llama.cpp/src/llama-graph.h +49 -7
  66. package/src/llama.cpp/src/llama-hparams.cpp +17 -1
  67. package/src/llama.cpp/src/llama-hparams.h +34 -5
  68. package/src/llama.cpp/src/llama-kv-cache.cpp +654 -321
  69. package/src/llama.cpp/src/llama-kv-cache.h +201 -85
  70. package/src/llama.cpp/src/llama-memory.h +3 -2
  71. package/src/llama.cpp/src/llama-model.cpp +273 -94
  72. package/src/llama.cpp/src/llama-model.h +4 -1
  73. package/src/llama.cpp/tests/test-arg-parser.cpp +1 -1
  74. package/src/llama.cpp/tools/llama-bench/llama-bench.cpp +1 -0
  75. package/src/llama.cpp/tools/mtmd/CMakeLists.txt +13 -2
  76. package/src/llama.cpp/tools/mtmd/clip-impl.h +108 -11
  77. package/src/llama.cpp/tools/mtmd/clip.cpp +466 -88
  78. package/src/llama.cpp/tools/mtmd/clip.h +6 -4
  79. package/src/llama.cpp/tools/mtmd/miniaudio.h +93468 -0
  80. package/src/llama.cpp/tools/mtmd/mtmd-audio.cpp +855 -0
  81. package/src/llama.cpp/tools/mtmd/mtmd-audio.h +62 -0
  82. package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +21 -14
  83. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +36 -49
  84. package/src/llama.cpp/tools/mtmd/mtmd.cpp +362 -98
  85. package/src/llama.cpp/tools/mtmd/mtmd.h +52 -21
  86. package/src/llama.cpp/tools/run/run.cpp +2 -2
  87. package/src/llama.cpp/tools/server/server.cpp +158 -47
  88. package/src/llama.cpp/tools/server/utils.hpp +71 -43
  89. package/src/llama.cpp/tools/tts/tts.cpp +4 -2
@@ -1102,6 +1102,9 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
1102
1102
  mparams.tensor_buft_overrides = params.tensor_buft_overrides.data();
1103
1103
  }
1104
1104
 
1105
+ mparams.progress_callback = params.load_progress_callback;
1106
+ mparams.progress_callback_user_data = params.load_progress_callback_user_data;
1107
+
1105
1108
  return mparams;
1106
1109
  }
1107
1110
 
@@ -1133,6 +1136,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
1133
1136
  cparams.flash_attn = params.flash_attn;
1134
1137
  cparams.no_perf = params.no_perf;
1135
1138
  cparams.op_offload = !params.no_op_offload;
1139
+ cparams.swa_full = params.swa_full;
1136
1140
 
1137
1141
  if (params.reranking) {
1138
1142
  cparams.embeddings = true;
@@ -1325,81 +1329,6 @@ std::string common_detokenize(const struct llama_vocab * vocab, const std::vecto
1325
1329
  return text;
1326
1330
  }
1327
1331
 
1328
- //
1329
- // KV cache utils
1330
- //
1331
-
1332
- void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
1333
- static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
1334
-
1335
- printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
1336
- view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
1337
-
1338
- llama_kv_cache_view_cell * c_curr = view.cells;
1339
- llama_seq_id * cs_curr = view.cells_sequences;
1340
-
1341
- for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
1342
- if (i % row_size == 0) {
1343
- printf("\n%5d: ", i);
1344
- }
1345
- int seq_count = 0;
1346
- for (int j = 0; j < view.n_seq_max; j++) {
1347
- if (cs_curr[j] >= 0) { seq_count++; }
1348
- }
1349
- putchar(slot_chars[std::min(sizeof(slot_chars) - 2, size_t(seq_count))]);
1350
- }
1351
-
1352
- printf("\n=== Done dumping\n");
1353
- }
1354
-
1355
- void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) {
1356
- static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
1357
-
1358
- printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
1359
- view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
1360
-
1361
- std::unordered_map<llama_seq_id, size_t> seqs;
1362
- llama_kv_cache_view_cell * c_curr = view.cells;
1363
- llama_seq_id * cs_curr = view.cells_sequences;
1364
-
1365
- for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
1366
- for (int j = 0; j < view.n_seq_max; j++) {
1367
- if (cs_curr[j] < 0) { continue; }
1368
- if (seqs.find(cs_curr[j]) == seqs.end()) {
1369
- if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
1370
- const size_t sz = seqs.size();
1371
- seqs[cs_curr[j]] = sz;
1372
- }
1373
- }
1374
- if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
1375
- }
1376
-
1377
- printf("=== Sequence legend: ");
1378
- for (const auto & it : seqs) {
1379
- printf("%zu=%d, ", it.second, it.first);
1380
- }
1381
- printf("'+'=other sequence ids");
1382
-
1383
- c_curr = view.cells;
1384
- cs_curr = view.cells_sequences;
1385
- for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
1386
- if (i % row_size == 0) {
1387
- printf("\n%5d: ", i);
1388
- }
1389
- for (int j = 0; j < view.n_seq_max; j++) {
1390
- if (cs_curr[j] >= 0) {
1391
- const auto & it = seqs.find(cs_curr[j]);
1392
- putchar(it != seqs.end() ? int(slot_chars[it->second]) : '+');
1393
- } else {
1394
- putchar('.');
1395
- }
1396
- }
1397
- putchar(' ');
1398
- }
1399
-
1400
- printf("\n=== Done dumping\n");
1401
- }
1402
-
1403
1332
  //
1404
1333
  // Embedding utils
1405
1334
  //
@@ -76,7 +76,7 @@ enum llama_example {
76
76
  LLAMA_EXAMPLE_SERVER,
77
77
  LLAMA_EXAMPLE_CVECTOR_GENERATOR,
78
78
  LLAMA_EXAMPLE_EXPORT_LORA,
79
- LLAMA_EXAMPLE_LLAVA,
79
+ LLAMA_EXAMPLE_MTMD,
80
80
  LLAMA_EXAMPLE_LOOKUP,
81
81
  LLAMA_EXAMPLE_PARALLEL,
82
82
  LLAMA_EXAMPLE_TTS,
@@ -323,13 +323,13 @@ struct common_params {
323
323
  bool flash_attn = false; // flash attention
324
324
  bool no_perf = false; // disable performance metrics
325
325
  bool ctx_shift = true; // context shift on inifinite text generation
326
+ bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
326
327
 
327
328
  bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
328
329
  bool use_mmap = true; // use mmap for faster loads
329
330
  bool use_mlock = false; // use mlock to keep model in memory
330
331
  bool verbose_prompt = false; // print prompt tokens before generation
331
332
  bool display_prompt = true; // print prompt before generation
332
- bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
333
333
  bool no_kv_offload = false; // disable KV offloading
334
334
  bool warmup = true; // warmup run
335
335
  bool check_tensors = false; // validate tensor data
@@ -428,6 +428,11 @@ struct common_params {
428
428
 
429
429
  // common params
430
430
  std::string out_file; // output filename for all example programs
431
+ // optional callback for model loading progress and cancellation:
432
+ // called with a progress value between 0.0 and 1.0.
433
+ // return false from callback to abort model loading or true to continue
434
+ llama_progress_callback load_progress_callback = NULL;
435
+ void * load_progress_callback_user_data = NULL;
431
436
  };
432
437
 
433
438
  // call once at the start of a program if it uses libcommon
@@ -616,16 +621,6 @@ std::string common_detokenize(
616
621
  const std::vector<llama_token> & tokens,
617
622
  bool special = true);
618
623
 
619
- //
620
- // KV cache utils
621
- //
622
-
623
- // Dump the KV cache view with the number of sequences per cell.
624
- void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
625
-
626
- // Dump the KV cache view showing individual sequences in each cell (long output).
627
- void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
628
-
629
624
  //
630
625
  // Embedding utils
631
626
  //
@@ -50,8 +50,6 @@ int main(int argc, char ** argv) {
50
50
  const int N = 5; // n-gram size
51
51
  const int G = 15; // max verification n-grams
52
52
 
53
- const bool dump_kv_cache = params.dump_kv_cache;
54
-
55
53
  // init llama.cpp
56
54
  llama_backend_init();
57
55
  llama_numa_init(params.numa);
@@ -152,9 +150,6 @@ int main(int argc, char ** argv) {
152
150
  // here we keep adding new n-grams as we go
153
151
  ngram_container ngrams_observed(llama_vocab_n_tokens(vocab), N, G);
154
152
 
155
- // debug
156
- struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, W + G + 1);
157
-
158
153
  const auto t_dec_start = ggml_time_us();
159
154
 
160
155
  // sample first token
@@ -172,12 +167,6 @@ int main(int argc, char ** argv) {
172
167
  }
173
168
 
174
169
  while (true) {
175
- // debug
176
- if (dump_kv_cache) {
177
- llama_kv_cache_view_update(ctx, &kvc_view);
178
- common_kv_cache_dump_view_seqs(kvc_view, 40);
179
- }
180
-
181
170
  // build the mask from https://lmsys.org/blog/2023-11-21-lookahead-decoding/
182
171
  //
183
172
  // Example for W = 5, N = 4, G = 2:
@@ -473,8 +462,6 @@ int main(int argc, char ** argv) {
473
462
 
474
463
  common_sampler_free(smpl);
475
464
 
476
- llama_kv_cache_view_free(&kvc_view);
477
-
478
465
  llama_batch_free(batch);
479
466
 
480
467
  llama_backend_free();
@@ -24,8 +24,6 @@ int main(int argc, char ** argv){
24
24
  // max. number of additional tokens to draft if match is found
25
25
  const int n_draft = params.speculative.n_max;
26
26
 
27
- const bool dump_kv_cache = params.dump_kv_cache;
28
-
29
27
  // init llama.cpp
30
28
  llama_backend_init();
31
29
  llama_numa_init(params.numa);
@@ -110,18 +108,9 @@ int main(int argc, char ** argv){
110
108
 
111
109
  llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, 1);
112
110
 
113
- // debug
114
- struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, 1);
115
-
116
111
  const auto t_dec_start = ggml_time_us();
117
112
 
118
113
  while (true) {
119
- // debug
120
- if (dump_kv_cache) {
121
- llama_kv_cache_view_update(ctx, &kvc_view);
122
- common_kv_cache_dump_view_seqs(kvc_view, 40);
123
- }
124
-
125
114
  // print current draft sequence
126
115
  LOG_DBG("drafted %s\n", string_from(ctx, draft).c_str());
127
116
 
@@ -178,8 +178,6 @@ int main(int argc, char ** argv) {
178
178
  // insert new requests as soon as the previous one is done
179
179
  const bool cont_batching = params.cont_batching;
180
180
 
181
- const bool dump_kv_cache = params.dump_kv_cache;
182
-
183
181
  // is the system prompt shared in the cache
184
182
  const bool is_sp_shared = params.is_pp_shared;
185
183
 
@@ -241,8 +239,6 @@ int main(int argc, char ** argv) {
241
239
  int32_t n_total_gen = 0;
242
240
  int32_t n_cache_miss = 0;
243
241
 
244
- struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, n_clients);
245
-
246
242
  const auto t_main_start = ggml_time_us();
247
243
 
248
244
  LOG_INF("%s: Simulating parallel requests from clients:\n", __func__);
@@ -272,11 +268,6 @@ int main(int argc, char ** argv) {
272
268
  LOG_INF("Processing requests ...\n\n");
273
269
 
274
270
  while (true) {
275
- if (dump_kv_cache) {
276
- llama_kv_cache_view_update(ctx, &kvc_view);
277
- common_kv_cache_dump_view_seqs(kvc_view, 40);
278
- }
279
-
280
271
  common_batch_clear(batch);
281
272
 
282
273
  // decode any currently ongoing sequences
@@ -81,14 +81,14 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
81
81
  }
82
82
  }
83
83
 
84
- static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
84
+ static void batch_encode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
85
85
  // clear previous kv_cache values (irrelevant for embeddings)
86
86
  llama_kv_self_clear(ctx);
87
87
 
88
88
  // run model
89
89
  LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
90
- if (llama_decode(ctx, batch) < 0) {
91
- LOG_ERR("%s : failed to decode\n", __func__);
90
+ if (llama_encode(ctx, batch) < 0) {
91
+ LOG_ERR("%s : failed to encode\n", __func__);
92
92
  }
93
93
 
94
94
  for (int i = 0; i < batch.n_tokens; i++) {
@@ -233,7 +233,7 @@ int main(int argc, char ** argv) {
233
233
  // encode if at capacity
234
234
  if (batch.n_tokens + n_toks > n_batch) {
235
235
  float * out = emb + p * n_embd;
236
- batch_decode(ctx, batch, out, s, n_embd);
236
+ batch_encode(ctx, batch, out, s, n_embd);
237
237
  common_batch_clear(batch);
238
238
  p += s;
239
239
  s = 0;
@@ -246,7 +246,7 @@ int main(int argc, char ** argv) {
246
246
 
247
247
  // final batch
248
248
  float * out = emb + p * n_embd;
249
- batch_decode(ctx, batch, out, s, n_embd);
249
+ batch_encode(ctx, batch, out, s, n_embd);
250
250
 
251
251
  // save embeddings to chunks
252
252
  for (int i = 0; i < n_chunks; i++) {
@@ -267,7 +267,7 @@ int main(int argc, char ** argv) {
267
267
  batch_add_seq(query_batch, query_tokens, 0);
268
268
 
269
269
  std::vector<float> query_emb(n_embd, 0);
270
- batch_decode(ctx, query_batch, query_emb.data(), 1, n_embd);
270
+ batch_encode(ctx, query_batch, query_emb.data(), 1, n_embd);
271
271
 
272
272
  common_batch_clear(query_batch);
273
273
 
@@ -84,13 +84,13 @@ int main(int argc, char ** argv) {
84
84
  model_params.n_gpu_layers = ngl;
85
85
 
86
86
  llama_model * model = llama_model_load_from_file(model_path.c_str(), model_params);
87
- const llama_vocab * vocab = llama_model_get_vocab(model);
88
87
 
89
88
  if (model == NULL) {
90
89
  fprintf(stderr , "%s: error: unable to load model\n" , __func__);
91
90
  return 1;
92
91
  }
93
92
 
93
+ const llama_vocab * vocab = llama_model_get_vocab(model);
94
94
  // tokenize the prompt
95
95
 
96
96
  // find the number of tokens in the prompt
@@ -98,7 +98,7 @@ int main(int argc, char ** argv) {
98
98
  auto generate = [&](const std::string & prompt) {
99
99
  std::string response;
100
100
 
101
- const bool is_first = llama_kv_self_used_cells(ctx) == 0;
101
+ const bool is_first = llama_kv_self_seq_pos_max(ctx, 0) == 0;
102
102
 
103
103
  // tokenize the prompt
104
104
  const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
@@ -113,7 +113,7 @@ int main(int argc, char ** argv) {
113
113
  while (true) {
114
114
  // check if we have enough space in the context to evaluate this batch
115
115
  int n_ctx = llama_n_ctx(ctx);
116
- int n_ctx_used = llama_kv_self_used_cells(ctx);
116
+ int n_ctx_used = llama_kv_self_seq_pos_max(ctx, 0);
117
117
  if (n_ctx_used + batch.n_tokens > n_ctx) {
118
118
  printf("\033[0m\n");
119
119
  fprintf(stderr, "context size exceeded\n");
@@ -12,16 +12,16 @@ source /opt/intel/oneapi/setvars.sh
12
12
 
13
13
  INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:"
14
14
  MODEL_FILE=models/llama-2-7b.Q4_0.gguf
15
- NGL=33
16
- CONEXT=4096
15
+ NGL=99
16
+ CONTEXT=4096
17
17
 
18
18
  if [ $# -gt 0 ]; then
19
19
  GGML_SYCL_DEVICE=$1
20
20
  echo "use $GGML_SYCL_DEVICE as main GPU"
21
21
  #use signle GPU only
22
- ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONEXT} -mg $GGML_SYCL_DEVICE -sm none
22
+ ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none
23
23
 
24
24
  else
25
25
  #use multiple GPUs with same max compute units
26
- ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONEXT}
26
+ ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT}
27
27
  fi
@@ -0,0 +1,28 @@
1
+ #!/bin/bash
2
+
3
+ # MIT license
4
+ # Copyright (C) 2025 Intel Corporation
5
+ # SPDX-License-Identifier: MIT
6
+
7
+ # If you want more control, DPC++ Allows selecting a specific device through the
8
+ # following environment variable
9
+ #export ONEAPI_DEVICE_SELECTOR="level_zero:0"
10
+ source /opt/intel/oneapi/setvars.sh
11
+
12
+ #export GGML_SYCL_DEBUG=1
13
+
14
+ #ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.
15
+
16
+ INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:"
17
+ MODEL_FILE=models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf
18
+ NGL=99 # Layers offloaded to the GPU. If the device runs out of memory, reduce this value according to the model you are using.
19
+ CONTEXT=4096
20
+
21
+ if [ $# -gt 0 ]; then
22
+ GGML_SYCL_DEVICE=$1
23
+ echo "Using $GGML_SYCL_DEVICE as the main GPU"
24
+ ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none
25
+ else
26
+ #use multiple GPUs with same max compute units
27
+ ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -c ${CONTEXT}
28
+ fi
@@ -6,4 +6,4 @@ set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
6
6
  @call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
7
7
 
8
8
 
9
- .\build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 33 -s 0
9
+ .\build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 99 -s 0
@@ -0,0 +1,9 @@
1
+ :: MIT license
2
+ :: Copyright (C) 2024 Intel Corporation
3
+ :: SPDX-License-Identifier: MIT
4
+
5
+ set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
6
+ @call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
7
+
8
+
9
+ .\build\bin\llama-cli.exe -m models\Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf -p %INPUT2% -n 400 -e -ngl 99
@@ -128,6 +128,8 @@ extern "C" {
128
128
  // set gradients to zero, initilize loss, and optionally reset the optimizer
129
129
  GGML_API void ggml_opt_reset(ggml_opt_context_t opt_ctx, bool optimizer);
130
130
 
131
+ GGML_API bool ggml_opt_static_graphs(ggml_opt_context_t opt_ctx); // whether the graphs are allocated_statically
132
+
131
133
  // get underlying tensors that store data
132
134
  // if not using static graphs these pointers become invalid with the next call to ggml_opt_alloc
133
135
  GGML_API struct ggml_tensor * ggml_opt_inputs( ggml_opt_context_t opt_ctx); // forward graph input tensor
@@ -536,6 +536,7 @@ extern "C" {
536
536
  GGML_UNARY_OP_HARDSWISH,
537
537
  GGML_UNARY_OP_HARDSIGMOID,
538
538
  GGML_UNARY_OP_EXP,
539
+ GGML_UNARY_OP_GELU_ERF,
539
540
 
540
541
  GGML_UNARY_OP_COUNT,
541
542
  };
@@ -1024,6 +1025,16 @@ extern "C" {
1024
1025
  struct ggml_context * ctx,
1025
1026
  struct ggml_tensor * a);
1026
1027
 
1028
+ // GELU using erf (error function) when possible
1029
+ // some backends may fallback to approximation based on Abramowitz and Stegun formula
1030
+ GGML_API struct ggml_tensor * ggml_gelu_erf(
1031
+ struct ggml_context * ctx,
1032
+ struct ggml_tensor * a);
1033
+
1034
+ GGML_API struct ggml_tensor * ggml_gelu_erf_inplace(
1035
+ struct ggml_context * ctx,
1036
+ struct ggml_tensor * a);
1037
+
1027
1038
  GGML_API struct ggml_tensor * ggml_gelu_quick(
1028
1039
  struct ggml_context * ctx,
1029
1040
  struct ggml_tensor * a);