llama_cpp 0.16.1 → 0.16.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -286,6 +286,7 @@ enum llm_kv {
286
286
  LLM_KV_LEADING_DENSE_BLOCK_COUNT,
287
287
  LLM_KV_FEED_FORWARD_LENGTH,
288
288
  LLM_KV_EXPERT_FEED_FORWARD_LENGTH,
289
+ LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH,
289
290
  LLM_KV_USE_PARALLEL_RESIDUAL,
290
291
  LLM_KV_TENSOR_DATA_LAYOUT,
291
292
  LLM_KV_EXPERT_COUNT,
@@ -364,21 +365,22 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
364
365
  { LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
365
366
  { LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
366
367
 
367
- { LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
368
- { LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
369
- { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
370
- { LLM_KV_BLOCK_COUNT, "%s.block_count" },
371
- { LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" },
372
- { LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
373
- { LLM_KV_EXPERT_FEED_FORWARD_LENGTH, "%s.expert_feed_forward_length" },
374
- { LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
375
- { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
376
- { LLM_KV_EXPERT_COUNT, "%s.expert_count" },
377
- { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
378
- { LLM_KV_EXPERT_SHARED_COUNT, "%s.expert_shared_count" },
379
- { LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
380
- { LLM_KV_POOLING_TYPE , "%s.pooling_type" },
381
- { LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
368
+ { LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
369
+ { LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
370
+ { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
371
+ { LLM_KV_BLOCK_COUNT, "%s.block_count" },
372
+ { LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" },
373
+ { LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
374
+ { LLM_KV_EXPERT_FEED_FORWARD_LENGTH, "%s.expert_feed_forward_length" },
375
+ { LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, "%s.expert_shared_feed_forward_length" },
376
+ { LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
377
+ { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
378
+ { LLM_KV_EXPERT_COUNT, "%s.expert_count" },
379
+ { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
380
+ { LLM_KV_EXPERT_SHARED_COUNT, "%s.expert_shared_count" },
381
+ { LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
382
+ { LLM_KV_POOLING_TYPE , "%s.pooling_type" },
383
+ { LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
382
384
 
383
385
  { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
384
386
  { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -1278,6 +1280,126 @@ struct no_init {
1278
1280
  };
1279
1281
 
1280
1282
  struct llama_file {
1283
+
1284
+ #if defined(_WIN32)
1285
+ // use FILE * so we don't have to re-open the file to mmap
1286
+ FILE * fp;
1287
+ HANDLE fp_win32;
1288
+ size_t size;
1289
+
1290
+ private:
1291
+ std::string GetErrorMessageWin32(DWORD error_code) const {
1292
+ std::string ret;
1293
+ LPSTR lpMsgBuf = NULL;
1294
+ DWORD bufLen = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
1295
+ NULL, error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&lpMsgBuf, 0, NULL);
1296
+ if (!bufLen) {
1297
+ ret = format("Win32 error code: %s", error_code);
1298
+ } else {
1299
+ ret = lpMsgBuf;
1300
+ LocalFree(lpMsgBuf);
1301
+ }
1302
+
1303
+ return ret;
1304
+ }
1305
+
1306
+ public:
1307
+
1308
+ llama_file(const char * fname, const char * mode) {
1309
+ fp = ggml_fopen(fname, mode);
1310
+ if (fp == NULL) {
1311
+ throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
1312
+ }
1313
+ fp_win32 = (HANDLE) _get_osfhandle(_fileno(fp));
1314
+ seek(0, SEEK_END);
1315
+ size = tell();
1316
+ seek(0, SEEK_SET);
1317
+ }
1318
+
1319
+ size_t tell() const {
1320
+ // SetFilePointerEx returns the current position when seeking relative 0 bytes
1321
+ LARGE_INTEGER li;
1322
+ li.QuadPart = 0;
1323
+ BOOL ret = SetFilePointerEx(fp_win32, li, &li, FILE_CURRENT);
1324
+ if (!ret) {
1325
+ throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
1326
+ }
1327
+
1328
+ return li.QuadPart;
1329
+ }
1330
+
1331
+ void seek(size_t offset, int whence) const {
1332
+ // no need to convert SEEK_* to FILE_*. The enums are the same.
1333
+ // Still, keep static asserts to avoid failures in the future.
1334
+ static_assert(SEEK_SET == FILE_BEGIN, "SEEK_SET != FILE_BEGIN");
1335
+ static_assert(SEEK_CUR == FILE_CURRENT, "SEEK_CUR != FILE_CURRENT");
1336
+ static_assert(SEEK_END == FILE_END, "SEEK_END != FILE_END");
1337
+
1338
+ LARGE_INTEGER li;
1339
+ li.QuadPart = offset;
1340
+ BOOL ret = SetFilePointerEx(fp_win32, li, NULL, whence);
1341
+ if (!ret) {
1342
+ throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
1343
+ }
1344
+ }
1345
+
1346
+ void read_raw(void * ptr, size_t len) const {
1347
+ // On Win32 ReadFile is significant faster than fread which is again significant faster than std::fstream. Thus
1348
+ // use the Win32 API to do file io instead of the C/C++ library functions.
1349
+
1350
+ // There are conditions under which ReadFile cannot read chunks >64MB.
1351
+ // Thus split the operation into smaller chunks if len exceeds this limit.
1352
+ size_t bytes_read = 0;
1353
+ while (bytes_read < len) {
1354
+ size_t chunk_size = std::min<size_t>(len - bytes_read, 64*1024*1024);
1355
+ DWORD chunk_read = 0;
1356
+ BOOL result = ReadFile(fp_win32, reinterpret_cast<char*>(ptr) + bytes_read, chunk_size, &chunk_read, NULL);
1357
+ if (!result) {
1358
+ throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
1359
+ }
1360
+ if (chunk_read < chunk_size || chunk_read == 0) {
1361
+ throw std::runtime_error("unexpectedly reached end of file");
1362
+ }
1363
+
1364
+ bytes_read += chunk_read;
1365
+ } ;
1366
+ }
1367
+
1368
+ uint32_t read_u32() const {
1369
+ uint32_t val;
1370
+ read_raw(&val, sizeof(val));
1371
+ return val;
1372
+ }
1373
+
1374
+ void write_raw(const void * ptr, size_t len) const {
1375
+ // There are conditions under which WriteFile cannot write chunks >64MB.
1376
+ // Thus split the operation into smaller chunks if len exceeds this limit.
1377
+ size_t bytes_written = 0;
1378
+ while (bytes_written < len) {
1379
+ size_t chunk_size = std::min<size_t>(len - bytes_written, 64*1024*1024);
1380
+ DWORD chunk_written = 0;
1381
+ BOOL result = WriteFile(fp_win32, reinterpret_cast<char const*>(ptr) + bytes_written, chunk_size, &chunk_written, NULL);
1382
+ if (!result) {
1383
+ throw std::runtime_error(format("write error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
1384
+ }
1385
+ if (chunk_written < chunk_size || chunk_written == 0) {
1386
+ throw std::runtime_error("unexpectedly failed to write bytes");
1387
+ }
1388
+
1389
+ bytes_written += chunk_written;
1390
+ }
1391
+ }
1392
+
1393
+ void write_u32(std::uint32_t val) const {
1394
+ write_raw(&val, sizeof(val));
1395
+ }
1396
+
1397
+ ~llama_file() {
1398
+ if (fp) {
1399
+ std::fclose(fp);
1400
+ }
1401
+ }
1402
+ #else
1281
1403
  // use FILE * so we don't have to re-open the file to mmap
1282
1404
  FILE * fp;
1283
1405
  size_t size;
@@ -1298,7 +1420,10 @@ struct llama_file {
1298
1420
  #else
1299
1421
  long ret = std::ftell(fp);
1300
1422
  #endif
1301
- GGML_ASSERT(ret != -1); // this really shouldn't fail
1423
+ if (ret == -1) {
1424
+ throw std::runtime_error(format("ftell error: %s", strerror(errno)));
1425
+ }
1426
+
1302
1427
  return (size_t) ret;
1303
1428
  }
1304
1429
 
@@ -1308,7 +1433,9 @@ struct llama_file {
1308
1433
  #else
1309
1434
  int ret = std::fseek(fp, (long) offset, whence);
1310
1435
  #endif
1311
- GGML_ASSERT(ret == 0); // same
1436
+ if (ret != 0) {
1437
+ throw std::runtime_error(format("seek error: %s", strerror(errno)));
1438
+ }
1312
1439
  }
1313
1440
 
1314
1441
  void read_raw(void * ptr, size_t len) const {
@@ -1351,6 +1478,7 @@ struct llama_file {
1351
1478
  std::fclose(fp);
1352
1479
  }
1353
1480
  }
1481
+ #endif
1354
1482
  };
1355
1483
  using llama_files = std::vector<std::unique_ptr<llama_file>>;
1356
1484
 
@@ -1844,6 +1972,7 @@ struct llama_hparams {
1844
1972
  uint32_t n_lora_q = 0;
1845
1973
  uint32_t n_lora_kv = 0;
1846
1974
  uint32_t n_ff_exp = 0;
1975
+ uint32_t n_ff_shexp = 0;
1847
1976
  uint32_t n_expert_shared = 0;
1848
1977
  float expert_weights_scale = 0.0;
1849
1978
 
@@ -1892,6 +2021,7 @@ struct llama_hparams {
1892
2021
  if (this->n_lora_q != other.n_lora_q) return true;
1893
2022
  if (this->n_lora_kv != other.n_lora_kv) return true;
1894
2023
  if (this->n_ff_exp != other.n_ff_exp) return true;
2024
+ if (this->n_ff_shexp != other.n_ff_shexp) return true;
1895
2025
  if (this->n_expert_shared != other.n_expert_shared) return true;
1896
2026
 
1897
2027
  if (this->rope_finetuned != other.rope_finetuned) return true;
@@ -2163,6 +2293,8 @@ struct llama_vocab {
2163
2293
  enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
2164
2294
  enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
2165
2295
 
2296
+ int max_token_len = 0; // used for optimizing longest token search
2297
+
2166
2298
  std::unordered_map<token, id> token_to_id;
2167
2299
  std::vector<token_data> id_to_token;
2168
2300
 
@@ -2180,16 +2312,17 @@ struct llama_vocab {
2180
2312
  id special_cls_id = -1;
2181
2313
  id special_mask_id = -1;
2182
2314
 
2183
- int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add.
2184
- int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
2185
-
2186
2315
  id linefeed_id = 13;
2187
2316
  id special_prefix_id = -1;
2188
2317
  id special_suffix_id = -1;
2189
2318
  id special_middle_id = -1;
2190
2319
  id special_eot_id = -1; // TODO: move above after "eos_id", and here add "file separator" token
2191
2320
 
2192
- bool add_space_prefix = true;
2321
+ // tokenizer flags
2322
+ bool tokenizer_add_space_prefix = true;
2323
+ bool tokenizer_add_bos = false;
2324
+ bool tokenizer_add_eos = false;
2325
+ bool tokenizer_ignore_merges = false;
2193
2326
 
2194
2327
  int find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
2195
2328
  GGML_ASSERT(token_left.find(' ') == std::string::npos);
@@ -3721,6 +3854,44 @@ struct llama_model_loader {
3721
3854
  std::vector<no_init<uint8_t>> read_buf;
3722
3855
  std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result;
3723
3856
 
3857
+ #if defined(GGML_USE_CUDA)
3858
+ // 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
3859
+ // NVMe raid configurations might require more / larger buffers.
3860
+ constexpr size_t num_buffers = 4;
3861
+ constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB
3862
+
3863
+ std::vector<ggml_backend_buffer_t> host_buffers;
3864
+ std::vector<void*> host_ptrs;
3865
+ std::vector<ggml_backend_event_t> events;
3866
+ size_t buffer_idx = 0; // buffer to use for async loads
3867
+
3868
+ ggml_backend_t cuda_backend = nullptr;
3869
+ if (!use_mmap && !check_tensors) {
3870
+ // When not using mmaped io use async uploads from pinned memory to GPU memory.
3871
+ // First determine if the CUDA backend is active, and if so, determine the device ID.
3872
+ ggml_backend_buffer_t buf = bufs_mmap.count(0) ? bufs_mmap.at(0) : nullptr;
3873
+ if (buf) {
3874
+ ggml_backend_buffer_type_t buffer_type = ggml_backend_buffer_get_type(buf);
3875
+ for (int i = 0; i < ggml_backend_cuda_get_device_count(); ++i) {
3876
+ auto * cuda_buffer_type = ggml_backend_cuda_buffer_type(i);
3877
+ if (buffer_type == cuda_buffer_type) {
3878
+ cuda_backend = ggml_backend_cuda_init(i);
3879
+ break;
3880
+ }
3881
+ }
3882
+ }
3883
+
3884
+ // If the cuda backend is active create pinned memory buffers and events for synchronisation.
3885
+ if (cuda_backend) {
3886
+ for (size_t idx = 0; idx < num_buffers; ++idx) {
3887
+ host_buffers.emplace_back(ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), buffer_size));
3888
+ host_ptrs.emplace_back(ggml_backend_buffer_get_base(host_buffers[idx]));
3889
+ events.emplace_back(ggml_backend_event_new(cuda_backend));
3890
+ }
3891
+ }
3892
+ }
3893
+ #endif
3894
+
3724
3895
  for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
3725
3896
  const auto * weight = get_weight(ggml_get_name(cur));
3726
3897
  if (weight == nullptr) {
@@ -3776,12 +3947,36 @@ struct llama_model_loader {
3776
3947
  }));
3777
3948
  }
3778
3949
  } else {
3779
- read_buf.resize(n_size);
3780
- file->seek(weight->offs, SEEK_SET);
3781
- file->read_raw(read_buf.data(), n_size);
3782
- ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
3783
- if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
3784
- throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
3950
+ #if defined(GGML_USE_CUDA)
3951
+ // If cuda_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
3952
+ if (cuda_backend) {
3953
+ file->seek(weight->offs, SEEK_SET);
3954
+
3955
+ size_t bytes_read = 0;
3956
+
3957
+ while (bytes_read < n_size) {
3958
+ size_t read_iteration = std::min<size_t>(buffer_size, n_size - bytes_read);
3959
+
3960
+ ggml_backend_event_synchronize(events[buffer_idx]);
3961
+ file->read_raw(host_ptrs[buffer_idx], read_iteration);
3962
+ ggml_backend_tensor_set_async(cuda_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);
3963
+ ggml_backend_event_record(events[buffer_idx]);
3964
+
3965
+ bytes_read += read_iteration;
3966
+ ++buffer_idx;
3967
+ buffer_idx %= num_buffers;
3968
+ }
3969
+ }
3970
+ else
3971
+ #endif
3972
+ {
3973
+ read_buf.resize(n_size);
3974
+ file->seek(weight->offs, SEEK_SET);
3975
+ file->read_raw(read_buf.data(), n_size);
3976
+ ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
3977
+ if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
3978
+ throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
3979
+ }
3785
3980
  }
3786
3981
  }
3787
3982
  }
@@ -3789,6 +3984,18 @@ struct llama_model_loader {
3789
3984
  size_done += n_size;
3790
3985
  }
3791
3986
 
3987
+ #if defined(GGML_USE_CUDA)
3988
+ // free temporary resources used for async cuda uploads
3989
+ if (cuda_backend) {
3990
+ for (size_t idx = 0; idx < num_buffers;++idx) {
3991
+ ggml_backend_event_synchronize(events[idx]);
3992
+ ggml_backend_event_free(events[idx]);
3993
+ ggml_backend_buffer_free(host_buffers[idx]);
3994
+ }
3995
+ ggml_backend_free(cuda_backend);
3996
+ }
3997
+ #endif
3998
+
3792
3999
  // check validation results
3793
4000
  bool validation_failed = false;
3794
4001
  for (auto & future : validation_result) {
@@ -4255,6 +4462,9 @@ static void llm_load_hparams(
4255
4462
  } break;
4256
4463
  case LLM_ARCH_QWEN2MOE:
4257
4464
  {
4465
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
4466
+ ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
4467
+
4258
4468
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
4259
4469
  switch (hparams.n_layer) {
4260
4470
  case 24: model.type = e_model::MODEL_A2_7B; break;
@@ -4563,7 +4773,7 @@ static void llm_load_vocab(
4563
4773
 
4564
4774
  const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
4565
4775
  if (add_space_prefix_keyidx != -1) {
4566
- vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
4776
+ vocab.tokenizer_add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
4567
4777
  } // The default value of add_space_prefix is true.
4568
4778
  } else if (tokenizer_model == "bert") {
4569
4779
  vocab.type = LLAMA_VOCAB_TYPE_WPM;
@@ -4576,13 +4786,13 @@ static void llm_load_vocab(
4576
4786
  vocab.special_pad_id = 0;
4577
4787
  vocab.special_cls_id = 101;
4578
4788
  vocab.special_mask_id = 103;
4579
- vocab.add_space_prefix = false;
4789
+ vocab.tokenizer_add_space_prefix = false;
4580
4790
  } else if (tokenizer_model == "gpt2") {
4581
4791
  vocab.type = LLAMA_VOCAB_TYPE_BPE;
4582
4792
 
4583
4793
  const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
4584
4794
  if (add_space_prefix_keyidx != -1) {
4585
- vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
4795
+ vocab.tokenizer_add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
4586
4796
  }
4587
4797
 
4588
4798
  // read bpe merges and populate bpe ranks
@@ -4640,6 +4850,8 @@ static void llm_load_vocab(
4640
4850
  tokenizer_pre == "llama-v3" ||
4641
4851
  tokenizer_pre == "llama-bpe") {
4642
4852
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
4853
+ vocab.tokenizer_ignore_merges = true;
4854
+ vocab.tokenizer_add_bos = true;
4643
4855
  } else if (
4644
4856
  tokenizer_pre == "deepseek-llm") {
4645
4857
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM;
@@ -4690,6 +4902,14 @@ static void llm_load_vocab(
4690
4902
  } else {
4691
4903
  throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
4692
4904
  }
4905
+ } else if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
4906
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
4907
+ vocab.tokenizer_add_bos = true;
4908
+ vocab.tokenizer_add_eos = false;
4909
+ } else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
4910
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
4911
+ vocab.tokenizer_add_bos = true;
4912
+ vocab.tokenizer_add_eos = false;
4693
4913
  } else {
4694
4914
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
4695
4915
  }
@@ -4721,6 +4941,7 @@ static void llm_load_vocab(
4721
4941
  GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
4722
4942
 
4723
4943
  vocab.token_to_id[word] = i;
4944
+ vocab.max_token_len = std::max(vocab.max_token_len, (int) word.size());
4724
4945
 
4725
4946
  auto & token_data = vocab.id_to_token[i];
4726
4947
  token_data.text = std::move(word);
@@ -4834,10 +5055,10 @@ static void llm_load_vocab(
4834
5055
  bool temp = true;
4835
5056
 
4836
5057
  if (ml.get_key(LLM_KV_TOKENIZER_ADD_BOS, temp, false)) {
4837
- vocab.special_add_bos = int(temp);
5058
+ vocab.tokenizer_add_bos = temp;
4838
5059
  }
4839
5060
  if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) {
4840
- vocab.special_add_eos = int(temp);
5061
+ vocab.tokenizer_add_eos = temp;
4841
5062
  }
4842
5063
  }
4843
5064
 
@@ -4937,7 +5158,7 @@ static void llm_load_vocab(
4937
5158
  );
4938
5159
 
4939
5160
  // set attributes by model/tokenizer name
4940
- if (_contains_any(tokenizer_pre, {"jina-v2-es", "jina-v2-de"})) {
5161
+ if (_contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})) {
4941
5162
  _set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
4942
5163
  } else if (_contains_any(model_name, {"phi-3", "phi3"})) {
4943
5164
  for (auto id : vocab.cache_special_tokens) {
@@ -5031,6 +5252,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
5031
5252
  if (vocab.special_middle_id != -1) { LLAMA_LOG_INFO( "%s: MID token = %d '%s'\n", __func__, vocab.special_middle_id, vocab.id_to_token[vocab.special_middle_id].text.c_str() ); }
5032
5253
  if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
5033
5254
 
5255
+ LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, vocab.max_token_len);
5256
+
5034
5257
  if (model.arch == LLM_ARCH_DEEPSEEK2) {
5035
5258
  LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
5036
5259
  LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
@@ -5040,6 +5263,11 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
5040
5263
  LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
5041
5264
  LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
5042
5265
  }
5266
+
5267
+ if (model.arch == LLM_ARCH_QWEN2MOE) {
5268
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
5269
+ LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
5270
+ }
5043
5271
  }
5044
5272
 
5045
5273
  // Returns false if cancelled by progress_callback
@@ -5183,7 +5411,7 @@ static bool llm_load_tensors(
5183
5411
  // create tensors for the weights
5184
5412
  {
5185
5413
  const int64_t n_embd = hparams.n_embd;
5186
- const int64_t n_embd_head = n_embd / hparams.n_head;
5414
+ const int64_t n_embd_head = (hparams.n_head == 0) ? 0 : n_embd / hparams.n_head;
5187
5415
  const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
5188
5416
  const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
5189
5417
  const int64_t n_embd_gqa = n_embd_v_gqa;
@@ -5826,16 +6054,17 @@ static bool llm_load_tensors(
5826
6054
  GGML_ASSERT(hparams.n_expert_used > 0);
5827
6055
 
5828
6056
  // MoE branch
5829
- auto n_ff_exp = n_ff / hparams.n_expert_used;
6057
+ auto n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / hparams.n_expert_used;
5830
6058
  layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
5831
6059
  layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert});
5832
6060
  layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
5833
6061
 
5834
6062
  // Shared expert branch
6063
+ auto n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
5835
6064
  layer.ffn_gate_inp_shexp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd});
5836
- layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff});
5837
- layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff, n_embd});
5838
- layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff});
6065
+ layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp});
6066
+ layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd});
6067
+ layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp});
5839
6068
  }
5840
6069
  } break;
5841
6070
  case LLM_ARCH_PHI2:
@@ -6625,16 +6854,6 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
6625
6854
  }
6626
6855
  #endif
6627
6856
 
6628
- #ifdef GGML_USE_SYCL
6629
- if (params.split_mode == LLAMA_SPLIT_MODE_NONE) {
6630
- ggml_backend_sycl_set_single_device_mode(params.main_gpu);
6631
- //SYCL use device index (0, 1, 2) directly, uer input device id, then convert to device index.
6632
- params.main_gpu = ggml_backend_sycl_get_device_index(params.main_gpu);
6633
- } else {
6634
- ggml_backend_sycl_set_mul_device_mode();
6635
- }
6636
- #endif
6637
-
6638
6857
  if (!llm_load_tensors(
6639
6858
  ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock,
6640
6859
  params.progress_callback, params.progress_callback_user_data
@@ -7435,6 +7654,50 @@ struct llm_build_context {
7435
7654
  return lctx.inp_s_seq;
7436
7655
  }
7437
7656
 
7657
+ struct ggml_cgraph * append_pooling(struct ggml_cgraph * gf) {
7658
+ // find result_norm tensor for input
7659
+ struct ggml_tensor * inp = nullptr;
7660
+ for (int i = gf->n_nodes - 1; i >= 0; --i) {
7661
+ inp = gf->nodes[i];
7662
+ if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) {
7663
+ break;
7664
+ } else {
7665
+ inp = nullptr;
7666
+ }
7667
+ }
7668
+ GGML_ASSERT(inp != nullptr && "missing result_norm/result_embd tensor");
7669
+
7670
+ struct ggml_tensor * cur;
7671
+
7672
+ switch (pooling_type) {
7673
+ case LLAMA_POOLING_TYPE_MEAN:
7674
+ {
7675
+ struct ggml_tensor * inp_mean = build_inp_mean();
7676
+ cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, inp)), inp_mean);
7677
+ } break;
7678
+ case LLAMA_POOLING_TYPE_CLS:
7679
+ case LLAMA_POOLING_TYPE_LAST:
7680
+ {
7681
+ struct ggml_tensor * inp_cls = build_inp_cls();
7682
+ cur = ggml_get_rows(ctx0, inp, inp_cls);
7683
+ } break;
7684
+ case LLAMA_POOLING_TYPE_NONE:
7685
+ {
7686
+ cur = inp;
7687
+ } break;
7688
+ default:
7689
+ {
7690
+ GGML_ASSERT(false && "unknown pooling type");
7691
+ } break;
7692
+ }
7693
+
7694
+ cb(cur, "result_embd_pooled", -1);
7695
+
7696
+ ggml_build_forward_expand(gf, cur);
7697
+
7698
+ return gf;
7699
+ }
7700
+
7438
7701
  struct ggml_cgraph * build_llama() {
7439
7702
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
7440
7703
 
@@ -8415,8 +8678,6 @@ struct llm_build_context {
8415
8678
  if (model.arch != LLM_ARCH_JINA_BERT_V2) {
8416
8679
  inp_pos = build_inp_pos();
8417
8680
  }
8418
- struct ggml_tensor * inp_mean = build_inp_mean();
8419
- struct ggml_tensor * inp_cls = build_inp_cls();
8420
8681
 
8421
8682
  // construct input embeddings (token, type, position)
8422
8683
  inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
@@ -8591,28 +8852,6 @@ struct llm_build_context {
8591
8852
  cur = inpL;
8592
8853
  cb(cur, "result_embd", -1);
8593
8854
 
8594
- // pooling layer
8595
- switch (pooling_type) {
8596
- case LLAMA_POOLING_TYPE_NONE:
8597
- {
8598
- // nop
8599
- } break;
8600
- case LLAMA_POOLING_TYPE_MEAN:
8601
- {
8602
- cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
8603
- cb(cur, "result_embd_pooled", -1);
8604
- } break;
8605
- case LLAMA_POOLING_TYPE_CLS:
8606
- {
8607
- cur = ggml_get_rows(ctx0, cur, inp_cls);
8608
- cb(cur, "result_embd_pooled", -1);
8609
- } break;
8610
- case LLAMA_POOLING_TYPE_UNSPECIFIED:
8611
- {
8612
- GGML_ASSERT(false && "Invalid pooling type");
8613
- } break;
8614
- }
8615
-
8616
8855
  ggml_build_forward_expand(gf, cur);
8617
8856
 
8618
8857
  return gf;
@@ -11697,6 +11936,11 @@ static struct ggml_cgraph * llama_build_graph(
11697
11936
  GGML_ASSERT(false);
11698
11937
  }
11699
11938
 
11939
+ // add on pooling layer
11940
+ if (lctx.cparams.embeddings) {
11941
+ result = llm.append_pooling(result);
11942
+ }
11943
+
11700
11944
  llm.free();
11701
11945
 
11702
11946
  return result;
@@ -11786,7 +12030,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
11786
12030
  // (!a || b) is a logical implication (a -> b)
11787
12031
  // !hparams.causal_attn -> !cparams.causal_attn
11788
12032
  (hparams.causal_attn || !cparams.causal_attn) &&
11789
- "causal attention with embedding models is not supported"
12033
+ "causal attention is not supported by this model"
11790
12034
  );
11791
12035
 
11792
12036
  if (lctx.inp_KQ_mask) {
@@ -11918,6 +12162,37 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
11918
12162
  }
11919
12163
  }
11920
12164
 
12165
+ if (cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) {
12166
+ const int64_t n_tokens = batch.n_tokens;
12167
+
12168
+ GGML_ASSERT(lctx.inp_cls);
12169
+ GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
12170
+
12171
+ uint32_t * data = (uint32_t *) lctx.inp_cls->data;
12172
+ memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));
12173
+
12174
+ std::vector<int> last_pos(n_tokens, -1);
12175
+ std::vector<int> last_row(n_tokens, -1);
12176
+
12177
+ for (int i = 0; i < n_tokens; ++i) {
12178
+ const llama_seq_id seq_id = batch.seq_id[i][0];
12179
+ const llama_pos pos = batch.pos[i];
12180
+
12181
+ GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == LAST");
12182
+
12183
+ if (pos >= last_pos[seq_id]) {
12184
+ last_pos[seq_id] = pos;
12185
+ last_row[seq_id] = i;
12186
+ }
12187
+ }
12188
+
12189
+ for (int i = 0; i < n_tokens; ++i) {
12190
+ if (last_row[i] >= 0) {
12191
+ data[i] = last_row[i];
12192
+ }
12193
+ }
12194
+ }
12195
+
11921
12196
  if (kv_self.recurrent) {
11922
12197
  const int64_t n_kv = kv_self.n;
11923
12198
 
@@ -11979,8 +12254,8 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
11979
12254
  const auto n_embd = hparams.n_embd;
11980
12255
 
11981
12256
  // TODO: use a per-batch flag for logits presence instead
11982
- const bool has_logits = cparams.causal_attn;
11983
- const bool has_embd = cparams.embeddings && (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
12257
+ const bool has_logits = !cparams.embeddings;
12258
+ const bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
11984
12259
 
11985
12260
  const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
11986
12261
  const size_t embd_size = has_embd ? n_embd*n_outputs_max : 0;
@@ -12110,11 +12385,13 @@ static int llama_decode_internal(
12110
12385
  std::vector<std::vector<llama_seq_id>> seq_id;
12111
12386
 
12112
12387
  // count outputs
12113
- if (batch_all.logits) {
12388
+ if (cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE) {
12389
+ n_outputs = n_tokens_all;
12390
+ } else if (batch_all.logits) {
12114
12391
  for (uint32_t i = 0; i < n_tokens_all; ++i) {
12115
12392
  n_outputs += batch_all.logits[i] != 0;
12116
12393
  }
12117
- } else if (lctx.logits_all || (cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE)) {
12394
+ } else if (lctx.logits_all) {
12118
12395
  n_outputs = n_tokens_all;
12119
12396
  } else {
12120
12397
  // keep last output only
@@ -12245,30 +12522,13 @@ static int llama_decode_internal(
12245
12522
  // no output
12246
12523
  res = nullptr;
12247
12524
  embd = nullptr;
12248
- } else if (!hparams.causal_attn) {
12249
- res = nullptr; // do not extract logits for embedding models such as BERT
12250
-
12251
- // token or sequence embeddings
12252
- embd = gf->nodes[gf->n_nodes - 1];
12253
-
12254
- GGML_ASSERT(strcmp(embd->name, "result_embd") == 0 || strcmp(embd->name, "result_embd_pooled") == 0);
12255
12525
  } else if (cparams.embeddings) {
12256
- // the embeddings could be in the second to last tensor, or any of the previous tensors
12257
- int i_embd = gf->n_nodes - 2;
12258
- for (int i = 3; strcmp(embd->name, "result_norm") != 0; ++i) {
12259
- i_embd = gf->n_nodes - i;
12260
- if (i_embd < 0) { break; }
12261
- embd = gf->nodes[i_embd];
12262
- }
12263
- GGML_ASSERT(i_embd >= 0 && "missing result_norm tensor");
12264
-
12265
- // TODO: use a per-batch flag to know when to skip logits while keeping embeddings
12266
- if (!cparams.causal_attn) {
12267
- res = nullptr; // do not extract logits when not needed
12268
- // skip computing logits
12269
- // TODO: is this safe?
12270
- gf->n_nodes = i_embd + 1;
12526
+ res = nullptr; // do not extract logits for embedding case
12527
+ embd = gf->nodes[gf->n_nodes - 1];
12528
+ if (strcmp(embd->name, "result_embd_pooled") != 0) {
12529
+ embd = gf->nodes[gf->n_nodes - 2];
12271
12530
  }
12531
+ GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
12272
12532
  } else {
12273
12533
  embd = nullptr; // do not extract embeddings when not needed
12274
12534
  GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
@@ -12337,11 +12597,10 @@ static int llama_decode_internal(
12337
12597
  ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float));
12338
12598
  }
12339
12599
  } break;
12340
- case LLAMA_POOLING_TYPE_CLS:
12341
12600
  case LLAMA_POOLING_TYPE_MEAN:
12601
+ case LLAMA_POOLING_TYPE_CLS:
12602
+ case LLAMA_POOLING_TYPE_LAST:
12342
12603
  {
12343
- GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0);
12344
-
12345
12604
  // extract sequence embeddings
12346
12605
  auto & embd_seq_out = lctx.embd_seq;
12347
12606
  embd_seq_out.clear();
@@ -12955,112 +13214,142 @@ struct llm_bigram_bpe {
12955
13214
  };
12956
13215
 
12957
13216
  struct llm_tokenizer_bpe {
12958
- llm_tokenizer_bpe(const llama_vocab & vocab): vocab(vocab) {}
12959
-
12960
- void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
12961
- int final_prev_index = -1;
12962
- bool ignore_merges = false;
12963
-
12964
- std::vector<std::string> word_collection;
12965
- switch (vocab.type) {
12966
- case LLAMA_VOCAB_TYPE_BPE:
12967
- switch (vocab.type_pre) {
12968
- case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
12969
- ignore_merges = true;
12970
- word_collection = unicode_regex_split(text, {
12971
- // original regex from tokenizer.json
12972
- //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
12973
-
12974
- // adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989
12975
- "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
12976
- });
12977
- break;
12978
- case LLAMA_VOCAB_PRE_TYPE_DBRX:
12979
- case LLAMA_VOCAB_PRE_TYPE_SMAUG:
12980
- word_collection = unicode_regex_split(text, {
12981
- // same as llama3
12982
- "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
12983
- });
12984
- break;
12985
- case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
12986
- word_collection = unicode_regex_split(text, {
12987
- "[\r\n]",
12988
- "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
12989
- "\\s?[!-/:-~!-/:-~‘-‟ -。]+",
12990
- "\\s+$",
12991
- "[一-龥ࠀ-一가-퟿]+",
12992
- "\\p{N}+",
12993
- });
12994
- break;
12995
- case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
12996
- word_collection = unicode_regex_split(text, {
12997
- "[\r\n]",
12998
- "\\s?\\p{L}+",
12999
- "\\s?\\p{P}+",
13000
- "[一-龥ࠀ-一가-퟿]+",
13001
- "\\p{N}",
13002
- });
13003
- break;
13004
- case LLAMA_VOCAB_PRE_TYPE_FALCON:
13005
- word_collection = unicode_regex_split(text, {
13006
- "[\\p{P}\\$\\+<=>\\^~\\|]+",
13007
- "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
13008
- "[0-9][0-9][0-9]",
13009
- });
13010
- break;
13011
- case LLAMA_VOCAB_PRE_TYPE_MPT:
13012
- // TODO: MPT pre-tokenization regexes are unknown
13013
- // the following are close, but not exact. run the following:
13014
- // ./bin/test-tokenizer-0 ../models/ggml-vocab-mpt.gguf
13015
- GGML_ASSERT("MPT pre-tokenization regexes are unknown - fixes needed");
13016
- word_collection = unicode_regex_split(text, {
13017
- "\\s?\\p{L}+",
13018
- "\\s?\\p{P}+",
13019
- "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
13020
- });
13021
- break;
13022
- case LLAMA_VOCAB_PRE_TYPE_STARCODER:
13023
- case LLAMA_VOCAB_PRE_TYPE_REFACT:
13024
- case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
13025
- word_collection = unicode_regex_split(text, {
13026
- "\\p{N}",
13027
- "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
13028
- });
13029
- break;
13030
- case LLAMA_VOCAB_PRE_TYPE_GPT2:
13031
- case LLAMA_VOCAB_PRE_TYPE_OLMO:
13032
- word_collection = unicode_regex_split(text, {
13033
- "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
13034
- });
13035
- break;
13036
- case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
13037
- case LLAMA_VOCAB_PRE_TYPE_QWEN2:
13038
- word_collection = unicode_regex_split(text, {
13039
- // original regex from tokenizer.json
13040
- // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
13041
- "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
13042
- });
13043
- break;
13044
- case LLAMA_VOCAB_PRE_TYPE_PORO:
13045
- word_collection = unicode_regex_split(text, {
13046
- " ?[^(\\s|.,!?…。,、।۔،)]+",
13047
- });
13048
- break;
13049
- default:
13050
- // default regex for BPE tokenization pre-processing
13051
- word_collection = unicode_regex_split(text, {
13052
- "[\\p{P}\\$\\+<=>\\^~\\|]+",
13053
- "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
13054
- "\\p{N}+",
13055
- "[0-9][0-9][0-9]",
13056
- });
13057
- break;
13058
- }
13217
+ llm_tokenizer_bpe(const llama_vocab & vocab): vocab(vocab) {
13218
+ GGML_ASSERT(vocab.type == LLAMA_VOCAB_TYPE_BPE);
13219
+ switch (vocab.type_pre) {
13220
+ case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
13221
+ regex_exprs = {
13222
+ // original regex from tokenizer.json
13223
+ //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
13224
+
13225
+ // adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989
13226
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
13227
+ };
13228
+ break;
13229
+ case LLAMA_VOCAB_PRE_TYPE_DBRX:
13230
+ case LLAMA_VOCAB_PRE_TYPE_SMAUG:
13231
+ regex_exprs = {
13232
+ // same as llama3
13233
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
13234
+ };
13235
+ break;
13236
+ case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
13237
+ regex_exprs = {
13238
+ "[\r\n]",
13239
+ "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
13240
+ "\\s?[!-/:-~!-/:-~‘-‟ -。]+",
13241
+ "\\s+$",
13242
+ "[一-龥ࠀ-一가-퟿]+",
13243
+ "\\p{N}+",
13244
+ };
13245
+ break;
13246
+ case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
13247
+ regex_exprs = {
13248
+ "[\r\n]",
13249
+ "\\s?\\p{L}+",
13250
+ "\\s?\\p{P}+",
13251
+ "[一-龥ࠀ-一가-퟿]+",
13252
+ "\\p{N}",
13253
+ };
13254
+ break;
13255
+ case LLAMA_VOCAB_PRE_TYPE_FALCON:
13256
+ regex_exprs = {
13257
+ "[\\p{P}\\$\\+<=>\\^~\\|`]+",
13258
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
13259
+ "[0-9][0-9][0-9]",
13260
+ };
13261
+ break;
13262
+ case LLAMA_VOCAB_PRE_TYPE_MPT:
13263
+ // TODO: MPT pre-tokenization regexes are unknown
13264
+ // the following are close, but not exact. run the following:
13265
+ // ./bin/test-tokenizer-0 ../models/ggml-vocab-mpt.gguf
13266
+ GGML_ASSERT("MPT pre-tokenization regexes are unknown - fixes needed");
13267
+ regex_exprs = {
13268
+ "\\s?\\p{L}+",
13269
+ "\\s?\\p{P}+",
13270
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
13271
+ };
13272
+ break;
13273
+ case LLAMA_VOCAB_PRE_TYPE_STARCODER:
13274
+ case LLAMA_VOCAB_PRE_TYPE_REFACT:
13275
+ case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
13276
+ regex_exprs = {
13277
+ "\\p{N}",
13278
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
13279
+ };
13280
+ break;
13281
+ case LLAMA_VOCAB_PRE_TYPE_GPT2:
13282
+ case LLAMA_VOCAB_PRE_TYPE_OLMO:
13283
+ regex_exprs = {
13284
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
13285
+ };
13286
+ break;
13287
+ case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
13288
+ case LLAMA_VOCAB_PRE_TYPE_QWEN2:
13289
+ regex_exprs = {
13290
+ // original regex from tokenizer.json
13291
+ // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
13292
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
13293
+ };
13294
+ break;
13295
+ case LLAMA_VOCAB_PRE_TYPE_PORO:
13296
+ regex_exprs = {
13297
+ " ?[^(\\s|.,!?…。,、।۔،)]+",
13298
+ };
13059
13299
  break;
13060
13300
  default:
13061
- GGML_ASSERT(false);
13301
+ // default regex for BPE tokenization pre-processing
13302
+ regex_exprs = {
13303
+ "[\\p{P}\\$\\+<=>\\^~\\|]+",
13304
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
13305
+ "\\p{N}+",
13306
+ "[0-9][0-9][0-9]",
13307
+ };
13062
13308
  break;
13063
13309
  }
13310
+ }
13311
+
13312
+ void append(const llama_vocab::id token_id, std::vector<llama_vocab::id> & output) const {
13313
+ output.push_back(token_id);
13314
+ }
13315
+
13316
+ bool append_bos(std::vector<llama_vocab::id> & output) const {
13317
+ if (vocab.tokenizer_add_bos) {
13318
+ GGML_ASSERT(vocab.special_bos_id != -1);
13319
+ output.push_back(vocab.special_bos_id);
13320
+ return true;
13321
+ }
13322
+ return false;
13323
+ }
13324
+
13325
+ bool append_eos(std::vector<llama_vocab::id> & output) const {
13326
+ if (vocab.tokenizer_add_eos) {
13327
+ GGML_ASSERT(vocab.special_eos_id != -1);
13328
+ output.push_back(vocab.special_eos_id);
13329
+ return true;
13330
+ }
13331
+ return false;
13332
+ }
13333
+
13334
+ void check_double_bos_eos(const std::vector<llama_vocab::id> & output) const {
13335
+ if (vocab.tokenizer_add_bos && output.size() >= 2 && output[1] == vocab.special_bos_id) {
13336
+ LLAMA_LOG_WARN(
13337
+ "%s: Added a BOS token to the prompt as specified by the model but the prompt "
13338
+ "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
13339
+ "Are you sure this is what you want?\n", __FUNCTION__);
13340
+ }
13341
+ if (vocab.tokenizer_add_eos && output.size() >= 2 && *(output.end()-2) == vocab.special_eos_id) {
13342
+ LLAMA_LOG_WARN(
13343
+ "%s: Added a EOS token to the prompt as specified by the model but the prompt "
13344
+ "also ends with a EOS token. So now the final prompt ends with 2 EOS tokens. "
13345
+ "Are you sure this is what you want?\n", __FUNCTION__);
13346
+ }
13347
+ }
13348
+
13349
+ void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
13350
+ int final_prev_index = -1;
13351
+
13352
+ const auto word_collection = unicode_regex_split(text, regex_exprs);
13064
13353
 
13065
13354
  symbols_final.clear();
13066
13355
 
@@ -13071,7 +13360,7 @@ struct llm_tokenizer_bpe {
13071
13360
  int index = 0;
13072
13361
  size_t offset = 0;
13073
13362
 
13074
- if (ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
13363
+ if (vocab.tokenizer_ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
13075
13364
  symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
13076
13365
  offset = word.size();
13077
13366
  }
@@ -13152,10 +13441,9 @@ struct llm_tokenizer_bpe {
13152
13441
  for (auto j = str.begin(); j != str.end(); ++j) {
13153
13442
  std::string byte_str(1, *j);
13154
13443
  auto token_multibyte = vocab.token_to_id.find(byte_str);
13155
- if (token_multibyte == vocab.token_to_id.end()) {
13156
- throw std::runtime_error("ERROR: byte not found in vocab");
13444
+ if (token_multibyte != vocab.token_to_id.end()) {
13445
+ output.push_back(token_multibyte->second);
13157
13446
  }
13158
- output.push_back((*token_multibyte).second);
13159
13447
  }
13160
13448
  } else {
13161
13449
  output.push_back((*token).second);
@@ -13194,6 +13482,8 @@ private:
13194
13482
 
13195
13483
  const llama_vocab & vocab;
13196
13484
 
13485
+ std::vector<std::string> regex_exprs;
13486
+
13197
13487
  std::vector<llm_symbol> symbols;
13198
13488
  std::vector<llm_symbol> symbols_final;
13199
13489
 
@@ -13203,7 +13493,7 @@ private:
13203
13493
  struct llm_tokenizer_wpm {
13204
13494
  llm_tokenizer_wpm(const llama_vocab & vocab): vocab(vocab) {}
13205
13495
 
13206
- void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
13496
+ void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) const {
13207
13497
  const auto & token_map = vocab.token_to_id;
13208
13498
 
13209
13499
  // normalize and split by whitespace
@@ -13212,7 +13502,7 @@ struct llm_tokenizer_wpm {
13212
13502
  // bos token prepended already
13213
13503
 
13214
13504
  // find the longest tokens that form the words
13215
- for (const std::string &word : words) {
13505
+ for (const std::string & word : words) {
13216
13506
  // skip empty words
13217
13507
  if (word.size() == 0) {
13218
13508
  continue;
@@ -13229,7 +13519,7 @@ struct llm_tokenizer_wpm {
13229
13519
  for (int i = 0; i < n; ++i) {
13230
13520
  // loop through possible match length
13231
13521
  bool match = false;
13232
- for (int j = n; j > i; j--) {
13522
+ for (int j = std::min(n, i + vocab.max_token_len + 1); j > i; j--) {
13233
13523
  auto it = token_map.find(word1.substr(i, j - i));
13234
13524
  if (it != token_map.end()) {
13235
13525
  output.push_back(it->second);
@@ -13252,11 +13542,12 @@ struct llm_tokenizer_wpm {
13252
13542
  }
13253
13543
  }
13254
13544
 
13255
- std::vector<std::string> preprocess(const std::string & text) {
13545
+ // TODO: reduce string copies by using cpts_offs array
13546
+ std::vector<std::string> preprocess(const std::string & text) const {
13256
13547
  const std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
13257
13548
  std::vector<std::string> words(1, "");
13258
13549
 
13259
- for (const char32_t cpt : cpts_nfd) {
13550
+ for (const uint32_t cpt : cpts_nfd) {
13260
13551
  const auto flags = unicode_cpt_flags(cpt);
13261
13552
 
13262
13553
  if (flags.is_whitespace) {
@@ -13474,7 +13765,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
13474
13765
 
13475
13766
  bool is_prev_special = false;
13476
13767
 
13477
- if (add_special && vocab.special_add_bos != 0) {
13768
+ if (add_special && vocab.tokenizer_add_bos) {
13478
13769
  GGML_ASSERT(vocab.special_bos_id != -1);
13479
13770
  output.push_back(vocab.special_bos_id);
13480
13771
  is_prev_special = true;
@@ -13484,7 +13775,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
13484
13775
  if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
13485
13776
  auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
13486
13777
 
13487
- if (vocab.add_space_prefix) {
13778
+ if (vocab.tokenizer_add_space_prefix) {
13488
13779
  if (!output.size() || is_prev_special) { // prefix with space if first token
13489
13780
  raw_text = " " + raw_text;
13490
13781
  }
@@ -13502,23 +13793,24 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
13502
13793
  }
13503
13794
  }
13504
13795
 
13505
- if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
13796
+ if (add_special && vocab.tokenizer_add_bos && output.size() >= 2 && output[1] == vocab.special_bos_id) {
13506
13797
  LLAMA_LOG_WARN(
13507
13798
  "%s: Added a BOS token to the prompt as specified by the model but the prompt "
13508
13799
  "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
13509
13800
  "Are you sure this is what you want?\n", __FUNCTION__);
13510
13801
  }
13511
13802
 
13512
- if (add_special && vocab.special_add_eos == 1) {
13803
+ if (add_special && vocab.tokenizer_add_eos) {
13513
13804
  GGML_ASSERT(vocab.special_eos_id != -1);
13514
13805
  output.push_back(vocab.special_eos_id);
13515
13806
  }
13516
13807
  } break;
13517
13808
  case LLAMA_VOCAB_TYPE_BPE:
13518
13809
  {
13519
- if (add_special && vocab.special_add_bos != 0) {
13520
- GGML_ASSERT(vocab.special_bos_id != -1);
13521
- output.push_back(vocab.special_bos_id);
13810
+ llm_tokenizer_bpe tokenizer(vocab);
13811
+
13812
+ if (add_special) {
13813
+ tokenizer.append_bos(output);
13522
13814
  }
13523
13815
 
13524
13816
  for (const auto & fragment : fragment_buffer) {
@@ -13528,23 +13820,15 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
13528
13820
  #ifdef PRETOKENIZERDEBUG
13529
13821
  LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
13530
13822
  #endif
13531
- llm_tokenizer_bpe tokenizer(vocab);
13532
13823
  tokenizer.tokenize(raw_text, output);
13533
13824
  } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
13534
- output.push_back(fragment.token);
13825
+ tokenizer.append(fragment.token, output);
13535
13826
  }
13536
13827
  }
13537
13828
 
13538
- if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
13539
- LLAMA_LOG_WARN(
13540
- "%s: Added a BOS token to the prompt as specified by the model but the prompt "
13541
- "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
13542
- "Are you sure this is what you want?\n", __FUNCTION__);
13543
- }
13544
-
13545
- if (add_special && vocab.special_add_eos == 1) {
13546
- GGML_ASSERT(vocab.special_add_eos != -1);
13547
- output.push_back(vocab.special_eos_id);
13829
+ if (add_special) {
13830
+ tokenizer.append_eos(output);
13831
+ tokenizer.check_double_bos_eos(output);
13548
13832
  }
13549
13833
  } break;
13550
13834
  case LLAMA_VOCAB_TYPE_WPM:
@@ -13554,6 +13838,8 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
13554
13838
  output.push_back(vocab.special_cls_id);
13555
13839
  }
13556
13840
 
13841
+ llm_tokenizer_wpm tokenizer(vocab);
13842
+
13557
13843
  for (const auto & fragment : fragment_buffer) {
13558
13844
  if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
13559
13845
  auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
@@ -13561,7 +13847,6 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
13561
13847
  #ifdef PRETOKENIZERDEBUG
13562
13848
  LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
13563
13849
  #endif
13564
- llm_tokenizer_wpm tokenizer(vocab);
13565
13850
  tokenizer.tokenize(raw_text, output);
13566
13851
  } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
13567
13852
  output.push_back(fragment.token);
@@ -16070,6 +16355,11 @@ struct llama_context * llama_new_context_with_model(
16070
16355
  params.flash_attn = false;
16071
16356
  }
16072
16357
 
16358
+ if (params.flash_attn && model->hparams.n_embd_head_k != model->hparams.n_embd_head_v) {
16359
+ LLAMA_LOG_WARN("%s: flash_attn requires n_embd_head_k == n_embd_head_v - forcing off\n", __func__);
16360
+ params.flash_attn = false;
16361
+ }
16362
+
16073
16363
  if (params.type_v != GGML_TYPE_F16 && !params.flash_attn) {
16074
16364
  LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__);
16075
16365
  return nullptr;
@@ -16241,8 +16531,7 @@ struct llama_context * llama_new_context_with_model(
16241
16531
  if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
16242
16532
  ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
16243
16533
  if (backend == nullptr) {
16244
- int main_gpu_id = ggml_backend_sycl_get_device_id(model->main_gpu);
16245
- LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, main_gpu_id, model->main_gpu);
16534
+ LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu);
16246
16535
  llama_free(ctx);
16247
16536
  return nullptr;
16248
16537
  }
@@ -17870,6 +18159,10 @@ void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)
17870
18159
  ctx->abort_callback_data = abort_callback_data;
17871
18160
  }
17872
18161
 
18162
+ void llama_set_embeddings(struct llama_context * ctx, bool embeddings) {
18163
+ ctx->cparams.embeddings = embeddings;
18164
+ }
18165
+
17873
18166
  void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
17874
18167
  ctx->cparams.causal_attn = causal_attn;
17875
18168
  }
@@ -18113,11 +18406,11 @@ llama_token llama_token_nl(const struct llama_model * model) {
18113
18406
  }
18114
18407
 
18115
18408
  int32_t llama_add_bos_token(const struct llama_model * model) {
18116
- return model->vocab.special_add_bos;
18409
+ return model->vocab.tokenizer_add_bos;
18117
18410
  }
18118
18411
 
18119
18412
  int32_t llama_add_eos_token(const struct llama_model * model) {
18120
- return model->vocab.special_add_eos;
18413
+ return model->vocab.tokenizer_add_eos;
18121
18414
  }
18122
18415
 
18123
18416
  llama_token llama_token_prefix(const struct llama_model * model) {