llama_cpp 0.16.1 → 0.16.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -286,6 +286,7 @@ enum llm_kv {
286
286
  LLM_KV_LEADING_DENSE_BLOCK_COUNT,
287
287
  LLM_KV_FEED_FORWARD_LENGTH,
288
288
  LLM_KV_EXPERT_FEED_FORWARD_LENGTH,
289
+ LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH,
289
290
  LLM_KV_USE_PARALLEL_RESIDUAL,
290
291
  LLM_KV_TENSOR_DATA_LAYOUT,
291
292
  LLM_KV_EXPERT_COUNT,
@@ -364,21 +365,22 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
364
365
  { LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
365
366
  { LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
366
367
 
367
- { LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
368
- { LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
369
- { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
370
- { LLM_KV_BLOCK_COUNT, "%s.block_count" },
371
- { LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" },
372
- { LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
373
- { LLM_KV_EXPERT_FEED_FORWARD_LENGTH, "%s.expert_feed_forward_length" },
374
- { LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
375
- { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
376
- { LLM_KV_EXPERT_COUNT, "%s.expert_count" },
377
- { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
378
- { LLM_KV_EXPERT_SHARED_COUNT, "%s.expert_shared_count" },
379
- { LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
380
- { LLM_KV_POOLING_TYPE , "%s.pooling_type" },
381
- { LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
368
+ { LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
369
+ { LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
370
+ { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
371
+ { LLM_KV_BLOCK_COUNT, "%s.block_count" },
372
+ { LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" },
373
+ { LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
374
+ { LLM_KV_EXPERT_FEED_FORWARD_LENGTH, "%s.expert_feed_forward_length" },
375
+ { LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, "%s.expert_shared_feed_forward_length" },
376
+ { LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
377
+ { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
378
+ { LLM_KV_EXPERT_COUNT, "%s.expert_count" },
379
+ { LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
380
+ { LLM_KV_EXPERT_SHARED_COUNT, "%s.expert_shared_count" },
381
+ { LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
382
+ { LLM_KV_POOLING_TYPE , "%s.pooling_type" },
383
+ { LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
382
384
 
383
385
  { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
384
386
  { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -1278,6 +1280,126 @@ struct no_init {
1278
1280
  };
1279
1281
 
1280
1282
  struct llama_file {
1283
+
1284
+ #if defined(_WIN32)
1285
+ // use FILE * so we don't have to re-open the file to mmap
1286
+ FILE * fp;
1287
+ HANDLE fp_win32;
1288
+ size_t size;
1289
+
1290
+ private:
1291
+ std::string GetErrorMessageWin32(DWORD error_code) const {
1292
+ std::string ret;
1293
+ LPSTR lpMsgBuf = NULL;
1294
+ DWORD bufLen = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
1295
+ NULL, error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&lpMsgBuf, 0, NULL);
1296
+ if (!bufLen) {
1297
+ ret = format("Win32 error code: %s", error_code);
1298
+ } else {
1299
+ ret = lpMsgBuf;
1300
+ LocalFree(lpMsgBuf);
1301
+ }
1302
+
1303
+ return ret;
1304
+ }
1305
+
1306
+ public:
1307
+
1308
+ llama_file(const char * fname, const char * mode) {
1309
+ fp = ggml_fopen(fname, mode);
1310
+ if (fp == NULL) {
1311
+ throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
1312
+ }
1313
+ fp_win32 = (HANDLE) _get_osfhandle(_fileno(fp));
1314
+ seek(0, SEEK_END);
1315
+ size = tell();
1316
+ seek(0, SEEK_SET);
1317
+ }
1318
+
1319
+ size_t tell() const {
1320
+ // SetFilePointerEx returns the current position when seeking relative 0 bytes
1321
+ LARGE_INTEGER li;
1322
+ li.QuadPart = 0;
1323
+ BOOL ret = SetFilePointerEx(fp_win32, li, &li, FILE_CURRENT);
1324
+ if (!ret) {
1325
+ throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
1326
+ }
1327
+
1328
+ return li.QuadPart;
1329
+ }
1330
+
1331
+ void seek(size_t offset, int whence) const {
1332
+ // no need to convert SEEK_* to FILE_*. The enums are the same.
1333
+ // Still, keep static asserts to avoid failures in the future.
1334
+ static_assert(SEEK_SET == FILE_BEGIN, "SEEK_SET != FILE_BEGIN");
1335
+ static_assert(SEEK_CUR == FILE_CURRENT, "SEEK_CUR != FILE_CURRENT");
1336
+ static_assert(SEEK_END == FILE_END, "SEEK_END != FILE_END");
1337
+
1338
+ LARGE_INTEGER li;
1339
+ li.QuadPart = offset;
1340
+ BOOL ret = SetFilePointerEx(fp_win32, li, NULL, whence);
1341
+ if (!ret) {
1342
+ throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
1343
+ }
1344
+ }
1345
+
1346
+ void read_raw(void * ptr, size_t len) const {
1347
+ // On Win32 ReadFile is significant faster than fread which is again significant faster than std::fstream. Thus
1348
+ // use the Win32 API to do file io instead of the C/C++ library functions.
1349
+
1350
+ // There are conditions under which ReadFile cannot read chunks >64MB.
1351
+ // Thus split the operation into smaller chunks if len exceeds this limit.
1352
+ size_t bytes_read = 0;
1353
+ while (bytes_read < len) {
1354
+ size_t chunk_size = std::min<size_t>(len - bytes_read, 64*1024*1024);
1355
+ DWORD chunk_read = 0;
1356
+ BOOL result = ReadFile(fp_win32, reinterpret_cast<char*>(ptr) + bytes_read, chunk_size, &chunk_read, NULL);
1357
+ if (!result) {
1358
+ throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
1359
+ }
1360
+ if (chunk_read < chunk_size || chunk_read == 0) {
1361
+ throw std::runtime_error("unexpectedly reached end of file");
1362
+ }
1363
+
1364
+ bytes_read += chunk_read;
1365
+ } ;
1366
+ }
1367
+
1368
+ uint32_t read_u32() const {
1369
+ uint32_t val;
1370
+ read_raw(&val, sizeof(val));
1371
+ return val;
1372
+ }
1373
+
1374
+ void write_raw(const void * ptr, size_t len) const {
1375
+ // There are conditions under which WriteFile cannot write chunks >64MB.
1376
+ // Thus split the operation into smaller chunks if len exceeds this limit.
1377
+ size_t bytes_written = 0;
1378
+ while (bytes_written < len) {
1379
+ size_t chunk_size = std::min<size_t>(len - bytes_written, 64*1024*1024);
1380
+ DWORD chunk_written = 0;
1381
+ BOOL result = WriteFile(fp_win32, reinterpret_cast<char const*>(ptr) + bytes_written, chunk_size, &chunk_written, NULL);
1382
+ if (!result) {
1383
+ throw std::runtime_error(format("write error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
1384
+ }
1385
+ if (chunk_written < chunk_size || chunk_written == 0) {
1386
+ throw std::runtime_error("unexpectedly failed to write bytes");
1387
+ }
1388
+
1389
+ bytes_written += chunk_written;
1390
+ }
1391
+ }
1392
+
1393
+ void write_u32(std::uint32_t val) const {
1394
+ write_raw(&val, sizeof(val));
1395
+ }
1396
+
1397
+ ~llama_file() {
1398
+ if (fp) {
1399
+ std::fclose(fp);
1400
+ }
1401
+ }
1402
+ #else
1281
1403
  // use FILE * so we don't have to re-open the file to mmap
1282
1404
  FILE * fp;
1283
1405
  size_t size;
@@ -1298,7 +1420,10 @@ struct llama_file {
1298
1420
  #else
1299
1421
  long ret = std::ftell(fp);
1300
1422
  #endif
1301
- GGML_ASSERT(ret != -1); // this really shouldn't fail
1423
+ if (ret == -1) {
1424
+ throw std::runtime_error(format("ftell error: %s", strerror(errno)));
1425
+ }
1426
+
1302
1427
  return (size_t) ret;
1303
1428
  }
1304
1429
 
@@ -1308,7 +1433,9 @@ struct llama_file {
1308
1433
  #else
1309
1434
  int ret = std::fseek(fp, (long) offset, whence);
1310
1435
  #endif
1311
- GGML_ASSERT(ret == 0); // same
1436
+ if (ret != 0) {
1437
+ throw std::runtime_error(format("seek error: %s", strerror(errno)));
1438
+ }
1312
1439
  }
1313
1440
 
1314
1441
  void read_raw(void * ptr, size_t len) const {
@@ -1351,6 +1478,7 @@ struct llama_file {
1351
1478
  std::fclose(fp);
1352
1479
  }
1353
1480
  }
1481
+ #endif
1354
1482
  };
1355
1483
  using llama_files = std::vector<std::unique_ptr<llama_file>>;
1356
1484
 
@@ -1844,6 +1972,7 @@ struct llama_hparams {
1844
1972
  uint32_t n_lora_q = 0;
1845
1973
  uint32_t n_lora_kv = 0;
1846
1974
  uint32_t n_ff_exp = 0;
1975
+ uint32_t n_ff_shexp = 0;
1847
1976
  uint32_t n_expert_shared = 0;
1848
1977
  float expert_weights_scale = 0.0;
1849
1978
 
@@ -1892,6 +2021,7 @@ struct llama_hparams {
1892
2021
  if (this->n_lora_q != other.n_lora_q) return true;
1893
2022
  if (this->n_lora_kv != other.n_lora_kv) return true;
1894
2023
  if (this->n_ff_exp != other.n_ff_exp) return true;
2024
+ if (this->n_ff_shexp != other.n_ff_shexp) return true;
1895
2025
  if (this->n_expert_shared != other.n_expert_shared) return true;
1896
2026
 
1897
2027
  if (this->rope_finetuned != other.rope_finetuned) return true;
@@ -2163,6 +2293,8 @@ struct llama_vocab {
2163
2293
  enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
2164
2294
  enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
2165
2295
 
2296
+ int max_token_len = 0; // used for optimizing longest token search
2297
+
2166
2298
  std::unordered_map<token, id> token_to_id;
2167
2299
  std::vector<token_data> id_to_token;
2168
2300
 
@@ -2180,16 +2312,17 @@ struct llama_vocab {
2180
2312
  id special_cls_id = -1;
2181
2313
  id special_mask_id = -1;
2182
2314
 
2183
- int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add.
2184
- int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
2185
-
2186
2315
  id linefeed_id = 13;
2187
2316
  id special_prefix_id = -1;
2188
2317
  id special_suffix_id = -1;
2189
2318
  id special_middle_id = -1;
2190
2319
  id special_eot_id = -1; // TODO: move above after "eos_id", and here add "file separator" token
2191
2320
 
2192
- bool add_space_prefix = true;
2321
+ // tokenizer flags
2322
+ bool tokenizer_add_space_prefix = true;
2323
+ bool tokenizer_add_bos = false;
2324
+ bool tokenizer_add_eos = false;
2325
+ bool tokenizer_ignore_merges = false;
2193
2326
 
2194
2327
  int find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
2195
2328
  GGML_ASSERT(token_left.find(' ') == std::string::npos);
@@ -3721,6 +3854,44 @@ struct llama_model_loader {
3721
3854
  std::vector<no_init<uint8_t>> read_buf;
3722
3855
  std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result;
3723
3856
 
3857
+ #if defined(GGML_USE_CUDA)
3858
+ // 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
3859
+ // NVMe raid configurations might require more / larger buffers.
3860
+ constexpr size_t num_buffers = 4;
3861
+ constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB
3862
+
3863
+ std::vector<ggml_backend_buffer_t> host_buffers;
3864
+ std::vector<void*> host_ptrs;
3865
+ std::vector<ggml_backend_event_t> events;
3866
+ size_t buffer_idx = 0; // buffer to use for async loads
3867
+
3868
+ ggml_backend_t cuda_backend = nullptr;
3869
+ if (!use_mmap && !check_tensors) {
3870
+ // When not using mmaped io use async uploads from pinned memory to GPU memory.
3871
+ // First determine if the CUDA backend is active, and if so, determine the device ID.
3872
+ ggml_backend_buffer_t buf = bufs_mmap.count(0) ? bufs_mmap.at(0) : nullptr;
3873
+ if (buf) {
3874
+ ggml_backend_buffer_type_t buffer_type = ggml_backend_buffer_get_type(buf);
3875
+ for (int i = 0; i < ggml_backend_cuda_get_device_count(); ++i) {
3876
+ auto * cuda_buffer_type = ggml_backend_cuda_buffer_type(i);
3877
+ if (buffer_type == cuda_buffer_type) {
3878
+ cuda_backend = ggml_backend_cuda_init(i);
3879
+ break;
3880
+ }
3881
+ }
3882
+ }
3883
+
3884
+ // If the cuda backend is active create pinned memory buffers and events for synchronisation.
3885
+ if (cuda_backend) {
3886
+ for (size_t idx = 0; idx < num_buffers; ++idx) {
3887
+ host_buffers.emplace_back(ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), buffer_size));
3888
+ host_ptrs.emplace_back(ggml_backend_buffer_get_base(host_buffers[idx]));
3889
+ events.emplace_back(ggml_backend_event_new(cuda_backend));
3890
+ }
3891
+ }
3892
+ }
3893
+ #endif
3894
+
3724
3895
  for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
3725
3896
  const auto * weight = get_weight(ggml_get_name(cur));
3726
3897
  if (weight == nullptr) {
@@ -3776,12 +3947,36 @@ struct llama_model_loader {
3776
3947
  }));
3777
3948
  }
3778
3949
  } else {
3779
- read_buf.resize(n_size);
3780
- file->seek(weight->offs, SEEK_SET);
3781
- file->read_raw(read_buf.data(), n_size);
3782
- ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
3783
- if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
3784
- throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
3950
+ #if defined(GGML_USE_CUDA)
3951
+ // If cuda_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
3952
+ if (cuda_backend) {
3953
+ file->seek(weight->offs, SEEK_SET);
3954
+
3955
+ size_t bytes_read = 0;
3956
+
3957
+ while (bytes_read < n_size) {
3958
+ size_t read_iteration = std::min<size_t>(buffer_size, n_size - bytes_read);
3959
+
3960
+ ggml_backend_event_synchronize(events[buffer_idx]);
3961
+ file->read_raw(host_ptrs[buffer_idx], read_iteration);
3962
+ ggml_backend_tensor_set_async(cuda_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);
3963
+ ggml_backend_event_record(events[buffer_idx]);
3964
+
3965
+ bytes_read += read_iteration;
3966
+ ++buffer_idx;
3967
+ buffer_idx %= num_buffers;
3968
+ }
3969
+ }
3970
+ else
3971
+ #endif
3972
+ {
3973
+ read_buf.resize(n_size);
3974
+ file->seek(weight->offs, SEEK_SET);
3975
+ file->read_raw(read_buf.data(), n_size);
3976
+ ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
3977
+ if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
3978
+ throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
3979
+ }
3785
3980
  }
3786
3981
  }
3787
3982
  }
@@ -3789,6 +3984,18 @@ struct llama_model_loader {
3789
3984
  size_done += n_size;
3790
3985
  }
3791
3986
 
3987
+ #if defined(GGML_USE_CUDA)
3988
+ // free temporary resources used for async cuda uploads
3989
+ if (cuda_backend) {
3990
+ for (size_t idx = 0; idx < num_buffers;++idx) {
3991
+ ggml_backend_event_synchronize(events[idx]);
3992
+ ggml_backend_event_free(events[idx]);
3993
+ ggml_backend_buffer_free(host_buffers[idx]);
3994
+ }
3995
+ ggml_backend_free(cuda_backend);
3996
+ }
3997
+ #endif
3998
+
3792
3999
  // check validation results
3793
4000
  bool validation_failed = false;
3794
4001
  for (auto & future : validation_result) {
@@ -4255,6 +4462,9 @@ static void llm_load_hparams(
4255
4462
  } break;
4256
4463
  case LLM_ARCH_QWEN2MOE:
4257
4464
  {
4465
+ ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
4466
+ ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
4467
+
4258
4468
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
4259
4469
  switch (hparams.n_layer) {
4260
4470
  case 24: model.type = e_model::MODEL_A2_7B; break;
@@ -4563,7 +4773,7 @@ static void llm_load_vocab(
4563
4773
 
4564
4774
  const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
4565
4775
  if (add_space_prefix_keyidx != -1) {
4566
- vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
4776
+ vocab.tokenizer_add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
4567
4777
  } // The default value of add_space_prefix is true.
4568
4778
  } else if (tokenizer_model == "bert") {
4569
4779
  vocab.type = LLAMA_VOCAB_TYPE_WPM;
@@ -4576,13 +4786,13 @@ static void llm_load_vocab(
4576
4786
  vocab.special_pad_id = 0;
4577
4787
  vocab.special_cls_id = 101;
4578
4788
  vocab.special_mask_id = 103;
4579
- vocab.add_space_prefix = false;
4789
+ vocab.tokenizer_add_space_prefix = false;
4580
4790
  } else if (tokenizer_model == "gpt2") {
4581
4791
  vocab.type = LLAMA_VOCAB_TYPE_BPE;
4582
4792
 
4583
4793
  const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
4584
4794
  if (add_space_prefix_keyidx != -1) {
4585
- vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
4795
+ vocab.tokenizer_add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
4586
4796
  }
4587
4797
 
4588
4798
  // read bpe merges and populate bpe ranks
@@ -4640,6 +4850,8 @@ static void llm_load_vocab(
4640
4850
  tokenizer_pre == "llama-v3" ||
4641
4851
  tokenizer_pre == "llama-bpe") {
4642
4852
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
4853
+ vocab.tokenizer_ignore_merges = true;
4854
+ vocab.tokenizer_add_bos = true;
4643
4855
  } else if (
4644
4856
  tokenizer_pre == "deepseek-llm") {
4645
4857
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM;
@@ -4690,6 +4902,14 @@ static void llm_load_vocab(
4690
4902
  } else {
4691
4903
  throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
4692
4904
  }
4905
+ } else if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
4906
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
4907
+ vocab.tokenizer_add_bos = true;
4908
+ vocab.tokenizer_add_eos = false;
4909
+ } else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
4910
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
4911
+ vocab.tokenizer_add_bos = true;
4912
+ vocab.tokenizer_add_eos = false;
4693
4913
  } else {
4694
4914
  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
4695
4915
  }
@@ -4721,6 +4941,7 @@ static void llm_load_vocab(
4721
4941
  GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
4722
4942
 
4723
4943
  vocab.token_to_id[word] = i;
4944
+ vocab.max_token_len = std::max(vocab.max_token_len, (int) word.size());
4724
4945
 
4725
4946
  auto & token_data = vocab.id_to_token[i];
4726
4947
  token_data.text = std::move(word);
@@ -4834,10 +5055,10 @@ static void llm_load_vocab(
4834
5055
  bool temp = true;
4835
5056
 
4836
5057
  if (ml.get_key(LLM_KV_TOKENIZER_ADD_BOS, temp, false)) {
4837
- vocab.special_add_bos = int(temp);
5058
+ vocab.tokenizer_add_bos = temp;
4838
5059
  }
4839
5060
  if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) {
4840
- vocab.special_add_eos = int(temp);
5061
+ vocab.tokenizer_add_eos = temp;
4841
5062
  }
4842
5063
  }
4843
5064
 
@@ -4937,7 +5158,7 @@ static void llm_load_vocab(
4937
5158
  );
4938
5159
 
4939
5160
  // set attributes by model/tokenizer name
4940
- if (_contains_any(tokenizer_pre, {"jina-v2-es", "jina-v2-de"})) {
5161
+ if (_contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})) {
4941
5162
  _set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
4942
5163
  } else if (_contains_any(model_name, {"phi-3", "phi3"})) {
4943
5164
  for (auto id : vocab.cache_special_tokens) {
@@ -5031,6 +5252,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
5031
5252
  if (vocab.special_middle_id != -1) { LLAMA_LOG_INFO( "%s: MID token = %d '%s'\n", __func__, vocab.special_middle_id, vocab.id_to_token[vocab.special_middle_id].text.c_str() ); }
5032
5253
  if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
5033
5254
 
5255
+ LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, vocab.max_token_len);
5256
+
5034
5257
  if (model.arch == LLM_ARCH_DEEPSEEK2) {
5035
5258
  LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
5036
5259
  LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
@@ -5040,6 +5263,11 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
5040
5263
  LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
5041
5264
  LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
5042
5265
  }
5266
+
5267
+ if (model.arch == LLM_ARCH_QWEN2MOE) {
5268
+ LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
5269
+ LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
5270
+ }
5043
5271
  }
5044
5272
 
5045
5273
  // Returns false if cancelled by progress_callback
@@ -5183,7 +5411,7 @@ static bool llm_load_tensors(
5183
5411
  // create tensors for the weights
5184
5412
  {
5185
5413
  const int64_t n_embd = hparams.n_embd;
5186
- const int64_t n_embd_head = n_embd / hparams.n_head;
5414
+ const int64_t n_embd_head = (hparams.n_head == 0) ? 0 : n_embd / hparams.n_head;
5187
5415
  const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
5188
5416
  const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
5189
5417
  const int64_t n_embd_gqa = n_embd_v_gqa;
@@ -5826,16 +6054,17 @@ static bool llm_load_tensors(
5826
6054
  GGML_ASSERT(hparams.n_expert_used > 0);
5827
6055
 
5828
6056
  // MoE branch
5829
- auto n_ff_exp = n_ff / hparams.n_expert_used;
6057
+ auto n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / hparams.n_expert_used;
5830
6058
  layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
5831
6059
  layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert});
5832
6060
  layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
5833
6061
 
5834
6062
  // Shared expert branch
6063
+ auto n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
5835
6064
  layer.ffn_gate_inp_shexp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd});
5836
- layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff});
5837
- layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff, n_embd});
5838
- layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff});
6065
+ layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp});
6066
+ layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd});
6067
+ layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp});
5839
6068
  }
5840
6069
  } break;
5841
6070
  case LLM_ARCH_PHI2:
@@ -6625,16 +6854,6 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
6625
6854
  }
6626
6855
  #endif
6627
6856
 
6628
- #ifdef GGML_USE_SYCL
6629
- if (params.split_mode == LLAMA_SPLIT_MODE_NONE) {
6630
- ggml_backend_sycl_set_single_device_mode(params.main_gpu);
6631
- //SYCL use device index (0, 1, 2) directly, uer input device id, then convert to device index.
6632
- params.main_gpu = ggml_backend_sycl_get_device_index(params.main_gpu);
6633
- } else {
6634
- ggml_backend_sycl_set_mul_device_mode();
6635
- }
6636
- #endif
6637
-
6638
6857
  if (!llm_load_tensors(
6639
6858
  ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock,
6640
6859
  params.progress_callback, params.progress_callback_user_data
@@ -7435,6 +7654,50 @@ struct llm_build_context {
7435
7654
  return lctx.inp_s_seq;
7436
7655
  }
7437
7656
 
7657
+ struct ggml_cgraph * append_pooling(struct ggml_cgraph * gf) {
7658
+ // find result_norm tensor for input
7659
+ struct ggml_tensor * inp = nullptr;
7660
+ for (int i = gf->n_nodes - 1; i >= 0; --i) {
7661
+ inp = gf->nodes[i];
7662
+ if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) {
7663
+ break;
7664
+ } else {
7665
+ inp = nullptr;
7666
+ }
7667
+ }
7668
+ GGML_ASSERT(inp != nullptr && "missing result_norm/result_embd tensor");
7669
+
7670
+ struct ggml_tensor * cur;
7671
+
7672
+ switch (pooling_type) {
7673
+ case LLAMA_POOLING_TYPE_MEAN:
7674
+ {
7675
+ struct ggml_tensor * inp_mean = build_inp_mean();
7676
+ cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, inp)), inp_mean);
7677
+ } break;
7678
+ case LLAMA_POOLING_TYPE_CLS:
7679
+ case LLAMA_POOLING_TYPE_LAST:
7680
+ {
7681
+ struct ggml_tensor * inp_cls = build_inp_cls();
7682
+ cur = ggml_get_rows(ctx0, inp, inp_cls);
7683
+ } break;
7684
+ case LLAMA_POOLING_TYPE_NONE:
7685
+ {
7686
+ cur = inp;
7687
+ } break;
7688
+ default:
7689
+ {
7690
+ GGML_ASSERT(false && "unknown pooling type");
7691
+ } break;
7692
+ }
7693
+
7694
+ cb(cur, "result_embd_pooled", -1);
7695
+
7696
+ ggml_build_forward_expand(gf, cur);
7697
+
7698
+ return gf;
7699
+ }
7700
+
7438
7701
  struct ggml_cgraph * build_llama() {
7439
7702
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
7440
7703
 
@@ -8415,8 +8678,6 @@ struct llm_build_context {
8415
8678
  if (model.arch != LLM_ARCH_JINA_BERT_V2) {
8416
8679
  inp_pos = build_inp_pos();
8417
8680
  }
8418
- struct ggml_tensor * inp_mean = build_inp_mean();
8419
- struct ggml_tensor * inp_cls = build_inp_cls();
8420
8681
 
8421
8682
  // construct input embeddings (token, type, position)
8422
8683
  inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
@@ -8591,28 +8852,6 @@ struct llm_build_context {
8591
8852
  cur = inpL;
8592
8853
  cb(cur, "result_embd", -1);
8593
8854
 
8594
- // pooling layer
8595
- switch (pooling_type) {
8596
- case LLAMA_POOLING_TYPE_NONE:
8597
- {
8598
- // nop
8599
- } break;
8600
- case LLAMA_POOLING_TYPE_MEAN:
8601
- {
8602
- cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
8603
- cb(cur, "result_embd_pooled", -1);
8604
- } break;
8605
- case LLAMA_POOLING_TYPE_CLS:
8606
- {
8607
- cur = ggml_get_rows(ctx0, cur, inp_cls);
8608
- cb(cur, "result_embd_pooled", -1);
8609
- } break;
8610
- case LLAMA_POOLING_TYPE_UNSPECIFIED:
8611
- {
8612
- GGML_ASSERT(false && "Invalid pooling type");
8613
- } break;
8614
- }
8615
-
8616
8855
  ggml_build_forward_expand(gf, cur);
8617
8856
 
8618
8857
  return gf;
@@ -11697,6 +11936,11 @@ static struct ggml_cgraph * llama_build_graph(
11697
11936
  GGML_ASSERT(false);
11698
11937
  }
11699
11938
 
11939
+ // add on pooling layer
11940
+ if (lctx.cparams.embeddings) {
11941
+ result = llm.append_pooling(result);
11942
+ }
11943
+
11700
11944
  llm.free();
11701
11945
 
11702
11946
  return result;
@@ -11786,7 +12030,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
11786
12030
  // (!a || b) is a logical implication (a -> b)
11787
12031
  // !hparams.causal_attn -> !cparams.causal_attn
11788
12032
  (hparams.causal_attn || !cparams.causal_attn) &&
11789
- "causal attention with embedding models is not supported"
12033
+ "causal attention is not supported by this model"
11790
12034
  );
11791
12035
 
11792
12036
  if (lctx.inp_KQ_mask) {
@@ -11918,6 +12162,37 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
11918
12162
  }
11919
12163
  }
11920
12164
 
12165
+ if (cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) {
12166
+ const int64_t n_tokens = batch.n_tokens;
12167
+
12168
+ GGML_ASSERT(lctx.inp_cls);
12169
+ GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
12170
+
12171
+ uint32_t * data = (uint32_t *) lctx.inp_cls->data;
12172
+ memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));
12173
+
12174
+ std::vector<int> last_pos(n_tokens, -1);
12175
+ std::vector<int> last_row(n_tokens, -1);
12176
+
12177
+ for (int i = 0; i < n_tokens; ++i) {
12178
+ const llama_seq_id seq_id = batch.seq_id[i][0];
12179
+ const llama_pos pos = batch.pos[i];
12180
+
12181
+ GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == LAST");
12182
+
12183
+ if (pos >= last_pos[seq_id]) {
12184
+ last_pos[seq_id] = pos;
12185
+ last_row[seq_id] = i;
12186
+ }
12187
+ }
12188
+
12189
+ for (int i = 0; i < n_tokens; ++i) {
12190
+ if (last_row[i] >= 0) {
12191
+ data[i] = last_row[i];
12192
+ }
12193
+ }
12194
+ }
12195
+
11921
12196
  if (kv_self.recurrent) {
11922
12197
  const int64_t n_kv = kv_self.n;
11923
12198
 
@@ -11979,8 +12254,8 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
11979
12254
  const auto n_embd = hparams.n_embd;
11980
12255
 
11981
12256
  // TODO: use a per-batch flag for logits presence instead
11982
- const bool has_logits = cparams.causal_attn;
11983
- const bool has_embd = cparams.embeddings && (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
12257
+ const bool has_logits = !cparams.embeddings;
12258
+ const bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
11984
12259
 
11985
12260
  const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
11986
12261
  const size_t embd_size = has_embd ? n_embd*n_outputs_max : 0;
@@ -12110,11 +12385,13 @@ static int llama_decode_internal(
12110
12385
  std::vector<std::vector<llama_seq_id>> seq_id;
12111
12386
 
12112
12387
  // count outputs
12113
- if (batch_all.logits) {
12388
+ if (cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE) {
12389
+ n_outputs = n_tokens_all;
12390
+ } else if (batch_all.logits) {
12114
12391
  for (uint32_t i = 0; i < n_tokens_all; ++i) {
12115
12392
  n_outputs += batch_all.logits[i] != 0;
12116
12393
  }
12117
- } else if (lctx.logits_all || (cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE)) {
12394
+ } else if (lctx.logits_all) {
12118
12395
  n_outputs = n_tokens_all;
12119
12396
  } else {
12120
12397
  // keep last output only
@@ -12245,30 +12522,13 @@ static int llama_decode_internal(
12245
12522
  // no output
12246
12523
  res = nullptr;
12247
12524
  embd = nullptr;
12248
- } else if (!hparams.causal_attn) {
12249
- res = nullptr; // do not extract logits for embedding models such as BERT
12250
-
12251
- // token or sequence embeddings
12252
- embd = gf->nodes[gf->n_nodes - 1];
12253
-
12254
- GGML_ASSERT(strcmp(embd->name, "result_embd") == 0 || strcmp(embd->name, "result_embd_pooled") == 0);
12255
12525
  } else if (cparams.embeddings) {
12256
- // the embeddings could be in the second to last tensor, or any of the previous tensors
12257
- int i_embd = gf->n_nodes - 2;
12258
- for (int i = 3; strcmp(embd->name, "result_norm") != 0; ++i) {
12259
- i_embd = gf->n_nodes - i;
12260
- if (i_embd < 0) { break; }
12261
- embd = gf->nodes[i_embd];
12262
- }
12263
- GGML_ASSERT(i_embd >= 0 && "missing result_norm tensor");
12264
-
12265
- // TODO: use a per-batch flag to know when to skip logits while keeping embeddings
12266
- if (!cparams.causal_attn) {
12267
- res = nullptr; // do not extract logits when not needed
12268
- // skip computing logits
12269
- // TODO: is this safe?
12270
- gf->n_nodes = i_embd + 1;
12526
+ res = nullptr; // do not extract logits for embedding case
12527
+ embd = gf->nodes[gf->n_nodes - 1];
12528
+ if (strcmp(embd->name, "result_embd_pooled") != 0) {
12529
+ embd = gf->nodes[gf->n_nodes - 2];
12271
12530
  }
12531
+ GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
12272
12532
  } else {
12273
12533
  embd = nullptr; // do not extract embeddings when not needed
12274
12534
  GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
@@ -12337,11 +12597,10 @@ static int llama_decode_internal(
12337
12597
  ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float));
12338
12598
  }
12339
12599
  } break;
12340
- case LLAMA_POOLING_TYPE_CLS:
12341
12600
  case LLAMA_POOLING_TYPE_MEAN:
12601
+ case LLAMA_POOLING_TYPE_CLS:
12602
+ case LLAMA_POOLING_TYPE_LAST:
12342
12603
  {
12343
- GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0);
12344
-
12345
12604
  // extract sequence embeddings
12346
12605
  auto & embd_seq_out = lctx.embd_seq;
12347
12606
  embd_seq_out.clear();
@@ -12955,112 +13214,142 @@ struct llm_bigram_bpe {
12955
13214
  };
12956
13215
 
12957
13216
  struct llm_tokenizer_bpe {
12958
- llm_tokenizer_bpe(const llama_vocab & vocab): vocab(vocab) {}
12959
-
12960
- void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
12961
- int final_prev_index = -1;
12962
- bool ignore_merges = false;
12963
-
12964
- std::vector<std::string> word_collection;
12965
- switch (vocab.type) {
12966
- case LLAMA_VOCAB_TYPE_BPE:
12967
- switch (vocab.type_pre) {
12968
- case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
12969
- ignore_merges = true;
12970
- word_collection = unicode_regex_split(text, {
12971
- // original regex from tokenizer.json
12972
- //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
12973
-
12974
- // adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989
12975
- "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
12976
- });
12977
- break;
12978
- case LLAMA_VOCAB_PRE_TYPE_DBRX:
12979
- case LLAMA_VOCAB_PRE_TYPE_SMAUG:
12980
- word_collection = unicode_regex_split(text, {
12981
- // same as llama3
12982
- "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
12983
- });
12984
- break;
12985
- case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
12986
- word_collection = unicode_regex_split(text, {
12987
- "[\r\n]",
12988
- "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
12989
- "\\s?[!-/:-~!-/:-~‘-‟ -。]+",
12990
- "\\s+$",
12991
- "[一-龥ࠀ-一가-퟿]+",
12992
- "\\p{N}+",
12993
- });
12994
- break;
12995
- case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
12996
- word_collection = unicode_regex_split(text, {
12997
- "[\r\n]",
12998
- "\\s?\\p{L}+",
12999
- "\\s?\\p{P}+",
13000
- "[一-龥ࠀ-一가-퟿]+",
13001
- "\\p{N}",
13002
- });
13003
- break;
13004
- case LLAMA_VOCAB_PRE_TYPE_FALCON:
13005
- word_collection = unicode_regex_split(text, {
13006
- "[\\p{P}\\$\\+<=>\\^~\\|]+",
13007
- "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
13008
- "[0-9][0-9][0-9]",
13009
- });
13010
- break;
13011
- case LLAMA_VOCAB_PRE_TYPE_MPT:
13012
- // TODO: MPT pre-tokenization regexes are unknown
13013
- // the following are close, but not exact. run the following:
13014
- // ./bin/test-tokenizer-0 ../models/ggml-vocab-mpt.gguf
13015
- GGML_ASSERT("MPT pre-tokenization regexes are unknown - fixes needed");
13016
- word_collection = unicode_regex_split(text, {
13017
- "\\s?\\p{L}+",
13018
- "\\s?\\p{P}+",
13019
- "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
13020
- });
13021
- break;
13022
- case LLAMA_VOCAB_PRE_TYPE_STARCODER:
13023
- case LLAMA_VOCAB_PRE_TYPE_REFACT:
13024
- case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
13025
- word_collection = unicode_regex_split(text, {
13026
- "\\p{N}",
13027
- "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
13028
- });
13029
- break;
13030
- case LLAMA_VOCAB_PRE_TYPE_GPT2:
13031
- case LLAMA_VOCAB_PRE_TYPE_OLMO:
13032
- word_collection = unicode_regex_split(text, {
13033
- "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
13034
- });
13035
- break;
13036
- case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
13037
- case LLAMA_VOCAB_PRE_TYPE_QWEN2:
13038
- word_collection = unicode_regex_split(text, {
13039
- // original regex from tokenizer.json
13040
- // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
13041
- "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
13042
- });
13043
- break;
13044
- case LLAMA_VOCAB_PRE_TYPE_PORO:
13045
- word_collection = unicode_regex_split(text, {
13046
- " ?[^(\\s|.,!?…。,、।۔،)]+",
13047
- });
13048
- break;
13049
- default:
13050
- // default regex for BPE tokenization pre-processing
13051
- word_collection = unicode_regex_split(text, {
13052
- "[\\p{P}\\$\\+<=>\\^~\\|]+",
13053
- "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
13054
- "\\p{N}+",
13055
- "[0-9][0-9][0-9]",
13056
- });
13057
- break;
13058
- }
13217
+ llm_tokenizer_bpe(const llama_vocab & vocab): vocab(vocab) {
13218
+ GGML_ASSERT(vocab.type == LLAMA_VOCAB_TYPE_BPE);
13219
+ switch (vocab.type_pre) {
13220
+ case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
13221
+ regex_exprs = {
13222
+ // original regex from tokenizer.json
13223
+ //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
13224
+
13225
+ // adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989
13226
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
13227
+ };
13228
+ break;
13229
+ case LLAMA_VOCAB_PRE_TYPE_DBRX:
13230
+ case LLAMA_VOCAB_PRE_TYPE_SMAUG:
13231
+ regex_exprs = {
13232
+ // same as llama3
13233
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
13234
+ };
13235
+ break;
13236
+ case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
13237
+ regex_exprs = {
13238
+ "[\r\n]",
13239
+ "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
13240
+ "\\s?[!-/:-~!-/:-~‘-‟ -。]+",
13241
+ "\\s+$",
13242
+ "[一-龥ࠀ-一가-퟿]+",
13243
+ "\\p{N}+",
13244
+ };
13245
+ break;
13246
+ case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
13247
+ regex_exprs = {
13248
+ "[\r\n]",
13249
+ "\\s?\\p{L}+",
13250
+ "\\s?\\p{P}+",
13251
+ "[一-龥ࠀ-一가-퟿]+",
13252
+ "\\p{N}",
13253
+ };
13254
+ break;
13255
+ case LLAMA_VOCAB_PRE_TYPE_FALCON:
13256
+ regex_exprs = {
13257
+ "[\\p{P}\\$\\+<=>\\^~\\|`]+",
13258
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
13259
+ "[0-9][0-9][0-9]",
13260
+ };
13261
+ break;
13262
+ case LLAMA_VOCAB_PRE_TYPE_MPT:
13263
+ // TODO: MPT pre-tokenization regexes are unknown
13264
+ // the following are close, but not exact. run the following:
13265
+ // ./bin/test-tokenizer-0 ../models/ggml-vocab-mpt.gguf
13266
+ GGML_ASSERT("MPT pre-tokenization regexes are unknown - fixes needed");
13267
+ regex_exprs = {
13268
+ "\\s?\\p{L}+",
13269
+ "\\s?\\p{P}+",
13270
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
13271
+ };
13272
+ break;
13273
+ case LLAMA_VOCAB_PRE_TYPE_STARCODER:
13274
+ case LLAMA_VOCAB_PRE_TYPE_REFACT:
13275
+ case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
13276
+ regex_exprs = {
13277
+ "\\p{N}",
13278
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
13279
+ };
13280
+ break;
13281
+ case LLAMA_VOCAB_PRE_TYPE_GPT2:
13282
+ case LLAMA_VOCAB_PRE_TYPE_OLMO:
13283
+ regex_exprs = {
13284
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
13285
+ };
13286
+ break;
13287
+ case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
13288
+ case LLAMA_VOCAB_PRE_TYPE_QWEN2:
13289
+ regex_exprs = {
13290
+ // original regex from tokenizer.json
13291
+ // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
13292
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
13293
+ };
13294
+ break;
13295
+ case LLAMA_VOCAB_PRE_TYPE_PORO:
13296
+ regex_exprs = {
13297
+ " ?[^(\\s|.,!?…。,、।۔،)]+",
13298
+ };
13059
13299
  break;
13060
13300
  default:
13061
- GGML_ASSERT(false);
13301
+ // default regex for BPE tokenization pre-processing
13302
+ regex_exprs = {
13303
+ "[\\p{P}\\$\\+<=>\\^~\\|]+",
13304
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
13305
+ "\\p{N}+",
13306
+ "[0-9][0-9][0-9]",
13307
+ };
13062
13308
  break;
13063
13309
  }
13310
+ }
13311
+
13312
+ void append(const llama_vocab::id token_id, std::vector<llama_vocab::id> & output) const {
13313
+ output.push_back(token_id);
13314
+ }
13315
+
13316
+ bool append_bos(std::vector<llama_vocab::id> & output) const {
13317
+ if (vocab.tokenizer_add_bos) {
13318
+ GGML_ASSERT(vocab.special_bos_id != -1);
13319
+ output.push_back(vocab.special_bos_id);
13320
+ return true;
13321
+ }
13322
+ return false;
13323
+ }
13324
+
13325
+ bool append_eos(std::vector<llama_vocab::id> & output) const {
13326
+ if (vocab.tokenizer_add_eos) {
13327
+ GGML_ASSERT(vocab.special_eos_id != -1);
13328
+ output.push_back(vocab.special_eos_id);
13329
+ return true;
13330
+ }
13331
+ return false;
13332
+ }
13333
+
13334
+ void check_double_bos_eos(const std::vector<llama_vocab::id> & output) const {
13335
+ if (vocab.tokenizer_add_bos && output.size() >= 2 && output[1] == vocab.special_bos_id) {
13336
+ LLAMA_LOG_WARN(
13337
+ "%s: Added a BOS token to the prompt as specified by the model but the prompt "
13338
+ "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
13339
+ "Are you sure this is what you want?\n", __FUNCTION__);
13340
+ }
13341
+ if (vocab.tokenizer_add_eos && output.size() >= 2 && *(output.end()-2) == vocab.special_eos_id) {
13342
+ LLAMA_LOG_WARN(
13343
+ "%s: Added a EOS token to the prompt as specified by the model but the prompt "
13344
+ "also ends with a EOS token. So now the final prompt ends with 2 EOS tokens. "
13345
+ "Are you sure this is what you want?\n", __FUNCTION__);
13346
+ }
13347
+ }
13348
+
13349
+ void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
13350
+ int final_prev_index = -1;
13351
+
13352
+ const auto word_collection = unicode_regex_split(text, regex_exprs);
13064
13353
 
13065
13354
  symbols_final.clear();
13066
13355
 
@@ -13071,7 +13360,7 @@ struct llm_tokenizer_bpe {
13071
13360
  int index = 0;
13072
13361
  size_t offset = 0;
13073
13362
 
13074
- if (ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
13363
+ if (vocab.tokenizer_ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
13075
13364
  symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
13076
13365
  offset = word.size();
13077
13366
  }
@@ -13152,10 +13441,9 @@ struct llm_tokenizer_bpe {
13152
13441
  for (auto j = str.begin(); j != str.end(); ++j) {
13153
13442
  std::string byte_str(1, *j);
13154
13443
  auto token_multibyte = vocab.token_to_id.find(byte_str);
13155
- if (token_multibyte == vocab.token_to_id.end()) {
13156
- throw std::runtime_error("ERROR: byte not found in vocab");
13444
+ if (token_multibyte != vocab.token_to_id.end()) {
13445
+ output.push_back(token_multibyte->second);
13157
13446
  }
13158
- output.push_back((*token_multibyte).second);
13159
13447
  }
13160
13448
  } else {
13161
13449
  output.push_back((*token).second);
@@ -13194,6 +13482,8 @@ private:
13194
13482
 
13195
13483
  const llama_vocab & vocab;
13196
13484
 
13485
+ std::vector<std::string> regex_exprs;
13486
+
13197
13487
  std::vector<llm_symbol> symbols;
13198
13488
  std::vector<llm_symbol> symbols_final;
13199
13489
 
@@ -13203,7 +13493,7 @@ private:
13203
13493
  struct llm_tokenizer_wpm {
13204
13494
  llm_tokenizer_wpm(const llama_vocab & vocab): vocab(vocab) {}
13205
13495
 
13206
- void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
13496
+ void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) const {
13207
13497
  const auto & token_map = vocab.token_to_id;
13208
13498
 
13209
13499
  // normalize and split by whitespace
@@ -13212,7 +13502,7 @@ struct llm_tokenizer_wpm {
13212
13502
  // bos token prepended already
13213
13503
 
13214
13504
  // find the longest tokens that form the words
13215
- for (const std::string &word : words) {
13505
+ for (const std::string & word : words) {
13216
13506
  // skip empty words
13217
13507
  if (word.size() == 0) {
13218
13508
  continue;
@@ -13229,7 +13519,7 @@ struct llm_tokenizer_wpm {
13229
13519
  for (int i = 0; i < n; ++i) {
13230
13520
  // loop through possible match length
13231
13521
  bool match = false;
13232
- for (int j = n; j > i; j--) {
13522
+ for (int j = std::min(n, i + vocab.max_token_len + 1); j > i; j--) {
13233
13523
  auto it = token_map.find(word1.substr(i, j - i));
13234
13524
  if (it != token_map.end()) {
13235
13525
  output.push_back(it->second);
@@ -13252,11 +13542,12 @@ struct llm_tokenizer_wpm {
13252
13542
  }
13253
13543
  }
13254
13544
 
13255
- std::vector<std::string> preprocess(const std::string & text) {
13545
+ // TODO: reduce string copies by using cpts_offs array
13546
+ std::vector<std::string> preprocess(const std::string & text) const {
13256
13547
  const std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
13257
13548
  std::vector<std::string> words(1, "");
13258
13549
 
13259
- for (const char32_t cpt : cpts_nfd) {
13550
+ for (const uint32_t cpt : cpts_nfd) {
13260
13551
  const auto flags = unicode_cpt_flags(cpt);
13261
13552
 
13262
13553
  if (flags.is_whitespace) {
@@ -13474,7 +13765,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
13474
13765
 
13475
13766
  bool is_prev_special = false;
13476
13767
 
13477
- if (add_special && vocab.special_add_bos != 0) {
13768
+ if (add_special && vocab.tokenizer_add_bos) {
13478
13769
  GGML_ASSERT(vocab.special_bos_id != -1);
13479
13770
  output.push_back(vocab.special_bos_id);
13480
13771
  is_prev_special = true;
@@ -13484,7 +13775,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
13484
13775
  if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
13485
13776
  auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
13486
13777
 
13487
- if (vocab.add_space_prefix) {
13778
+ if (vocab.tokenizer_add_space_prefix) {
13488
13779
  if (!output.size() || is_prev_special) { // prefix with space if first token
13489
13780
  raw_text = " " + raw_text;
13490
13781
  }
@@ -13502,23 +13793,24 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
13502
13793
  }
13503
13794
  }
13504
13795
 
13505
- if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
13796
+ if (add_special && vocab.tokenizer_add_bos && output.size() >= 2 && output[1] == vocab.special_bos_id) {
13506
13797
  LLAMA_LOG_WARN(
13507
13798
  "%s: Added a BOS token to the prompt as specified by the model but the prompt "
13508
13799
  "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
13509
13800
  "Are you sure this is what you want?\n", __FUNCTION__);
13510
13801
  }
13511
13802
 
13512
- if (add_special && vocab.special_add_eos == 1) {
13803
+ if (add_special && vocab.tokenizer_add_eos) {
13513
13804
  GGML_ASSERT(vocab.special_eos_id != -1);
13514
13805
  output.push_back(vocab.special_eos_id);
13515
13806
  }
13516
13807
  } break;
13517
13808
  case LLAMA_VOCAB_TYPE_BPE:
13518
13809
  {
13519
- if (add_special && vocab.special_add_bos != 0) {
13520
- GGML_ASSERT(vocab.special_bos_id != -1);
13521
- output.push_back(vocab.special_bos_id);
13810
+ llm_tokenizer_bpe tokenizer(vocab);
13811
+
13812
+ if (add_special) {
13813
+ tokenizer.append_bos(output);
13522
13814
  }
13523
13815
 
13524
13816
  for (const auto & fragment : fragment_buffer) {
@@ -13528,23 +13820,15 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
13528
13820
  #ifdef PRETOKENIZERDEBUG
13529
13821
  LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
13530
13822
  #endif
13531
- llm_tokenizer_bpe tokenizer(vocab);
13532
13823
  tokenizer.tokenize(raw_text, output);
13533
13824
  } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
13534
- output.push_back(fragment.token);
13825
+ tokenizer.append(fragment.token, output);
13535
13826
  }
13536
13827
  }
13537
13828
 
13538
- if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
13539
- LLAMA_LOG_WARN(
13540
- "%s: Added a BOS token to the prompt as specified by the model but the prompt "
13541
- "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
13542
- "Are you sure this is what you want?\n", __FUNCTION__);
13543
- }
13544
-
13545
- if (add_special && vocab.special_add_eos == 1) {
13546
- GGML_ASSERT(vocab.special_add_eos != -1);
13547
- output.push_back(vocab.special_eos_id);
13829
+ if (add_special) {
13830
+ tokenizer.append_eos(output);
13831
+ tokenizer.check_double_bos_eos(output);
13548
13832
  }
13549
13833
  } break;
13550
13834
  case LLAMA_VOCAB_TYPE_WPM:
@@ -13554,6 +13838,8 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
13554
13838
  output.push_back(vocab.special_cls_id);
13555
13839
  }
13556
13840
 
13841
+ llm_tokenizer_wpm tokenizer(vocab);
13842
+
13557
13843
  for (const auto & fragment : fragment_buffer) {
13558
13844
  if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
13559
13845
  auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
@@ -13561,7 +13847,6 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
13561
13847
  #ifdef PRETOKENIZERDEBUG
13562
13848
  LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
13563
13849
  #endif
13564
- llm_tokenizer_wpm tokenizer(vocab);
13565
13850
  tokenizer.tokenize(raw_text, output);
13566
13851
  } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
13567
13852
  output.push_back(fragment.token);
@@ -16070,6 +16355,11 @@ struct llama_context * llama_new_context_with_model(
16070
16355
  params.flash_attn = false;
16071
16356
  }
16072
16357
 
16358
+ if (params.flash_attn && model->hparams.n_embd_head_k != model->hparams.n_embd_head_v) {
16359
+ LLAMA_LOG_WARN("%s: flash_attn requires n_embd_head_k == n_embd_head_v - forcing off\n", __func__);
16360
+ params.flash_attn = false;
16361
+ }
16362
+
16073
16363
  if (params.type_v != GGML_TYPE_F16 && !params.flash_attn) {
16074
16364
  LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__);
16075
16365
  return nullptr;
@@ -16241,8 +16531,7 @@ struct llama_context * llama_new_context_with_model(
16241
16531
  if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
16242
16532
  ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
16243
16533
  if (backend == nullptr) {
16244
- int main_gpu_id = ggml_backend_sycl_get_device_id(model->main_gpu);
16245
- LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, main_gpu_id, model->main_gpu);
16534
+ LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu);
16246
16535
  llama_free(ctx);
16247
16536
  return nullptr;
16248
16537
  }
@@ -17870,6 +18159,10 @@ void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)
17870
18159
  ctx->abort_callback_data = abort_callback_data;
17871
18160
  }
17872
18161
 
18162
+ void llama_set_embeddings(struct llama_context * ctx, bool embeddings) {
18163
+ ctx->cparams.embeddings = embeddings;
18164
+ }
18165
+
17873
18166
  void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
17874
18167
  ctx->cparams.causal_attn = causal_attn;
17875
18168
  }
@@ -18113,11 +18406,11 @@ llama_token llama_token_nl(const struct llama_model * model) {
18113
18406
  }
18114
18407
 
18115
18408
  int32_t llama_add_bos_token(const struct llama_model * model) {
18116
- return model->vocab.special_add_bos;
18409
+ return model->vocab.tokenizer_add_bos;
18117
18410
  }
18118
18411
 
18119
18412
  int32_t llama_add_eos_token(const struct llama_model * model) {
18120
- return model->vocab.special_add_eos;
18413
+ return model->vocab.tokenizer_add_eos;
18121
18414
  }
18122
18415
 
18123
18416
  llama_token llama_token_prefix(const struct llama_model * model) {