llama_cpp 0.16.1 → 0.16.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/ext/llama_cpp/extconf.rb +1 -0
- data/ext/llama_cpp/llama_cpp.cpp +12 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- data/vendor/tmp/llama.cpp/Makefile +10 -2
- data/vendor/tmp/llama.cpp/ggml-backend.c +14 -3
- data/vendor/tmp/llama.cpp/ggml-backend.h +3 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +10 -10
- data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +28 -0
- data/vendor/tmp/llama.cpp/ggml-impl.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +6 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +982 -368
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +8 -3
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +2124 -13202
- data/vendor/tmp/llama.cpp/ggml-sycl.h +1 -10
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +27564 -23876
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +278 -366
- data/vendor/tmp/llama.cpp/ggml.c +67 -150
- data/vendor/tmp/llama.cpp/ggml.h +6 -0
- data/vendor/tmp/llama.cpp/llama.cpp +530 -237
- data/vendor/tmp/llama.cpp/llama.h +5 -1
- data/vendor/tmp/llama.cpp/sgemm.cpp +2 -0
- data/vendor/tmp/llama.cpp/unicode-data.cpp +851 -801
- data/vendor/tmp/llama.cpp/unicode.cpp +33 -19
- data/vendor/tmp/llama.cpp/unicode.h +1 -1
- metadata +2 -2
@@ -286,6 +286,7 @@ enum llm_kv {
|
|
286
286
|
LLM_KV_LEADING_DENSE_BLOCK_COUNT,
|
287
287
|
LLM_KV_FEED_FORWARD_LENGTH,
|
288
288
|
LLM_KV_EXPERT_FEED_FORWARD_LENGTH,
|
289
|
+
LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH,
|
289
290
|
LLM_KV_USE_PARALLEL_RESIDUAL,
|
290
291
|
LLM_KV_TENSOR_DATA_LAYOUT,
|
291
292
|
LLM_KV_EXPERT_COUNT,
|
@@ -364,21 +365,22 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
364
365
|
{ LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
|
365
366
|
{ LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
|
366
367
|
|
367
|
-
{ LLM_KV_VOCAB_SIZE,
|
368
|
-
{ LLM_KV_CONTEXT_LENGTH,
|
369
|
-
{ LLM_KV_EMBEDDING_LENGTH,
|
370
|
-
{ LLM_KV_BLOCK_COUNT,
|
371
|
-
{ LLM_KV_LEADING_DENSE_BLOCK_COUNT,
|
372
|
-
{ LLM_KV_FEED_FORWARD_LENGTH,
|
373
|
-
{ LLM_KV_EXPERT_FEED_FORWARD_LENGTH,
|
374
|
-
{
|
375
|
-
{
|
376
|
-
{
|
377
|
-
{
|
378
|
-
{
|
379
|
-
{
|
380
|
-
{
|
381
|
-
{
|
368
|
+
{ LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
|
369
|
+
{ LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
|
370
|
+
{ LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
|
371
|
+
{ LLM_KV_BLOCK_COUNT, "%s.block_count" },
|
372
|
+
{ LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" },
|
373
|
+
{ LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
|
374
|
+
{ LLM_KV_EXPERT_FEED_FORWARD_LENGTH, "%s.expert_feed_forward_length" },
|
375
|
+
{ LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, "%s.expert_shared_feed_forward_length" },
|
376
|
+
{ LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
|
377
|
+
{ LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
|
378
|
+
{ LLM_KV_EXPERT_COUNT, "%s.expert_count" },
|
379
|
+
{ LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
|
380
|
+
{ LLM_KV_EXPERT_SHARED_COUNT, "%s.expert_shared_count" },
|
381
|
+
{ LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
|
382
|
+
{ LLM_KV_POOLING_TYPE , "%s.pooling_type" },
|
383
|
+
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
|
382
384
|
|
383
385
|
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
384
386
|
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
@@ -1278,6 +1280,126 @@ struct no_init {
|
|
1278
1280
|
};
|
1279
1281
|
|
1280
1282
|
struct llama_file {
|
1283
|
+
|
1284
|
+
#if defined(_WIN32)
|
1285
|
+
// use FILE * so we don't have to re-open the file to mmap
|
1286
|
+
FILE * fp;
|
1287
|
+
HANDLE fp_win32;
|
1288
|
+
size_t size;
|
1289
|
+
|
1290
|
+
private:
|
1291
|
+
std::string GetErrorMessageWin32(DWORD error_code) const {
|
1292
|
+
std::string ret;
|
1293
|
+
LPSTR lpMsgBuf = NULL;
|
1294
|
+
DWORD bufLen = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
|
1295
|
+
NULL, error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&lpMsgBuf, 0, NULL);
|
1296
|
+
if (!bufLen) {
|
1297
|
+
ret = format("Win32 error code: %s", error_code);
|
1298
|
+
} else {
|
1299
|
+
ret = lpMsgBuf;
|
1300
|
+
LocalFree(lpMsgBuf);
|
1301
|
+
}
|
1302
|
+
|
1303
|
+
return ret;
|
1304
|
+
}
|
1305
|
+
|
1306
|
+
public:
|
1307
|
+
|
1308
|
+
llama_file(const char * fname, const char * mode) {
|
1309
|
+
fp = ggml_fopen(fname, mode);
|
1310
|
+
if (fp == NULL) {
|
1311
|
+
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
|
1312
|
+
}
|
1313
|
+
fp_win32 = (HANDLE) _get_osfhandle(_fileno(fp));
|
1314
|
+
seek(0, SEEK_END);
|
1315
|
+
size = tell();
|
1316
|
+
seek(0, SEEK_SET);
|
1317
|
+
}
|
1318
|
+
|
1319
|
+
size_t tell() const {
|
1320
|
+
// SetFilePointerEx returns the current position when seeking relative 0 bytes
|
1321
|
+
LARGE_INTEGER li;
|
1322
|
+
li.QuadPart = 0;
|
1323
|
+
BOOL ret = SetFilePointerEx(fp_win32, li, &li, FILE_CURRENT);
|
1324
|
+
if (!ret) {
|
1325
|
+
throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
|
1326
|
+
}
|
1327
|
+
|
1328
|
+
return li.QuadPart;
|
1329
|
+
}
|
1330
|
+
|
1331
|
+
void seek(size_t offset, int whence) const {
|
1332
|
+
// no need to convert SEEK_* to FILE_*. The enums are the same.
|
1333
|
+
// Still, keep static asserts to avoid failures in the future.
|
1334
|
+
static_assert(SEEK_SET == FILE_BEGIN, "SEEK_SET != FILE_BEGIN");
|
1335
|
+
static_assert(SEEK_CUR == FILE_CURRENT, "SEEK_CUR != FILE_CURRENT");
|
1336
|
+
static_assert(SEEK_END == FILE_END, "SEEK_END != FILE_END");
|
1337
|
+
|
1338
|
+
LARGE_INTEGER li;
|
1339
|
+
li.QuadPart = offset;
|
1340
|
+
BOOL ret = SetFilePointerEx(fp_win32, li, NULL, whence);
|
1341
|
+
if (!ret) {
|
1342
|
+
throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
|
1343
|
+
}
|
1344
|
+
}
|
1345
|
+
|
1346
|
+
void read_raw(void * ptr, size_t len) const {
|
1347
|
+
// On Win32 ReadFile is significant faster than fread which is again significant faster than std::fstream. Thus
|
1348
|
+
// use the Win32 API to do file io instead of the C/C++ library functions.
|
1349
|
+
|
1350
|
+
// There are conditions under which ReadFile cannot read chunks >64MB.
|
1351
|
+
// Thus split the operation into smaller chunks if len exceeds this limit.
|
1352
|
+
size_t bytes_read = 0;
|
1353
|
+
while (bytes_read < len) {
|
1354
|
+
size_t chunk_size = std::min<size_t>(len - bytes_read, 64*1024*1024);
|
1355
|
+
DWORD chunk_read = 0;
|
1356
|
+
BOOL result = ReadFile(fp_win32, reinterpret_cast<char*>(ptr) + bytes_read, chunk_size, &chunk_read, NULL);
|
1357
|
+
if (!result) {
|
1358
|
+
throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
|
1359
|
+
}
|
1360
|
+
if (chunk_read < chunk_size || chunk_read == 0) {
|
1361
|
+
throw std::runtime_error("unexpectedly reached end of file");
|
1362
|
+
}
|
1363
|
+
|
1364
|
+
bytes_read += chunk_read;
|
1365
|
+
} ;
|
1366
|
+
}
|
1367
|
+
|
1368
|
+
uint32_t read_u32() const {
|
1369
|
+
uint32_t val;
|
1370
|
+
read_raw(&val, sizeof(val));
|
1371
|
+
return val;
|
1372
|
+
}
|
1373
|
+
|
1374
|
+
void write_raw(const void * ptr, size_t len) const {
|
1375
|
+
// There are conditions under which WriteFile cannot write chunks >64MB.
|
1376
|
+
// Thus split the operation into smaller chunks if len exceeds this limit.
|
1377
|
+
size_t bytes_written = 0;
|
1378
|
+
while (bytes_written < len) {
|
1379
|
+
size_t chunk_size = std::min<size_t>(len - bytes_written, 64*1024*1024);
|
1380
|
+
DWORD chunk_written = 0;
|
1381
|
+
BOOL result = WriteFile(fp_win32, reinterpret_cast<char const*>(ptr) + bytes_written, chunk_size, &chunk_written, NULL);
|
1382
|
+
if (!result) {
|
1383
|
+
throw std::runtime_error(format("write error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
|
1384
|
+
}
|
1385
|
+
if (chunk_written < chunk_size || chunk_written == 0) {
|
1386
|
+
throw std::runtime_error("unexpectedly failed to write bytes");
|
1387
|
+
}
|
1388
|
+
|
1389
|
+
bytes_written += chunk_written;
|
1390
|
+
}
|
1391
|
+
}
|
1392
|
+
|
1393
|
+
void write_u32(std::uint32_t val) const {
|
1394
|
+
write_raw(&val, sizeof(val));
|
1395
|
+
}
|
1396
|
+
|
1397
|
+
~llama_file() {
|
1398
|
+
if (fp) {
|
1399
|
+
std::fclose(fp);
|
1400
|
+
}
|
1401
|
+
}
|
1402
|
+
#else
|
1281
1403
|
// use FILE * so we don't have to re-open the file to mmap
|
1282
1404
|
FILE * fp;
|
1283
1405
|
size_t size;
|
@@ -1298,7 +1420,10 @@ struct llama_file {
|
|
1298
1420
|
#else
|
1299
1421
|
long ret = std::ftell(fp);
|
1300
1422
|
#endif
|
1301
|
-
|
1423
|
+
if (ret == -1) {
|
1424
|
+
throw std::runtime_error(format("ftell error: %s", strerror(errno)));
|
1425
|
+
}
|
1426
|
+
|
1302
1427
|
return (size_t) ret;
|
1303
1428
|
}
|
1304
1429
|
|
@@ -1308,7 +1433,9 @@ struct llama_file {
|
|
1308
1433
|
#else
|
1309
1434
|
int ret = std::fseek(fp, (long) offset, whence);
|
1310
1435
|
#endif
|
1311
|
-
|
1436
|
+
if (ret != 0) {
|
1437
|
+
throw std::runtime_error(format("seek error: %s", strerror(errno)));
|
1438
|
+
}
|
1312
1439
|
}
|
1313
1440
|
|
1314
1441
|
void read_raw(void * ptr, size_t len) const {
|
@@ -1351,6 +1478,7 @@ struct llama_file {
|
|
1351
1478
|
std::fclose(fp);
|
1352
1479
|
}
|
1353
1480
|
}
|
1481
|
+
#endif
|
1354
1482
|
};
|
1355
1483
|
using llama_files = std::vector<std::unique_ptr<llama_file>>;
|
1356
1484
|
|
@@ -1844,6 +1972,7 @@ struct llama_hparams {
|
|
1844
1972
|
uint32_t n_lora_q = 0;
|
1845
1973
|
uint32_t n_lora_kv = 0;
|
1846
1974
|
uint32_t n_ff_exp = 0;
|
1975
|
+
uint32_t n_ff_shexp = 0;
|
1847
1976
|
uint32_t n_expert_shared = 0;
|
1848
1977
|
float expert_weights_scale = 0.0;
|
1849
1978
|
|
@@ -1892,6 +2021,7 @@ struct llama_hparams {
|
|
1892
2021
|
if (this->n_lora_q != other.n_lora_q) return true;
|
1893
2022
|
if (this->n_lora_kv != other.n_lora_kv) return true;
|
1894
2023
|
if (this->n_ff_exp != other.n_ff_exp) return true;
|
2024
|
+
if (this->n_ff_shexp != other.n_ff_shexp) return true;
|
1895
2025
|
if (this->n_expert_shared != other.n_expert_shared) return true;
|
1896
2026
|
|
1897
2027
|
if (this->rope_finetuned != other.rope_finetuned) return true;
|
@@ -2163,6 +2293,8 @@ struct llama_vocab {
|
|
2163
2293
|
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
|
2164
2294
|
enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
2165
2295
|
|
2296
|
+
int max_token_len = 0; // used for optimizing longest token search
|
2297
|
+
|
2166
2298
|
std::unordered_map<token, id> token_to_id;
|
2167
2299
|
std::vector<token_data> id_to_token;
|
2168
2300
|
|
@@ -2180,16 +2312,17 @@ struct llama_vocab {
|
|
2180
2312
|
id special_cls_id = -1;
|
2181
2313
|
id special_mask_id = -1;
|
2182
2314
|
|
2183
|
-
int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add.
|
2184
|
-
int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
|
2185
|
-
|
2186
2315
|
id linefeed_id = 13;
|
2187
2316
|
id special_prefix_id = -1;
|
2188
2317
|
id special_suffix_id = -1;
|
2189
2318
|
id special_middle_id = -1;
|
2190
2319
|
id special_eot_id = -1; // TODO: move above after "eos_id", and here add "file separator" token
|
2191
2320
|
|
2192
|
-
|
2321
|
+
// tokenizer flags
|
2322
|
+
bool tokenizer_add_space_prefix = true;
|
2323
|
+
bool tokenizer_add_bos = false;
|
2324
|
+
bool tokenizer_add_eos = false;
|
2325
|
+
bool tokenizer_ignore_merges = false;
|
2193
2326
|
|
2194
2327
|
int find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
|
2195
2328
|
GGML_ASSERT(token_left.find(' ') == std::string::npos);
|
@@ -3721,6 +3854,44 @@ struct llama_model_loader {
|
|
3721
3854
|
std::vector<no_init<uint8_t>> read_buf;
|
3722
3855
|
std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result;
|
3723
3856
|
|
3857
|
+
#if defined(GGML_USE_CUDA)
|
3858
|
+
// 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
|
3859
|
+
// NVMe raid configurations might require more / larger buffers.
|
3860
|
+
constexpr size_t num_buffers = 4;
|
3861
|
+
constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB
|
3862
|
+
|
3863
|
+
std::vector<ggml_backend_buffer_t> host_buffers;
|
3864
|
+
std::vector<void*> host_ptrs;
|
3865
|
+
std::vector<ggml_backend_event_t> events;
|
3866
|
+
size_t buffer_idx = 0; // buffer to use for async loads
|
3867
|
+
|
3868
|
+
ggml_backend_t cuda_backend = nullptr;
|
3869
|
+
if (!use_mmap && !check_tensors) {
|
3870
|
+
// When not using mmaped io use async uploads from pinned memory to GPU memory.
|
3871
|
+
// First determine if the CUDA backend is active, and if so, determine the device ID.
|
3872
|
+
ggml_backend_buffer_t buf = bufs_mmap.count(0) ? bufs_mmap.at(0) : nullptr;
|
3873
|
+
if (buf) {
|
3874
|
+
ggml_backend_buffer_type_t buffer_type = ggml_backend_buffer_get_type(buf);
|
3875
|
+
for (int i = 0; i < ggml_backend_cuda_get_device_count(); ++i) {
|
3876
|
+
auto * cuda_buffer_type = ggml_backend_cuda_buffer_type(i);
|
3877
|
+
if (buffer_type == cuda_buffer_type) {
|
3878
|
+
cuda_backend = ggml_backend_cuda_init(i);
|
3879
|
+
break;
|
3880
|
+
}
|
3881
|
+
}
|
3882
|
+
}
|
3883
|
+
|
3884
|
+
// If the cuda backend is active create pinned memory buffers and events for synchronisation.
|
3885
|
+
if (cuda_backend) {
|
3886
|
+
for (size_t idx = 0; idx < num_buffers; ++idx) {
|
3887
|
+
host_buffers.emplace_back(ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), buffer_size));
|
3888
|
+
host_ptrs.emplace_back(ggml_backend_buffer_get_base(host_buffers[idx]));
|
3889
|
+
events.emplace_back(ggml_backend_event_new(cuda_backend));
|
3890
|
+
}
|
3891
|
+
}
|
3892
|
+
}
|
3893
|
+
#endif
|
3894
|
+
|
3724
3895
|
for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
|
3725
3896
|
const auto * weight = get_weight(ggml_get_name(cur));
|
3726
3897
|
if (weight == nullptr) {
|
@@ -3776,12 +3947,36 @@ struct llama_model_loader {
|
|
3776
3947
|
}));
|
3777
3948
|
}
|
3778
3949
|
} else {
|
3779
|
-
|
3780
|
-
|
3781
|
-
|
3782
|
-
|
3783
|
-
|
3784
|
-
|
3950
|
+
#if defined(GGML_USE_CUDA)
|
3951
|
+
// If cuda_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
|
3952
|
+
if (cuda_backend) {
|
3953
|
+
file->seek(weight->offs, SEEK_SET);
|
3954
|
+
|
3955
|
+
size_t bytes_read = 0;
|
3956
|
+
|
3957
|
+
while (bytes_read < n_size) {
|
3958
|
+
size_t read_iteration = std::min<size_t>(buffer_size, n_size - bytes_read);
|
3959
|
+
|
3960
|
+
ggml_backend_event_synchronize(events[buffer_idx]);
|
3961
|
+
file->read_raw(host_ptrs[buffer_idx], read_iteration);
|
3962
|
+
ggml_backend_tensor_set_async(cuda_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);
|
3963
|
+
ggml_backend_event_record(events[buffer_idx]);
|
3964
|
+
|
3965
|
+
bytes_read += read_iteration;
|
3966
|
+
++buffer_idx;
|
3967
|
+
buffer_idx %= num_buffers;
|
3968
|
+
}
|
3969
|
+
}
|
3970
|
+
else
|
3971
|
+
#endif
|
3972
|
+
{
|
3973
|
+
read_buf.resize(n_size);
|
3974
|
+
file->seek(weight->offs, SEEK_SET);
|
3975
|
+
file->read_raw(read_buf.data(), n_size);
|
3976
|
+
ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
|
3977
|
+
if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
|
3978
|
+
throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
|
3979
|
+
}
|
3785
3980
|
}
|
3786
3981
|
}
|
3787
3982
|
}
|
@@ -3789,6 +3984,18 @@ struct llama_model_loader {
|
|
3789
3984
|
size_done += n_size;
|
3790
3985
|
}
|
3791
3986
|
|
3987
|
+
#if defined(GGML_USE_CUDA)
|
3988
|
+
// free temporary resources used for async cuda uploads
|
3989
|
+
if (cuda_backend) {
|
3990
|
+
for (size_t idx = 0; idx < num_buffers;++idx) {
|
3991
|
+
ggml_backend_event_synchronize(events[idx]);
|
3992
|
+
ggml_backend_event_free(events[idx]);
|
3993
|
+
ggml_backend_buffer_free(host_buffers[idx]);
|
3994
|
+
}
|
3995
|
+
ggml_backend_free(cuda_backend);
|
3996
|
+
}
|
3997
|
+
#endif
|
3998
|
+
|
3792
3999
|
// check validation results
|
3793
4000
|
bool validation_failed = false;
|
3794
4001
|
for (auto & future : validation_result) {
|
@@ -4255,6 +4462,9 @@ static void llm_load_hparams(
|
|
4255
4462
|
} break;
|
4256
4463
|
case LLM_ARCH_QWEN2MOE:
|
4257
4464
|
{
|
4465
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
|
4466
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
|
4467
|
+
|
4258
4468
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
4259
4469
|
switch (hparams.n_layer) {
|
4260
4470
|
case 24: model.type = e_model::MODEL_A2_7B; break;
|
@@ -4563,7 +4773,7 @@ static void llm_load_vocab(
|
|
4563
4773
|
|
4564
4774
|
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
|
4565
4775
|
if (add_space_prefix_keyidx != -1) {
|
4566
|
-
vocab.
|
4776
|
+
vocab.tokenizer_add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
|
4567
4777
|
} // The default value of add_space_prefix is true.
|
4568
4778
|
} else if (tokenizer_model == "bert") {
|
4569
4779
|
vocab.type = LLAMA_VOCAB_TYPE_WPM;
|
@@ -4576,13 +4786,13 @@ static void llm_load_vocab(
|
|
4576
4786
|
vocab.special_pad_id = 0;
|
4577
4787
|
vocab.special_cls_id = 101;
|
4578
4788
|
vocab.special_mask_id = 103;
|
4579
|
-
vocab.
|
4789
|
+
vocab.tokenizer_add_space_prefix = false;
|
4580
4790
|
} else if (tokenizer_model == "gpt2") {
|
4581
4791
|
vocab.type = LLAMA_VOCAB_TYPE_BPE;
|
4582
4792
|
|
4583
4793
|
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
|
4584
4794
|
if (add_space_prefix_keyidx != -1) {
|
4585
|
-
vocab.
|
4795
|
+
vocab.tokenizer_add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
|
4586
4796
|
}
|
4587
4797
|
|
4588
4798
|
// read bpe merges and populate bpe ranks
|
@@ -4640,6 +4850,8 @@ static void llm_load_vocab(
|
|
4640
4850
|
tokenizer_pre == "llama-v3" ||
|
4641
4851
|
tokenizer_pre == "llama-bpe") {
|
4642
4852
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
|
4853
|
+
vocab.tokenizer_ignore_merges = true;
|
4854
|
+
vocab.tokenizer_add_bos = true;
|
4643
4855
|
} else if (
|
4644
4856
|
tokenizer_pre == "deepseek-llm") {
|
4645
4857
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM;
|
@@ -4690,6 +4902,14 @@ static void llm_load_vocab(
|
|
4690
4902
|
} else {
|
4691
4903
|
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
4692
4904
|
}
|
4905
|
+
} else if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
|
4906
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
4907
|
+
vocab.tokenizer_add_bos = true;
|
4908
|
+
vocab.tokenizer_add_eos = false;
|
4909
|
+
} else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
|
4910
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
4911
|
+
vocab.tokenizer_add_bos = true;
|
4912
|
+
vocab.tokenizer_add_eos = false;
|
4693
4913
|
} else {
|
4694
4914
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
4695
4915
|
}
|
@@ -4721,6 +4941,7 @@ static void llm_load_vocab(
|
|
4721
4941
|
GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
|
4722
4942
|
|
4723
4943
|
vocab.token_to_id[word] = i;
|
4944
|
+
vocab.max_token_len = std::max(vocab.max_token_len, (int) word.size());
|
4724
4945
|
|
4725
4946
|
auto & token_data = vocab.id_to_token[i];
|
4726
4947
|
token_data.text = std::move(word);
|
@@ -4834,10 +5055,10 @@ static void llm_load_vocab(
|
|
4834
5055
|
bool temp = true;
|
4835
5056
|
|
4836
5057
|
if (ml.get_key(LLM_KV_TOKENIZER_ADD_BOS, temp, false)) {
|
4837
|
-
vocab.
|
5058
|
+
vocab.tokenizer_add_bos = temp;
|
4838
5059
|
}
|
4839
5060
|
if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) {
|
4840
|
-
vocab.
|
5061
|
+
vocab.tokenizer_add_eos = temp;
|
4841
5062
|
}
|
4842
5063
|
}
|
4843
5064
|
|
@@ -4937,7 +5158,7 @@ static void llm_load_vocab(
|
|
4937
5158
|
);
|
4938
5159
|
|
4939
5160
|
// set attributes by model/tokenizer name
|
4940
|
-
if (_contains_any(tokenizer_pre, {"jina-v2-es", "jina-v2-
|
5161
|
+
if (_contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})) {
|
4941
5162
|
_set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
|
4942
5163
|
} else if (_contains_any(model_name, {"phi-3", "phi3"})) {
|
4943
5164
|
for (auto id : vocab.cache_special_tokens) {
|
@@ -5031,6 +5252,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
5031
5252
|
if (vocab.special_middle_id != -1) { LLAMA_LOG_INFO( "%s: MID token = %d '%s'\n", __func__, vocab.special_middle_id, vocab.id_to_token[vocab.special_middle_id].text.c_str() ); }
|
5032
5253
|
if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
|
5033
5254
|
|
5255
|
+
LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, vocab.max_token_len);
|
5256
|
+
|
5034
5257
|
if (model.arch == LLM_ARCH_DEEPSEEK2) {
|
5035
5258
|
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
5036
5259
|
LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
|
@@ -5040,6 +5263,11 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
5040
5263
|
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
5041
5264
|
LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
|
5042
5265
|
}
|
5266
|
+
|
5267
|
+
if (model.arch == LLM_ARCH_QWEN2MOE) {
|
5268
|
+
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
5269
|
+
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
|
5270
|
+
}
|
5043
5271
|
}
|
5044
5272
|
|
5045
5273
|
// Returns false if cancelled by progress_callback
|
@@ -5183,7 +5411,7 @@ static bool llm_load_tensors(
|
|
5183
5411
|
// create tensors for the weights
|
5184
5412
|
{
|
5185
5413
|
const int64_t n_embd = hparams.n_embd;
|
5186
|
-
const int64_t n_embd_head = n_embd / hparams.n_head;
|
5414
|
+
const int64_t n_embd_head = (hparams.n_head == 0) ? 0 : n_embd / hparams.n_head;
|
5187
5415
|
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
5188
5416
|
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
5189
5417
|
const int64_t n_embd_gqa = n_embd_v_gqa;
|
@@ -5826,16 +6054,17 @@ static bool llm_load_tensors(
|
|
5826
6054
|
GGML_ASSERT(hparams.n_expert_used > 0);
|
5827
6055
|
|
5828
6056
|
// MoE branch
|
5829
|
-
auto n_ff_exp = n_ff / hparams.n_expert_used;
|
6057
|
+
auto n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / hparams.n_expert_used;
|
5830
6058
|
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
|
5831
6059
|
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert});
|
5832
6060
|
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
|
5833
6061
|
|
5834
6062
|
// Shared expert branch
|
6063
|
+
auto n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
|
5835
6064
|
layer.ffn_gate_inp_shexp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd});
|
5836
|
-
layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd,
|
5837
|
-
layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {
|
5838
|
-
layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd,
|
6065
|
+
layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp});
|
6066
|
+
layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd});
|
6067
|
+
layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp});
|
5839
6068
|
}
|
5840
6069
|
} break;
|
5841
6070
|
case LLM_ARCH_PHI2:
|
@@ -6625,16 +6854,6 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
|
6625
6854
|
}
|
6626
6855
|
#endif
|
6627
6856
|
|
6628
|
-
#ifdef GGML_USE_SYCL
|
6629
|
-
if (params.split_mode == LLAMA_SPLIT_MODE_NONE) {
|
6630
|
-
ggml_backend_sycl_set_single_device_mode(params.main_gpu);
|
6631
|
-
//SYCL use device index (0, 1, 2) directly, uer input device id, then convert to device index.
|
6632
|
-
params.main_gpu = ggml_backend_sycl_get_device_index(params.main_gpu);
|
6633
|
-
} else {
|
6634
|
-
ggml_backend_sycl_set_mul_device_mode();
|
6635
|
-
}
|
6636
|
-
#endif
|
6637
|
-
|
6638
6857
|
if (!llm_load_tensors(
|
6639
6858
|
ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock,
|
6640
6859
|
params.progress_callback, params.progress_callback_user_data
|
@@ -7435,6 +7654,50 @@ struct llm_build_context {
|
|
7435
7654
|
return lctx.inp_s_seq;
|
7436
7655
|
}
|
7437
7656
|
|
7657
|
+
struct ggml_cgraph * append_pooling(struct ggml_cgraph * gf) {
|
7658
|
+
// find result_norm tensor for input
|
7659
|
+
struct ggml_tensor * inp = nullptr;
|
7660
|
+
for (int i = gf->n_nodes - 1; i >= 0; --i) {
|
7661
|
+
inp = gf->nodes[i];
|
7662
|
+
if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) {
|
7663
|
+
break;
|
7664
|
+
} else {
|
7665
|
+
inp = nullptr;
|
7666
|
+
}
|
7667
|
+
}
|
7668
|
+
GGML_ASSERT(inp != nullptr && "missing result_norm/result_embd tensor");
|
7669
|
+
|
7670
|
+
struct ggml_tensor * cur;
|
7671
|
+
|
7672
|
+
switch (pooling_type) {
|
7673
|
+
case LLAMA_POOLING_TYPE_MEAN:
|
7674
|
+
{
|
7675
|
+
struct ggml_tensor * inp_mean = build_inp_mean();
|
7676
|
+
cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, inp)), inp_mean);
|
7677
|
+
} break;
|
7678
|
+
case LLAMA_POOLING_TYPE_CLS:
|
7679
|
+
case LLAMA_POOLING_TYPE_LAST:
|
7680
|
+
{
|
7681
|
+
struct ggml_tensor * inp_cls = build_inp_cls();
|
7682
|
+
cur = ggml_get_rows(ctx0, inp, inp_cls);
|
7683
|
+
} break;
|
7684
|
+
case LLAMA_POOLING_TYPE_NONE:
|
7685
|
+
{
|
7686
|
+
cur = inp;
|
7687
|
+
} break;
|
7688
|
+
default:
|
7689
|
+
{
|
7690
|
+
GGML_ASSERT(false && "unknown pooling type");
|
7691
|
+
} break;
|
7692
|
+
}
|
7693
|
+
|
7694
|
+
cb(cur, "result_embd_pooled", -1);
|
7695
|
+
|
7696
|
+
ggml_build_forward_expand(gf, cur);
|
7697
|
+
|
7698
|
+
return gf;
|
7699
|
+
}
|
7700
|
+
|
7438
7701
|
struct ggml_cgraph * build_llama() {
|
7439
7702
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
7440
7703
|
|
@@ -8415,8 +8678,6 @@ struct llm_build_context {
|
|
8415
8678
|
if (model.arch != LLM_ARCH_JINA_BERT_V2) {
|
8416
8679
|
inp_pos = build_inp_pos();
|
8417
8680
|
}
|
8418
|
-
struct ggml_tensor * inp_mean = build_inp_mean();
|
8419
|
-
struct ggml_tensor * inp_cls = build_inp_cls();
|
8420
8681
|
|
8421
8682
|
// construct input embeddings (token, type, position)
|
8422
8683
|
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
@@ -8591,28 +8852,6 @@ struct llm_build_context {
|
|
8591
8852
|
cur = inpL;
|
8592
8853
|
cb(cur, "result_embd", -1);
|
8593
8854
|
|
8594
|
-
// pooling layer
|
8595
|
-
switch (pooling_type) {
|
8596
|
-
case LLAMA_POOLING_TYPE_NONE:
|
8597
|
-
{
|
8598
|
-
// nop
|
8599
|
-
} break;
|
8600
|
-
case LLAMA_POOLING_TYPE_MEAN:
|
8601
|
-
{
|
8602
|
-
cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
|
8603
|
-
cb(cur, "result_embd_pooled", -1);
|
8604
|
-
} break;
|
8605
|
-
case LLAMA_POOLING_TYPE_CLS:
|
8606
|
-
{
|
8607
|
-
cur = ggml_get_rows(ctx0, cur, inp_cls);
|
8608
|
-
cb(cur, "result_embd_pooled", -1);
|
8609
|
-
} break;
|
8610
|
-
case LLAMA_POOLING_TYPE_UNSPECIFIED:
|
8611
|
-
{
|
8612
|
-
GGML_ASSERT(false && "Invalid pooling type");
|
8613
|
-
} break;
|
8614
|
-
}
|
8615
|
-
|
8616
8855
|
ggml_build_forward_expand(gf, cur);
|
8617
8856
|
|
8618
8857
|
return gf;
|
@@ -11697,6 +11936,11 @@ static struct ggml_cgraph * llama_build_graph(
|
|
11697
11936
|
GGML_ASSERT(false);
|
11698
11937
|
}
|
11699
11938
|
|
11939
|
+
// add on pooling layer
|
11940
|
+
if (lctx.cparams.embeddings) {
|
11941
|
+
result = llm.append_pooling(result);
|
11942
|
+
}
|
11943
|
+
|
11700
11944
|
llm.free();
|
11701
11945
|
|
11702
11946
|
return result;
|
@@ -11786,7 +12030,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
11786
12030
|
// (!a || b) is a logical implication (a -> b)
|
11787
12031
|
// !hparams.causal_attn -> !cparams.causal_attn
|
11788
12032
|
(hparams.causal_attn || !cparams.causal_attn) &&
|
11789
|
-
"causal attention
|
12033
|
+
"causal attention is not supported by this model"
|
11790
12034
|
);
|
11791
12035
|
|
11792
12036
|
if (lctx.inp_KQ_mask) {
|
@@ -11918,6 +12162,37 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
11918
12162
|
}
|
11919
12163
|
}
|
11920
12164
|
|
12165
|
+
if (cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) {
|
12166
|
+
const int64_t n_tokens = batch.n_tokens;
|
12167
|
+
|
12168
|
+
GGML_ASSERT(lctx.inp_cls);
|
12169
|
+
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
|
12170
|
+
|
12171
|
+
uint32_t * data = (uint32_t *) lctx.inp_cls->data;
|
12172
|
+
memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));
|
12173
|
+
|
12174
|
+
std::vector<int> last_pos(n_tokens, -1);
|
12175
|
+
std::vector<int> last_row(n_tokens, -1);
|
12176
|
+
|
12177
|
+
for (int i = 0; i < n_tokens; ++i) {
|
12178
|
+
const llama_seq_id seq_id = batch.seq_id[i][0];
|
12179
|
+
const llama_pos pos = batch.pos[i];
|
12180
|
+
|
12181
|
+
GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == LAST");
|
12182
|
+
|
12183
|
+
if (pos >= last_pos[seq_id]) {
|
12184
|
+
last_pos[seq_id] = pos;
|
12185
|
+
last_row[seq_id] = i;
|
12186
|
+
}
|
12187
|
+
}
|
12188
|
+
|
12189
|
+
for (int i = 0; i < n_tokens; ++i) {
|
12190
|
+
if (last_row[i] >= 0) {
|
12191
|
+
data[i] = last_row[i];
|
12192
|
+
}
|
12193
|
+
}
|
12194
|
+
}
|
12195
|
+
|
11921
12196
|
if (kv_self.recurrent) {
|
11922
12197
|
const int64_t n_kv = kv_self.n;
|
11923
12198
|
|
@@ -11979,8 +12254,8 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
|
|
11979
12254
|
const auto n_embd = hparams.n_embd;
|
11980
12255
|
|
11981
12256
|
// TODO: use a per-batch flag for logits presence instead
|
11982
|
-
const bool has_logits = cparams.
|
11983
|
-
const bool has_embd =
|
12257
|
+
const bool has_logits = !cparams.embeddings;
|
12258
|
+
const bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
|
11984
12259
|
|
11985
12260
|
const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
|
11986
12261
|
const size_t embd_size = has_embd ? n_embd*n_outputs_max : 0;
|
@@ -12110,11 +12385,13 @@ static int llama_decode_internal(
|
|
12110
12385
|
std::vector<std::vector<llama_seq_id>> seq_id;
|
12111
12386
|
|
12112
12387
|
// count outputs
|
12113
|
-
if (
|
12388
|
+
if (cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE) {
|
12389
|
+
n_outputs = n_tokens_all;
|
12390
|
+
} else if (batch_all.logits) {
|
12114
12391
|
for (uint32_t i = 0; i < n_tokens_all; ++i) {
|
12115
12392
|
n_outputs += batch_all.logits[i] != 0;
|
12116
12393
|
}
|
12117
|
-
} else if (lctx.logits_all
|
12394
|
+
} else if (lctx.logits_all) {
|
12118
12395
|
n_outputs = n_tokens_all;
|
12119
12396
|
} else {
|
12120
12397
|
// keep last output only
|
@@ -12245,30 +12522,13 @@ static int llama_decode_internal(
|
|
12245
12522
|
// no output
|
12246
12523
|
res = nullptr;
|
12247
12524
|
embd = nullptr;
|
12248
|
-
} else if (!hparams.causal_attn) {
|
12249
|
-
res = nullptr; // do not extract logits for embedding models such as BERT
|
12250
|
-
|
12251
|
-
// token or sequence embeddings
|
12252
|
-
embd = gf->nodes[gf->n_nodes - 1];
|
12253
|
-
|
12254
|
-
GGML_ASSERT(strcmp(embd->name, "result_embd") == 0 || strcmp(embd->name, "result_embd_pooled") == 0);
|
12255
12525
|
} else if (cparams.embeddings) {
|
12256
|
-
|
12257
|
-
|
12258
|
-
|
12259
|
-
|
12260
|
-
if (i_embd < 0) { break; }
|
12261
|
-
embd = gf->nodes[i_embd];
|
12262
|
-
}
|
12263
|
-
GGML_ASSERT(i_embd >= 0 && "missing result_norm tensor");
|
12264
|
-
|
12265
|
-
// TODO: use a per-batch flag to know when to skip logits while keeping embeddings
|
12266
|
-
if (!cparams.causal_attn) {
|
12267
|
-
res = nullptr; // do not extract logits when not needed
|
12268
|
-
// skip computing logits
|
12269
|
-
// TODO: is this safe?
|
12270
|
-
gf->n_nodes = i_embd + 1;
|
12526
|
+
res = nullptr; // do not extract logits for embedding case
|
12527
|
+
embd = gf->nodes[gf->n_nodes - 1];
|
12528
|
+
if (strcmp(embd->name, "result_embd_pooled") != 0) {
|
12529
|
+
embd = gf->nodes[gf->n_nodes - 2];
|
12271
12530
|
}
|
12531
|
+
GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
|
12272
12532
|
} else {
|
12273
12533
|
embd = nullptr; // do not extract embeddings when not needed
|
12274
12534
|
GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
|
@@ -12337,11 +12597,10 @@ static int llama_decode_internal(
|
|
12337
12597
|
ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float));
|
12338
12598
|
}
|
12339
12599
|
} break;
|
12340
|
-
case LLAMA_POOLING_TYPE_CLS:
|
12341
12600
|
case LLAMA_POOLING_TYPE_MEAN:
|
12601
|
+
case LLAMA_POOLING_TYPE_CLS:
|
12602
|
+
case LLAMA_POOLING_TYPE_LAST:
|
12342
12603
|
{
|
12343
|
-
GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0);
|
12344
|
-
|
12345
12604
|
// extract sequence embeddings
|
12346
12605
|
auto & embd_seq_out = lctx.embd_seq;
|
12347
12606
|
embd_seq_out.clear();
|
@@ -12955,112 +13214,142 @@ struct llm_bigram_bpe {
|
|
12955
13214
|
};
|
12956
13215
|
|
12957
13216
|
struct llm_tokenizer_bpe {
|
12958
|
-
llm_tokenizer_bpe(const llama_vocab & vocab): vocab(vocab) {
|
12959
|
-
|
12960
|
-
|
12961
|
-
|
12962
|
-
|
12963
|
-
|
12964
|
-
|
12965
|
-
|
12966
|
-
|
12967
|
-
|
12968
|
-
|
12969
|
-
|
12970
|
-
|
12971
|
-
|
12972
|
-
|
12973
|
-
|
12974
|
-
|
12975
|
-
|
12976
|
-
|
12977
|
-
|
12978
|
-
|
12979
|
-
|
12980
|
-
|
12981
|
-
|
12982
|
-
|
12983
|
-
|
12984
|
-
|
12985
|
-
|
12986
|
-
|
12987
|
-
|
12988
|
-
|
12989
|
-
|
12990
|
-
|
12991
|
-
|
12992
|
-
|
12993
|
-
|
12994
|
-
|
12995
|
-
|
12996
|
-
|
12997
|
-
|
12998
|
-
|
12999
|
-
|
13000
|
-
|
13001
|
-
|
13002
|
-
|
13003
|
-
|
13004
|
-
|
13005
|
-
|
13006
|
-
|
13007
|
-
|
13008
|
-
|
13009
|
-
|
13010
|
-
|
13011
|
-
|
13012
|
-
|
13013
|
-
|
13014
|
-
|
13015
|
-
|
13016
|
-
|
13017
|
-
|
13018
|
-
|
13019
|
-
|
13020
|
-
|
13021
|
-
|
13022
|
-
|
13023
|
-
|
13024
|
-
|
13025
|
-
|
13026
|
-
|
13027
|
-
|
13028
|
-
|
13029
|
-
|
13030
|
-
|
13031
|
-
|
13032
|
-
|
13033
|
-
|
13034
|
-
|
13035
|
-
|
13036
|
-
|
13037
|
-
|
13038
|
-
|
13039
|
-
|
13040
|
-
// "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
|
13041
|
-
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
13042
|
-
});
|
13043
|
-
break;
|
13044
|
-
case LLAMA_VOCAB_PRE_TYPE_PORO:
|
13045
|
-
word_collection = unicode_regex_split(text, {
|
13046
|
-
" ?[^(\\s|.,!?…。,、।۔،)]+",
|
13047
|
-
});
|
13048
|
-
break;
|
13049
|
-
default:
|
13050
|
-
// default regex for BPE tokenization pre-processing
|
13051
|
-
word_collection = unicode_regex_split(text, {
|
13052
|
-
"[\\p{P}\\$\\+<=>\\^~\\|]+",
|
13053
|
-
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
13054
|
-
"\\p{N}+",
|
13055
|
-
"[0-9][0-9][0-9]",
|
13056
|
-
});
|
13057
|
-
break;
|
13058
|
-
}
|
13217
|
+
llm_tokenizer_bpe(const llama_vocab & vocab): vocab(vocab) {
|
13218
|
+
GGML_ASSERT(vocab.type == LLAMA_VOCAB_TYPE_BPE);
|
13219
|
+
switch (vocab.type_pre) {
|
13220
|
+
case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
|
13221
|
+
regex_exprs = {
|
13222
|
+
// original regex from tokenizer.json
|
13223
|
+
//"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
13224
|
+
|
13225
|
+
// adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989
|
13226
|
+
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
13227
|
+
};
|
13228
|
+
break;
|
13229
|
+
case LLAMA_VOCAB_PRE_TYPE_DBRX:
|
13230
|
+
case LLAMA_VOCAB_PRE_TYPE_SMAUG:
|
13231
|
+
regex_exprs = {
|
13232
|
+
// same as llama3
|
13233
|
+
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
13234
|
+
};
|
13235
|
+
break;
|
13236
|
+
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
|
13237
|
+
regex_exprs = {
|
13238
|
+
"[\r\n]",
|
13239
|
+
"\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
|
13240
|
+
"\\s?[!-/:-~!-/:-~‘-‟ -。]+",
|
13241
|
+
"\\s+$",
|
13242
|
+
"[一-龥ࠀ-一가-]+",
|
13243
|
+
"\\p{N}+",
|
13244
|
+
};
|
13245
|
+
break;
|
13246
|
+
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
|
13247
|
+
regex_exprs = {
|
13248
|
+
"[\r\n]",
|
13249
|
+
"\\s?\\p{L}+",
|
13250
|
+
"\\s?\\p{P}+",
|
13251
|
+
"[一-龥ࠀ-一가-]+",
|
13252
|
+
"\\p{N}",
|
13253
|
+
};
|
13254
|
+
break;
|
13255
|
+
case LLAMA_VOCAB_PRE_TYPE_FALCON:
|
13256
|
+
regex_exprs = {
|
13257
|
+
"[\\p{P}\\$\\+<=>\\^~\\|`]+",
|
13258
|
+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
13259
|
+
"[0-9][0-9][0-9]",
|
13260
|
+
};
|
13261
|
+
break;
|
13262
|
+
case LLAMA_VOCAB_PRE_TYPE_MPT:
|
13263
|
+
// TODO: MPT pre-tokenization regexes are unknown
|
13264
|
+
// the following are close, but not exact. run the following:
|
13265
|
+
// ./bin/test-tokenizer-0 ../models/ggml-vocab-mpt.gguf
|
13266
|
+
GGML_ASSERT("MPT pre-tokenization regexes are unknown - fixes needed");
|
13267
|
+
regex_exprs = {
|
13268
|
+
"\\s?\\p{L}+",
|
13269
|
+
"\\s?\\p{P}+",
|
13270
|
+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
13271
|
+
};
|
13272
|
+
break;
|
13273
|
+
case LLAMA_VOCAB_PRE_TYPE_STARCODER:
|
13274
|
+
case LLAMA_VOCAB_PRE_TYPE_REFACT:
|
13275
|
+
case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
|
13276
|
+
regex_exprs = {
|
13277
|
+
"\\p{N}",
|
13278
|
+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
13279
|
+
};
|
13280
|
+
break;
|
13281
|
+
case LLAMA_VOCAB_PRE_TYPE_GPT2:
|
13282
|
+
case LLAMA_VOCAB_PRE_TYPE_OLMO:
|
13283
|
+
regex_exprs = {
|
13284
|
+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
13285
|
+
};
|
13286
|
+
break;
|
13287
|
+
case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
|
13288
|
+
case LLAMA_VOCAB_PRE_TYPE_QWEN2:
|
13289
|
+
regex_exprs = {
|
13290
|
+
// original regex from tokenizer.json
|
13291
|
+
// "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
|
13292
|
+
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
13293
|
+
};
|
13294
|
+
break;
|
13295
|
+
case LLAMA_VOCAB_PRE_TYPE_PORO:
|
13296
|
+
regex_exprs = {
|
13297
|
+
" ?[^(\\s|.,!?…。,、।۔،)]+",
|
13298
|
+
};
|
13059
13299
|
break;
|
13060
13300
|
default:
|
13061
|
-
|
13301
|
+
// default regex for BPE tokenization pre-processing
|
13302
|
+
regex_exprs = {
|
13303
|
+
"[\\p{P}\\$\\+<=>\\^~\\|]+",
|
13304
|
+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
13305
|
+
"\\p{N}+",
|
13306
|
+
"[0-9][0-9][0-9]",
|
13307
|
+
};
|
13062
13308
|
break;
|
13063
13309
|
}
|
13310
|
+
}
|
13311
|
+
|
13312
|
+
void append(const llama_vocab::id token_id, std::vector<llama_vocab::id> & output) const {
|
13313
|
+
output.push_back(token_id);
|
13314
|
+
}
|
13315
|
+
|
13316
|
+
bool append_bos(std::vector<llama_vocab::id> & output) const {
|
13317
|
+
if (vocab.tokenizer_add_bos) {
|
13318
|
+
GGML_ASSERT(vocab.special_bos_id != -1);
|
13319
|
+
output.push_back(vocab.special_bos_id);
|
13320
|
+
return true;
|
13321
|
+
}
|
13322
|
+
return false;
|
13323
|
+
}
|
13324
|
+
|
13325
|
+
bool append_eos(std::vector<llama_vocab::id> & output) const {
|
13326
|
+
if (vocab.tokenizer_add_eos) {
|
13327
|
+
GGML_ASSERT(vocab.special_eos_id != -1);
|
13328
|
+
output.push_back(vocab.special_eos_id);
|
13329
|
+
return true;
|
13330
|
+
}
|
13331
|
+
return false;
|
13332
|
+
}
|
13333
|
+
|
13334
|
+
void check_double_bos_eos(const std::vector<llama_vocab::id> & output) const {
|
13335
|
+
if (vocab.tokenizer_add_bos && output.size() >= 2 && output[1] == vocab.special_bos_id) {
|
13336
|
+
LLAMA_LOG_WARN(
|
13337
|
+
"%s: Added a BOS token to the prompt as specified by the model but the prompt "
|
13338
|
+
"also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
|
13339
|
+
"Are you sure this is what you want?\n", __FUNCTION__);
|
13340
|
+
}
|
13341
|
+
if (vocab.tokenizer_add_eos && output.size() >= 2 && *(output.end()-2) == vocab.special_eos_id) {
|
13342
|
+
LLAMA_LOG_WARN(
|
13343
|
+
"%s: Added a EOS token to the prompt as specified by the model but the prompt "
|
13344
|
+
"also ends with a EOS token. So now the final prompt ends with 2 EOS tokens. "
|
13345
|
+
"Are you sure this is what you want?\n", __FUNCTION__);
|
13346
|
+
}
|
13347
|
+
}
|
13348
|
+
|
13349
|
+
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
13350
|
+
int final_prev_index = -1;
|
13351
|
+
|
13352
|
+
const auto word_collection = unicode_regex_split(text, regex_exprs);
|
13064
13353
|
|
13065
13354
|
symbols_final.clear();
|
13066
13355
|
|
@@ -13071,7 +13360,7 @@ struct llm_tokenizer_bpe {
|
|
13071
13360
|
int index = 0;
|
13072
13361
|
size_t offset = 0;
|
13073
13362
|
|
13074
|
-
if (
|
13363
|
+
if (vocab.tokenizer_ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
|
13075
13364
|
symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
|
13076
13365
|
offset = word.size();
|
13077
13366
|
}
|
@@ -13152,10 +13441,9 @@ struct llm_tokenizer_bpe {
|
|
13152
13441
|
for (auto j = str.begin(); j != str.end(); ++j) {
|
13153
13442
|
std::string byte_str(1, *j);
|
13154
13443
|
auto token_multibyte = vocab.token_to_id.find(byte_str);
|
13155
|
-
if (token_multibyte
|
13156
|
-
|
13444
|
+
if (token_multibyte != vocab.token_to_id.end()) {
|
13445
|
+
output.push_back(token_multibyte->second);
|
13157
13446
|
}
|
13158
|
-
output.push_back((*token_multibyte).second);
|
13159
13447
|
}
|
13160
13448
|
} else {
|
13161
13449
|
output.push_back((*token).second);
|
@@ -13194,6 +13482,8 @@ private:
|
|
13194
13482
|
|
13195
13483
|
const llama_vocab & vocab;
|
13196
13484
|
|
13485
|
+
std::vector<std::string> regex_exprs;
|
13486
|
+
|
13197
13487
|
std::vector<llm_symbol> symbols;
|
13198
13488
|
std::vector<llm_symbol> symbols_final;
|
13199
13489
|
|
@@ -13203,7 +13493,7 @@ private:
|
|
13203
13493
|
struct llm_tokenizer_wpm {
|
13204
13494
|
llm_tokenizer_wpm(const llama_vocab & vocab): vocab(vocab) {}
|
13205
13495
|
|
13206
|
-
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
13496
|
+
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) const {
|
13207
13497
|
const auto & token_map = vocab.token_to_id;
|
13208
13498
|
|
13209
13499
|
// normalize and split by whitespace
|
@@ -13212,7 +13502,7 @@ struct llm_tokenizer_wpm {
|
|
13212
13502
|
// bos token prepended already
|
13213
13503
|
|
13214
13504
|
// find the longest tokens that form the words
|
13215
|
-
for (const std::string &word : words) {
|
13505
|
+
for (const std::string & word : words) {
|
13216
13506
|
// skip empty words
|
13217
13507
|
if (word.size() == 0) {
|
13218
13508
|
continue;
|
@@ -13229,7 +13519,7 @@ struct llm_tokenizer_wpm {
|
|
13229
13519
|
for (int i = 0; i < n; ++i) {
|
13230
13520
|
// loop through possible match length
|
13231
13521
|
bool match = false;
|
13232
|
-
for (int j = n; j > i; j--) {
|
13522
|
+
for (int j = std::min(n, i + vocab.max_token_len + 1); j > i; j--) {
|
13233
13523
|
auto it = token_map.find(word1.substr(i, j - i));
|
13234
13524
|
if (it != token_map.end()) {
|
13235
13525
|
output.push_back(it->second);
|
@@ -13252,11 +13542,12 @@ struct llm_tokenizer_wpm {
|
|
13252
13542
|
}
|
13253
13543
|
}
|
13254
13544
|
|
13255
|
-
|
13545
|
+
// TODO: reduce string copies by using cpts_offs array
|
13546
|
+
std::vector<std::string> preprocess(const std::string & text) const {
|
13256
13547
|
const std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
|
13257
13548
|
std::vector<std::string> words(1, "");
|
13258
13549
|
|
13259
|
-
for (const
|
13550
|
+
for (const uint32_t cpt : cpts_nfd) {
|
13260
13551
|
const auto flags = unicode_cpt_flags(cpt);
|
13261
13552
|
|
13262
13553
|
if (flags.is_whitespace) {
|
@@ -13474,7 +13765,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
13474
13765
|
|
13475
13766
|
bool is_prev_special = false;
|
13476
13767
|
|
13477
|
-
if (add_special && vocab.
|
13768
|
+
if (add_special && vocab.tokenizer_add_bos) {
|
13478
13769
|
GGML_ASSERT(vocab.special_bos_id != -1);
|
13479
13770
|
output.push_back(vocab.special_bos_id);
|
13480
13771
|
is_prev_special = true;
|
@@ -13484,7 +13775,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
13484
13775
|
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
13485
13776
|
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
13486
13777
|
|
13487
|
-
if (vocab.
|
13778
|
+
if (vocab.tokenizer_add_space_prefix) {
|
13488
13779
|
if (!output.size() || is_prev_special) { // prefix with space if first token
|
13489
13780
|
raw_text = " " + raw_text;
|
13490
13781
|
}
|
@@ -13502,23 +13793,24 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
13502
13793
|
}
|
13503
13794
|
}
|
13504
13795
|
|
13505
|
-
if (add_special && vocab.
|
13796
|
+
if (add_special && vocab.tokenizer_add_bos && output.size() >= 2 && output[1] == vocab.special_bos_id) {
|
13506
13797
|
LLAMA_LOG_WARN(
|
13507
13798
|
"%s: Added a BOS token to the prompt as specified by the model but the prompt "
|
13508
13799
|
"also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
|
13509
13800
|
"Are you sure this is what you want?\n", __FUNCTION__);
|
13510
13801
|
}
|
13511
13802
|
|
13512
|
-
if (add_special && vocab.
|
13803
|
+
if (add_special && vocab.tokenizer_add_eos) {
|
13513
13804
|
GGML_ASSERT(vocab.special_eos_id != -1);
|
13514
13805
|
output.push_back(vocab.special_eos_id);
|
13515
13806
|
}
|
13516
13807
|
} break;
|
13517
13808
|
case LLAMA_VOCAB_TYPE_BPE:
|
13518
13809
|
{
|
13519
|
-
|
13520
|
-
|
13521
|
-
|
13810
|
+
llm_tokenizer_bpe tokenizer(vocab);
|
13811
|
+
|
13812
|
+
if (add_special) {
|
13813
|
+
tokenizer.append_bos(output);
|
13522
13814
|
}
|
13523
13815
|
|
13524
13816
|
for (const auto & fragment : fragment_buffer) {
|
@@ -13528,23 +13820,15 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
13528
13820
|
#ifdef PRETOKENIZERDEBUG
|
13529
13821
|
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
|
13530
13822
|
#endif
|
13531
|
-
llm_tokenizer_bpe tokenizer(vocab);
|
13532
13823
|
tokenizer.tokenize(raw_text, output);
|
13533
13824
|
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
13534
|
-
|
13825
|
+
tokenizer.append(fragment.token, output);
|
13535
13826
|
}
|
13536
13827
|
}
|
13537
13828
|
|
13538
|
-
if (add_special
|
13539
|
-
|
13540
|
-
|
13541
|
-
"also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
|
13542
|
-
"Are you sure this is what you want?\n", __FUNCTION__);
|
13543
|
-
}
|
13544
|
-
|
13545
|
-
if (add_special && vocab.special_add_eos == 1) {
|
13546
|
-
GGML_ASSERT(vocab.special_add_eos != -1);
|
13547
|
-
output.push_back(vocab.special_eos_id);
|
13829
|
+
if (add_special) {
|
13830
|
+
tokenizer.append_eos(output);
|
13831
|
+
tokenizer.check_double_bos_eos(output);
|
13548
13832
|
}
|
13549
13833
|
} break;
|
13550
13834
|
case LLAMA_VOCAB_TYPE_WPM:
|
@@ -13554,6 +13838,8 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
13554
13838
|
output.push_back(vocab.special_cls_id);
|
13555
13839
|
}
|
13556
13840
|
|
13841
|
+
llm_tokenizer_wpm tokenizer(vocab);
|
13842
|
+
|
13557
13843
|
for (const auto & fragment : fragment_buffer) {
|
13558
13844
|
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
13559
13845
|
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
@@ -13561,7 +13847,6 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
13561
13847
|
#ifdef PRETOKENIZERDEBUG
|
13562
13848
|
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
|
13563
13849
|
#endif
|
13564
|
-
llm_tokenizer_wpm tokenizer(vocab);
|
13565
13850
|
tokenizer.tokenize(raw_text, output);
|
13566
13851
|
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
13567
13852
|
output.push_back(fragment.token);
|
@@ -16070,6 +16355,11 @@ struct llama_context * llama_new_context_with_model(
|
|
16070
16355
|
params.flash_attn = false;
|
16071
16356
|
}
|
16072
16357
|
|
16358
|
+
if (params.flash_attn && model->hparams.n_embd_head_k != model->hparams.n_embd_head_v) {
|
16359
|
+
LLAMA_LOG_WARN("%s: flash_attn requires n_embd_head_k == n_embd_head_v - forcing off\n", __func__);
|
16360
|
+
params.flash_attn = false;
|
16361
|
+
}
|
16362
|
+
|
16073
16363
|
if (params.type_v != GGML_TYPE_F16 && !params.flash_attn) {
|
16074
16364
|
LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__);
|
16075
16365
|
return nullptr;
|
@@ -16241,8 +16531,7 @@ struct llama_context * llama_new_context_with_model(
|
|
16241
16531
|
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
16242
16532
|
ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
|
16243
16533
|
if (backend == nullptr) {
|
16244
|
-
|
16245
|
-
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, main_gpu_id, model->main_gpu);
|
16534
|
+
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu);
|
16246
16535
|
llama_free(ctx);
|
16247
16536
|
return nullptr;
|
16248
16537
|
}
|
@@ -17870,6 +18159,10 @@ void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)
|
|
17870
18159
|
ctx->abort_callback_data = abort_callback_data;
|
17871
18160
|
}
|
17872
18161
|
|
18162
|
+
void llama_set_embeddings(struct llama_context * ctx, bool embeddings) {
|
18163
|
+
ctx->cparams.embeddings = embeddings;
|
18164
|
+
}
|
18165
|
+
|
17873
18166
|
void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
|
17874
18167
|
ctx->cparams.causal_attn = causal_attn;
|
17875
18168
|
}
|
@@ -18113,11 +18406,11 @@ llama_token llama_token_nl(const struct llama_model * model) {
|
|
18113
18406
|
}
|
18114
18407
|
|
18115
18408
|
int32_t llama_add_bos_token(const struct llama_model * model) {
|
18116
|
-
return model->vocab.
|
18409
|
+
return model->vocab.tokenizer_add_bos;
|
18117
18410
|
}
|
18118
18411
|
|
18119
18412
|
int32_t llama_add_eos_token(const struct llama_model * model) {
|
18120
|
-
return model->vocab.
|
18413
|
+
return model->vocab.tokenizer_add_eos;
|
18121
18414
|
}
|
18122
18415
|
|
18123
18416
|
llama_token llama_token_prefix(const struct llama_model * model) {
|