llama_cpp 0.16.1 → 0.16.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/ext/llama_cpp/extconf.rb +1 -0
- data/ext/llama_cpp/llama_cpp.cpp +12 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +2 -0
- data/vendor/tmp/llama.cpp/Makefile +10 -2
- data/vendor/tmp/llama.cpp/ggml-backend.c +14 -3
- data/vendor/tmp/llama.cpp/ggml-backend.h +3 -0
- data/vendor/tmp/llama.cpp/ggml-cuda/mmq.cu +10 -10
- data/vendor/tmp/llama.cpp/ggml-cuda/mmvq.cu +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda/unary.cu +28 -0
- data/vendor/tmp/llama.cpp/ggml-impl.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +6 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +982 -368
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +8 -3
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +2124 -13202
- data/vendor/tmp/llama.cpp/ggml-sycl.h +1 -10
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +27564 -23876
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +278 -366
- data/vendor/tmp/llama.cpp/ggml.c +67 -150
- data/vendor/tmp/llama.cpp/ggml.h +6 -0
- data/vendor/tmp/llama.cpp/llama.cpp +530 -237
- data/vendor/tmp/llama.cpp/llama.h +5 -1
- data/vendor/tmp/llama.cpp/sgemm.cpp +2 -0
- data/vendor/tmp/llama.cpp/unicode-data.cpp +851 -801
- data/vendor/tmp/llama.cpp/unicode.cpp +33 -19
- data/vendor/tmp/llama.cpp/unicode.h +1 -1
- metadata +2 -2
@@ -286,6 +286,7 @@ enum llm_kv {
|
|
286
286
|
LLM_KV_LEADING_DENSE_BLOCK_COUNT,
|
287
287
|
LLM_KV_FEED_FORWARD_LENGTH,
|
288
288
|
LLM_KV_EXPERT_FEED_FORWARD_LENGTH,
|
289
|
+
LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH,
|
289
290
|
LLM_KV_USE_PARALLEL_RESIDUAL,
|
290
291
|
LLM_KV_TENSOR_DATA_LAYOUT,
|
291
292
|
LLM_KV_EXPERT_COUNT,
|
@@ -364,21 +365,22 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
364
365
|
{ LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
|
365
366
|
{ LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
|
366
367
|
|
367
|
-
{ LLM_KV_VOCAB_SIZE,
|
368
|
-
{ LLM_KV_CONTEXT_LENGTH,
|
369
|
-
{ LLM_KV_EMBEDDING_LENGTH,
|
370
|
-
{ LLM_KV_BLOCK_COUNT,
|
371
|
-
{ LLM_KV_LEADING_DENSE_BLOCK_COUNT,
|
372
|
-
{ LLM_KV_FEED_FORWARD_LENGTH,
|
373
|
-
{ LLM_KV_EXPERT_FEED_FORWARD_LENGTH,
|
374
|
-
{
|
375
|
-
{
|
376
|
-
{
|
377
|
-
{
|
378
|
-
{
|
379
|
-
{
|
380
|
-
{
|
381
|
-
{
|
368
|
+
{ LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
|
369
|
+
{ LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
|
370
|
+
{ LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
|
371
|
+
{ LLM_KV_BLOCK_COUNT, "%s.block_count" },
|
372
|
+
{ LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" },
|
373
|
+
{ LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
|
374
|
+
{ LLM_KV_EXPERT_FEED_FORWARD_LENGTH, "%s.expert_feed_forward_length" },
|
375
|
+
{ LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, "%s.expert_shared_feed_forward_length" },
|
376
|
+
{ LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
|
377
|
+
{ LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
|
378
|
+
{ LLM_KV_EXPERT_COUNT, "%s.expert_count" },
|
379
|
+
{ LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
|
380
|
+
{ LLM_KV_EXPERT_SHARED_COUNT, "%s.expert_shared_count" },
|
381
|
+
{ LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
|
382
|
+
{ LLM_KV_POOLING_TYPE , "%s.pooling_type" },
|
383
|
+
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
|
382
384
|
|
383
385
|
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
384
386
|
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
@@ -1278,6 +1280,126 @@ struct no_init {
|
|
1278
1280
|
};
|
1279
1281
|
|
1280
1282
|
struct llama_file {
|
1283
|
+
|
1284
|
+
#if defined(_WIN32)
|
1285
|
+
// use FILE * so we don't have to re-open the file to mmap
|
1286
|
+
FILE * fp;
|
1287
|
+
HANDLE fp_win32;
|
1288
|
+
size_t size;
|
1289
|
+
|
1290
|
+
private:
|
1291
|
+
std::string GetErrorMessageWin32(DWORD error_code) const {
|
1292
|
+
std::string ret;
|
1293
|
+
LPSTR lpMsgBuf = NULL;
|
1294
|
+
DWORD bufLen = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
|
1295
|
+
NULL, error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&lpMsgBuf, 0, NULL);
|
1296
|
+
if (!bufLen) {
|
1297
|
+
ret = format("Win32 error code: %s", error_code);
|
1298
|
+
} else {
|
1299
|
+
ret = lpMsgBuf;
|
1300
|
+
LocalFree(lpMsgBuf);
|
1301
|
+
}
|
1302
|
+
|
1303
|
+
return ret;
|
1304
|
+
}
|
1305
|
+
|
1306
|
+
public:
|
1307
|
+
|
1308
|
+
llama_file(const char * fname, const char * mode) {
|
1309
|
+
fp = ggml_fopen(fname, mode);
|
1310
|
+
if (fp == NULL) {
|
1311
|
+
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
|
1312
|
+
}
|
1313
|
+
fp_win32 = (HANDLE) _get_osfhandle(_fileno(fp));
|
1314
|
+
seek(0, SEEK_END);
|
1315
|
+
size = tell();
|
1316
|
+
seek(0, SEEK_SET);
|
1317
|
+
}
|
1318
|
+
|
1319
|
+
size_t tell() const {
|
1320
|
+
// SetFilePointerEx returns the current position when seeking relative 0 bytes
|
1321
|
+
LARGE_INTEGER li;
|
1322
|
+
li.QuadPart = 0;
|
1323
|
+
BOOL ret = SetFilePointerEx(fp_win32, li, &li, FILE_CURRENT);
|
1324
|
+
if (!ret) {
|
1325
|
+
throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
|
1326
|
+
}
|
1327
|
+
|
1328
|
+
return li.QuadPart;
|
1329
|
+
}
|
1330
|
+
|
1331
|
+
void seek(size_t offset, int whence) const {
|
1332
|
+
// no need to convert SEEK_* to FILE_*. The enums are the same.
|
1333
|
+
// Still, keep static asserts to avoid failures in the future.
|
1334
|
+
static_assert(SEEK_SET == FILE_BEGIN, "SEEK_SET != FILE_BEGIN");
|
1335
|
+
static_assert(SEEK_CUR == FILE_CURRENT, "SEEK_CUR != FILE_CURRENT");
|
1336
|
+
static_assert(SEEK_END == FILE_END, "SEEK_END != FILE_END");
|
1337
|
+
|
1338
|
+
LARGE_INTEGER li;
|
1339
|
+
li.QuadPart = offset;
|
1340
|
+
BOOL ret = SetFilePointerEx(fp_win32, li, NULL, whence);
|
1341
|
+
if (!ret) {
|
1342
|
+
throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
|
1343
|
+
}
|
1344
|
+
}
|
1345
|
+
|
1346
|
+
void read_raw(void * ptr, size_t len) const {
|
1347
|
+
// On Win32 ReadFile is significant faster than fread which is again significant faster than std::fstream. Thus
|
1348
|
+
// use the Win32 API to do file io instead of the C/C++ library functions.
|
1349
|
+
|
1350
|
+
// There are conditions under which ReadFile cannot read chunks >64MB.
|
1351
|
+
// Thus split the operation into smaller chunks if len exceeds this limit.
|
1352
|
+
size_t bytes_read = 0;
|
1353
|
+
while (bytes_read < len) {
|
1354
|
+
size_t chunk_size = std::min<size_t>(len - bytes_read, 64*1024*1024);
|
1355
|
+
DWORD chunk_read = 0;
|
1356
|
+
BOOL result = ReadFile(fp_win32, reinterpret_cast<char*>(ptr) + bytes_read, chunk_size, &chunk_read, NULL);
|
1357
|
+
if (!result) {
|
1358
|
+
throw std::runtime_error(format("read error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
|
1359
|
+
}
|
1360
|
+
if (chunk_read < chunk_size || chunk_read == 0) {
|
1361
|
+
throw std::runtime_error("unexpectedly reached end of file");
|
1362
|
+
}
|
1363
|
+
|
1364
|
+
bytes_read += chunk_read;
|
1365
|
+
} ;
|
1366
|
+
}
|
1367
|
+
|
1368
|
+
uint32_t read_u32() const {
|
1369
|
+
uint32_t val;
|
1370
|
+
read_raw(&val, sizeof(val));
|
1371
|
+
return val;
|
1372
|
+
}
|
1373
|
+
|
1374
|
+
void write_raw(const void * ptr, size_t len) const {
|
1375
|
+
// There are conditions under which WriteFile cannot write chunks >64MB.
|
1376
|
+
// Thus split the operation into smaller chunks if len exceeds this limit.
|
1377
|
+
size_t bytes_written = 0;
|
1378
|
+
while (bytes_written < len) {
|
1379
|
+
size_t chunk_size = std::min<size_t>(len - bytes_written, 64*1024*1024);
|
1380
|
+
DWORD chunk_written = 0;
|
1381
|
+
BOOL result = WriteFile(fp_win32, reinterpret_cast<char const*>(ptr) + bytes_written, chunk_size, &chunk_written, NULL);
|
1382
|
+
if (!result) {
|
1383
|
+
throw std::runtime_error(format("write error: %s", GetErrorMessageWin32(GetLastError()).c_str()));
|
1384
|
+
}
|
1385
|
+
if (chunk_written < chunk_size || chunk_written == 0) {
|
1386
|
+
throw std::runtime_error("unexpectedly failed to write bytes");
|
1387
|
+
}
|
1388
|
+
|
1389
|
+
bytes_written += chunk_written;
|
1390
|
+
}
|
1391
|
+
}
|
1392
|
+
|
1393
|
+
void write_u32(std::uint32_t val) const {
|
1394
|
+
write_raw(&val, sizeof(val));
|
1395
|
+
}
|
1396
|
+
|
1397
|
+
~llama_file() {
|
1398
|
+
if (fp) {
|
1399
|
+
std::fclose(fp);
|
1400
|
+
}
|
1401
|
+
}
|
1402
|
+
#else
|
1281
1403
|
// use FILE * so we don't have to re-open the file to mmap
|
1282
1404
|
FILE * fp;
|
1283
1405
|
size_t size;
|
@@ -1298,7 +1420,10 @@ struct llama_file {
|
|
1298
1420
|
#else
|
1299
1421
|
long ret = std::ftell(fp);
|
1300
1422
|
#endif
|
1301
|
-
|
1423
|
+
if (ret == -1) {
|
1424
|
+
throw std::runtime_error(format("ftell error: %s", strerror(errno)));
|
1425
|
+
}
|
1426
|
+
|
1302
1427
|
return (size_t) ret;
|
1303
1428
|
}
|
1304
1429
|
|
@@ -1308,7 +1433,9 @@ struct llama_file {
|
|
1308
1433
|
#else
|
1309
1434
|
int ret = std::fseek(fp, (long) offset, whence);
|
1310
1435
|
#endif
|
1311
|
-
|
1436
|
+
if (ret != 0) {
|
1437
|
+
throw std::runtime_error(format("seek error: %s", strerror(errno)));
|
1438
|
+
}
|
1312
1439
|
}
|
1313
1440
|
|
1314
1441
|
void read_raw(void * ptr, size_t len) const {
|
@@ -1351,6 +1478,7 @@ struct llama_file {
|
|
1351
1478
|
std::fclose(fp);
|
1352
1479
|
}
|
1353
1480
|
}
|
1481
|
+
#endif
|
1354
1482
|
};
|
1355
1483
|
using llama_files = std::vector<std::unique_ptr<llama_file>>;
|
1356
1484
|
|
@@ -1844,6 +1972,7 @@ struct llama_hparams {
|
|
1844
1972
|
uint32_t n_lora_q = 0;
|
1845
1973
|
uint32_t n_lora_kv = 0;
|
1846
1974
|
uint32_t n_ff_exp = 0;
|
1975
|
+
uint32_t n_ff_shexp = 0;
|
1847
1976
|
uint32_t n_expert_shared = 0;
|
1848
1977
|
float expert_weights_scale = 0.0;
|
1849
1978
|
|
@@ -1892,6 +2021,7 @@ struct llama_hparams {
|
|
1892
2021
|
if (this->n_lora_q != other.n_lora_q) return true;
|
1893
2022
|
if (this->n_lora_kv != other.n_lora_kv) return true;
|
1894
2023
|
if (this->n_ff_exp != other.n_ff_exp) return true;
|
2024
|
+
if (this->n_ff_shexp != other.n_ff_shexp) return true;
|
1895
2025
|
if (this->n_expert_shared != other.n_expert_shared) return true;
|
1896
2026
|
|
1897
2027
|
if (this->rope_finetuned != other.rope_finetuned) return true;
|
@@ -2163,6 +2293,8 @@ struct llama_vocab {
|
|
2163
2293
|
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
|
2164
2294
|
enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
2165
2295
|
|
2296
|
+
int max_token_len = 0; // used for optimizing longest token search
|
2297
|
+
|
2166
2298
|
std::unordered_map<token, id> token_to_id;
|
2167
2299
|
std::vector<token_data> id_to_token;
|
2168
2300
|
|
@@ -2180,16 +2312,17 @@ struct llama_vocab {
|
|
2180
2312
|
id special_cls_id = -1;
|
2181
2313
|
id special_mask_id = -1;
|
2182
2314
|
|
2183
|
-
int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add.
|
2184
|
-
int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
|
2185
|
-
|
2186
2315
|
id linefeed_id = 13;
|
2187
2316
|
id special_prefix_id = -1;
|
2188
2317
|
id special_suffix_id = -1;
|
2189
2318
|
id special_middle_id = -1;
|
2190
2319
|
id special_eot_id = -1; // TODO: move above after "eos_id", and here add "file separator" token
|
2191
2320
|
|
2192
|
-
|
2321
|
+
// tokenizer flags
|
2322
|
+
bool tokenizer_add_space_prefix = true;
|
2323
|
+
bool tokenizer_add_bos = false;
|
2324
|
+
bool tokenizer_add_eos = false;
|
2325
|
+
bool tokenizer_ignore_merges = false;
|
2193
2326
|
|
2194
2327
|
int find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
|
2195
2328
|
GGML_ASSERT(token_left.find(' ') == std::string::npos);
|
@@ -3721,6 +3854,44 @@ struct llama_model_loader {
|
|
3721
3854
|
std::vector<no_init<uint8_t>> read_buf;
|
3722
3855
|
std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result;
|
3723
3856
|
|
3857
|
+
#if defined(GGML_USE_CUDA)
|
3858
|
+
// 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives.
|
3859
|
+
// NVMe raid configurations might require more / larger buffers.
|
3860
|
+
constexpr size_t num_buffers = 4;
|
3861
|
+
constexpr size_t buffer_size = 1 * 1024 * 1024; // 1MB
|
3862
|
+
|
3863
|
+
std::vector<ggml_backend_buffer_t> host_buffers;
|
3864
|
+
std::vector<void*> host_ptrs;
|
3865
|
+
std::vector<ggml_backend_event_t> events;
|
3866
|
+
size_t buffer_idx = 0; // buffer to use for async loads
|
3867
|
+
|
3868
|
+
ggml_backend_t cuda_backend = nullptr;
|
3869
|
+
if (!use_mmap && !check_tensors) {
|
3870
|
+
// When not using mmaped io use async uploads from pinned memory to GPU memory.
|
3871
|
+
// First determine if the CUDA backend is active, and if so, determine the device ID.
|
3872
|
+
ggml_backend_buffer_t buf = bufs_mmap.count(0) ? bufs_mmap.at(0) : nullptr;
|
3873
|
+
if (buf) {
|
3874
|
+
ggml_backend_buffer_type_t buffer_type = ggml_backend_buffer_get_type(buf);
|
3875
|
+
for (int i = 0; i < ggml_backend_cuda_get_device_count(); ++i) {
|
3876
|
+
auto * cuda_buffer_type = ggml_backend_cuda_buffer_type(i);
|
3877
|
+
if (buffer_type == cuda_buffer_type) {
|
3878
|
+
cuda_backend = ggml_backend_cuda_init(i);
|
3879
|
+
break;
|
3880
|
+
}
|
3881
|
+
}
|
3882
|
+
}
|
3883
|
+
|
3884
|
+
// If the cuda backend is active create pinned memory buffers and events for synchronisation.
|
3885
|
+
if (cuda_backend) {
|
3886
|
+
for (size_t idx = 0; idx < num_buffers; ++idx) {
|
3887
|
+
host_buffers.emplace_back(ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), buffer_size));
|
3888
|
+
host_ptrs.emplace_back(ggml_backend_buffer_get_base(host_buffers[idx]));
|
3889
|
+
events.emplace_back(ggml_backend_event_new(cuda_backend));
|
3890
|
+
}
|
3891
|
+
}
|
3892
|
+
}
|
3893
|
+
#endif
|
3894
|
+
|
3724
3895
|
for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
|
3725
3896
|
const auto * weight = get_weight(ggml_get_name(cur));
|
3726
3897
|
if (weight == nullptr) {
|
@@ -3776,12 +3947,36 @@ struct llama_model_loader {
|
|
3776
3947
|
}));
|
3777
3948
|
}
|
3778
3949
|
} else {
|
3779
|
-
|
3780
|
-
|
3781
|
-
|
3782
|
-
|
3783
|
-
|
3784
|
-
|
3950
|
+
#if defined(GGML_USE_CUDA)
|
3951
|
+
// If cuda_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU.
|
3952
|
+
if (cuda_backend) {
|
3953
|
+
file->seek(weight->offs, SEEK_SET);
|
3954
|
+
|
3955
|
+
size_t bytes_read = 0;
|
3956
|
+
|
3957
|
+
while (bytes_read < n_size) {
|
3958
|
+
size_t read_iteration = std::min<size_t>(buffer_size, n_size - bytes_read);
|
3959
|
+
|
3960
|
+
ggml_backend_event_synchronize(events[buffer_idx]);
|
3961
|
+
file->read_raw(host_ptrs[buffer_idx], read_iteration);
|
3962
|
+
ggml_backend_tensor_set_async(cuda_backend, cur, host_ptrs[buffer_idx], bytes_read, read_iteration);
|
3963
|
+
ggml_backend_event_record(events[buffer_idx]);
|
3964
|
+
|
3965
|
+
bytes_read += read_iteration;
|
3966
|
+
++buffer_idx;
|
3967
|
+
buffer_idx %= num_buffers;
|
3968
|
+
}
|
3969
|
+
}
|
3970
|
+
else
|
3971
|
+
#endif
|
3972
|
+
{
|
3973
|
+
read_buf.resize(n_size);
|
3974
|
+
file->seek(weight->offs, SEEK_SET);
|
3975
|
+
file->read_raw(read_buf.data(), n_size);
|
3976
|
+
ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
|
3977
|
+
if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
|
3978
|
+
throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
|
3979
|
+
}
|
3785
3980
|
}
|
3786
3981
|
}
|
3787
3982
|
}
|
@@ -3789,6 +3984,18 @@ struct llama_model_loader {
|
|
3789
3984
|
size_done += n_size;
|
3790
3985
|
}
|
3791
3986
|
|
3987
|
+
#if defined(GGML_USE_CUDA)
|
3988
|
+
// free temporary resources used for async cuda uploads
|
3989
|
+
if (cuda_backend) {
|
3990
|
+
for (size_t idx = 0; idx < num_buffers;++idx) {
|
3991
|
+
ggml_backend_event_synchronize(events[idx]);
|
3992
|
+
ggml_backend_event_free(events[idx]);
|
3993
|
+
ggml_backend_buffer_free(host_buffers[idx]);
|
3994
|
+
}
|
3995
|
+
ggml_backend_free(cuda_backend);
|
3996
|
+
}
|
3997
|
+
#endif
|
3998
|
+
|
3792
3999
|
// check validation results
|
3793
4000
|
bool validation_failed = false;
|
3794
4001
|
for (auto & future : validation_result) {
|
@@ -4255,6 +4462,9 @@ static void llm_load_hparams(
|
|
4255
4462
|
} break;
|
4256
4463
|
case LLM_ARCH_QWEN2MOE:
|
4257
4464
|
{
|
4465
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
|
4466
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
|
4467
|
+
|
4258
4468
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
4259
4469
|
switch (hparams.n_layer) {
|
4260
4470
|
case 24: model.type = e_model::MODEL_A2_7B; break;
|
@@ -4563,7 +4773,7 @@ static void llm_load_vocab(
|
|
4563
4773
|
|
4564
4774
|
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
|
4565
4775
|
if (add_space_prefix_keyidx != -1) {
|
4566
|
-
vocab.
|
4776
|
+
vocab.tokenizer_add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
|
4567
4777
|
} // The default value of add_space_prefix is true.
|
4568
4778
|
} else if (tokenizer_model == "bert") {
|
4569
4779
|
vocab.type = LLAMA_VOCAB_TYPE_WPM;
|
@@ -4576,13 +4786,13 @@ static void llm_load_vocab(
|
|
4576
4786
|
vocab.special_pad_id = 0;
|
4577
4787
|
vocab.special_cls_id = 101;
|
4578
4788
|
vocab.special_mask_id = 103;
|
4579
|
-
vocab.
|
4789
|
+
vocab.tokenizer_add_space_prefix = false;
|
4580
4790
|
} else if (tokenizer_model == "gpt2") {
|
4581
4791
|
vocab.type = LLAMA_VOCAB_TYPE_BPE;
|
4582
4792
|
|
4583
4793
|
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
|
4584
4794
|
if (add_space_prefix_keyidx != -1) {
|
4585
|
-
vocab.
|
4795
|
+
vocab.tokenizer_add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
|
4586
4796
|
}
|
4587
4797
|
|
4588
4798
|
// read bpe merges and populate bpe ranks
|
@@ -4640,6 +4850,8 @@ static void llm_load_vocab(
|
|
4640
4850
|
tokenizer_pre == "llama-v3" ||
|
4641
4851
|
tokenizer_pre == "llama-bpe") {
|
4642
4852
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
|
4853
|
+
vocab.tokenizer_ignore_merges = true;
|
4854
|
+
vocab.tokenizer_add_bos = true;
|
4643
4855
|
} else if (
|
4644
4856
|
tokenizer_pre == "deepseek-llm") {
|
4645
4857
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM;
|
@@ -4690,6 +4902,14 @@ static void llm_load_vocab(
|
|
4690
4902
|
} else {
|
4691
4903
|
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
4692
4904
|
}
|
4905
|
+
} else if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
|
4906
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
4907
|
+
vocab.tokenizer_add_bos = true;
|
4908
|
+
vocab.tokenizer_add_eos = false;
|
4909
|
+
} else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
|
4910
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
4911
|
+
vocab.tokenizer_add_bos = true;
|
4912
|
+
vocab.tokenizer_add_eos = false;
|
4693
4913
|
} else {
|
4694
4914
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
4695
4915
|
}
|
@@ -4721,6 +4941,7 @@ static void llm_load_vocab(
|
|
4721
4941
|
GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
|
4722
4942
|
|
4723
4943
|
vocab.token_to_id[word] = i;
|
4944
|
+
vocab.max_token_len = std::max(vocab.max_token_len, (int) word.size());
|
4724
4945
|
|
4725
4946
|
auto & token_data = vocab.id_to_token[i];
|
4726
4947
|
token_data.text = std::move(word);
|
@@ -4834,10 +5055,10 @@ static void llm_load_vocab(
|
|
4834
5055
|
bool temp = true;
|
4835
5056
|
|
4836
5057
|
if (ml.get_key(LLM_KV_TOKENIZER_ADD_BOS, temp, false)) {
|
4837
|
-
vocab.
|
5058
|
+
vocab.tokenizer_add_bos = temp;
|
4838
5059
|
}
|
4839
5060
|
if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) {
|
4840
|
-
vocab.
|
5061
|
+
vocab.tokenizer_add_eos = temp;
|
4841
5062
|
}
|
4842
5063
|
}
|
4843
5064
|
|
@@ -4937,7 +5158,7 @@ static void llm_load_vocab(
|
|
4937
5158
|
);
|
4938
5159
|
|
4939
5160
|
// set attributes by model/tokenizer name
|
4940
|
-
if (_contains_any(tokenizer_pre, {"jina-v2-es", "jina-v2-
|
5161
|
+
if (_contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})) {
|
4941
5162
|
_set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
|
4942
5163
|
} else if (_contains_any(model_name, {"phi-3", "phi3"})) {
|
4943
5164
|
for (auto id : vocab.cache_special_tokens) {
|
@@ -5031,6 +5252,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
5031
5252
|
if (vocab.special_middle_id != -1) { LLAMA_LOG_INFO( "%s: MID token = %d '%s'\n", __func__, vocab.special_middle_id, vocab.id_to_token[vocab.special_middle_id].text.c_str() ); }
|
5032
5253
|
if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
|
5033
5254
|
|
5255
|
+
LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, vocab.max_token_len);
|
5256
|
+
|
5034
5257
|
if (model.arch == LLM_ARCH_DEEPSEEK2) {
|
5035
5258
|
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
5036
5259
|
LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
|
@@ -5040,6 +5263,11 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
5040
5263
|
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
5041
5264
|
LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
|
5042
5265
|
}
|
5266
|
+
|
5267
|
+
if (model.arch == LLM_ARCH_QWEN2MOE) {
|
5268
|
+
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
5269
|
+
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
|
5270
|
+
}
|
5043
5271
|
}
|
5044
5272
|
|
5045
5273
|
// Returns false if cancelled by progress_callback
|
@@ -5183,7 +5411,7 @@ static bool llm_load_tensors(
|
|
5183
5411
|
// create tensors for the weights
|
5184
5412
|
{
|
5185
5413
|
const int64_t n_embd = hparams.n_embd;
|
5186
|
-
const int64_t n_embd_head = n_embd / hparams.n_head;
|
5414
|
+
const int64_t n_embd_head = (hparams.n_head == 0) ? 0 : n_embd / hparams.n_head;
|
5187
5415
|
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
5188
5416
|
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
5189
5417
|
const int64_t n_embd_gqa = n_embd_v_gqa;
|
@@ -5826,16 +6054,17 @@ static bool llm_load_tensors(
|
|
5826
6054
|
GGML_ASSERT(hparams.n_expert_used > 0);
|
5827
6055
|
|
5828
6056
|
// MoE branch
|
5829
|
-
auto n_ff_exp = n_ff / hparams.n_expert_used;
|
6057
|
+
auto n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / hparams.n_expert_used;
|
5830
6058
|
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
|
5831
6059
|
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert});
|
5832
6060
|
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
|
5833
6061
|
|
5834
6062
|
// Shared expert branch
|
6063
|
+
auto n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
|
5835
6064
|
layer.ffn_gate_inp_shexp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd});
|
5836
|
-
layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd,
|
5837
|
-
layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {
|
5838
|
-
layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd,
|
6065
|
+
layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp});
|
6066
|
+
layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd});
|
6067
|
+
layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp});
|
5839
6068
|
}
|
5840
6069
|
} break;
|
5841
6070
|
case LLM_ARCH_PHI2:
|
@@ -6625,16 +6854,6 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
|
6625
6854
|
}
|
6626
6855
|
#endif
|
6627
6856
|
|
6628
|
-
#ifdef GGML_USE_SYCL
|
6629
|
-
if (params.split_mode == LLAMA_SPLIT_MODE_NONE) {
|
6630
|
-
ggml_backend_sycl_set_single_device_mode(params.main_gpu);
|
6631
|
-
//SYCL use device index (0, 1, 2) directly, uer input device id, then convert to device index.
|
6632
|
-
params.main_gpu = ggml_backend_sycl_get_device_index(params.main_gpu);
|
6633
|
-
} else {
|
6634
|
-
ggml_backend_sycl_set_mul_device_mode();
|
6635
|
-
}
|
6636
|
-
#endif
|
6637
|
-
|
6638
6857
|
if (!llm_load_tensors(
|
6639
6858
|
ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock,
|
6640
6859
|
params.progress_callback, params.progress_callback_user_data
|
@@ -7435,6 +7654,50 @@ struct llm_build_context {
|
|
7435
7654
|
return lctx.inp_s_seq;
|
7436
7655
|
}
|
7437
7656
|
|
7657
|
+
struct ggml_cgraph * append_pooling(struct ggml_cgraph * gf) {
|
7658
|
+
// find result_norm tensor for input
|
7659
|
+
struct ggml_tensor * inp = nullptr;
|
7660
|
+
for (int i = gf->n_nodes - 1; i >= 0; --i) {
|
7661
|
+
inp = gf->nodes[i];
|
7662
|
+
if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) {
|
7663
|
+
break;
|
7664
|
+
} else {
|
7665
|
+
inp = nullptr;
|
7666
|
+
}
|
7667
|
+
}
|
7668
|
+
GGML_ASSERT(inp != nullptr && "missing result_norm/result_embd tensor");
|
7669
|
+
|
7670
|
+
struct ggml_tensor * cur;
|
7671
|
+
|
7672
|
+
switch (pooling_type) {
|
7673
|
+
case LLAMA_POOLING_TYPE_MEAN:
|
7674
|
+
{
|
7675
|
+
struct ggml_tensor * inp_mean = build_inp_mean();
|
7676
|
+
cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, inp)), inp_mean);
|
7677
|
+
} break;
|
7678
|
+
case LLAMA_POOLING_TYPE_CLS:
|
7679
|
+
case LLAMA_POOLING_TYPE_LAST:
|
7680
|
+
{
|
7681
|
+
struct ggml_tensor * inp_cls = build_inp_cls();
|
7682
|
+
cur = ggml_get_rows(ctx0, inp, inp_cls);
|
7683
|
+
} break;
|
7684
|
+
case LLAMA_POOLING_TYPE_NONE:
|
7685
|
+
{
|
7686
|
+
cur = inp;
|
7687
|
+
} break;
|
7688
|
+
default:
|
7689
|
+
{
|
7690
|
+
GGML_ASSERT(false && "unknown pooling type");
|
7691
|
+
} break;
|
7692
|
+
}
|
7693
|
+
|
7694
|
+
cb(cur, "result_embd_pooled", -1);
|
7695
|
+
|
7696
|
+
ggml_build_forward_expand(gf, cur);
|
7697
|
+
|
7698
|
+
return gf;
|
7699
|
+
}
|
7700
|
+
|
7438
7701
|
struct ggml_cgraph * build_llama() {
|
7439
7702
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
7440
7703
|
|
@@ -8415,8 +8678,6 @@ struct llm_build_context {
|
|
8415
8678
|
if (model.arch != LLM_ARCH_JINA_BERT_V2) {
|
8416
8679
|
inp_pos = build_inp_pos();
|
8417
8680
|
}
|
8418
|
-
struct ggml_tensor * inp_mean = build_inp_mean();
|
8419
|
-
struct ggml_tensor * inp_cls = build_inp_cls();
|
8420
8681
|
|
8421
8682
|
// construct input embeddings (token, type, position)
|
8422
8683
|
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
@@ -8591,28 +8852,6 @@ struct llm_build_context {
|
|
8591
8852
|
cur = inpL;
|
8592
8853
|
cb(cur, "result_embd", -1);
|
8593
8854
|
|
8594
|
-
// pooling layer
|
8595
|
-
switch (pooling_type) {
|
8596
|
-
case LLAMA_POOLING_TYPE_NONE:
|
8597
|
-
{
|
8598
|
-
// nop
|
8599
|
-
} break;
|
8600
|
-
case LLAMA_POOLING_TYPE_MEAN:
|
8601
|
-
{
|
8602
|
-
cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
|
8603
|
-
cb(cur, "result_embd_pooled", -1);
|
8604
|
-
} break;
|
8605
|
-
case LLAMA_POOLING_TYPE_CLS:
|
8606
|
-
{
|
8607
|
-
cur = ggml_get_rows(ctx0, cur, inp_cls);
|
8608
|
-
cb(cur, "result_embd_pooled", -1);
|
8609
|
-
} break;
|
8610
|
-
case LLAMA_POOLING_TYPE_UNSPECIFIED:
|
8611
|
-
{
|
8612
|
-
GGML_ASSERT(false && "Invalid pooling type");
|
8613
|
-
} break;
|
8614
|
-
}
|
8615
|
-
|
8616
8855
|
ggml_build_forward_expand(gf, cur);
|
8617
8856
|
|
8618
8857
|
return gf;
|
@@ -11697,6 +11936,11 @@ static struct ggml_cgraph * llama_build_graph(
|
|
11697
11936
|
GGML_ASSERT(false);
|
11698
11937
|
}
|
11699
11938
|
|
11939
|
+
// add on pooling layer
|
11940
|
+
if (lctx.cparams.embeddings) {
|
11941
|
+
result = llm.append_pooling(result);
|
11942
|
+
}
|
11943
|
+
|
11700
11944
|
llm.free();
|
11701
11945
|
|
11702
11946
|
return result;
|
@@ -11786,7 +12030,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
11786
12030
|
// (!a || b) is a logical implication (a -> b)
|
11787
12031
|
// !hparams.causal_attn -> !cparams.causal_attn
|
11788
12032
|
(hparams.causal_attn || !cparams.causal_attn) &&
|
11789
|
-
"causal attention
|
12033
|
+
"causal attention is not supported by this model"
|
11790
12034
|
);
|
11791
12035
|
|
11792
12036
|
if (lctx.inp_KQ_mask) {
|
@@ -11918,6 +12162,37 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
11918
12162
|
}
|
11919
12163
|
}
|
11920
12164
|
|
12165
|
+
if (cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) {
|
12166
|
+
const int64_t n_tokens = batch.n_tokens;
|
12167
|
+
|
12168
|
+
GGML_ASSERT(lctx.inp_cls);
|
12169
|
+
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
|
12170
|
+
|
12171
|
+
uint32_t * data = (uint32_t *) lctx.inp_cls->data;
|
12172
|
+
memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));
|
12173
|
+
|
12174
|
+
std::vector<int> last_pos(n_tokens, -1);
|
12175
|
+
std::vector<int> last_row(n_tokens, -1);
|
12176
|
+
|
12177
|
+
for (int i = 0; i < n_tokens; ++i) {
|
12178
|
+
const llama_seq_id seq_id = batch.seq_id[i][0];
|
12179
|
+
const llama_pos pos = batch.pos[i];
|
12180
|
+
|
12181
|
+
GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == LAST");
|
12182
|
+
|
12183
|
+
if (pos >= last_pos[seq_id]) {
|
12184
|
+
last_pos[seq_id] = pos;
|
12185
|
+
last_row[seq_id] = i;
|
12186
|
+
}
|
12187
|
+
}
|
12188
|
+
|
12189
|
+
for (int i = 0; i < n_tokens; ++i) {
|
12190
|
+
if (last_row[i] >= 0) {
|
12191
|
+
data[i] = last_row[i];
|
12192
|
+
}
|
12193
|
+
}
|
12194
|
+
}
|
12195
|
+
|
11921
12196
|
if (kv_self.recurrent) {
|
11922
12197
|
const int64_t n_kv = kv_self.n;
|
11923
12198
|
|
@@ -11979,8 +12254,8 @@ static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) {
|
|
11979
12254
|
const auto n_embd = hparams.n_embd;
|
11980
12255
|
|
11981
12256
|
// TODO: use a per-batch flag for logits presence instead
|
11982
|
-
const bool has_logits = cparams.
|
11983
|
-
const bool has_embd =
|
12257
|
+
const bool has_logits = !cparams.embeddings;
|
12258
|
+
const bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
|
11984
12259
|
|
11985
12260
|
const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
|
11986
12261
|
const size_t embd_size = has_embd ? n_embd*n_outputs_max : 0;
|
@@ -12110,11 +12385,13 @@ static int llama_decode_internal(
|
|
12110
12385
|
std::vector<std::vector<llama_seq_id>> seq_id;
|
12111
12386
|
|
12112
12387
|
// count outputs
|
12113
|
-
if (
|
12388
|
+
if (cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE) {
|
12389
|
+
n_outputs = n_tokens_all;
|
12390
|
+
} else if (batch_all.logits) {
|
12114
12391
|
for (uint32_t i = 0; i < n_tokens_all; ++i) {
|
12115
12392
|
n_outputs += batch_all.logits[i] != 0;
|
12116
12393
|
}
|
12117
|
-
} else if (lctx.logits_all
|
12394
|
+
} else if (lctx.logits_all) {
|
12118
12395
|
n_outputs = n_tokens_all;
|
12119
12396
|
} else {
|
12120
12397
|
// keep last output only
|
@@ -12245,30 +12522,13 @@ static int llama_decode_internal(
|
|
12245
12522
|
// no output
|
12246
12523
|
res = nullptr;
|
12247
12524
|
embd = nullptr;
|
12248
|
-
} else if (!hparams.causal_attn) {
|
12249
|
-
res = nullptr; // do not extract logits for embedding models such as BERT
|
12250
|
-
|
12251
|
-
// token or sequence embeddings
|
12252
|
-
embd = gf->nodes[gf->n_nodes - 1];
|
12253
|
-
|
12254
|
-
GGML_ASSERT(strcmp(embd->name, "result_embd") == 0 || strcmp(embd->name, "result_embd_pooled") == 0);
|
12255
12525
|
} else if (cparams.embeddings) {
|
12256
|
-
|
12257
|
-
|
12258
|
-
|
12259
|
-
|
12260
|
-
if (i_embd < 0) { break; }
|
12261
|
-
embd = gf->nodes[i_embd];
|
12262
|
-
}
|
12263
|
-
GGML_ASSERT(i_embd >= 0 && "missing result_norm tensor");
|
12264
|
-
|
12265
|
-
// TODO: use a per-batch flag to know when to skip logits while keeping embeddings
|
12266
|
-
if (!cparams.causal_attn) {
|
12267
|
-
res = nullptr; // do not extract logits when not needed
|
12268
|
-
// skip computing logits
|
12269
|
-
// TODO: is this safe?
|
12270
|
-
gf->n_nodes = i_embd + 1;
|
12526
|
+
res = nullptr; // do not extract logits for embedding case
|
12527
|
+
embd = gf->nodes[gf->n_nodes - 1];
|
12528
|
+
if (strcmp(embd->name, "result_embd_pooled") != 0) {
|
12529
|
+
embd = gf->nodes[gf->n_nodes - 2];
|
12271
12530
|
}
|
12531
|
+
GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
|
12272
12532
|
} else {
|
12273
12533
|
embd = nullptr; // do not extract embeddings when not needed
|
12274
12534
|
GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
|
@@ -12337,11 +12597,10 @@ static int llama_decode_internal(
|
|
12337
12597
|
ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float));
|
12338
12598
|
}
|
12339
12599
|
} break;
|
12340
|
-
case LLAMA_POOLING_TYPE_CLS:
|
12341
12600
|
case LLAMA_POOLING_TYPE_MEAN:
|
12601
|
+
case LLAMA_POOLING_TYPE_CLS:
|
12602
|
+
case LLAMA_POOLING_TYPE_LAST:
|
12342
12603
|
{
|
12343
|
-
GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0);
|
12344
|
-
|
12345
12604
|
// extract sequence embeddings
|
12346
12605
|
auto & embd_seq_out = lctx.embd_seq;
|
12347
12606
|
embd_seq_out.clear();
|
@@ -12955,112 +13214,142 @@ struct llm_bigram_bpe {
|
|
12955
13214
|
};
|
12956
13215
|
|
12957
13216
|
struct llm_tokenizer_bpe {
|
12958
|
-
llm_tokenizer_bpe(const llama_vocab & vocab): vocab(vocab) {
|
12959
|
-
|
12960
|
-
|
12961
|
-
|
12962
|
-
|
12963
|
-
|
12964
|
-
|
12965
|
-
|
12966
|
-
|
12967
|
-
|
12968
|
-
|
12969
|
-
|
12970
|
-
|
12971
|
-
|
12972
|
-
|
12973
|
-
|
12974
|
-
|
12975
|
-
|
12976
|
-
|
12977
|
-
|
12978
|
-
|
12979
|
-
|
12980
|
-
|
12981
|
-
|
12982
|
-
|
12983
|
-
|
12984
|
-
|
12985
|
-
|
12986
|
-
|
12987
|
-
|
12988
|
-
|
12989
|
-
|
12990
|
-
|
12991
|
-
|
12992
|
-
|
12993
|
-
|
12994
|
-
|
12995
|
-
|
12996
|
-
|
12997
|
-
|
12998
|
-
|
12999
|
-
|
13000
|
-
|
13001
|
-
|
13002
|
-
|
13003
|
-
|
13004
|
-
|
13005
|
-
|
13006
|
-
|
13007
|
-
|
13008
|
-
|
13009
|
-
|
13010
|
-
|
13011
|
-
|
13012
|
-
|
13013
|
-
|
13014
|
-
|
13015
|
-
|
13016
|
-
|
13017
|
-
|
13018
|
-
|
13019
|
-
|
13020
|
-
|
13021
|
-
|
13022
|
-
|
13023
|
-
|
13024
|
-
|
13025
|
-
|
13026
|
-
|
13027
|
-
|
13028
|
-
|
13029
|
-
|
13030
|
-
|
13031
|
-
|
13032
|
-
|
13033
|
-
|
13034
|
-
|
13035
|
-
|
13036
|
-
|
13037
|
-
|
13038
|
-
|
13039
|
-
|
13040
|
-
// "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
|
13041
|
-
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
13042
|
-
});
|
13043
|
-
break;
|
13044
|
-
case LLAMA_VOCAB_PRE_TYPE_PORO:
|
13045
|
-
word_collection = unicode_regex_split(text, {
|
13046
|
-
" ?[^(\\s|.,!?…。,、।۔،)]+",
|
13047
|
-
});
|
13048
|
-
break;
|
13049
|
-
default:
|
13050
|
-
// default regex for BPE tokenization pre-processing
|
13051
|
-
word_collection = unicode_regex_split(text, {
|
13052
|
-
"[\\p{P}\\$\\+<=>\\^~\\|]+",
|
13053
|
-
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
13054
|
-
"\\p{N}+",
|
13055
|
-
"[0-9][0-9][0-9]",
|
13056
|
-
});
|
13057
|
-
break;
|
13058
|
-
}
|
13217
|
+
llm_tokenizer_bpe(const llama_vocab & vocab): vocab(vocab) {
|
13218
|
+
GGML_ASSERT(vocab.type == LLAMA_VOCAB_TYPE_BPE);
|
13219
|
+
switch (vocab.type_pre) {
|
13220
|
+
case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
|
13221
|
+
regex_exprs = {
|
13222
|
+
// original regex from tokenizer.json
|
13223
|
+
//"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
13224
|
+
|
13225
|
+
// adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989
|
13226
|
+
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
13227
|
+
};
|
13228
|
+
break;
|
13229
|
+
case LLAMA_VOCAB_PRE_TYPE_DBRX:
|
13230
|
+
case LLAMA_VOCAB_PRE_TYPE_SMAUG:
|
13231
|
+
regex_exprs = {
|
13232
|
+
// same as llama3
|
13233
|
+
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
13234
|
+
};
|
13235
|
+
break;
|
13236
|
+
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
|
13237
|
+
regex_exprs = {
|
13238
|
+
"[\r\n]",
|
13239
|
+
"\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
|
13240
|
+
"\\s?[!-/:-~!-/:-~‘-‟ -。]+",
|
13241
|
+
"\\s+$",
|
13242
|
+
"[一-龥ࠀ-一가-]+",
|
13243
|
+
"\\p{N}+",
|
13244
|
+
};
|
13245
|
+
break;
|
13246
|
+
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
|
13247
|
+
regex_exprs = {
|
13248
|
+
"[\r\n]",
|
13249
|
+
"\\s?\\p{L}+",
|
13250
|
+
"\\s?\\p{P}+",
|
13251
|
+
"[一-龥ࠀ-一가-]+",
|
13252
|
+
"\\p{N}",
|
13253
|
+
};
|
13254
|
+
break;
|
13255
|
+
case LLAMA_VOCAB_PRE_TYPE_FALCON:
|
13256
|
+
regex_exprs = {
|
13257
|
+
"[\\p{P}\\$\\+<=>\\^~\\|`]+",
|
13258
|
+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
13259
|
+
"[0-9][0-9][0-9]",
|
13260
|
+
};
|
13261
|
+
break;
|
13262
|
+
case LLAMA_VOCAB_PRE_TYPE_MPT:
|
13263
|
+
// TODO: MPT pre-tokenization regexes are unknown
|
13264
|
+
// the following are close, but not exact. run the following:
|
13265
|
+
// ./bin/test-tokenizer-0 ../models/ggml-vocab-mpt.gguf
|
13266
|
+
GGML_ASSERT("MPT pre-tokenization regexes are unknown - fixes needed");
|
13267
|
+
regex_exprs = {
|
13268
|
+
"\\s?\\p{L}+",
|
13269
|
+
"\\s?\\p{P}+",
|
13270
|
+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
13271
|
+
};
|
13272
|
+
break;
|
13273
|
+
case LLAMA_VOCAB_PRE_TYPE_STARCODER:
|
13274
|
+
case LLAMA_VOCAB_PRE_TYPE_REFACT:
|
13275
|
+
case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
|
13276
|
+
regex_exprs = {
|
13277
|
+
"\\p{N}",
|
13278
|
+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
13279
|
+
};
|
13280
|
+
break;
|
13281
|
+
case LLAMA_VOCAB_PRE_TYPE_GPT2:
|
13282
|
+
case LLAMA_VOCAB_PRE_TYPE_OLMO:
|
13283
|
+
regex_exprs = {
|
13284
|
+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
13285
|
+
};
|
13286
|
+
break;
|
13287
|
+
case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
|
13288
|
+
case LLAMA_VOCAB_PRE_TYPE_QWEN2:
|
13289
|
+
regex_exprs = {
|
13290
|
+
// original regex from tokenizer.json
|
13291
|
+
// "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
|
13292
|
+
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
13293
|
+
};
|
13294
|
+
break;
|
13295
|
+
case LLAMA_VOCAB_PRE_TYPE_PORO:
|
13296
|
+
regex_exprs = {
|
13297
|
+
" ?[^(\\s|.,!?…。,、।۔،)]+",
|
13298
|
+
};
|
13059
13299
|
break;
|
13060
13300
|
default:
|
13061
|
-
|
13301
|
+
// default regex for BPE tokenization pre-processing
|
13302
|
+
regex_exprs = {
|
13303
|
+
"[\\p{P}\\$\\+<=>\\^~\\|]+",
|
13304
|
+
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
13305
|
+
"\\p{N}+",
|
13306
|
+
"[0-9][0-9][0-9]",
|
13307
|
+
};
|
13062
13308
|
break;
|
13063
13309
|
}
|
13310
|
+
}
|
13311
|
+
|
13312
|
+
void append(const llama_vocab::id token_id, std::vector<llama_vocab::id> & output) const {
|
13313
|
+
output.push_back(token_id);
|
13314
|
+
}
|
13315
|
+
|
13316
|
+
bool append_bos(std::vector<llama_vocab::id> & output) const {
|
13317
|
+
if (vocab.tokenizer_add_bos) {
|
13318
|
+
GGML_ASSERT(vocab.special_bos_id != -1);
|
13319
|
+
output.push_back(vocab.special_bos_id);
|
13320
|
+
return true;
|
13321
|
+
}
|
13322
|
+
return false;
|
13323
|
+
}
|
13324
|
+
|
13325
|
+
bool append_eos(std::vector<llama_vocab::id> & output) const {
|
13326
|
+
if (vocab.tokenizer_add_eos) {
|
13327
|
+
GGML_ASSERT(vocab.special_eos_id != -1);
|
13328
|
+
output.push_back(vocab.special_eos_id);
|
13329
|
+
return true;
|
13330
|
+
}
|
13331
|
+
return false;
|
13332
|
+
}
|
13333
|
+
|
13334
|
+
void check_double_bos_eos(const std::vector<llama_vocab::id> & output) const {
|
13335
|
+
if (vocab.tokenizer_add_bos && output.size() >= 2 && output[1] == vocab.special_bos_id) {
|
13336
|
+
LLAMA_LOG_WARN(
|
13337
|
+
"%s: Added a BOS token to the prompt as specified by the model but the prompt "
|
13338
|
+
"also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
|
13339
|
+
"Are you sure this is what you want?\n", __FUNCTION__);
|
13340
|
+
}
|
13341
|
+
if (vocab.tokenizer_add_eos && output.size() >= 2 && *(output.end()-2) == vocab.special_eos_id) {
|
13342
|
+
LLAMA_LOG_WARN(
|
13343
|
+
"%s: Added a EOS token to the prompt as specified by the model but the prompt "
|
13344
|
+
"also ends with a EOS token. So now the final prompt ends with 2 EOS tokens. "
|
13345
|
+
"Are you sure this is what you want?\n", __FUNCTION__);
|
13346
|
+
}
|
13347
|
+
}
|
13348
|
+
|
13349
|
+
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
13350
|
+
int final_prev_index = -1;
|
13351
|
+
|
13352
|
+
const auto word_collection = unicode_regex_split(text, regex_exprs);
|
13064
13353
|
|
13065
13354
|
symbols_final.clear();
|
13066
13355
|
|
@@ -13071,7 +13360,7 @@ struct llm_tokenizer_bpe {
|
|
13071
13360
|
int index = 0;
|
13072
13361
|
size_t offset = 0;
|
13073
13362
|
|
13074
|
-
if (
|
13363
|
+
if (vocab.tokenizer_ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
|
13075
13364
|
symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
|
13076
13365
|
offset = word.size();
|
13077
13366
|
}
|
@@ -13152,10 +13441,9 @@ struct llm_tokenizer_bpe {
|
|
13152
13441
|
for (auto j = str.begin(); j != str.end(); ++j) {
|
13153
13442
|
std::string byte_str(1, *j);
|
13154
13443
|
auto token_multibyte = vocab.token_to_id.find(byte_str);
|
13155
|
-
if (token_multibyte
|
13156
|
-
|
13444
|
+
if (token_multibyte != vocab.token_to_id.end()) {
|
13445
|
+
output.push_back(token_multibyte->second);
|
13157
13446
|
}
|
13158
|
-
output.push_back((*token_multibyte).second);
|
13159
13447
|
}
|
13160
13448
|
} else {
|
13161
13449
|
output.push_back((*token).second);
|
@@ -13194,6 +13482,8 @@ private:
|
|
13194
13482
|
|
13195
13483
|
const llama_vocab & vocab;
|
13196
13484
|
|
13485
|
+
std::vector<std::string> regex_exprs;
|
13486
|
+
|
13197
13487
|
std::vector<llm_symbol> symbols;
|
13198
13488
|
std::vector<llm_symbol> symbols_final;
|
13199
13489
|
|
@@ -13203,7 +13493,7 @@ private:
|
|
13203
13493
|
struct llm_tokenizer_wpm {
|
13204
13494
|
llm_tokenizer_wpm(const llama_vocab & vocab): vocab(vocab) {}
|
13205
13495
|
|
13206
|
-
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
13496
|
+
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) const {
|
13207
13497
|
const auto & token_map = vocab.token_to_id;
|
13208
13498
|
|
13209
13499
|
// normalize and split by whitespace
|
@@ -13212,7 +13502,7 @@ struct llm_tokenizer_wpm {
|
|
13212
13502
|
// bos token prepended already
|
13213
13503
|
|
13214
13504
|
// find the longest tokens that form the words
|
13215
|
-
for (const std::string &word : words) {
|
13505
|
+
for (const std::string & word : words) {
|
13216
13506
|
// skip empty words
|
13217
13507
|
if (word.size() == 0) {
|
13218
13508
|
continue;
|
@@ -13229,7 +13519,7 @@ struct llm_tokenizer_wpm {
|
|
13229
13519
|
for (int i = 0; i < n; ++i) {
|
13230
13520
|
// loop through possible match length
|
13231
13521
|
bool match = false;
|
13232
|
-
for (int j = n; j > i; j--) {
|
13522
|
+
for (int j = std::min(n, i + vocab.max_token_len + 1); j > i; j--) {
|
13233
13523
|
auto it = token_map.find(word1.substr(i, j - i));
|
13234
13524
|
if (it != token_map.end()) {
|
13235
13525
|
output.push_back(it->second);
|
@@ -13252,11 +13542,12 @@ struct llm_tokenizer_wpm {
|
|
13252
13542
|
}
|
13253
13543
|
}
|
13254
13544
|
|
13255
|
-
|
13545
|
+
// TODO: reduce string copies by using cpts_offs array
|
13546
|
+
std::vector<std::string> preprocess(const std::string & text) const {
|
13256
13547
|
const std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
|
13257
13548
|
std::vector<std::string> words(1, "");
|
13258
13549
|
|
13259
|
-
for (const
|
13550
|
+
for (const uint32_t cpt : cpts_nfd) {
|
13260
13551
|
const auto flags = unicode_cpt_flags(cpt);
|
13261
13552
|
|
13262
13553
|
if (flags.is_whitespace) {
|
@@ -13474,7 +13765,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
13474
13765
|
|
13475
13766
|
bool is_prev_special = false;
|
13476
13767
|
|
13477
|
-
if (add_special && vocab.
|
13768
|
+
if (add_special && vocab.tokenizer_add_bos) {
|
13478
13769
|
GGML_ASSERT(vocab.special_bos_id != -1);
|
13479
13770
|
output.push_back(vocab.special_bos_id);
|
13480
13771
|
is_prev_special = true;
|
@@ -13484,7 +13775,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
13484
13775
|
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
13485
13776
|
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
13486
13777
|
|
13487
|
-
if (vocab.
|
13778
|
+
if (vocab.tokenizer_add_space_prefix) {
|
13488
13779
|
if (!output.size() || is_prev_special) { // prefix with space if first token
|
13489
13780
|
raw_text = " " + raw_text;
|
13490
13781
|
}
|
@@ -13502,23 +13793,24 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
13502
13793
|
}
|
13503
13794
|
}
|
13504
13795
|
|
13505
|
-
if (add_special && vocab.
|
13796
|
+
if (add_special && vocab.tokenizer_add_bos && output.size() >= 2 && output[1] == vocab.special_bos_id) {
|
13506
13797
|
LLAMA_LOG_WARN(
|
13507
13798
|
"%s: Added a BOS token to the prompt as specified by the model but the prompt "
|
13508
13799
|
"also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
|
13509
13800
|
"Are you sure this is what you want?\n", __FUNCTION__);
|
13510
13801
|
}
|
13511
13802
|
|
13512
|
-
if (add_special && vocab.
|
13803
|
+
if (add_special && vocab.tokenizer_add_eos) {
|
13513
13804
|
GGML_ASSERT(vocab.special_eos_id != -1);
|
13514
13805
|
output.push_back(vocab.special_eos_id);
|
13515
13806
|
}
|
13516
13807
|
} break;
|
13517
13808
|
case LLAMA_VOCAB_TYPE_BPE:
|
13518
13809
|
{
|
13519
|
-
|
13520
|
-
|
13521
|
-
|
13810
|
+
llm_tokenizer_bpe tokenizer(vocab);
|
13811
|
+
|
13812
|
+
if (add_special) {
|
13813
|
+
tokenizer.append_bos(output);
|
13522
13814
|
}
|
13523
13815
|
|
13524
13816
|
for (const auto & fragment : fragment_buffer) {
|
@@ -13528,23 +13820,15 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
13528
13820
|
#ifdef PRETOKENIZERDEBUG
|
13529
13821
|
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
|
13530
13822
|
#endif
|
13531
|
-
llm_tokenizer_bpe tokenizer(vocab);
|
13532
13823
|
tokenizer.tokenize(raw_text, output);
|
13533
13824
|
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
13534
|
-
|
13825
|
+
tokenizer.append(fragment.token, output);
|
13535
13826
|
}
|
13536
13827
|
}
|
13537
13828
|
|
13538
|
-
if (add_special
|
13539
|
-
|
13540
|
-
|
13541
|
-
"also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
|
13542
|
-
"Are you sure this is what you want?\n", __FUNCTION__);
|
13543
|
-
}
|
13544
|
-
|
13545
|
-
if (add_special && vocab.special_add_eos == 1) {
|
13546
|
-
GGML_ASSERT(vocab.special_add_eos != -1);
|
13547
|
-
output.push_back(vocab.special_eos_id);
|
13829
|
+
if (add_special) {
|
13830
|
+
tokenizer.append_eos(output);
|
13831
|
+
tokenizer.check_double_bos_eos(output);
|
13548
13832
|
}
|
13549
13833
|
} break;
|
13550
13834
|
case LLAMA_VOCAB_TYPE_WPM:
|
@@ -13554,6 +13838,8 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
13554
13838
|
output.push_back(vocab.special_cls_id);
|
13555
13839
|
}
|
13556
13840
|
|
13841
|
+
llm_tokenizer_wpm tokenizer(vocab);
|
13842
|
+
|
13557
13843
|
for (const auto & fragment : fragment_buffer) {
|
13558
13844
|
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
13559
13845
|
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
@@ -13561,7 +13847,6 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
13561
13847
|
#ifdef PRETOKENIZERDEBUG
|
13562
13848
|
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
|
13563
13849
|
#endif
|
13564
|
-
llm_tokenizer_wpm tokenizer(vocab);
|
13565
13850
|
tokenizer.tokenize(raw_text, output);
|
13566
13851
|
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
13567
13852
|
output.push_back(fragment.token);
|
@@ -16070,6 +16355,11 @@ struct llama_context * llama_new_context_with_model(
|
|
16070
16355
|
params.flash_attn = false;
|
16071
16356
|
}
|
16072
16357
|
|
16358
|
+
if (params.flash_attn && model->hparams.n_embd_head_k != model->hparams.n_embd_head_v) {
|
16359
|
+
LLAMA_LOG_WARN("%s: flash_attn requires n_embd_head_k == n_embd_head_v - forcing off\n", __func__);
|
16360
|
+
params.flash_attn = false;
|
16361
|
+
}
|
16362
|
+
|
16073
16363
|
if (params.type_v != GGML_TYPE_F16 && !params.flash_attn) {
|
16074
16364
|
LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__);
|
16075
16365
|
return nullptr;
|
@@ -16241,8 +16531,7 @@ struct llama_context * llama_new_context_with_model(
|
|
16241
16531
|
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
16242
16532
|
ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
|
16243
16533
|
if (backend == nullptr) {
|
16244
|
-
|
16245
|
-
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, main_gpu_id, model->main_gpu);
|
16534
|
+
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu);
|
16246
16535
|
llama_free(ctx);
|
16247
16536
|
return nullptr;
|
16248
16537
|
}
|
@@ -17870,6 +18159,10 @@ void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)
|
|
17870
18159
|
ctx->abort_callback_data = abort_callback_data;
|
17871
18160
|
}
|
17872
18161
|
|
18162
|
+
void llama_set_embeddings(struct llama_context * ctx, bool embeddings) {
|
18163
|
+
ctx->cparams.embeddings = embeddings;
|
18164
|
+
}
|
18165
|
+
|
17873
18166
|
void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
|
17874
18167
|
ctx->cparams.causal_attn = causal_attn;
|
17875
18168
|
}
|
@@ -18113,11 +18406,11 @@ llama_token llama_token_nl(const struct llama_model * model) {
|
|
18113
18406
|
}
|
18114
18407
|
|
18115
18408
|
int32_t llama_add_bos_token(const struct llama_model * model) {
|
18116
|
-
return model->vocab.
|
18409
|
+
return model->vocab.tokenizer_add_bos;
|
18117
18410
|
}
|
18118
18411
|
|
18119
18412
|
int32_t llama_add_eos_token(const struct llama_model * model) {
|
18120
|
-
return model->vocab.
|
18413
|
+
return model->vocab.tokenizer_add_eos;
|
18121
18414
|
}
|
18122
18415
|
|
18123
18416
|
llama_token llama_token_prefix(const struct llama_model * model) {
|