@fugood/llama.node 1.0.3 → 1.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +14 -14
- package/src/llama.cpp/common/CMakeLists.txt +4 -5
- package/src/llama.cpp/common/arg.cpp +37 -0
- package/src/llama.cpp/common/common.cpp +22 -6
- package/src/llama.cpp/common/common.h +14 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +3 -0
- package/src/llama.cpp/ggml/include/ggml-webgpu.h +19 -0
- package/src/llama.cpp/ggml/include/ggml.h +13 -0
- package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +343 -1094
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +23 -8
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +39 -0
- package/src/llama.cpp/include/llama.h +13 -48
- package/src/llama.cpp/src/llama-arch.cpp +222 -15
- package/src/llama.cpp/src/llama-arch.h +16 -1
- package/src/llama.cpp/src/llama-batch.cpp +76 -70
- package/src/llama.cpp/src/llama-batch.h +24 -18
- package/src/llama.cpp/src/llama-chat.cpp +44 -1
- package/src/llama.cpp/src/llama-chat.h +2 -0
- package/src/llama.cpp/src/llama-context.cpp +134 -95
- package/src/llama.cpp/src/llama-context.h +13 -16
- package/src/llama.cpp/src/llama-cparams.h +3 -2
- package/src/llama.cpp/src/llama-graph.cpp +239 -154
- package/src/llama.cpp/src/llama-graph.h +162 -126
- package/src/llama.cpp/src/llama-hparams.cpp +45 -0
- package/src/llama.cpp/src/llama-hparams.h +11 -1
- package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +11 -5
- package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +3 -0
- package/src/llama.cpp/src/llama-kv-cache-unified.cpp +698 -302
- package/src/llama.cpp/src/llama-kv-cache-unified.h +89 -31
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +1 -0
- package/src/llama.cpp/src/llama-memory-recurrent.cpp +6 -9
- package/src/llama.cpp/src/llama-model.cpp +2309 -665
- package/src/llama.cpp/src/llama-model.h +18 -4
- package/src/llama.cpp/src/llama-quant.cpp +2 -2
- package/src/llama.cpp/src/llama-vocab.cpp +368 -9
- package/src/llama.cpp/src/llama-vocab.h +43 -0
- package/src/llama.cpp/src/unicode.cpp +207 -0
- package/src/llama.cpp/src/unicode.h +2 -0
|
@@ -32,17 +32,21 @@ enum llm_type {
|
|
|
32
32
|
LLM_TYPE_190M,
|
|
33
33
|
LLM_TYPE_220M,
|
|
34
34
|
LLM_TYPE_250M,
|
|
35
|
+
LLM_TYPE_256M,
|
|
35
36
|
LLM_TYPE_270M,
|
|
36
37
|
LLM_TYPE_335M,
|
|
38
|
+
LLM_TYPE_350M,
|
|
37
39
|
LLM_TYPE_410M,
|
|
38
40
|
LLM_TYPE_450M,
|
|
39
41
|
LLM_TYPE_475M,
|
|
42
|
+
LLM_TYPE_700M,
|
|
40
43
|
LLM_TYPE_770M,
|
|
41
44
|
LLM_TYPE_780M,
|
|
42
45
|
LLM_TYPE_0_3B,
|
|
43
46
|
LLM_TYPE_0_5B,
|
|
44
47
|
LLM_TYPE_0_6B,
|
|
45
48
|
LLM_TYPE_1B,
|
|
49
|
+
LLM_TYPE_1_2B,
|
|
46
50
|
LLM_TYPE_1_3B,
|
|
47
51
|
LLM_TYPE_1_4B,
|
|
48
52
|
LLM_TYPE_1_5B,
|
|
@@ -95,8 +99,10 @@ enum llm_type {
|
|
|
95
99
|
LLM_TYPE_17B_16E, // llama4 Scout
|
|
96
100
|
LLM_TYPE_17B_128E, // llama4 Maverick
|
|
97
101
|
LLM_TYPE_A13B,
|
|
102
|
+
LLM_TYPE_21B_A3B, // Ernie MoE small
|
|
98
103
|
LLM_TYPE_30B_A3B,
|
|
99
104
|
LLM_TYPE_235B_A22B,
|
|
105
|
+
LLM_TYPE_300B_A47B, // Ernie MoE big
|
|
100
106
|
LLM_TYPE_E2B,
|
|
101
107
|
LLM_TYPE_E4B,
|
|
102
108
|
};
|
|
@@ -154,6 +160,12 @@ struct llama_layer_convnext {
|
|
|
154
160
|
struct ggml_tensor * gamma = nullptr;
|
|
155
161
|
};
|
|
156
162
|
|
|
163
|
+
struct llama_layer_shortconv {
|
|
164
|
+
struct ggml_tensor * in_proj = nullptr;
|
|
165
|
+
struct ggml_tensor * conv = nullptr;
|
|
166
|
+
struct ggml_tensor * out_proj = nullptr;
|
|
167
|
+
};
|
|
168
|
+
|
|
157
169
|
struct llama_layer {
|
|
158
170
|
// normalization
|
|
159
171
|
struct ggml_tensor * attn_norm = nullptr;
|
|
@@ -174,6 +186,9 @@ struct llama_layer {
|
|
|
174
186
|
struct ggml_tensor * attn_norm_cross = nullptr;
|
|
175
187
|
struct ggml_tensor * attn_norm_enc = nullptr;
|
|
176
188
|
struct ggml_tensor * ssm_norm = nullptr;
|
|
189
|
+
struct ggml_tensor * ssm_dt_norm = nullptr;
|
|
190
|
+
struct ggml_tensor * ssm_b_norm = nullptr;
|
|
191
|
+
struct ggml_tensor * ssm_c_norm = nullptr;
|
|
177
192
|
|
|
178
193
|
// attention
|
|
179
194
|
struct ggml_tensor * wq = nullptr;
|
|
@@ -337,6 +352,8 @@ struct llama_layer {
|
|
|
337
352
|
struct llama_layer_posnet posnet;
|
|
338
353
|
|
|
339
354
|
struct llama_layer_convnext convnext;
|
|
355
|
+
|
|
356
|
+
struct llama_layer_shortconv shortconv;
|
|
340
357
|
};
|
|
341
358
|
|
|
342
359
|
struct llama_model {
|
|
@@ -437,10 +454,7 @@ struct llama_model {
|
|
|
437
454
|
llama_memory_i * create_memory(const llama_memory_params & params, llama_cparams & cparams) const;
|
|
438
455
|
|
|
439
456
|
// TODO: move this to new llm_arch_model_i interface
|
|
440
|
-
|
|
441
|
-
const llm_graph_params & params,
|
|
442
|
-
ggml_cgraph * gf,
|
|
443
|
-
llm_graph_type type) const;
|
|
457
|
+
ggml_cgraph * build_graph(const llm_graph_params & params) const;
|
|
444
458
|
|
|
445
459
|
private:
|
|
446
460
|
struct impl;
|
|
@@ -844,6 +844,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
844
844
|
// do not quantize Mamba's small yet 2D weights
|
|
845
845
|
// NOTE: can't use LLM_TN here because the layer number is not known
|
|
846
846
|
quantize &= name.find("ssm_conv1d.weight") == std::string::npos;
|
|
847
|
+
quantize &= name.find("shortconv.conv.weight") == std::string::npos;
|
|
847
848
|
|
|
848
849
|
// do not quantize RWKV's small yet 2D weights
|
|
849
850
|
quantize &= name.find("time_mix_first.weight") == std::string::npos;
|
|
@@ -883,8 +884,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
883
884
|
if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) {
|
|
884
885
|
if (qtype != new_type) {
|
|
885
886
|
LLAMA_LOG_DEBUG("(overriding %s) ", ggml_type_name(new_type));
|
|
886
|
-
new_type = qtype;
|
|
887
|
-
break; // if two or more types are specified for the tensor, first match wins
|
|
887
|
+
new_type = qtype; // if two or more types are specified for the same tensor, the last match wins
|
|
888
888
|
}
|
|
889
889
|
}
|
|
890
890
|
}
|
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
#include <cassert>
|
|
12
12
|
#include <cctype>
|
|
13
13
|
#include <cfloat>
|
|
14
|
+
#include <cmath>
|
|
14
15
|
#include <cstdarg>
|
|
15
16
|
#include <cstring>
|
|
16
17
|
#include <forward_list>
|
|
@@ -404,6 +405,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
|
|
404
405
|
"[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
|
405
406
|
};
|
|
406
407
|
break;
|
|
408
|
+
case LLAMA_VOCAB_PRE_TYPE_KIMI_K2:
|
|
409
|
+
regex_exprs = {
|
|
410
|
+
// K2 trigger pattern - this will activate the custom K2 handler in unicode.cpp
|
|
411
|
+
// The custom handler implements all K2 patterns with proper Han character exclusion
|
|
412
|
+
"\\p{Han}+",
|
|
413
|
+
};
|
|
414
|
+
break;
|
|
407
415
|
case LLAMA_VOCAB_PRE_TYPE_SUPERBPE:
|
|
408
416
|
regex_exprs = {
|
|
409
417
|
"\\p{N}+",
|
|
@@ -1196,6 +1204,284 @@ private:
|
|
|
1196
1204
|
const llm_tokenizer_rwkv & tokenizer;
|
|
1197
1205
|
};
|
|
1198
1206
|
|
|
1207
|
+
struct llm_tokenizer_plamo2 : llm_tokenizer {
|
|
1208
|
+
llm_tokenizer_plamo2(const llama_vocab & vocab) {
|
|
1209
|
+
build(vocab);
|
|
1210
|
+
}
|
|
1211
|
+
|
|
1212
|
+
void build(const llama_vocab & vocab) {
|
|
1213
|
+
// Reset internal structures
|
|
1214
|
+
tokens_.clear();
|
|
1215
|
+
bytes_.assign(256, 0);
|
|
1216
|
+
to_suffix_id_.clear();
|
|
1217
|
+
table_.clear();
|
|
1218
|
+
|
|
1219
|
+
// Build token list and byte mapping
|
|
1220
|
+
std::unordered_map<std::string, float> suffix_to_score;
|
|
1221
|
+
std::unordered_map<std::string, llama_token> token_to_id;
|
|
1222
|
+
|
|
1223
|
+
for (size_t token_id = 0; token_id < vocab.n_tokens(); ++token_id) {
|
|
1224
|
+
const auto & entry = vocab.get_token_data(token_id);
|
|
1225
|
+
tokens_.push_back(entry.text);
|
|
1226
|
+
token_to_id[entry.text] = static_cast<llama_token>(token_id);
|
|
1227
|
+
|
|
1228
|
+
// Handle byte tokens
|
|
1229
|
+
if (vocab.is_byte(token_id)) {
|
|
1230
|
+
if (entry.text.length() == 6 && entry.text.substr(0, 3) == "<0x" && entry.text.back() == '>') {
|
|
1231
|
+
std::string hex_str = entry.text.substr(3, 2);
|
|
1232
|
+
int byte_val = std::stoi(hex_str, nullptr, 16);
|
|
1233
|
+
bytes_[byte_val] = static_cast<llama_token>(token_id);
|
|
1234
|
+
}
|
|
1235
|
+
continue;
|
|
1236
|
+
}
|
|
1237
|
+
|
|
1238
|
+
// Add token and all its suffixes to suffix_to_score
|
|
1239
|
+
suffix_to_score[entry.text] = entry.score;
|
|
1240
|
+
|
|
1241
|
+
// Extract suffixes character by character (UTF-8 aware)
|
|
1242
|
+
std::vector<uint32_t> cpts = unicode_cpts_from_utf8(entry.text);
|
|
1243
|
+
for (size_t i = 1; i < cpts.size(); ++i) {
|
|
1244
|
+
std::string suffix;
|
|
1245
|
+
for (size_t j = i; j < cpts.size(); ++j) {
|
|
1246
|
+
suffix += unicode_cpt_to_utf8(cpts[j]);
|
|
1247
|
+
}
|
|
1248
|
+
if (suffix_to_score.find(suffix) == suffix_to_score.end()) {
|
|
1249
|
+
suffix_to_score[suffix] = std::numeric_limits<float>::quiet_NaN();
|
|
1250
|
+
}
|
|
1251
|
+
}
|
|
1252
|
+
}
|
|
1253
|
+
|
|
1254
|
+
// Check that all byte tokens are set
|
|
1255
|
+
for (int i = 0; i < 256; ++i) {
|
|
1256
|
+
if (bytes_[i] == 0) {
|
|
1257
|
+
throw std::runtime_error("Byte token for <0x" + std::to_string(i) + "> is not set");
|
|
1258
|
+
}
|
|
1259
|
+
}
|
|
1260
|
+
|
|
1261
|
+
// Build suffix list in lexicographical order of reversed strings
|
|
1262
|
+
std::vector<std::string> suffixes;
|
|
1263
|
+
for (const auto & pair : suffix_to_score) {
|
|
1264
|
+
suffixes.push_back(pair.first);
|
|
1265
|
+
}
|
|
1266
|
+
suffixes.push_back(""); // Empty suffix
|
|
1267
|
+
|
|
1268
|
+
std::sort(suffixes.begin(), suffixes.end(), [](const std::string & a, const std::string & b) {
|
|
1269
|
+
std::string rev_a(a.rbegin(), a.rend());
|
|
1270
|
+
std::string rev_b(b.rbegin(), b.rend());
|
|
1271
|
+
return rev_a < rev_b;
|
|
1272
|
+
});
|
|
1273
|
+
|
|
1274
|
+
// Build suffix_to_id and to_suffix_id_
|
|
1275
|
+
std::unordered_map<std::string, int32_t> suffix_to_id;
|
|
1276
|
+
int32_t num_pieces = 0;
|
|
1277
|
+
|
|
1278
|
+
for (const auto & suffix : suffixes) {
|
|
1279
|
+
suffix_to_id[suffix] = num_pieces;
|
|
1280
|
+
if (!suffix.empty()) {
|
|
1281
|
+
std::vector<uint32_t> cpts = unicode_cpts_from_utf8(suffix);
|
|
1282
|
+
|
|
1283
|
+
std::string remaining;
|
|
1284
|
+
for (size_t i = 1; i < cpts.size(); ++i) {
|
|
1285
|
+
remaining += unicode_cpt_to_utf8(cpts[i]);
|
|
1286
|
+
}
|
|
1287
|
+
|
|
1288
|
+
int64_t piece_code = (static_cast<int64_t>(cpts[0]) << 32) | suffix_to_id[remaining];
|
|
1289
|
+
to_suffix_id_[piece_code] = num_pieces;
|
|
1290
|
+
|
|
1291
|
+
// Count number of pieces for this suffix
|
|
1292
|
+
int32_t pieces_for_suffix = 1; // sentinel row
|
|
1293
|
+
for (int32_t piece_length = static_cast<int32_t>(cpts.size()); piece_length > 0; --piece_length) {
|
|
1294
|
+
std::string piece;
|
|
1295
|
+
for (int32_t i = 0; i < piece_length; ++i) {
|
|
1296
|
+
piece += unicode_cpt_to_utf8(cpts[i]);
|
|
1297
|
+
}
|
|
1298
|
+
if (suffix_to_score.find(piece) != suffix_to_score.end()) {
|
|
1299
|
+
pieces_for_suffix++;
|
|
1300
|
+
}
|
|
1301
|
+
}
|
|
1302
|
+
num_pieces += pieces_for_suffix;
|
|
1303
|
+
} else {
|
|
1304
|
+
num_pieces++; // Empty suffix contributes one piece (sentinel row)
|
|
1305
|
+
}
|
|
1306
|
+
}
|
|
1307
|
+
|
|
1308
|
+
// Build flattened table
|
|
1309
|
+
table_.resize(num_pieces, std::vector<int32_t>(4, 0));
|
|
1310
|
+
int32_t table_idx = 0;
|
|
1311
|
+
|
|
1312
|
+
for (const auto & suffix : suffixes) {
|
|
1313
|
+
// Add all prefixes of the suffix to the table (in decreasing order of length)
|
|
1314
|
+
std::vector<uint32_t> cpts = unicode_cpts_from_utf8(suffix);
|
|
1315
|
+
for (int32_t piece_length = static_cast<int32_t>(cpts.size()); piece_length > 0; --piece_length) {
|
|
1316
|
+
std::string piece;
|
|
1317
|
+
for (int32_t i = 0; i < piece_length; ++i) {
|
|
1318
|
+
piece += unicode_cpt_to_utf8(cpts[i]);
|
|
1319
|
+
}
|
|
1320
|
+
|
|
1321
|
+
auto score_it = suffix_to_score.find(piece);
|
|
1322
|
+
if (score_it == suffix_to_score.end()) {
|
|
1323
|
+
continue;
|
|
1324
|
+
}
|
|
1325
|
+
|
|
1326
|
+
table_[table_idx][TABLE_PIECE_LENGTH] = piece_length;
|
|
1327
|
+
auto token_it = token_to_id.find(piece);
|
|
1328
|
+
table_[table_idx][TABLE_TOKEN_ID] = (token_it != token_to_id.end()) ? token_it->second : -1;
|
|
1329
|
+
|
|
1330
|
+
float score = score_it->second;
|
|
1331
|
+
table_[table_idx][TABLE_SCORE] = std::isfinite(score) ?
|
|
1332
|
+
static_cast<int32_t>(std::round(score * 1e4)) : INVALID_SCORE;
|
|
1333
|
+
table_[table_idx][TABLE_PIECE_ID] = suffix_to_id[piece];
|
|
1334
|
+
|
|
1335
|
+
table_idx++;
|
|
1336
|
+
}
|
|
1337
|
+
|
|
1338
|
+
// Add sentinel row
|
|
1339
|
+
table_[table_idx][TABLE_PIECE_LENGTH] = 1;
|
|
1340
|
+
table_[table_idx][TABLE_TOKEN_ID] = -1;
|
|
1341
|
+
table_[table_idx][TABLE_SCORE] = UNKNOWN_SCORE;
|
|
1342
|
+
table_idx++;
|
|
1343
|
+
}
|
|
1344
|
+
}
|
|
1345
|
+
|
|
1346
|
+
std::vector<llama_token> encode(const std::string & text) const {
|
|
1347
|
+
std::vector<uint32_t> unicode_data = unicode_cpts_from_utf8(text);
|
|
1348
|
+
// Skip the first code point if it is a BOM (Byte Order Mark)
|
|
1349
|
+
if (!unicode_data.empty() && unicode_data[0] == 0xFEFF) {
|
|
1350
|
+
unicode_data.erase(unicode_data.begin());
|
|
1351
|
+
}
|
|
1352
|
+
|
|
1353
|
+
if (unicode_data.empty()) {
|
|
1354
|
+
return {};
|
|
1355
|
+
}
|
|
1356
|
+
|
|
1357
|
+
const size_t data_len = unicode_data.size();
|
|
1358
|
+
|
|
1359
|
+
// Initialize scores array (dynamic programming)
|
|
1360
|
+
std::vector<int64_t> scores(data_len + 1, static_cast<int64_t>(1) << 60);
|
|
1361
|
+
scores[data_len] = 0;
|
|
1362
|
+
|
|
1363
|
+
// Path array to track best tokenization
|
|
1364
|
+
std::vector<std::vector<int32_t>> path(data_len + 1, std::vector<int32_t>(3, 0));
|
|
1365
|
+
|
|
1366
|
+
int32_t suffix_id = 0;
|
|
1367
|
+
|
|
1368
|
+
// Process from end to beginning
|
|
1369
|
+
for (int i = static_cast<int>(data_len) - 1; i >= 0; --i) {
|
|
1370
|
+
uint32_t c = unicode_data[i];
|
|
1371
|
+
|
|
1372
|
+
// Find next suffix ID
|
|
1373
|
+
for (size_t p = suffix_id; p < table_.size(); ++p) {
|
|
1374
|
+
int64_t piece_code = (static_cast<int64_t>(c) << 32) | table_[p][TABLE_PIECE_ID];
|
|
1375
|
+
auto it = to_suffix_id_.find(piece_code);
|
|
1376
|
+
suffix_id = (it != to_suffix_id_.end()) ? it->second : 0;
|
|
1377
|
+
|
|
1378
|
+
if (suffix_id > 0 || table_[p][TABLE_SCORE] == UNKNOWN_SCORE) {
|
|
1379
|
+
break;
|
|
1380
|
+
}
|
|
1381
|
+
}
|
|
1382
|
+
|
|
1383
|
+
// Update best path
|
|
1384
|
+
for (size_t p = suffix_id; p < table_.size(); ++p) {
|
|
1385
|
+
int32_t score = table_[p][TABLE_SCORE];
|
|
1386
|
+
if (score > INVALID_SCORE) {
|
|
1387
|
+
int32_t piece_length = table_[p][TABLE_PIECE_LENGTH];
|
|
1388
|
+
int64_t s = scores[i + piece_length] - score;
|
|
1389
|
+
|
|
1390
|
+
if (s < scores[i]) {
|
|
1391
|
+
scores[i] = s;
|
|
1392
|
+
path[i][PATH_TOKEN_LENGTH] = piece_length;
|
|
1393
|
+
path[i][PATH_TOKEN_ID] = table_[p][TABLE_TOKEN_ID];
|
|
1394
|
+
path[i][PATH_NUM_TOKENS] = path[i + piece_length][PATH_NUM_TOKENS] + 1;
|
|
1395
|
+
|
|
1396
|
+
if (score == UNKNOWN_SCORE) {
|
|
1397
|
+
// Add UTF-8 byte count
|
|
1398
|
+
path[i][PATH_NUM_TOKENS] += (c >= 0x80) + (c >= 0x800) + (c >= 0x10000);
|
|
1399
|
+
}
|
|
1400
|
+
}
|
|
1401
|
+
}
|
|
1402
|
+
|
|
1403
|
+
if (score == UNKNOWN_SCORE) {
|
|
1404
|
+
break;
|
|
1405
|
+
}
|
|
1406
|
+
}
|
|
1407
|
+
}
|
|
1408
|
+
|
|
1409
|
+
// Decode the best path
|
|
1410
|
+
std::vector<llama_token> token_ids;
|
|
1411
|
+
token_ids.reserve(path[0][PATH_NUM_TOKENS]);
|
|
1412
|
+
|
|
1413
|
+
int pos = 0;
|
|
1414
|
+
while (pos < static_cast<int>(data_len)) {
|
|
1415
|
+
if (path[pos][PATH_TOKEN_ID] >= 0) {
|
|
1416
|
+
token_ids.push_back(path[pos][PATH_TOKEN_ID]);
|
|
1417
|
+
} else {
|
|
1418
|
+
// Fall back to byte tokens
|
|
1419
|
+
uint32_t c = unicode_data[pos];
|
|
1420
|
+
int s = 1 + (c >= 0x80) + (c >= 0x800) + (c >= 0x10000);
|
|
1421
|
+
|
|
1422
|
+
for (int i = 0; i < s; ++i) {
|
|
1423
|
+
uint8_t b;
|
|
1424
|
+
if (s == 1) {
|
|
1425
|
+
b = c;
|
|
1426
|
+
} else {
|
|
1427
|
+
if (i == 0) {
|
|
1428
|
+
b = (0xF00 >> s) & 0xFF;
|
|
1429
|
+
} else {
|
|
1430
|
+
b = 0x80;
|
|
1431
|
+
}
|
|
1432
|
+
}
|
|
1433
|
+
token_ids.push_back(bytes_[b | ((c >> ((s - i - 1) * 6)) & 0x3F)]);
|
|
1434
|
+
}
|
|
1435
|
+
}
|
|
1436
|
+
|
|
1437
|
+
assert(path[pos][PATH_TOKEN_LENGTH] > 0);
|
|
1438
|
+
pos += path[pos][PATH_TOKEN_LENGTH];
|
|
1439
|
+
}
|
|
1440
|
+
|
|
1441
|
+
return token_ids;
|
|
1442
|
+
}
|
|
1443
|
+
private:
|
|
1444
|
+
// Constants for table structure
|
|
1445
|
+
static constexpr int32_t TABLE_PIECE_LENGTH = 0;
|
|
1446
|
+
static constexpr int32_t TABLE_TOKEN_ID = 1;
|
|
1447
|
+
static constexpr int32_t TABLE_SCORE = 2;
|
|
1448
|
+
static constexpr int32_t TABLE_PIECE_ID = 3;
|
|
1449
|
+
|
|
1450
|
+
// Constants for path array
|
|
1451
|
+
static constexpr int32_t PATH_TOKEN_LENGTH = 0;
|
|
1452
|
+
static constexpr int32_t PATH_TOKEN_ID = 1;
|
|
1453
|
+
static constexpr int32_t PATH_NUM_TOKENS = 2;
|
|
1454
|
+
|
|
1455
|
+
// Score constants
|
|
1456
|
+
static constexpr int32_t INVALID_SCORE = -20000000;
|
|
1457
|
+
static constexpr int32_t UNKNOWN_SCORE = -10000000;
|
|
1458
|
+
|
|
1459
|
+
// List of tokens in the vocabulary
|
|
1460
|
+
std::vector<std::string> tokens_;
|
|
1461
|
+
|
|
1462
|
+
// Mapping from byte code point to token ID (for byte fallback)
|
|
1463
|
+
std::vector<llama_token> bytes_;
|
|
1464
|
+
|
|
1465
|
+
// Mapping from piece code to suffix ID
|
|
1466
|
+
std::unordered_map<int64_t, int32_t> to_suffix_id_;
|
|
1467
|
+
|
|
1468
|
+
// Flattened table representing the Trie structure
|
|
1469
|
+
// Each row contains: [piece_length, token_id, score, piece_id]
|
|
1470
|
+
std::vector<std::vector<int32_t>> table_;
|
|
1471
|
+
};
|
|
1472
|
+
|
|
1473
|
+
struct llm_tokenizer_plamo2_session {
|
|
1474
|
+
llm_tokenizer_plamo2_session(const llm_tokenizer_plamo2 & tokenizer) : tokenizer(tokenizer) {}
|
|
1475
|
+
|
|
1476
|
+
void tokenize(const std::string & text, std::vector<llama_token> & output) {
|
|
1477
|
+
std::vector<llama_token> tokens = tokenizer.encode(text);
|
|
1478
|
+
output.insert(output.end(), tokens.begin(), tokens.end());
|
|
1479
|
+
}
|
|
1480
|
+
|
|
1481
|
+
private:
|
|
1482
|
+
const llm_tokenizer_plamo2 & tokenizer;
|
|
1483
|
+
};
|
|
1484
|
+
|
|
1199
1485
|
//
|
|
1200
1486
|
// impl
|
|
1201
1487
|
//
|
|
@@ -1499,6 +1785,16 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1499
1785
|
special_unk_id = LLAMA_TOKEN_NULL;
|
|
1500
1786
|
special_sep_id = LLAMA_TOKEN_NULL;
|
|
1501
1787
|
special_pad_id = LLAMA_TOKEN_NULL;
|
|
1788
|
+
} else if (tokenizer_model == "plamo2") {
|
|
1789
|
+
type = LLAMA_VOCAB_TYPE_PLAMO2;
|
|
1790
|
+
|
|
1791
|
+
// PLaMo-2 default special tokens (these will be overridden by model config)
|
|
1792
|
+
special_bos_id = 1; // <|plamo:bos|>
|
|
1793
|
+
special_eos_id = 2; // <|plamo:eos|>
|
|
1794
|
+
special_unk_id = 0; // <|plamo:unk|>
|
|
1795
|
+
special_sep_id = LLAMA_TOKEN_NULL;
|
|
1796
|
+
special_pad_id = 3; // <|plamo:pad|>
|
|
1797
|
+
special_mask_id = LLAMA_TOKEN_NULL;
|
|
1502
1798
|
} else {
|
|
1503
1799
|
throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
|
|
1504
1800
|
}
|
|
@@ -1524,7 +1820,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1524
1820
|
tokenizer_pre == "llama-bpe"||
|
|
1525
1821
|
tokenizer_pre == "falcon3" ||
|
|
1526
1822
|
tokenizer_pre == "falcon-h1" ||
|
|
1527
|
-
tokenizer_pre == "pixtral"
|
|
1823
|
+
tokenizer_pre == "pixtral" ||
|
|
1824
|
+
tokenizer_pre == "midm-2.0" ||
|
|
1825
|
+
tokenizer_pre == "lfm2") {
|
|
1528
1826
|
pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
|
|
1529
1827
|
ignore_merges = true;
|
|
1530
1828
|
add_bos = true;
|
|
@@ -1627,6 +1925,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1627
1925
|
} else if (
|
|
1628
1926
|
tokenizer_pre == "exaone") {
|
|
1629
1927
|
pre_type = LLAMA_VOCAB_PRE_TYPE_EXAONE;
|
|
1928
|
+
} else if (
|
|
1929
|
+
tokenizer_pre == "exaone4") {
|
|
1930
|
+
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
|
|
1630
1931
|
} else if (
|
|
1631
1932
|
tokenizer_pre == "chameleon") {
|
|
1632
1933
|
pre_type = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
|
|
@@ -1663,6 +1964,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1663
1964
|
tokenizer_pre == "hunyuan") {
|
|
1664
1965
|
pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN;
|
|
1665
1966
|
clean_spaces = false;
|
|
1967
|
+
} else if (
|
|
1968
|
+
tokenizer_pre == "kimi-k2") {
|
|
1969
|
+
pre_type = LLAMA_VOCAB_PRE_TYPE_KIMI_K2;
|
|
1970
|
+
clean_spaces = false;
|
|
1666
1971
|
} else {
|
|
1667
1972
|
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
|
1668
1973
|
}
|
|
@@ -1846,6 +2151,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
1846
2151
|
|| t.first == "<EOT>"
|
|
1847
2152
|
|| t.first == "_<EOT>"
|
|
1848
2153
|
|| t.first == "<|end▁of▁sentence|>" // DeepSeek
|
|
2154
|
+
|| t.first == "<end_of_utterance>" // smoldocling
|
|
1849
2155
|
) {
|
|
1850
2156
|
special_eot_id = t.second;
|
|
1851
2157
|
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
|
@@ -2005,6 +2311,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
|
2005
2311
|
|| t.first == "<EOT>"
|
|
2006
2312
|
|| t.first == "_<EOT>"
|
|
2007
2313
|
|| t.first == "<|end_of_text|>"
|
|
2314
|
+
|| t.first == "<end_of_utterance>" // smoldocling
|
|
2008
2315
|
) {
|
|
2009
2316
|
special_eog_ids.insert(t.second);
|
|
2010
2317
|
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
|
@@ -2141,13 +2448,14 @@ enum llama_vocab_type llama_vocab::impl::get_type() const {
|
|
|
2141
2448
|
|
|
2142
2449
|
std::string llama_vocab::impl::type_name() const{
|
|
2143
2450
|
switch (type) {
|
|
2144
|
-
case LLAMA_VOCAB_TYPE_NONE:
|
|
2145
|
-
case LLAMA_VOCAB_TYPE_SPM:
|
|
2146
|
-
case LLAMA_VOCAB_TYPE_BPE:
|
|
2147
|
-
case LLAMA_VOCAB_TYPE_WPM:
|
|
2148
|
-
case LLAMA_VOCAB_TYPE_UGM:
|
|
2149
|
-
case LLAMA_VOCAB_TYPE_RWKV:
|
|
2150
|
-
|
|
2451
|
+
case LLAMA_VOCAB_TYPE_NONE: return "no vocab";
|
|
2452
|
+
case LLAMA_VOCAB_TYPE_SPM: return "SPM";
|
|
2453
|
+
case LLAMA_VOCAB_TYPE_BPE: return "BPE";
|
|
2454
|
+
case LLAMA_VOCAB_TYPE_WPM: return "WPM";
|
|
2455
|
+
case LLAMA_VOCAB_TYPE_UGM: return "UGM";
|
|
2456
|
+
case LLAMA_VOCAB_TYPE_RWKV: return "RWKV";
|
|
2457
|
+
case LLAMA_VOCAB_TYPE_PLAMO2: return "PLaMo2";
|
|
2458
|
+
default: return "unknown";
|
|
2151
2459
|
}
|
|
2152
2460
|
}
|
|
2153
2461
|
|
|
@@ -2230,6 +2538,9 @@ void llama_vocab::impl::init_tokenizer(enum llama_vocab_type type) {
|
|
|
2230
2538
|
case LLAMA_VOCAB_TYPE_RWKV:
|
|
2231
2539
|
tokenizer = std::make_unique<llm_tokenizer_rwkv>(vocab);
|
|
2232
2540
|
break;
|
|
2541
|
+
case LLAMA_VOCAB_TYPE_PLAMO2:
|
|
2542
|
+
tokenizer = std::make_unique<llm_tokenizer_plamo2>(vocab);
|
|
2543
|
+
break;
|
|
2233
2544
|
default:
|
|
2234
2545
|
GGML_ABORT("unsupported vocab type");
|
|
2235
2546
|
}
|
|
@@ -2562,6 +2873,23 @@ std::vector<llama_token> llama_vocab::impl::tokenize(
|
|
|
2562
2873
|
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
|
2563
2874
|
std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
|
2564
2875
|
|
|
2876
|
+
#ifdef PRETOKENIZERDEBUG
|
|
2877
|
+
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
|
|
2878
|
+
#endif
|
|
2879
|
+
|
|
2880
|
+
session.tokenize(text, output);
|
|
2881
|
+
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
|
2882
|
+
output.push_back(fragment.token);
|
|
2883
|
+
}
|
|
2884
|
+
}
|
|
2885
|
+
} break;
|
|
2886
|
+
case LLAMA_VOCAB_TYPE_PLAMO2:
|
|
2887
|
+
{
|
|
2888
|
+
llm_tokenizer_plamo2_session session(*static_cast<const llm_tokenizer_plamo2 *>(tokenizer.get()));
|
|
2889
|
+
for (const auto & fragment : fragment_buffer) {
|
|
2890
|
+
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
|
2891
|
+
std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
|
2892
|
+
|
|
2565
2893
|
#ifdef PRETOKENIZERDEBUG
|
|
2566
2894
|
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
|
|
2567
2895
|
#endif
|
|
@@ -2660,6 +2988,24 @@ int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t
|
|
|
2660
2988
|
memcpy(buf, result.data(), result.size());
|
|
2661
2989
|
return (int)result.size();
|
|
2662
2990
|
}
|
|
2991
|
+
case LLAMA_VOCAB_TYPE_PLAMO2: {
|
|
2992
|
+
// PLaMo-2 uses similar token handling as BPE/SPM
|
|
2993
|
+
if (vocab.is_byte(token)) {
|
|
2994
|
+
// Handle byte tokens like <0xXX>
|
|
2995
|
+
if (token_text.length() == 6 && token_text.substr(0, 3) == "<0x" && token_text.back() == '>') {
|
|
2996
|
+
int hex_val = std::stoi(token_text.substr(3, 2), nullptr, 16);
|
|
2997
|
+
if (length < 1) {
|
|
2998
|
+
return -1;
|
|
2999
|
+
}
|
|
3000
|
+
buf[0] = static_cast<char>(hex_val);
|
|
3001
|
+
return 1;
|
|
3002
|
+
}
|
|
3003
|
+
}
|
|
3004
|
+
|
|
3005
|
+
// Normal token - just copy the text
|
|
3006
|
+
std::string result = token_text;
|
|
3007
|
+
return _try_copy(result.data(), result.size());
|
|
3008
|
+
}
|
|
2663
3009
|
default:
|
|
2664
3010
|
GGML_ABORT("fatal error");
|
|
2665
3011
|
}
|
|
@@ -2904,6 +3250,12 @@ llama_token llama_vocab::byte_to_token(uint8_t ch) const {
|
|
|
2904
3250
|
case LLAMA_VOCAB_TYPE_BPE: {
|
|
2905
3251
|
return pimpl->token_to_id.at(unicode_byte_to_utf8(ch));
|
|
2906
3252
|
}
|
|
3253
|
+
case LLAMA_VOCAB_TYPE_PLAMO2: {
|
|
3254
|
+
// PLaMo-2 uses byte tokens in format <0xXX>
|
|
3255
|
+
char hex_str[8];
|
|
3256
|
+
snprintf(hex_str, sizeof(hex_str), "<0x%02X>", ch);
|
|
3257
|
+
return pimpl->token_to_id.at(hex_str);
|
|
3258
|
+
}
|
|
2907
3259
|
default:
|
|
2908
3260
|
GGML_ABORT("fatal error");
|
|
2909
3261
|
}
|
|
@@ -3005,6 +3357,10 @@ llama_token llama_vocab::token_fim_sep() const {
|
|
|
3005
3357
|
return pimpl->special_fim_sep_id;
|
|
3006
3358
|
}
|
|
3007
3359
|
|
|
3360
|
+
llama_token llama_vocab::token_mask() const {
|
|
3361
|
+
return pimpl->special_mask_id;
|
|
3362
|
+
}
|
|
3363
|
+
|
|
3008
3364
|
bool llama_vocab::get_add_space_prefix() const {
|
|
3009
3365
|
return pimpl->add_space_prefix;
|
|
3010
3366
|
}
|
|
@@ -3245,6 +3601,10 @@ llama_token llama_vocab_fim_sep(const struct llama_vocab * vocab) {
|
|
|
3245
3601
|
return vocab->token_fim_sep();
|
|
3246
3602
|
}
|
|
3247
3603
|
|
|
3604
|
+
llama_token llama_vocab_mask(const struct llama_vocab* vocab) {
|
|
3605
|
+
return vocab->token_mask();
|
|
3606
|
+
}
|
|
3607
|
+
|
|
3248
3608
|
// deprecated
|
|
3249
3609
|
const char * llama_token_get_text(const struct llama_vocab * vocab, llama_token token) {
|
|
3250
3610
|
return llama_vocab_get_text(vocab, token);
|
|
@@ -3381,4 +3741,3 @@ int32_t llama_detokenize(
|
|
|
3381
3741
|
bool unparse_special) {
|
|
3382
3742
|
return vocab->detokenize(tokens, n_tokens, text, text_len_max, remove_special, unparse_special);
|
|
3383
3743
|
}
|
|
3384
|
-
|
|
@@ -6,6 +6,48 @@
|
|
|
6
6
|
#include <vector>
|
|
7
7
|
#include <memory>
|
|
8
8
|
|
|
9
|
+
// pre-tokenization types
|
|
10
|
+
enum llama_vocab_pre_type {
|
|
11
|
+
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
|
|
12
|
+
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
|
|
13
|
+
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
|
|
14
|
+
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
|
|
15
|
+
LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
|
|
16
|
+
LLAMA_VOCAB_PRE_TYPE_MPT = 5,
|
|
17
|
+
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
|
|
18
|
+
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
|
|
19
|
+
LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
|
|
20
|
+
LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
|
|
21
|
+
LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10,
|
|
22
|
+
LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
|
|
23
|
+
LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
|
|
24
|
+
LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
|
|
25
|
+
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
|
|
26
|
+
LLAMA_VOCAB_PRE_TYPE_PORO = 15,
|
|
27
|
+
LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16,
|
|
28
|
+
LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
|
|
29
|
+
LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
|
|
30
|
+
LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
|
|
31
|
+
LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
|
|
32
|
+
LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
|
|
33
|
+
LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
|
|
34
|
+
LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
|
|
35
|
+
LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
|
|
36
|
+
LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
|
|
37
|
+
LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
|
|
38
|
+
LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
|
|
39
|
+
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
|
|
40
|
+
LLAMA_VOCAB_PRE_TYPE_GPT4O = 29,
|
|
41
|
+
LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30,
|
|
42
|
+
LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
|
|
43
|
+
LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
|
|
44
|
+
LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
|
|
45
|
+
LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
|
|
46
|
+
LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
|
|
47
|
+
LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36,
|
|
48
|
+
LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37,
|
|
49
|
+
};
|
|
50
|
+
|
|
9
51
|
struct LLM_KV;
|
|
10
52
|
struct llama_model_loader;
|
|
11
53
|
|
|
@@ -59,6 +101,7 @@ struct llama_vocab {
|
|
|
59
101
|
llama_token token_sep() const;
|
|
60
102
|
llama_token token_nl () const;
|
|
61
103
|
llama_token token_pad() const;
|
|
104
|
+
llama_token token_mask() const;
|
|
62
105
|
|
|
63
106
|
llama_token token_prefix() const;
|
|
64
107
|
llama_token token_middle() const;
|