@fugood/llama.node 1.0.3 → 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/package.json +14 -14
  2. package/src/llama.cpp/common/CMakeLists.txt +4 -5
  3. package/src/llama.cpp/common/arg.cpp +37 -0
  4. package/src/llama.cpp/common/common.cpp +22 -6
  5. package/src/llama.cpp/common/common.h +14 -1
  6. package/src/llama.cpp/ggml/CMakeLists.txt +3 -0
  7. package/src/llama.cpp/ggml/include/ggml-webgpu.h +19 -0
  8. package/src/llama.cpp/ggml/include/ggml.h +13 -0
  9. package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -0
  10. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +343 -1094
  11. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +23 -8
  12. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +3 -0
  13. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +39 -0
  14. package/src/llama.cpp/include/llama.h +13 -48
  15. package/src/llama.cpp/src/llama-arch.cpp +222 -15
  16. package/src/llama.cpp/src/llama-arch.h +16 -1
  17. package/src/llama.cpp/src/llama-batch.cpp +76 -70
  18. package/src/llama.cpp/src/llama-batch.h +24 -18
  19. package/src/llama.cpp/src/llama-chat.cpp +44 -1
  20. package/src/llama.cpp/src/llama-chat.h +2 -0
  21. package/src/llama.cpp/src/llama-context.cpp +134 -95
  22. package/src/llama.cpp/src/llama-context.h +13 -16
  23. package/src/llama.cpp/src/llama-cparams.h +3 -2
  24. package/src/llama.cpp/src/llama-graph.cpp +239 -154
  25. package/src/llama.cpp/src/llama-graph.h +162 -126
  26. package/src/llama.cpp/src/llama-hparams.cpp +45 -0
  27. package/src/llama.cpp/src/llama-hparams.h +11 -1
  28. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +11 -5
  29. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +3 -0
  30. package/src/llama.cpp/src/llama-kv-cache-unified.cpp +698 -302
  31. package/src/llama.cpp/src/llama-kv-cache-unified.h +89 -31
  32. package/src/llama.cpp/src/llama-memory-hybrid.cpp +1 -0
  33. package/src/llama.cpp/src/llama-memory-recurrent.cpp +6 -9
  34. package/src/llama.cpp/src/llama-model.cpp +2309 -665
  35. package/src/llama.cpp/src/llama-model.h +18 -4
  36. package/src/llama.cpp/src/llama-quant.cpp +2 -2
  37. package/src/llama.cpp/src/llama-vocab.cpp +368 -9
  38. package/src/llama.cpp/src/llama-vocab.h +43 -0
  39. package/src/llama.cpp/src/unicode.cpp +207 -0
  40. package/src/llama.cpp/src/unicode.h +2 -0
@@ -32,17 +32,21 @@ enum llm_type {
32
32
  LLM_TYPE_190M,
33
33
  LLM_TYPE_220M,
34
34
  LLM_TYPE_250M,
35
+ LLM_TYPE_256M,
35
36
  LLM_TYPE_270M,
36
37
  LLM_TYPE_335M,
38
+ LLM_TYPE_350M,
37
39
  LLM_TYPE_410M,
38
40
  LLM_TYPE_450M,
39
41
  LLM_TYPE_475M,
42
+ LLM_TYPE_700M,
40
43
  LLM_TYPE_770M,
41
44
  LLM_TYPE_780M,
42
45
  LLM_TYPE_0_3B,
43
46
  LLM_TYPE_0_5B,
44
47
  LLM_TYPE_0_6B,
45
48
  LLM_TYPE_1B,
49
+ LLM_TYPE_1_2B,
46
50
  LLM_TYPE_1_3B,
47
51
  LLM_TYPE_1_4B,
48
52
  LLM_TYPE_1_5B,
@@ -95,8 +99,10 @@ enum llm_type {
95
99
  LLM_TYPE_17B_16E, // llama4 Scout
96
100
  LLM_TYPE_17B_128E, // llama4 Maverick
97
101
  LLM_TYPE_A13B,
102
+ LLM_TYPE_21B_A3B, // Ernie MoE small
98
103
  LLM_TYPE_30B_A3B,
99
104
  LLM_TYPE_235B_A22B,
105
+ LLM_TYPE_300B_A47B, // Ernie MoE big
100
106
  LLM_TYPE_E2B,
101
107
  LLM_TYPE_E4B,
102
108
  };
@@ -154,6 +160,12 @@ struct llama_layer_convnext {
154
160
  struct ggml_tensor * gamma = nullptr;
155
161
  };
156
162
 
163
+ struct llama_layer_shortconv {
164
+ struct ggml_tensor * in_proj = nullptr;
165
+ struct ggml_tensor * conv = nullptr;
166
+ struct ggml_tensor * out_proj = nullptr;
167
+ };
168
+
157
169
  struct llama_layer {
158
170
  // normalization
159
171
  struct ggml_tensor * attn_norm = nullptr;
@@ -174,6 +186,9 @@ struct llama_layer {
174
186
  struct ggml_tensor * attn_norm_cross = nullptr;
175
187
  struct ggml_tensor * attn_norm_enc = nullptr;
176
188
  struct ggml_tensor * ssm_norm = nullptr;
189
+ struct ggml_tensor * ssm_dt_norm = nullptr;
190
+ struct ggml_tensor * ssm_b_norm = nullptr;
191
+ struct ggml_tensor * ssm_c_norm = nullptr;
177
192
 
178
193
  // attention
179
194
  struct ggml_tensor * wq = nullptr;
@@ -337,6 +352,8 @@ struct llama_layer {
337
352
  struct llama_layer_posnet posnet;
338
353
 
339
354
  struct llama_layer_convnext convnext;
355
+
356
+ struct llama_layer_shortconv shortconv;
340
357
  };
341
358
 
342
359
  struct llama_model {
@@ -437,10 +454,7 @@ struct llama_model {
437
454
  llama_memory_i * create_memory(const llama_memory_params & params, llama_cparams & cparams) const;
438
455
 
439
456
  // TODO: move this to new llm_arch_model_i interface
440
- llm_graph_result_ptr build_graph(
441
- const llm_graph_params & params,
442
- ggml_cgraph * gf,
443
- llm_graph_type type) const;
457
+ ggml_cgraph * build_graph(const llm_graph_params & params) const;
444
458
 
445
459
  private:
446
460
  struct impl;
@@ -844,6 +844,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
844
844
  // do not quantize Mamba's small yet 2D weights
845
845
  // NOTE: can't use LLM_TN here because the layer number is not known
846
846
  quantize &= name.find("ssm_conv1d.weight") == std::string::npos;
847
+ quantize &= name.find("shortconv.conv.weight") == std::string::npos;
847
848
 
848
849
  // do not quantize RWKV's small yet 2D weights
849
850
  quantize &= name.find("time_mix_first.weight") == std::string::npos;
@@ -883,8 +884,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
883
884
  if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) {
884
885
  if (qtype != new_type) {
885
886
  LLAMA_LOG_DEBUG("(overriding %s) ", ggml_type_name(new_type));
886
- new_type = qtype;
887
- break; // if two or more types are specified for the tensor, first match wins
887
+ new_type = qtype; // if two or more types are specified for the same tensor, the last match wins
888
888
  }
889
889
  }
890
890
  }
@@ -11,6 +11,7 @@
11
11
  #include <cassert>
12
12
  #include <cctype>
13
13
  #include <cfloat>
14
+ #include <cmath>
14
15
  #include <cstdarg>
15
16
  #include <cstring>
16
17
  #include <forward_list>
@@ -404,6 +405,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
404
405
  "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
405
406
  };
406
407
  break;
408
+ case LLAMA_VOCAB_PRE_TYPE_KIMI_K2:
409
+ regex_exprs = {
410
+ // K2 trigger pattern - this will activate the custom K2 handler in unicode.cpp
411
+ // The custom handler implements all K2 patterns with proper Han character exclusion
412
+ "\\p{Han}+",
413
+ };
414
+ break;
407
415
  case LLAMA_VOCAB_PRE_TYPE_SUPERBPE:
408
416
  regex_exprs = {
409
417
  "\\p{N}+",
@@ -1196,6 +1204,284 @@ private:
1196
1204
  const llm_tokenizer_rwkv & tokenizer;
1197
1205
  };
1198
1206
 
1207
+ struct llm_tokenizer_plamo2 : llm_tokenizer {
1208
+ llm_tokenizer_plamo2(const llama_vocab & vocab) {
1209
+ build(vocab);
1210
+ }
1211
+
1212
+ void build(const llama_vocab & vocab) {
1213
+ // Reset internal structures
1214
+ tokens_.clear();
1215
+ bytes_.assign(256, 0);
1216
+ to_suffix_id_.clear();
1217
+ table_.clear();
1218
+
1219
+ // Build token list and byte mapping
1220
+ std::unordered_map<std::string, float> suffix_to_score;
1221
+ std::unordered_map<std::string, llama_token> token_to_id;
1222
+
1223
+ for (size_t token_id = 0; token_id < vocab.n_tokens(); ++token_id) {
1224
+ const auto & entry = vocab.get_token_data(token_id);
1225
+ tokens_.push_back(entry.text);
1226
+ token_to_id[entry.text] = static_cast<llama_token>(token_id);
1227
+
1228
+ // Handle byte tokens
1229
+ if (vocab.is_byte(token_id)) {
1230
+ if (entry.text.length() == 6 && entry.text.substr(0, 3) == "<0x" && entry.text.back() == '>') {
1231
+ std::string hex_str = entry.text.substr(3, 2);
1232
+ int byte_val = std::stoi(hex_str, nullptr, 16);
1233
+ bytes_[byte_val] = static_cast<llama_token>(token_id);
1234
+ }
1235
+ continue;
1236
+ }
1237
+
1238
+ // Add token and all its suffixes to suffix_to_score
1239
+ suffix_to_score[entry.text] = entry.score;
1240
+
1241
+ // Extract suffixes character by character (UTF-8 aware)
1242
+ std::vector<uint32_t> cpts = unicode_cpts_from_utf8(entry.text);
1243
+ for (size_t i = 1; i < cpts.size(); ++i) {
1244
+ std::string suffix;
1245
+ for (size_t j = i; j < cpts.size(); ++j) {
1246
+ suffix += unicode_cpt_to_utf8(cpts[j]);
1247
+ }
1248
+ if (suffix_to_score.find(suffix) == suffix_to_score.end()) {
1249
+ suffix_to_score[suffix] = std::numeric_limits<float>::quiet_NaN();
1250
+ }
1251
+ }
1252
+ }
1253
+
1254
+ // Check that all byte tokens are set
1255
+ for (int i = 0; i < 256; ++i) {
1256
+ if (bytes_[i] == 0) {
1257
+ throw std::runtime_error("Byte token for <0x" + std::to_string(i) + "> is not set");
1258
+ }
1259
+ }
1260
+
1261
+ // Build suffix list in lexicographical order of reversed strings
1262
+ std::vector<std::string> suffixes;
1263
+ for (const auto & pair : suffix_to_score) {
1264
+ suffixes.push_back(pair.first);
1265
+ }
1266
+ suffixes.push_back(""); // Empty suffix
1267
+
1268
+ std::sort(suffixes.begin(), suffixes.end(), [](const std::string & a, const std::string & b) {
1269
+ std::string rev_a(a.rbegin(), a.rend());
1270
+ std::string rev_b(b.rbegin(), b.rend());
1271
+ return rev_a < rev_b;
1272
+ });
1273
+
1274
+ // Build suffix_to_id and to_suffix_id_
1275
+ std::unordered_map<std::string, int32_t> suffix_to_id;
1276
+ int32_t num_pieces = 0;
1277
+
1278
+ for (const auto & suffix : suffixes) {
1279
+ suffix_to_id[suffix] = num_pieces;
1280
+ if (!suffix.empty()) {
1281
+ std::vector<uint32_t> cpts = unicode_cpts_from_utf8(suffix);
1282
+
1283
+ std::string remaining;
1284
+ for (size_t i = 1; i < cpts.size(); ++i) {
1285
+ remaining += unicode_cpt_to_utf8(cpts[i]);
1286
+ }
1287
+
1288
+ int64_t piece_code = (static_cast<int64_t>(cpts[0]) << 32) | suffix_to_id[remaining];
1289
+ to_suffix_id_[piece_code] = num_pieces;
1290
+
1291
+ // Count number of pieces for this suffix
1292
+ int32_t pieces_for_suffix = 1; // sentinel row
1293
+ for (int32_t piece_length = static_cast<int32_t>(cpts.size()); piece_length > 0; --piece_length) {
1294
+ std::string piece;
1295
+ for (int32_t i = 0; i < piece_length; ++i) {
1296
+ piece += unicode_cpt_to_utf8(cpts[i]);
1297
+ }
1298
+ if (suffix_to_score.find(piece) != suffix_to_score.end()) {
1299
+ pieces_for_suffix++;
1300
+ }
1301
+ }
1302
+ num_pieces += pieces_for_suffix;
1303
+ } else {
1304
+ num_pieces++; // Empty suffix contributes one piece (sentinel row)
1305
+ }
1306
+ }
1307
+
1308
+ // Build flattened table
1309
+ table_.resize(num_pieces, std::vector<int32_t>(4, 0));
1310
+ int32_t table_idx = 0;
1311
+
1312
+ for (const auto & suffix : suffixes) {
1313
+ // Add all prefixes of the suffix to the table (in decreasing order of length)
1314
+ std::vector<uint32_t> cpts = unicode_cpts_from_utf8(suffix);
1315
+ for (int32_t piece_length = static_cast<int32_t>(cpts.size()); piece_length > 0; --piece_length) {
1316
+ std::string piece;
1317
+ for (int32_t i = 0; i < piece_length; ++i) {
1318
+ piece += unicode_cpt_to_utf8(cpts[i]);
1319
+ }
1320
+
1321
+ auto score_it = suffix_to_score.find(piece);
1322
+ if (score_it == suffix_to_score.end()) {
1323
+ continue;
1324
+ }
1325
+
1326
+ table_[table_idx][TABLE_PIECE_LENGTH] = piece_length;
1327
+ auto token_it = token_to_id.find(piece);
1328
+ table_[table_idx][TABLE_TOKEN_ID] = (token_it != token_to_id.end()) ? token_it->second : -1;
1329
+
1330
+ float score = score_it->second;
1331
+ table_[table_idx][TABLE_SCORE] = std::isfinite(score) ?
1332
+ static_cast<int32_t>(std::round(score * 1e4)) : INVALID_SCORE;
1333
+ table_[table_idx][TABLE_PIECE_ID] = suffix_to_id[piece];
1334
+
1335
+ table_idx++;
1336
+ }
1337
+
1338
+ // Add sentinel row
1339
+ table_[table_idx][TABLE_PIECE_LENGTH] = 1;
1340
+ table_[table_idx][TABLE_TOKEN_ID] = -1;
1341
+ table_[table_idx][TABLE_SCORE] = UNKNOWN_SCORE;
1342
+ table_idx++;
1343
+ }
1344
+ }
1345
+
1346
+ std::vector<llama_token> encode(const std::string & text) const {
1347
+ std::vector<uint32_t> unicode_data = unicode_cpts_from_utf8(text);
1348
+ // Skip the first code point if it is a BOM (Byte Order Mark)
1349
+ if (!unicode_data.empty() && unicode_data[0] == 0xFEFF) {
1350
+ unicode_data.erase(unicode_data.begin());
1351
+ }
1352
+
1353
+ if (unicode_data.empty()) {
1354
+ return {};
1355
+ }
1356
+
1357
+ const size_t data_len = unicode_data.size();
1358
+
1359
+ // Initialize scores array (dynamic programming)
1360
+ std::vector<int64_t> scores(data_len + 1, static_cast<int64_t>(1) << 60);
1361
+ scores[data_len] = 0;
1362
+
1363
+ // Path array to track best tokenization
1364
+ std::vector<std::vector<int32_t>> path(data_len + 1, std::vector<int32_t>(3, 0));
1365
+
1366
+ int32_t suffix_id = 0;
1367
+
1368
+ // Process from end to beginning
1369
+ for (int i = static_cast<int>(data_len) - 1; i >= 0; --i) {
1370
+ uint32_t c = unicode_data[i];
1371
+
1372
+ // Find next suffix ID
1373
+ for (size_t p = suffix_id; p < table_.size(); ++p) {
1374
+ int64_t piece_code = (static_cast<int64_t>(c) << 32) | table_[p][TABLE_PIECE_ID];
1375
+ auto it = to_suffix_id_.find(piece_code);
1376
+ suffix_id = (it != to_suffix_id_.end()) ? it->second : 0;
1377
+
1378
+ if (suffix_id > 0 || table_[p][TABLE_SCORE] == UNKNOWN_SCORE) {
1379
+ break;
1380
+ }
1381
+ }
1382
+
1383
+ // Update best path
1384
+ for (size_t p = suffix_id; p < table_.size(); ++p) {
1385
+ int32_t score = table_[p][TABLE_SCORE];
1386
+ if (score > INVALID_SCORE) {
1387
+ int32_t piece_length = table_[p][TABLE_PIECE_LENGTH];
1388
+ int64_t s = scores[i + piece_length] - score;
1389
+
1390
+ if (s < scores[i]) {
1391
+ scores[i] = s;
1392
+ path[i][PATH_TOKEN_LENGTH] = piece_length;
1393
+ path[i][PATH_TOKEN_ID] = table_[p][TABLE_TOKEN_ID];
1394
+ path[i][PATH_NUM_TOKENS] = path[i + piece_length][PATH_NUM_TOKENS] + 1;
1395
+
1396
+ if (score == UNKNOWN_SCORE) {
1397
+ // Add UTF-8 byte count
1398
+ path[i][PATH_NUM_TOKENS] += (c >= 0x80) + (c >= 0x800) + (c >= 0x10000);
1399
+ }
1400
+ }
1401
+ }
1402
+
1403
+ if (score == UNKNOWN_SCORE) {
1404
+ break;
1405
+ }
1406
+ }
1407
+ }
1408
+
1409
+ // Decode the best path
1410
+ std::vector<llama_token> token_ids;
1411
+ token_ids.reserve(path[0][PATH_NUM_TOKENS]);
1412
+
1413
+ int pos = 0;
1414
+ while (pos < static_cast<int>(data_len)) {
1415
+ if (path[pos][PATH_TOKEN_ID] >= 0) {
1416
+ token_ids.push_back(path[pos][PATH_TOKEN_ID]);
1417
+ } else {
1418
+ // Fall back to byte tokens
1419
+ uint32_t c = unicode_data[pos];
1420
+ int s = 1 + (c >= 0x80) + (c >= 0x800) + (c >= 0x10000);
1421
+
1422
+ for (int i = 0; i < s; ++i) {
1423
+ uint8_t b;
1424
+ if (s == 1) {
1425
+ b = c;
1426
+ } else {
1427
+ if (i == 0) {
1428
+ b = (0xF00 >> s) & 0xFF;
1429
+ } else {
1430
+ b = 0x80;
1431
+ }
1432
+ }
1433
+ token_ids.push_back(bytes_[b | ((c >> ((s - i - 1) * 6)) & 0x3F)]);
1434
+ }
1435
+ }
1436
+
1437
+ assert(path[pos][PATH_TOKEN_LENGTH] > 0);
1438
+ pos += path[pos][PATH_TOKEN_LENGTH];
1439
+ }
1440
+
1441
+ return token_ids;
1442
+ }
1443
+ private:
1444
+ // Constants for table structure
1445
+ static constexpr int32_t TABLE_PIECE_LENGTH = 0;
1446
+ static constexpr int32_t TABLE_TOKEN_ID = 1;
1447
+ static constexpr int32_t TABLE_SCORE = 2;
1448
+ static constexpr int32_t TABLE_PIECE_ID = 3;
1449
+
1450
+ // Constants for path array
1451
+ static constexpr int32_t PATH_TOKEN_LENGTH = 0;
1452
+ static constexpr int32_t PATH_TOKEN_ID = 1;
1453
+ static constexpr int32_t PATH_NUM_TOKENS = 2;
1454
+
1455
+ // Score constants
1456
+ static constexpr int32_t INVALID_SCORE = -20000000;
1457
+ static constexpr int32_t UNKNOWN_SCORE = -10000000;
1458
+
1459
+ // List of tokens in the vocabulary
1460
+ std::vector<std::string> tokens_;
1461
+
1462
+ // Mapping from byte code point to token ID (for byte fallback)
1463
+ std::vector<llama_token> bytes_;
1464
+
1465
+ // Mapping from piece code to suffix ID
1466
+ std::unordered_map<int64_t, int32_t> to_suffix_id_;
1467
+
1468
+ // Flattened table representing the Trie structure
1469
+ // Each row contains: [piece_length, token_id, score, piece_id]
1470
+ std::vector<std::vector<int32_t>> table_;
1471
+ };
1472
+
1473
+ struct llm_tokenizer_plamo2_session {
1474
+ llm_tokenizer_plamo2_session(const llm_tokenizer_plamo2 & tokenizer) : tokenizer(tokenizer) {}
1475
+
1476
+ void tokenize(const std::string & text, std::vector<llama_token> & output) {
1477
+ std::vector<llama_token> tokens = tokenizer.encode(text);
1478
+ output.insert(output.end(), tokens.begin(), tokens.end());
1479
+ }
1480
+
1481
+ private:
1482
+ const llm_tokenizer_plamo2 & tokenizer;
1483
+ };
1484
+
1199
1485
  //
1200
1486
  // impl
1201
1487
  //
@@ -1499,6 +1785,16 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1499
1785
  special_unk_id = LLAMA_TOKEN_NULL;
1500
1786
  special_sep_id = LLAMA_TOKEN_NULL;
1501
1787
  special_pad_id = LLAMA_TOKEN_NULL;
1788
+ } else if (tokenizer_model == "plamo2") {
1789
+ type = LLAMA_VOCAB_TYPE_PLAMO2;
1790
+
1791
+ // PLaMo-2 default special tokens (these will be overridden by model config)
1792
+ special_bos_id = 1; // <|plamo:bos|>
1793
+ special_eos_id = 2; // <|plamo:eos|>
1794
+ special_unk_id = 0; // <|plamo:unk|>
1795
+ special_sep_id = LLAMA_TOKEN_NULL;
1796
+ special_pad_id = 3; // <|plamo:pad|>
1797
+ special_mask_id = LLAMA_TOKEN_NULL;
1502
1798
  } else {
1503
1799
  throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
1504
1800
  }
@@ -1524,7 +1820,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1524
1820
  tokenizer_pre == "llama-bpe"||
1525
1821
  tokenizer_pre == "falcon3" ||
1526
1822
  tokenizer_pre == "falcon-h1" ||
1527
- tokenizer_pre == "pixtral") {
1823
+ tokenizer_pre == "pixtral" ||
1824
+ tokenizer_pre == "midm-2.0" ||
1825
+ tokenizer_pre == "lfm2") {
1528
1826
  pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
1529
1827
  ignore_merges = true;
1530
1828
  add_bos = true;
@@ -1627,6 +1925,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1627
1925
  } else if (
1628
1926
  tokenizer_pre == "exaone") {
1629
1927
  pre_type = LLAMA_VOCAB_PRE_TYPE_EXAONE;
1928
+ } else if (
1929
+ tokenizer_pre == "exaone4") {
1930
+ pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
1630
1931
  } else if (
1631
1932
  tokenizer_pre == "chameleon") {
1632
1933
  pre_type = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
@@ -1663,6 +1964,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1663
1964
  tokenizer_pre == "hunyuan") {
1664
1965
  pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN;
1665
1966
  clean_spaces = false;
1967
+ } else if (
1968
+ tokenizer_pre == "kimi-k2") {
1969
+ pre_type = LLAMA_VOCAB_PRE_TYPE_KIMI_K2;
1970
+ clean_spaces = false;
1666
1971
  } else {
1667
1972
  throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
1668
1973
  }
@@ -1846,6 +2151,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1846
2151
  || t.first == "<EOT>"
1847
2152
  || t.first == "_<EOT>"
1848
2153
  || t.first == "<|end▁of▁sentence|>" // DeepSeek
2154
+ || t.first == "<end_of_utterance>" // smoldocling
1849
2155
  ) {
1850
2156
  special_eot_id = t.second;
1851
2157
  if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@@ -2005,6 +2311,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
2005
2311
  || t.first == "<EOT>"
2006
2312
  || t.first == "_<EOT>"
2007
2313
  || t.first == "<|end_of_text|>"
2314
+ || t.first == "<end_of_utterance>" // smoldocling
2008
2315
  ) {
2009
2316
  special_eog_ids.insert(t.second);
2010
2317
  if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@@ -2141,13 +2448,14 @@ enum llama_vocab_type llama_vocab::impl::get_type() const {
2141
2448
 
2142
2449
  std::string llama_vocab::impl::type_name() const{
2143
2450
  switch (type) {
2144
- case LLAMA_VOCAB_TYPE_NONE: return "no vocab";
2145
- case LLAMA_VOCAB_TYPE_SPM: return "SPM";
2146
- case LLAMA_VOCAB_TYPE_BPE: return "BPE";
2147
- case LLAMA_VOCAB_TYPE_WPM: return "WPM";
2148
- case LLAMA_VOCAB_TYPE_UGM: return "UGM";
2149
- case LLAMA_VOCAB_TYPE_RWKV: return "RWKV";
2150
- default: return "unknown";
2451
+ case LLAMA_VOCAB_TYPE_NONE: return "no vocab";
2452
+ case LLAMA_VOCAB_TYPE_SPM: return "SPM";
2453
+ case LLAMA_VOCAB_TYPE_BPE: return "BPE";
2454
+ case LLAMA_VOCAB_TYPE_WPM: return "WPM";
2455
+ case LLAMA_VOCAB_TYPE_UGM: return "UGM";
2456
+ case LLAMA_VOCAB_TYPE_RWKV: return "RWKV";
2457
+ case LLAMA_VOCAB_TYPE_PLAMO2: return "PLaMo2";
2458
+ default: return "unknown";
2151
2459
  }
2152
2460
  }
2153
2461
 
@@ -2230,6 +2538,9 @@ void llama_vocab::impl::init_tokenizer(enum llama_vocab_type type) {
2230
2538
  case LLAMA_VOCAB_TYPE_RWKV:
2231
2539
  tokenizer = std::make_unique<llm_tokenizer_rwkv>(vocab);
2232
2540
  break;
2541
+ case LLAMA_VOCAB_TYPE_PLAMO2:
2542
+ tokenizer = std::make_unique<llm_tokenizer_plamo2>(vocab);
2543
+ break;
2233
2544
  default:
2234
2545
  GGML_ABORT("unsupported vocab type");
2235
2546
  }
@@ -2562,6 +2873,23 @@ std::vector<llama_token> llama_vocab::impl::tokenize(
2562
2873
  if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2563
2874
  std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
2564
2875
 
2876
+ #ifdef PRETOKENIZERDEBUG
2877
+ LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
2878
+ #endif
2879
+
2880
+ session.tokenize(text, output);
2881
+ } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
2882
+ output.push_back(fragment.token);
2883
+ }
2884
+ }
2885
+ } break;
2886
+ case LLAMA_VOCAB_TYPE_PLAMO2:
2887
+ {
2888
+ llm_tokenizer_plamo2_session session(*static_cast<const llm_tokenizer_plamo2 *>(tokenizer.get()));
2889
+ for (const auto & fragment : fragment_buffer) {
2890
+ if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2891
+ std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
2892
+
2565
2893
  #ifdef PRETOKENIZERDEBUG
2566
2894
  LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
2567
2895
  #endif
@@ -2660,6 +2988,24 @@ int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t
2660
2988
  memcpy(buf, result.data(), result.size());
2661
2989
  return (int)result.size();
2662
2990
  }
2991
+ case LLAMA_VOCAB_TYPE_PLAMO2: {
2992
+ // PLaMo-2 uses similar token handling as BPE/SPM
2993
+ if (vocab.is_byte(token)) {
2994
+ // Handle byte tokens like <0xXX>
2995
+ if (token_text.length() == 6 && token_text.substr(0, 3) == "<0x" && token_text.back() == '>') {
2996
+ int hex_val = std::stoi(token_text.substr(3, 2), nullptr, 16);
2997
+ if (length < 1) {
2998
+ return -1;
2999
+ }
3000
+ buf[0] = static_cast<char>(hex_val);
3001
+ return 1;
3002
+ }
3003
+ }
3004
+
3005
+ // Normal token - just copy the text
3006
+ std::string result = token_text;
3007
+ return _try_copy(result.data(), result.size());
3008
+ }
2663
3009
  default:
2664
3010
  GGML_ABORT("fatal error");
2665
3011
  }
@@ -2904,6 +3250,12 @@ llama_token llama_vocab::byte_to_token(uint8_t ch) const {
2904
3250
  case LLAMA_VOCAB_TYPE_BPE: {
2905
3251
  return pimpl->token_to_id.at(unicode_byte_to_utf8(ch));
2906
3252
  }
3253
+ case LLAMA_VOCAB_TYPE_PLAMO2: {
3254
+ // PLaMo-2 uses byte tokens in format <0xXX>
3255
+ char hex_str[8];
3256
+ snprintf(hex_str, sizeof(hex_str), "<0x%02X>", ch);
3257
+ return pimpl->token_to_id.at(hex_str);
3258
+ }
2907
3259
  default:
2908
3260
  GGML_ABORT("fatal error");
2909
3261
  }
@@ -3005,6 +3357,10 @@ llama_token llama_vocab::token_fim_sep() const {
3005
3357
  return pimpl->special_fim_sep_id;
3006
3358
  }
3007
3359
 
3360
+ llama_token llama_vocab::token_mask() const {
3361
+ return pimpl->special_mask_id;
3362
+ }
3363
+
3008
3364
  bool llama_vocab::get_add_space_prefix() const {
3009
3365
  return pimpl->add_space_prefix;
3010
3366
  }
@@ -3245,6 +3601,10 @@ llama_token llama_vocab_fim_sep(const struct llama_vocab * vocab) {
3245
3601
  return vocab->token_fim_sep();
3246
3602
  }
3247
3603
 
3604
+ llama_token llama_vocab_mask(const struct llama_vocab* vocab) {
3605
+ return vocab->token_mask();
3606
+ }
3607
+
3248
3608
  // deprecated
3249
3609
  const char * llama_token_get_text(const struct llama_vocab * vocab, llama_token token) {
3250
3610
  return llama_vocab_get_text(vocab, token);
@@ -3381,4 +3741,3 @@ int32_t llama_detokenize(
3381
3741
  bool unparse_special) {
3382
3742
  return vocab->detokenize(tokens, n_tokens, text, text_len_max, remove_special, unparse_special);
3383
3743
  }
3384
-
@@ -6,6 +6,48 @@
6
6
  #include <vector>
7
7
  #include <memory>
8
8
 
9
+ // pre-tokenization types
10
+ enum llama_vocab_pre_type {
11
+ LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
12
+ LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
13
+ LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
14
+ LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
15
+ LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
16
+ LLAMA_VOCAB_PRE_TYPE_MPT = 5,
17
+ LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
18
+ LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
19
+ LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
20
+ LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
21
+ LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10,
22
+ LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
23
+ LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
24
+ LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
25
+ LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
26
+ LLAMA_VOCAB_PRE_TYPE_PORO = 15,
27
+ LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16,
28
+ LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
29
+ LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
30
+ LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
31
+ LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
32
+ LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
33
+ LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
34
+ LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
35
+ LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
36
+ LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
37
+ LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
38
+ LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
39
+ LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
40
+ LLAMA_VOCAB_PRE_TYPE_GPT4O = 29,
41
+ LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30,
42
+ LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
43
+ LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
44
+ LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
45
+ LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
46
+ LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
47
+ LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36,
48
+ LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37,
49
+ };
50
+
9
51
  struct LLM_KV;
10
52
  struct llama_model_loader;
11
53
 
@@ -59,6 +101,7 @@ struct llama_vocab {
59
101
  llama_token token_sep() const;
60
102
  llama_token token_nl () const;
61
103
  llama_token token_pad() const;
104
+ llama_token token_mask() const;
62
105
 
63
106
  llama_token token_prefix() const;
64
107
  llama_token token_middle() const;