@fugood/llama.node 1.0.2 → 1.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. package/package.json +14 -14
  2. package/src/llama.cpp/CMakeLists.txt +0 -1
  3. package/src/llama.cpp/common/CMakeLists.txt +4 -5
  4. package/src/llama.cpp/common/arg.cpp +44 -0
  5. package/src/llama.cpp/common/common.cpp +22 -6
  6. package/src/llama.cpp/common/common.h +15 -1
  7. package/src/llama.cpp/ggml/CMakeLists.txt +10 -2
  8. package/src/llama.cpp/ggml/include/ggml-webgpu.h +19 -0
  9. package/src/llama.cpp/ggml/include/ggml.h +104 -10
  10. package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -1
  11. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +6 -1
  12. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +12 -1
  13. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +343 -1094
  14. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +749 -163
  15. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +5 -0
  16. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +1 -1
  17. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +12 -9
  18. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +88 -9
  19. package/src/llama.cpp/include/llama.h +13 -47
  20. package/src/llama.cpp/src/llama-arch.cpp +298 -3
  21. package/src/llama.cpp/src/llama-arch.h +22 -1
  22. package/src/llama.cpp/src/llama-batch.cpp +103 -71
  23. package/src/llama.cpp/src/llama-batch.h +31 -18
  24. package/src/llama.cpp/src/llama-chat.cpp +59 -1
  25. package/src/llama.cpp/src/llama-chat.h +3 -0
  26. package/src/llama.cpp/src/llama-context.cpp +134 -95
  27. package/src/llama.cpp/src/llama-context.h +13 -16
  28. package/src/llama.cpp/src/llama-cparams.h +3 -2
  29. package/src/llama.cpp/src/llama-graph.cpp +279 -180
  30. package/src/llama.cpp/src/llama-graph.h +183 -122
  31. package/src/llama.cpp/src/llama-hparams.cpp +47 -1
  32. package/src/llama.cpp/src/llama-hparams.h +12 -1
  33. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +38 -22
  34. package/src/llama.cpp/src/llama-kv-cache-unified-iswa.h +7 -2
  35. package/src/llama.cpp/src/llama-kv-cache-unified.cpp +849 -304
  36. package/src/llama.cpp/src/llama-kv-cache-unified.h +143 -47
  37. package/src/llama.cpp/src/llama-kv-cells.h +62 -10
  38. package/src/llama.cpp/src/llama-memory-hybrid.cpp +10 -4
  39. package/src/llama.cpp/src/llama-memory-hybrid.h +3 -1
  40. package/src/llama.cpp/src/llama-memory-recurrent.cpp +21 -11
  41. package/src/llama.cpp/src/llama-memory.cpp +17 -0
  42. package/src/llama.cpp/src/llama-memory.h +3 -0
  43. package/src/llama.cpp/src/llama-model.cpp +3373 -743
  44. package/src/llama.cpp/src/llama-model.h +20 -4
  45. package/src/llama.cpp/src/llama-quant.cpp +2 -2
  46. package/src/llama.cpp/src/llama-vocab.cpp +376 -10
  47. package/src/llama.cpp/src/llama-vocab.h +43 -0
  48. package/src/llama.cpp/src/unicode.cpp +207 -0
  49. package/src/llama.cpp/src/unicode.h +2 -0
  50. package/src/llama.cpp/ggml/include/ggml-kompute.h +0 -50
@@ -32,17 +32,21 @@ enum llm_type {
32
32
  LLM_TYPE_190M,
33
33
  LLM_TYPE_220M,
34
34
  LLM_TYPE_250M,
35
+ LLM_TYPE_256M,
35
36
  LLM_TYPE_270M,
36
37
  LLM_TYPE_335M,
38
+ LLM_TYPE_350M,
37
39
  LLM_TYPE_410M,
38
40
  LLM_TYPE_450M,
39
41
  LLM_TYPE_475M,
42
+ LLM_TYPE_700M,
40
43
  LLM_TYPE_770M,
41
44
  LLM_TYPE_780M,
42
45
  LLM_TYPE_0_3B,
43
46
  LLM_TYPE_0_5B,
44
47
  LLM_TYPE_0_6B,
45
48
  LLM_TYPE_1B,
49
+ LLM_TYPE_1_2B,
46
50
  LLM_TYPE_1_3B,
47
51
  LLM_TYPE_1_4B,
48
52
  LLM_TYPE_1_5B,
@@ -94,8 +98,11 @@ enum llm_type {
94
98
  LLM_TYPE_57B_A14B,
95
99
  LLM_TYPE_17B_16E, // llama4 Scout
96
100
  LLM_TYPE_17B_128E, // llama4 Maverick
101
+ LLM_TYPE_A13B,
102
+ LLM_TYPE_21B_A3B, // Ernie MoE small
97
103
  LLM_TYPE_30B_A3B,
98
104
  LLM_TYPE_235B_A22B,
105
+ LLM_TYPE_300B_A47B, // Ernie MoE big
99
106
  LLM_TYPE_E2B,
100
107
  LLM_TYPE_E4B,
101
108
  };
@@ -153,6 +160,12 @@ struct llama_layer_convnext {
153
160
  struct ggml_tensor * gamma = nullptr;
154
161
  };
155
162
 
163
+ struct llama_layer_shortconv {
164
+ struct ggml_tensor * in_proj = nullptr;
165
+ struct ggml_tensor * conv = nullptr;
166
+ struct ggml_tensor * out_proj = nullptr;
167
+ };
168
+
156
169
  struct llama_layer {
157
170
  // normalization
158
171
  struct ggml_tensor * attn_norm = nullptr;
@@ -172,6 +185,10 @@ struct llama_layer {
172
185
  struct ggml_tensor * ffn_sub_norm = nullptr;
173
186
  struct ggml_tensor * attn_norm_cross = nullptr;
174
187
  struct ggml_tensor * attn_norm_enc = nullptr;
188
+ struct ggml_tensor * ssm_norm = nullptr;
189
+ struct ggml_tensor * ssm_dt_norm = nullptr;
190
+ struct ggml_tensor * ssm_b_norm = nullptr;
191
+ struct ggml_tensor * ssm_c_norm = nullptr;
175
192
 
176
193
  // attention
177
194
  struct ggml_tensor * wq = nullptr;
@@ -335,6 +352,8 @@ struct llama_layer {
335
352
  struct llama_layer_posnet posnet;
336
353
 
337
354
  struct llama_layer_convnext convnext;
355
+
356
+ struct llama_layer_shortconv shortconv;
338
357
  };
339
358
 
340
359
  struct llama_model {
@@ -435,10 +454,7 @@ struct llama_model {
435
454
  llama_memory_i * create_memory(const llama_memory_params & params, llama_cparams & cparams) const;
436
455
 
437
456
  // TODO: move this to new llm_arch_model_i interface
438
- llm_graph_result_ptr build_graph(
439
- const llm_graph_params & params,
440
- ggml_cgraph * gf,
441
- llm_graph_type type) const;
457
+ ggml_cgraph * build_graph(const llm_graph_params & params) const;
442
458
 
443
459
  private:
444
460
  struct impl;
@@ -844,6 +844,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
844
844
  // do not quantize Mamba's small yet 2D weights
845
845
  // NOTE: can't use LLM_TN here because the layer number is not known
846
846
  quantize &= name.find("ssm_conv1d.weight") == std::string::npos;
847
+ quantize &= name.find("shortconv.conv.weight") == std::string::npos;
847
848
 
848
849
  // do not quantize RWKV's small yet 2D weights
849
850
  quantize &= name.find("time_mix_first.weight") == std::string::npos;
@@ -883,8 +884,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
883
884
  if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) {
884
885
  if (qtype != new_type) {
885
886
  LLAMA_LOG_DEBUG("(overriding %s) ", ggml_type_name(new_type));
886
- new_type = qtype;
887
- break; // if two or more types are specified for the tensor, first match wins
887
+ new_type = qtype; // if two or more types are specified for the same tensor, the last match wins
888
888
  }
889
889
  }
890
890
  }
@@ -11,6 +11,7 @@
11
11
  #include <cassert>
12
12
  #include <cctype>
13
13
  #include <cfloat>
14
+ #include <cmath>
14
15
  #include <cstdarg>
15
16
  #include <cstring>
16
17
  #include <forward_list>
@@ -351,6 +352,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
351
352
  break;
352
353
  case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
353
354
  case LLAMA_VOCAB_PRE_TYPE_QWEN2:
355
+ case LLAMA_VOCAB_PRE_TYPE_HUNYUAN:
354
356
  regex_exprs = {
355
357
  // original regex from tokenizer.json
356
358
  // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
@@ -403,6 +405,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
403
405
  "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
404
406
  };
405
407
  break;
408
+ case LLAMA_VOCAB_PRE_TYPE_KIMI_K2:
409
+ regex_exprs = {
410
+ // K2 trigger pattern - this will activate the custom K2 handler in unicode.cpp
411
+ // The custom handler implements all K2 patterns with proper Han character exclusion
412
+ "\\p{Han}+",
413
+ };
414
+ break;
406
415
  case LLAMA_VOCAB_PRE_TYPE_SUPERBPE:
407
416
  regex_exprs = {
408
417
  "\\p{N}+",
@@ -1195,6 +1204,284 @@ private:
1195
1204
  const llm_tokenizer_rwkv & tokenizer;
1196
1205
  };
1197
1206
 
1207
+ struct llm_tokenizer_plamo2 : llm_tokenizer {
1208
+ llm_tokenizer_plamo2(const llama_vocab & vocab) {
1209
+ build(vocab);
1210
+ }
1211
+
1212
+ void build(const llama_vocab & vocab) {
1213
+ // Reset internal structures
1214
+ tokens_.clear();
1215
+ bytes_.assign(256, 0);
1216
+ to_suffix_id_.clear();
1217
+ table_.clear();
1218
+
1219
+ // Build token list and byte mapping
1220
+ std::unordered_map<std::string, float> suffix_to_score;
1221
+ std::unordered_map<std::string, llama_token> token_to_id;
1222
+
1223
+ for (size_t token_id = 0; token_id < vocab.n_tokens(); ++token_id) {
1224
+ const auto & entry = vocab.get_token_data(token_id);
1225
+ tokens_.push_back(entry.text);
1226
+ token_to_id[entry.text] = static_cast<llama_token>(token_id);
1227
+
1228
+ // Handle byte tokens
1229
+ if (vocab.is_byte(token_id)) {
1230
+ if (entry.text.length() == 6 && entry.text.substr(0, 3) == "<0x" && entry.text.back() == '>') {
1231
+ std::string hex_str = entry.text.substr(3, 2);
1232
+ int byte_val = std::stoi(hex_str, nullptr, 16);
1233
+ bytes_[byte_val] = static_cast<llama_token>(token_id);
1234
+ }
1235
+ continue;
1236
+ }
1237
+
1238
+ // Add token and all its suffixes to suffix_to_score
1239
+ suffix_to_score[entry.text] = entry.score;
1240
+
1241
+ // Extract suffixes character by character (UTF-8 aware)
1242
+ std::vector<uint32_t> cpts = unicode_cpts_from_utf8(entry.text);
1243
+ for (size_t i = 1; i < cpts.size(); ++i) {
1244
+ std::string suffix;
1245
+ for (size_t j = i; j < cpts.size(); ++j) {
1246
+ suffix += unicode_cpt_to_utf8(cpts[j]);
1247
+ }
1248
+ if (suffix_to_score.find(suffix) == suffix_to_score.end()) {
1249
+ suffix_to_score[suffix] = std::numeric_limits<float>::quiet_NaN();
1250
+ }
1251
+ }
1252
+ }
1253
+
1254
+ // Check that all byte tokens are set
1255
+ for (int i = 0; i < 256; ++i) {
1256
+ if (bytes_[i] == 0) {
1257
+ throw std::runtime_error("Byte token for <0x" + std::to_string(i) + "> is not set");
1258
+ }
1259
+ }
1260
+
1261
+ // Build suffix list in lexicographical order of reversed strings
1262
+ std::vector<std::string> suffixes;
1263
+ for (const auto & pair : suffix_to_score) {
1264
+ suffixes.push_back(pair.first);
1265
+ }
1266
+ suffixes.push_back(""); // Empty suffix
1267
+
1268
+ std::sort(suffixes.begin(), suffixes.end(), [](const std::string & a, const std::string & b) {
1269
+ std::string rev_a(a.rbegin(), a.rend());
1270
+ std::string rev_b(b.rbegin(), b.rend());
1271
+ return rev_a < rev_b;
1272
+ });
1273
+
1274
+ // Build suffix_to_id and to_suffix_id_
1275
+ std::unordered_map<std::string, int32_t> suffix_to_id;
1276
+ int32_t num_pieces = 0;
1277
+
1278
+ for (const auto & suffix : suffixes) {
1279
+ suffix_to_id[suffix] = num_pieces;
1280
+ if (!suffix.empty()) {
1281
+ std::vector<uint32_t> cpts = unicode_cpts_from_utf8(suffix);
1282
+
1283
+ std::string remaining;
1284
+ for (size_t i = 1; i < cpts.size(); ++i) {
1285
+ remaining += unicode_cpt_to_utf8(cpts[i]);
1286
+ }
1287
+
1288
+ int64_t piece_code = (static_cast<int64_t>(cpts[0]) << 32) | suffix_to_id[remaining];
1289
+ to_suffix_id_[piece_code] = num_pieces;
1290
+
1291
+ // Count number of pieces for this suffix
1292
+ int32_t pieces_for_suffix = 1; // sentinel row
1293
+ for (int32_t piece_length = static_cast<int32_t>(cpts.size()); piece_length > 0; --piece_length) {
1294
+ std::string piece;
1295
+ for (int32_t i = 0; i < piece_length; ++i) {
1296
+ piece += unicode_cpt_to_utf8(cpts[i]);
1297
+ }
1298
+ if (suffix_to_score.find(piece) != suffix_to_score.end()) {
1299
+ pieces_for_suffix++;
1300
+ }
1301
+ }
1302
+ num_pieces += pieces_for_suffix;
1303
+ } else {
1304
+ num_pieces++; // Empty suffix contributes one piece (sentinel row)
1305
+ }
1306
+ }
1307
+
1308
+ // Build flattened table
1309
+ table_.resize(num_pieces, std::vector<int32_t>(4, 0));
1310
+ int32_t table_idx = 0;
1311
+
1312
+ for (const auto & suffix : suffixes) {
1313
+ // Add all prefixes of the suffix to the table (in decreasing order of length)
1314
+ std::vector<uint32_t> cpts = unicode_cpts_from_utf8(suffix);
1315
+ for (int32_t piece_length = static_cast<int32_t>(cpts.size()); piece_length > 0; --piece_length) {
1316
+ std::string piece;
1317
+ for (int32_t i = 0; i < piece_length; ++i) {
1318
+ piece += unicode_cpt_to_utf8(cpts[i]);
1319
+ }
1320
+
1321
+ auto score_it = suffix_to_score.find(piece);
1322
+ if (score_it == suffix_to_score.end()) {
1323
+ continue;
1324
+ }
1325
+
1326
+ table_[table_idx][TABLE_PIECE_LENGTH] = piece_length;
1327
+ auto token_it = token_to_id.find(piece);
1328
+ table_[table_idx][TABLE_TOKEN_ID] = (token_it != token_to_id.end()) ? token_it->second : -1;
1329
+
1330
+ float score = score_it->second;
1331
+ table_[table_idx][TABLE_SCORE] = std::isfinite(score) ?
1332
+ static_cast<int32_t>(std::round(score * 1e4)) : INVALID_SCORE;
1333
+ table_[table_idx][TABLE_PIECE_ID] = suffix_to_id[piece];
1334
+
1335
+ table_idx++;
1336
+ }
1337
+
1338
+ // Add sentinel row
1339
+ table_[table_idx][TABLE_PIECE_LENGTH] = 1;
1340
+ table_[table_idx][TABLE_TOKEN_ID] = -1;
1341
+ table_[table_idx][TABLE_SCORE] = UNKNOWN_SCORE;
1342
+ table_idx++;
1343
+ }
1344
+ }
1345
+
1346
+ std::vector<llama_token> encode(const std::string & text) const {
1347
+ std::vector<uint32_t> unicode_data = unicode_cpts_from_utf8(text);
1348
+ // Skip the first code point if it is a BOM (Byte Order Mark)
1349
+ if (!unicode_data.empty() && unicode_data[0] == 0xFEFF) {
1350
+ unicode_data.erase(unicode_data.begin());
1351
+ }
1352
+
1353
+ if (unicode_data.empty()) {
1354
+ return {};
1355
+ }
1356
+
1357
+ const size_t data_len = unicode_data.size();
1358
+
1359
+ // Initialize scores array (dynamic programming)
1360
+ std::vector<int64_t> scores(data_len + 1, static_cast<int64_t>(1) << 60);
1361
+ scores[data_len] = 0;
1362
+
1363
+ // Path array to track best tokenization
1364
+ std::vector<std::vector<int32_t>> path(data_len + 1, std::vector<int32_t>(3, 0));
1365
+
1366
+ int32_t suffix_id = 0;
1367
+
1368
+ // Process from end to beginning
1369
+ for (int i = static_cast<int>(data_len) - 1; i >= 0; --i) {
1370
+ uint32_t c = unicode_data[i];
1371
+
1372
+ // Find next suffix ID
1373
+ for (size_t p = suffix_id; p < table_.size(); ++p) {
1374
+ int64_t piece_code = (static_cast<int64_t>(c) << 32) | table_[p][TABLE_PIECE_ID];
1375
+ auto it = to_suffix_id_.find(piece_code);
1376
+ suffix_id = (it != to_suffix_id_.end()) ? it->second : 0;
1377
+
1378
+ if (suffix_id > 0 || table_[p][TABLE_SCORE] == UNKNOWN_SCORE) {
1379
+ break;
1380
+ }
1381
+ }
1382
+
1383
+ // Update best path
1384
+ for (size_t p = suffix_id; p < table_.size(); ++p) {
1385
+ int32_t score = table_[p][TABLE_SCORE];
1386
+ if (score > INVALID_SCORE) {
1387
+ int32_t piece_length = table_[p][TABLE_PIECE_LENGTH];
1388
+ int64_t s = scores[i + piece_length] - score;
1389
+
1390
+ if (s < scores[i]) {
1391
+ scores[i] = s;
1392
+ path[i][PATH_TOKEN_LENGTH] = piece_length;
1393
+ path[i][PATH_TOKEN_ID] = table_[p][TABLE_TOKEN_ID];
1394
+ path[i][PATH_NUM_TOKENS] = path[i + piece_length][PATH_NUM_TOKENS] + 1;
1395
+
1396
+ if (score == UNKNOWN_SCORE) {
1397
+ // Add UTF-8 byte count
1398
+ path[i][PATH_NUM_TOKENS] += (c >= 0x80) + (c >= 0x800) + (c >= 0x10000);
1399
+ }
1400
+ }
1401
+ }
1402
+
1403
+ if (score == UNKNOWN_SCORE) {
1404
+ break;
1405
+ }
1406
+ }
1407
+ }
1408
+
1409
+ // Decode the best path
1410
+ std::vector<llama_token> token_ids;
1411
+ token_ids.reserve(path[0][PATH_NUM_TOKENS]);
1412
+
1413
+ int pos = 0;
1414
+ while (pos < static_cast<int>(data_len)) {
1415
+ if (path[pos][PATH_TOKEN_ID] >= 0) {
1416
+ token_ids.push_back(path[pos][PATH_TOKEN_ID]);
1417
+ } else {
1418
+ // Fall back to byte tokens
1419
+ uint32_t c = unicode_data[pos];
1420
+ int s = 1 + (c >= 0x80) + (c >= 0x800) + (c >= 0x10000);
1421
+
1422
+ for (int i = 0; i < s; ++i) {
1423
+ uint8_t b;
1424
+ if (s == 1) {
1425
+ b = c;
1426
+ } else {
1427
+ if (i == 0) {
1428
+ b = (0xF00 >> s) & 0xFF;
1429
+ } else {
1430
+ b = 0x80;
1431
+ }
1432
+ }
1433
+ token_ids.push_back(bytes_[b | ((c >> ((s - i - 1) * 6)) & 0x3F)]);
1434
+ }
1435
+ }
1436
+
1437
+ assert(path[pos][PATH_TOKEN_LENGTH] > 0);
1438
+ pos += path[pos][PATH_TOKEN_LENGTH];
1439
+ }
1440
+
1441
+ return token_ids;
1442
+ }
1443
+ private:
1444
+ // Constants for table structure
1445
+ static constexpr int32_t TABLE_PIECE_LENGTH = 0;
1446
+ static constexpr int32_t TABLE_TOKEN_ID = 1;
1447
+ static constexpr int32_t TABLE_SCORE = 2;
1448
+ static constexpr int32_t TABLE_PIECE_ID = 3;
1449
+
1450
+ // Constants for path array
1451
+ static constexpr int32_t PATH_TOKEN_LENGTH = 0;
1452
+ static constexpr int32_t PATH_TOKEN_ID = 1;
1453
+ static constexpr int32_t PATH_NUM_TOKENS = 2;
1454
+
1455
+ // Score constants
1456
+ static constexpr int32_t INVALID_SCORE = -20000000;
1457
+ static constexpr int32_t UNKNOWN_SCORE = -10000000;
1458
+
1459
+ // List of tokens in the vocabulary
1460
+ std::vector<std::string> tokens_;
1461
+
1462
+ // Mapping from byte code point to token ID (for byte fallback)
1463
+ std::vector<llama_token> bytes_;
1464
+
1465
+ // Mapping from piece code to suffix ID
1466
+ std::unordered_map<int64_t, int32_t> to_suffix_id_;
1467
+
1468
+ // Flattened table representing the Trie structure
1469
+ // Each row contains: [piece_length, token_id, score, piece_id]
1470
+ std::vector<std::vector<int32_t>> table_;
1471
+ };
1472
+
1473
+ struct llm_tokenizer_plamo2_session {
1474
+ llm_tokenizer_plamo2_session(const llm_tokenizer_plamo2 & tokenizer) : tokenizer(tokenizer) {}
1475
+
1476
+ void tokenize(const std::string & text, std::vector<llama_token> & output) {
1477
+ std::vector<llama_token> tokens = tokenizer.encode(text);
1478
+ output.insert(output.end(), tokens.begin(), tokens.end());
1479
+ }
1480
+
1481
+ private:
1482
+ const llm_tokenizer_plamo2 & tokenizer;
1483
+ };
1484
+
1198
1485
  //
1199
1486
  // impl
1200
1487
  //
@@ -1498,6 +1785,16 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1498
1785
  special_unk_id = LLAMA_TOKEN_NULL;
1499
1786
  special_sep_id = LLAMA_TOKEN_NULL;
1500
1787
  special_pad_id = LLAMA_TOKEN_NULL;
1788
+ } else if (tokenizer_model == "plamo2") {
1789
+ type = LLAMA_VOCAB_TYPE_PLAMO2;
1790
+
1791
+ // PLaMo-2 default special tokens (these will be overridden by model config)
1792
+ special_bos_id = 1; // <|plamo:bos|>
1793
+ special_eos_id = 2; // <|plamo:eos|>
1794
+ special_unk_id = 0; // <|plamo:unk|>
1795
+ special_sep_id = LLAMA_TOKEN_NULL;
1796
+ special_pad_id = 3; // <|plamo:pad|>
1797
+ special_mask_id = LLAMA_TOKEN_NULL;
1501
1798
  } else {
1502
1799
  throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
1503
1800
  }
@@ -1522,7 +1819,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1522
1819
  tokenizer_pre == "llama-v3" ||
1523
1820
  tokenizer_pre == "llama-bpe"||
1524
1821
  tokenizer_pre == "falcon3" ||
1525
- tokenizer_pre == "pixtral") {
1822
+ tokenizer_pre == "falcon-h1" ||
1823
+ tokenizer_pre == "pixtral" ||
1824
+ tokenizer_pre == "midm-2.0" ||
1825
+ tokenizer_pre == "lfm2") {
1526
1826
  pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
1527
1827
  ignore_merges = true;
1528
1828
  add_bos = true;
@@ -1554,7 +1854,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1554
1854
  tokenizer_pre == "jina-de" ||
1555
1855
  tokenizer_pre == "gigachat" ||
1556
1856
  tokenizer_pre == "jina-v2-es" ||
1557
- tokenizer_pre == "jina-v2-de") {
1857
+ tokenizer_pre == "jina-v2-de" ||
1858
+ tokenizer_pre == "a.x-4.0") {
1558
1859
  pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
1559
1860
  } else if (
1560
1861
  tokenizer_pre == "jina-v1-en" ||
@@ -1624,6 +1925,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1624
1925
  } else if (
1625
1926
  tokenizer_pre == "exaone") {
1626
1927
  pre_type = LLAMA_VOCAB_PRE_TYPE_EXAONE;
1928
+ } else if (
1929
+ tokenizer_pre == "exaone4") {
1930
+ pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
1627
1931
  } else if (
1628
1932
  tokenizer_pre == "chameleon") {
1629
1933
  pre_type = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
@@ -1656,6 +1960,14 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1656
1960
  tokenizer_pre == "seed-coder") {
1657
1961
  pre_type = LLAMA_VOCAB_PRE_TYPE_SEED_CODER;
1658
1962
  clean_spaces = false;
1963
+ } else if (
1964
+ tokenizer_pre == "hunyuan") {
1965
+ pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN;
1966
+ clean_spaces = false;
1967
+ } else if (
1968
+ tokenizer_pre == "kimi-k2") {
1969
+ pre_type = LLAMA_VOCAB_PRE_TYPE_KIMI_K2;
1970
+ clean_spaces = false;
1659
1971
  } else {
1660
1972
  throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
1661
1973
  }
@@ -1839,6 +2151,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1839
2151
  || t.first == "<EOT>"
1840
2152
  || t.first == "_<EOT>"
1841
2153
  || t.first == "<|end▁of▁sentence|>" // DeepSeek
2154
+ || t.first == "<end_of_utterance>" // smoldocling
1842
2155
  ) {
1843
2156
  special_eot_id = t.second;
1844
2157
  if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@@ -1998,6 +2311,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1998
2311
  || t.first == "<EOT>"
1999
2312
  || t.first == "_<EOT>"
2000
2313
  || t.first == "<|end_of_text|>"
2314
+ || t.first == "<end_of_utterance>" // smoldocling
2001
2315
  ) {
2002
2316
  special_eog_ids.insert(t.second);
2003
2317
  if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@@ -2134,13 +2448,14 @@ enum llama_vocab_type llama_vocab::impl::get_type() const {
2134
2448
 
2135
2449
  std::string llama_vocab::impl::type_name() const{
2136
2450
  switch (type) {
2137
- case LLAMA_VOCAB_TYPE_NONE: return "no vocab";
2138
- case LLAMA_VOCAB_TYPE_SPM: return "SPM";
2139
- case LLAMA_VOCAB_TYPE_BPE: return "BPE";
2140
- case LLAMA_VOCAB_TYPE_WPM: return "WPM";
2141
- case LLAMA_VOCAB_TYPE_UGM: return "UGM";
2142
- case LLAMA_VOCAB_TYPE_RWKV: return "RWKV";
2143
- default: return "unknown";
2451
+ case LLAMA_VOCAB_TYPE_NONE: return "no vocab";
2452
+ case LLAMA_VOCAB_TYPE_SPM: return "SPM";
2453
+ case LLAMA_VOCAB_TYPE_BPE: return "BPE";
2454
+ case LLAMA_VOCAB_TYPE_WPM: return "WPM";
2455
+ case LLAMA_VOCAB_TYPE_UGM: return "UGM";
2456
+ case LLAMA_VOCAB_TYPE_RWKV: return "RWKV";
2457
+ case LLAMA_VOCAB_TYPE_PLAMO2: return "PLaMo2";
2458
+ default: return "unknown";
2144
2459
  }
2145
2460
  }
2146
2461
 
@@ -2223,6 +2538,9 @@ void llama_vocab::impl::init_tokenizer(enum llama_vocab_type type) {
2223
2538
  case LLAMA_VOCAB_TYPE_RWKV:
2224
2539
  tokenizer = std::make_unique<llm_tokenizer_rwkv>(vocab);
2225
2540
  break;
2541
+ case LLAMA_VOCAB_TYPE_PLAMO2:
2542
+ tokenizer = std::make_unique<llm_tokenizer_plamo2>(vocab);
2543
+ break;
2226
2544
  default:
2227
2545
  GGML_ABORT("unsupported vocab type");
2228
2546
  }
@@ -2555,6 +2873,23 @@ std::vector<llama_token> llama_vocab::impl::tokenize(
2555
2873
  if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2556
2874
  std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
2557
2875
 
2876
+ #ifdef PRETOKENIZERDEBUG
2877
+ LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
2878
+ #endif
2879
+
2880
+ session.tokenize(text, output);
2881
+ } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
2882
+ output.push_back(fragment.token);
2883
+ }
2884
+ }
2885
+ } break;
2886
+ case LLAMA_VOCAB_TYPE_PLAMO2:
2887
+ {
2888
+ llm_tokenizer_plamo2_session session(*static_cast<const llm_tokenizer_plamo2 *>(tokenizer.get()));
2889
+ for (const auto & fragment : fragment_buffer) {
2890
+ if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2891
+ std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
2892
+
2558
2893
  #ifdef PRETOKENIZERDEBUG
2559
2894
  LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
2560
2895
  #endif
@@ -2653,6 +2988,24 @@ int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t
2653
2988
  memcpy(buf, result.data(), result.size());
2654
2989
  return (int)result.size();
2655
2990
  }
2991
+ case LLAMA_VOCAB_TYPE_PLAMO2: {
2992
+ // PLaMo-2 uses similar token handling as BPE/SPM
2993
+ if (vocab.is_byte(token)) {
2994
+ // Handle byte tokens like <0xXX>
2995
+ if (token_text.length() == 6 && token_text.substr(0, 3) == "<0x" && token_text.back() == '>') {
2996
+ int hex_val = std::stoi(token_text.substr(3, 2), nullptr, 16);
2997
+ if (length < 1) {
2998
+ return -1;
2999
+ }
3000
+ buf[0] = static_cast<char>(hex_val);
3001
+ return 1;
3002
+ }
3003
+ }
3004
+
3005
+ // Normal token - just copy the text
3006
+ std::string result = token_text;
3007
+ return _try_copy(result.data(), result.size());
3008
+ }
2656
3009
  default:
2657
3010
  GGML_ABORT("fatal error");
2658
3011
  }
@@ -2897,6 +3250,12 @@ llama_token llama_vocab::byte_to_token(uint8_t ch) const {
2897
3250
  case LLAMA_VOCAB_TYPE_BPE: {
2898
3251
  return pimpl->token_to_id.at(unicode_byte_to_utf8(ch));
2899
3252
  }
3253
+ case LLAMA_VOCAB_TYPE_PLAMO2: {
3254
+ // PLaMo-2 uses byte tokens in format <0xXX>
3255
+ char hex_str[8];
3256
+ snprintf(hex_str, sizeof(hex_str), "<0x%02X>", ch);
3257
+ return pimpl->token_to_id.at(hex_str);
3258
+ }
2900
3259
  default:
2901
3260
  GGML_ABORT("fatal error");
2902
3261
  }
@@ -2998,6 +3357,10 @@ llama_token llama_vocab::token_fim_sep() const {
2998
3357
  return pimpl->special_fim_sep_id;
2999
3358
  }
3000
3359
 
3360
+ llama_token llama_vocab::token_mask() const {
3361
+ return pimpl->special_mask_id;
3362
+ }
3363
+
3001
3364
  bool llama_vocab::get_add_space_prefix() const {
3002
3365
  return pimpl->add_space_prefix;
3003
3366
  }
@@ -3238,6 +3601,10 @@ llama_token llama_vocab_fim_sep(const struct llama_vocab * vocab) {
3238
3601
  return vocab->token_fim_sep();
3239
3602
  }
3240
3603
 
3604
+ llama_token llama_vocab_mask(const struct llama_vocab* vocab) {
3605
+ return vocab->token_mask();
3606
+ }
3607
+
3241
3608
  // deprecated
3242
3609
  const char * llama_token_get_text(const struct llama_vocab * vocab, llama_token token) {
3243
3610
  return llama_vocab_get_text(vocab, token);
@@ -3374,4 +3741,3 @@ int32_t llama_detokenize(
3374
3741
  bool unparse_special) {
3375
3742
  return vocab->detokenize(tokens, n_tokens, text, text_len_max, remove_special, unparse_special);
3376
3743
  }
3377
-
@@ -6,6 +6,48 @@
6
6
  #include <vector>
7
7
  #include <memory>
8
8
 
9
+ // pre-tokenization types
10
+ enum llama_vocab_pre_type {
11
+ LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
12
+ LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
13
+ LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
14
+ LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
15
+ LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
16
+ LLAMA_VOCAB_PRE_TYPE_MPT = 5,
17
+ LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
18
+ LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
19
+ LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
20
+ LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
21
+ LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10,
22
+ LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
23
+ LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
24
+ LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
25
+ LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
26
+ LLAMA_VOCAB_PRE_TYPE_PORO = 15,
27
+ LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16,
28
+ LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
29
+ LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
30
+ LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
31
+ LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
32
+ LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
33
+ LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
34
+ LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
35
+ LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
36
+ LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
37
+ LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
38
+ LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
39
+ LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
40
+ LLAMA_VOCAB_PRE_TYPE_GPT4O = 29,
41
+ LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30,
42
+ LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
43
+ LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
44
+ LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
45
+ LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
46
+ LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
47
+ LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36,
48
+ LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37,
49
+ };
50
+
9
51
  struct LLM_KV;
10
52
  struct llama_model_loader;
11
53
 
@@ -59,6 +101,7 @@ struct llama_vocab {
59
101
  llama_token token_sep() const;
60
102
  llama_token token_nl () const;
61
103
  llama_token token_pad() const;
104
+ llama_token token_mask() const;
62
105
 
63
106
  llama_token token_prefix() const;
64
107
  llama_token token_middle() const;