@fugood/llama.node 0.3.17 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (193) hide show
  1. package/CMakeLists.txt +3 -1
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +39 -2
  19. package/lib/index.js +132 -1
  20. package/lib/index.ts +203 -3
  21. package/package.json +2 -1
  22. package/src/EmbeddingWorker.cpp +1 -1
  23. package/src/LlamaCompletionWorker.cpp +366 -19
  24. package/src/LlamaCompletionWorker.h +30 -10
  25. package/src/LlamaContext.cpp +213 -5
  26. package/src/LlamaContext.h +12 -0
  27. package/src/common.hpp +15 -0
  28. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +133 -24
  29. package/src/llama.cpp/.github/workflows/build.yml +41 -762
  30. package/src/llama.cpp/.github/workflows/docker.yml +5 -2
  31. package/src/llama.cpp/.github/workflows/release.yml +716 -0
  32. package/src/llama.cpp/.github/workflows/server.yml +12 -12
  33. package/src/llama.cpp/CMakeLists.txt +5 -17
  34. package/src/llama.cpp/cmake/build-info.cmake +8 -2
  35. package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
  36. package/src/llama.cpp/common/CMakeLists.txt +31 -3
  37. package/src/llama.cpp/common/arg.cpp +48 -29
  38. package/src/llama.cpp/common/chat.cpp +128 -106
  39. package/src/llama.cpp/common/chat.h +2 -0
  40. package/src/llama.cpp/common/common.cpp +37 -1
  41. package/src/llama.cpp/common/common.h +18 -9
  42. package/src/llama.cpp/common/llguidance.cpp +1 -0
  43. package/src/llama.cpp/common/minja/chat-template.hpp +9 -5
  44. package/src/llama.cpp/common/minja/minja.hpp +69 -36
  45. package/src/llama.cpp/common/regex-partial.cpp +204 -0
  46. package/src/llama.cpp/common/regex-partial.h +56 -0
  47. package/src/llama.cpp/common/sampling.cpp +57 -50
  48. package/src/llama.cpp/examples/CMakeLists.txt +2 -23
  49. package/src/llama.cpp/examples/embedding/embedding.cpp +2 -11
  50. package/src/llama.cpp/examples/parallel/parallel.cpp +86 -14
  51. package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
  52. package/src/llama.cpp/examples/training/finetune.cpp +96 -0
  53. package/src/llama.cpp/ggml/CMakeLists.txt +27 -0
  54. package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
  55. package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
  56. package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
  57. package/src/llama.cpp/ggml/include/ggml.h +10 -7
  58. package/src/llama.cpp/ggml/src/CMakeLists.txt +1 -1
  59. package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
  60. package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
  61. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +20 -13
  62. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +0 -2
  63. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +306 -6
  64. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +4 -13
  65. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +29 -16
  66. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +88 -5
  67. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -12
  68. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +264 -69
  69. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +501 -0
  70. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +0 -13
  71. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +0 -6
  72. package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
  73. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +36 -11
  74. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +0 -2
  75. package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
  76. package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
  77. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +41 -27
  78. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +29 -23
  79. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +9 -8
  80. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +121 -232
  81. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +7 -15
  82. package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
  83. package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
  84. package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
  85. package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
  86. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +0 -23
  87. package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
  88. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +338 -166
  89. package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
  90. package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
  91. package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
  92. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +81 -70
  93. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +657 -193
  94. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +20 -0
  95. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +123 -29
  96. package/src/llama.cpp/ggml/src/ggml.c +29 -20
  97. package/src/llama.cpp/ggml/src/gguf.cpp +33 -33
  98. package/src/llama.cpp/include/llama.h +52 -11
  99. package/src/llama.cpp/requirements/requirements-all.txt +3 -3
  100. package/src/llama.cpp/scripts/xxd.cmake +1 -1
  101. package/src/llama.cpp/src/CMakeLists.txt +1 -0
  102. package/src/llama.cpp/src/llama-adapter.cpp +6 -0
  103. package/src/llama.cpp/src/llama-arch.cpp +3 -0
  104. package/src/llama.cpp/src/llama-batch.cpp +5 -1
  105. package/src/llama.cpp/src/llama-batch.h +2 -1
  106. package/src/llama.cpp/src/llama-chat.cpp +17 -7
  107. package/src/llama.cpp/src/llama-chat.h +1 -0
  108. package/src/llama.cpp/src/llama-context.cpp +389 -501
  109. package/src/llama.cpp/src/llama-context.h +44 -32
  110. package/src/llama.cpp/src/llama-cparams.h +1 -0
  111. package/src/llama.cpp/src/llama-graph.cpp +20 -38
  112. package/src/llama.cpp/src/llama-graph.h +12 -8
  113. package/src/llama.cpp/src/llama-kv-cache.cpp +1503 -389
  114. package/src/llama.cpp/src/llama-kv-cache.h +271 -85
  115. package/src/llama.cpp/src/llama-memory.h +11 -1
  116. package/src/llama.cpp/src/llama-model-loader.cpp +24 -15
  117. package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
  118. package/src/llama.cpp/src/llama-model-saver.h +37 -0
  119. package/src/llama.cpp/src/llama-model.cpp +316 -69
  120. package/src/llama.cpp/src/llama-model.h +8 -1
  121. package/src/llama.cpp/src/llama-quant.cpp +15 -13
  122. package/src/llama.cpp/src/llama-sampling.cpp +18 -6
  123. package/src/llama.cpp/src/llama-vocab.cpp +42 -4
  124. package/src/llama.cpp/src/llama-vocab.h +6 -0
  125. package/src/llama.cpp/src/llama.cpp +14 -0
  126. package/src/llama.cpp/tests/CMakeLists.txt +10 -2
  127. package/src/llama.cpp/tests/test-backend-ops.cpp +107 -47
  128. package/src/llama.cpp/tests/test-chat-template.cpp +10 -11
  129. package/src/llama.cpp/tests/test-chat.cpp +3 -1
  130. package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
  131. package/src/llama.cpp/tests/test-opt.cpp +33 -21
  132. package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
  133. package/src/llama.cpp/tests/test-sampling.cpp +1 -1
  134. package/src/llama.cpp/tools/CMakeLists.txt +39 -0
  135. package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +2 -2
  136. package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
  137. package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +495 -348
  138. package/src/llama.cpp/{examples → tools}/main/main.cpp +6 -9
  139. package/src/llama.cpp/{examples/llava → tools/mtmd}/CMakeLists.txt +1 -35
  140. package/src/llama.cpp/{examples/llava → tools/mtmd}/clip-impl.h +25 -5
  141. package/src/llama.cpp/{examples/llava → tools/mtmd}/clip.cpp +1440 -1349
  142. package/src/llama.cpp/tools/mtmd/clip.h +99 -0
  143. package/src/llama.cpp/{examples/llava → tools/mtmd}/mtmd-cli.cpp +70 -44
  144. package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
  145. package/src/llama.cpp/{examples/llava → tools/mtmd}/mtmd.cpp +251 -281
  146. package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
  147. package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +4 -2
  148. package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +13 -76
  149. package/src/llama.cpp/{examples → tools}/rpc/rpc-server.cpp +70 -74
  150. package/src/llama.cpp/{examples → tools}/run/run.cpp +18 -4
  151. package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
  152. package/src/llama.cpp/{examples → tools}/server/server.cpp +291 -76
  153. package/src/llama.cpp/{examples → tools}/server/utils.hpp +377 -5
  154. package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
  155. package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
  156. package/src/llama.cpp/examples/infill/infill.cpp +0 -590
  157. package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
  158. package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
  159. package/src/llama.cpp/examples/llava/clip.h +0 -135
  160. package/src/llama.cpp/examples/llava/llava.cpp +0 -586
  161. package/src/llama.cpp/examples/llava/llava.h +0 -49
  162. package/src/llama.cpp/examples/llava/mtmd.h +0 -168
  163. package/src/llama.cpp/examples/llava/qwen2vl-test.cpp +0 -636
  164. /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
  165. /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
  166. /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
  167. /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
  168. /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
  169. /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
  170. /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
  171. /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
  172. /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
  173. /package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +0 -0
  174. /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
  175. /package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +0 -0
  176. /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
  177. /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
  178. /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
  179. /package/src/llama.cpp/{examples/llava → tools/mtmd}/deprecation-warning.cpp +0 -0
  180. /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
  181. /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
  182. /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
  183. /package/src/llama.cpp/{examples → tools}/rpc/CMakeLists.txt +0 -0
  184. /package/src/llama.cpp/{examples → tools}/run/CMakeLists.txt +0 -0
  185. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
  186. /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
  187. /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
  188. /package/src/llama.cpp/{examples → tools}/server/httplib.h +0 -0
  189. /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
  190. /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
  191. /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
  192. /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
  193. /package/src/llama.cpp/{examples → tools}/tts/tts.cpp +0 -0
@@ -36,6 +36,7 @@ enum llm_type {
36
36
  LLM_TYPE_335M,
37
37
  LLM_TYPE_410M,
38
38
  LLM_TYPE_450M,
39
+ LLM_TYPE_475M,
39
40
  LLM_TYPE_770M,
40
41
  LLM_TYPE_780M,
41
42
  LLM_TYPE_0_5B,
@@ -75,6 +76,7 @@ enum llm_type {
75
76
  LLM_TYPE_236B,
76
77
  LLM_TYPE_290B,
77
78
  LLM_TYPE_314B,
79
+ LLM_TYPE_405B,
78
80
  LLM_TYPE_671B,
79
81
  LLM_TYPE_SMALL,
80
82
  LLM_TYPE_MEDIUM,
@@ -94,6 +96,8 @@ enum llm_type {
94
96
  LLM_TYPE_235B_A22B,
95
97
  };
96
98
 
99
+ std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type);
100
+
97
101
  struct llama_layer_posnet {
98
102
  // resnet
99
103
  struct ggml_tensor * norm1 = nullptr;
@@ -394,8 +398,11 @@ struct llama_model {
394
398
 
395
399
  const struct ggml_tensor * get_tensor(const char * name) const;
396
400
 
401
+ ggml_tensor * get_rope_factors(uint32_t n_ctx_per_seq, int il) const;
402
+
403
+ // note: can mutate `cparams`
397
404
  // TODO: move this to new llm_arch_model_i interface
398
- llama_memory_i * create_memory() const; // TODO: params
405
+ llama_memory_i * create_memory(const llama_memory_params & params, llama_cparams & cparams) const;
399
406
 
400
407
  // TODO: move this to new llm_arch_model_i interface
401
408
  llm_graph_result_ptr build_graph(
@@ -14,6 +14,12 @@
14
14
  #include <thread>
15
15
  #include <unordered_map>
16
16
 
17
+ // Quantization types. Changes to this struct must be replicated in quantize.cpp
18
+ struct tensor_quantization {
19
+ std::string name;
20
+ ggml_type quant = GGML_TYPE_COUNT;
21
+ };
22
+
17
23
  static void zeros(std::ofstream & file, size_t n) {
18
24
  char zero = 0;
19
25
  for (size_t i = 0; i < n; ++i) {
@@ -48,12 +54,6 @@ struct quantize_state_impl {
48
54
  {}
49
55
  };
50
56
 
51
- // changes to this struct must be replicated in quantize.cpp
52
- struct tensor_quantization {
53
- std::string name;
54
- ggml_type quant = GGML_TYPE_COUNT;
55
- };
56
-
57
57
  static void llama_tensor_dequantize_impl(
58
58
  ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
59
59
  const size_t nelements, const int nthread
@@ -519,7 +519,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
519
519
  nthread = std::thread::hardware_concurrency();
520
520
  }
521
521
 
522
- // mmap consistently increases speed Linux, and also increases speed on Windows with
522
+ // mmap consistently increases speed on Linux, and also increases speed on Windows with
523
523
  // hot cache. It may cause a slowdown on macOS, possibly related to free memory.
524
524
  #if defined(__linux__) || defined(_WIN32)
525
525
  constexpr bool use_mmap = true;
@@ -529,7 +529,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
529
529
 
530
530
  llama_model_kv_override * kv_overrides = nullptr;
531
531
  if (params->kv_overrides) {
532
- auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
532
+ auto * v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
533
533
  kv_overrides = v->data();
534
534
  }
535
535
 
@@ -796,17 +796,19 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
796
796
  // unless the user specifies a type
797
797
  if (params->tensor_types) {
798
798
  const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
799
+ const std::string tensor_name(tensor->name);
799
800
  for (const auto & [tname, qtype] : tensor_types) {
800
- if (std::regex pattern(tname); std::regex_search(tensor->name, pattern)) {
801
- if (qtype != new_type) {
802
- LLAMA_LOG_DEBUG("(overriding %s -> %s), ", ggml_type_name(new_type), ggml_type_name(qtype));
801
+ if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) {
802
+ if (qtype != new_type) {
803
+ LLAMA_LOG_DEBUG("(overriding %s) ", ggml_type_name(new_type));
804
+ new_type = qtype;
805
+ break; // if two or more types are specified for the tensor, first match wins
803
806
  }
804
- new_type = qtype;
805
- break;
806
807
  }
807
808
  }
808
809
  }
809
810
  }
811
+
810
812
  if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
811
813
  new_type = params->token_embedding_type;
812
814
  }
@@ -1750,23 +1750,35 @@ static const char * llama_sampler_top_n_sigma_name(const struct llama_sampler *
1750
1750
  static void llama_sampler_top_n_sigma_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
1751
1751
  const auto * ctx = (llama_sampler_top_n_sigma *) smpl->ctx;
1752
1752
 
1753
+ if (ctx->n <= 0.0f || cur_p->size <= 1) {
1754
+ return;
1755
+ }
1756
+
1753
1757
  // find max logit and calculate mean
1754
1758
  float max = cur_p->data[0].logit;
1755
1759
  float logits_sum = 0;
1760
+ size_t valid_count = 0;
1756
1761
  for (size_t i = 0; i < cur_p->size; ++i) {
1757
- if (cur_p->data[i].logit > max) {
1758
- max = cur_p->data[i].logit;
1762
+ // Only count non-negative infinity values
1763
+ if (cur_p->data[i].logit != -INFINITY) {
1764
+ if (cur_p->data[i].logit > max) {
1765
+ max = cur_p->data[i].logit;
1766
+ }
1767
+ logits_sum += cur_p->data[i].logit;
1768
+ valid_count++;
1759
1769
  }
1760
- logits_sum += cur_p->data[i].logit;
1761
1770
  }
1762
- float mean = logits_sum/cur_p->size;
1771
+ float mean = valid_count > 0 ? logits_sum/valid_count : 0;
1763
1772
 
1764
1773
  // calculate standard deviation
1765
1774
  float acc = 0;
1766
1775
  for (size_t i = 0; i < cur_p->size; ++i) {
1767
- acc += pow(cur_p->data[i].logit - mean, 2);
1776
+ // Skip -infinity in std calculation
1777
+ if (cur_p->data[i].logit != -INFINITY) {
1778
+ acc += pow(cur_p->data[i].logit - mean, 2);
1779
+ }
1768
1780
  }
1769
- float std = sqrt(acc/cur_p->size);
1781
+ float std = valid_count > 0 ? sqrt(acc/valid_count) : 0;
1770
1782
 
1771
1783
  //apply mask
1772
1784
  for (size_t i = 0; i < cur_p->size; ++i) {
@@ -1,5 +1,7 @@
1
1
  #include "llama-vocab.h"
2
2
 
3
+ #include "ggml.h"
4
+ #include "gguf.h"
3
5
  #include "llama-impl.h"
4
6
  #include "llama-model-loader.h"
5
7
 
@@ -415,6 +417,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
415
417
  "'(?:[sSdDmMtT]|[lL][lL]|[vV][eE]|[rR][eE])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
416
418
  };
417
419
  break;
420
+ case LLAMA_VOCAB_PRE_TYPE_SEED_CODER:
421
+ regex_exprs = {
422
+ // original regex from tokenizer.json
423
+ // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\r\n]+|\\s*[\r\n]+|\\s+(?!\\S)|\\s+"
424
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\\r\\n]+|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
425
+ };
426
+ break;
418
427
  default:
419
428
  // default regex for BPE tokenization pre-processing
420
429
  regex_exprs = {
@@ -1227,6 +1236,9 @@ struct fragment_buffer_variant {
1227
1236
  struct llama_vocab::impl {
1228
1237
  uint32_t n_token_types = 0; // for BERT-style token types
1229
1238
 
1239
+ std::string tokenizer_model;
1240
+ std::string tokenizer_pre;
1241
+
1230
1242
  enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
1231
1243
  enum llama_vocab_pre_type pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
1232
1244
 
@@ -1362,9 +1374,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1362
1374
 
1363
1375
  // determine vocab type
1364
1376
  {
1365
- std::string tokenizer_model;
1366
- std::string tokenizer_pre;
1367
-
1368
1377
  ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
1369
1378
  ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
1370
1379
 
@@ -1459,7 +1468,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1459
1468
 
1460
1469
  const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
1461
1470
  if (precompiled_charsmap_keyidx != -1) {
1462
- size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
1471
+ const gguf_type pc_type = gguf_get_arr_type(ctx, precompiled_charsmap_keyidx);
1472
+ GGML_ASSERT(pc_type == GGUF_TYPE_INT8 || pc_type == GGUF_TYPE_UINT8);
1473
+
1474
+ const size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
1463
1475
  const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
1464
1476
  precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
1465
1477
  #ifdef IS_BIG_ENDIAN
@@ -1634,6 +1646,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1634
1646
  tokenizer_pre == "bailingmoe") {
1635
1647
  pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
1636
1648
  clean_spaces = false;
1649
+ } else if (
1650
+ tokenizer_pre == "seed-coder") {
1651
+ pre_type = LLAMA_VOCAB_PRE_TYPE_SEED_CODER;
1652
+ clean_spaces = false;
1637
1653
  } else {
1638
1654
  throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
1639
1655
  }
@@ -2778,6 +2794,14 @@ void llama_vocab::load(llama_model_loader & ml, const LLM_KV & kv) {
2778
2794
  pimpl->load(ml, kv);
2779
2795
  }
2780
2796
 
2797
+ std::string llama_vocab::get_tokenizer_model() const {
2798
+ return pimpl->tokenizer_model;
2799
+ }
2800
+
2801
+ std::string llama_vocab::get_tokenizer_pre() const {
2802
+ return pimpl->tokenizer_pre;
2803
+ }
2804
+
2781
2805
  enum llama_vocab_type llama_vocab::get_type() const {
2782
2806
  return pimpl->type;
2783
2807
  }
@@ -3000,6 +3024,20 @@ int llama_vocab::find_bpe_rank(const std::string & token_left, const std::string
3000
3024
  return it->second;
3001
3025
  }
3002
3026
 
3027
+ std::vector<std::string> llama_vocab::get_bpe_merges() const {
3028
+ std::vector<std::string> result(pimpl->bpe_ranks.size());
3029
+
3030
+ for (const auto & pair : pimpl->bpe_ranks) {
3031
+ result[pair.second] = pair.first.first + " " + pair.first.second;
3032
+ }
3033
+
3034
+ return result;
3035
+ }
3036
+
3037
+ std::vector<char> llama_vocab::get_precompiled_charsmap() const {
3038
+ return pimpl->precompiled_charsmap;
3039
+ }
3040
+
3003
3041
  int32_t llama_vocab::tokenize(
3004
3042
  const char * text,
3005
3043
  int32_t text_len,
@@ -21,6 +21,9 @@ struct llama_vocab {
21
21
 
22
22
  void load(llama_model_loader & ml, const LLM_KV & kv);
23
23
 
24
+ std::string get_tokenizer_model() const;
25
+ std::string get_tokenizer_pre() const;
26
+
24
27
  enum llama_vocab_type get_type() const;
25
28
  enum llama_vocab_pre_type get_pre_type() const;
26
29
 
@@ -80,6 +83,9 @@ struct llama_vocab {
80
83
  int max_token_len() const;
81
84
 
82
85
  int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
86
+ std::vector<std::string> get_bpe_merges() const;
87
+
88
+ std::vector<char> get_precompiled_charsmap() const;
83
89
 
84
90
  int32_t tokenize(
85
91
  const char * text,
@@ -4,6 +4,7 @@
4
4
  #include "llama-mmap.h"
5
5
  #include "llama-vocab.h"
6
6
  #include "llama-model-loader.h"
7
+ #include "llama-model-saver.h"
7
8
  #include "llama-model.h"
8
9
 
9
10
  #include "ggml.h"
@@ -139,6 +140,11 @@ static struct llama_model * llama_model_load_from_file_impl(
139
140
  struct llama_model_params params) {
140
141
  ggml_time_init();
141
142
 
143
+ if (!params.vocab_only && ggml_backend_reg_count() == 0) {
144
+ LLAMA_LOG_ERROR("%s: no backends are loaded. hint: use ggml_backend_load() or ggml_backend_load_all() to load a backend before calling this function\n", __func__);
145
+ return nullptr;
146
+ }
147
+
142
148
  unsigned cur_percentage = 0;
143
149
  if (params.progress_callback == NULL) {
144
150
  params.progress_callback_user_data = &cur_percentage;
@@ -253,6 +259,13 @@ struct llama_model * llama_model_load_from_splits(
253
259
  return llama_model_load_from_file_impl(splits.front(), splits, params);
254
260
  }
255
261
 
262
+ void llama_model_save_to_file(const struct llama_model * model, const char * path_model) {
263
+ llama_model_saver ms(*model);
264
+ ms.add_kv_from_model();
265
+ ms.add_tensors_from_model();
266
+ ms.save(path_model);
267
+ }
268
+
256
269
  //
257
270
  // chat templates
258
271
  //
@@ -338,3 +351,4 @@ const char * llama_print_system_info(void) {
338
351
 
339
352
  return s.c_str();
340
353
  }
354
+
@@ -111,10 +111,13 @@ if (NOT WIN32)
111
111
  # TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8
112
112
  if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
113
113
  llama_build_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
114
- target_include_directories(test-json-schema-to-grammar PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../examples/server)
114
+ target_include_directories(test-json-schema-to-grammar PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../tools/server)
115
+ endif()
116
+
117
+ if (NOT GGML_BACKEND_DL)
118
+ llama_build(test-quantize-stats.cpp)
115
119
  endif()
116
120
 
117
- llama_build(test-quantize-stats.cpp)
118
121
  llama_build(test-gbnf-validator.cpp)
119
122
 
120
123
  # build test-tokenizer-1-bpe target once and add many tests
@@ -141,6 +144,7 @@ endif()
141
144
 
142
145
  llama_build_and_test(test-log.cpp)
143
146
  llama_build_and_test(test-chat-template.cpp)
147
+ llama_build_and_test(test-regex-partial.cpp)
144
148
 
145
149
  # this fails on windows (github hosted runner) due to curl DLL not found (exit code 0xc0000135)
146
150
  if (NOT WIN32)
@@ -162,6 +166,10 @@ if (NOT GGML_BACKEND_DL)
162
166
  llama_build_and_test(test-rope.cpp)
163
167
  endif()
164
168
 
169
+ # libmtmd
170
+ set(LLAMA_TEST_NAME test-mtmd-c-api)
171
+ llama_build_and_test(test-mtmd-c-api.c)
172
+ target_link_libraries(${LLAMA_TEST_NAME} PRIVATE mtmd)
165
173
 
166
174
  # dummy executable - not installed
167
175
  get_filename_component(TEST_TARGET test-c.c NAME_WE)