@fugood/llama.node 0.3.16 → 0.3.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (202) hide show
  1. package/CMakeLists.txt +3 -0
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +5 -0
  19. package/package.json +1 -1
  20. package/src/LlamaCompletionWorker.cpp +8 -0
  21. package/src/LlamaCompletionWorker.h +1 -0
  22. package/src/LlamaContext.cpp +3 -2
  23. package/src/llama.cpp/.github/workflows/build-linux-cross.yml +124 -0
  24. package/src/llama.cpp/.github/workflows/build.yml +70 -27
  25. package/src/llama.cpp/.github/workflows/docker.yml +6 -6
  26. package/src/llama.cpp/.github/workflows/server.yml +7 -11
  27. package/src/llama.cpp/CMakeLists.txt +23 -1
  28. package/src/llama.cpp/common/CMakeLists.txt +6 -3
  29. package/src/llama.cpp/common/arg.cpp +809 -105
  30. package/src/llama.cpp/common/arg.h +9 -0
  31. package/src/llama.cpp/common/chat.cpp +1 -1
  32. package/src/llama.cpp/common/common.cpp +31 -521
  33. package/src/llama.cpp/common/common.h +17 -36
  34. package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
  35. package/src/llama.cpp/common/llguidance.cpp +30 -47
  36. package/src/llama.cpp/common/minja/chat-template.hpp +15 -7
  37. package/src/llama.cpp/common/minja/minja.hpp +119 -93
  38. package/src/llama.cpp/common/sampling.cpp +3 -0
  39. package/src/llama.cpp/docs/build.md +122 -7
  40. package/src/llama.cpp/examples/CMakeLists.txt +0 -9
  41. package/src/llama.cpp/examples/batched/batched.cpp +1 -1
  42. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +1 -1
  43. package/src/llama.cpp/examples/embedding/embedding.cpp +7 -1
  44. package/src/llama.cpp/examples/export-lora/export-lora.cpp +1 -1
  45. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -16
  46. package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
  47. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +210 -8
  48. package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
  49. package/src/llama.cpp/examples/llava/CMakeLists.txt +39 -24
  50. package/src/llama.cpp/examples/llava/clip-impl.h +345 -0
  51. package/src/llama.cpp/examples/llava/clip.cpp +2152 -1803
  52. package/src/llama.cpp/examples/llava/clip.h +39 -22
  53. package/src/llama.cpp/examples/llava/deprecation-warning.cpp +22 -0
  54. package/src/llama.cpp/examples/llava/llava.cpp +64 -52
  55. package/src/llama.cpp/examples/llava/mtmd-cli.cpp +344 -0
  56. package/src/llama.cpp/examples/llava/mtmd.cpp +708 -0
  57. package/src/llama.cpp/examples/llava/mtmd.h +168 -0
  58. package/src/llama.cpp/examples/llava/{qwen2vl-cli.cpp → qwen2vl-test.cpp} +83 -31
  59. package/src/llama.cpp/examples/main/main.cpp +16 -5
  60. package/src/llama.cpp/examples/parallel/parallel.cpp +3 -1
  61. package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
  62. package/src/llama.cpp/examples/perplexity/perplexity.cpp +17 -3
  63. package/src/llama.cpp/examples/quantize/quantize.cpp +115 -2
  64. package/src/llama.cpp/examples/rpc/CMakeLists.txt +4 -2
  65. package/src/llama.cpp/examples/rpc/rpc-server.cpp +163 -8
  66. package/src/llama.cpp/examples/run/CMakeLists.txt +12 -1
  67. package/src/llama.cpp/examples/run/run.cpp +14 -28
  68. package/src/llama.cpp/examples/server/httplib.h +313 -247
  69. package/src/llama.cpp/examples/server/server.cpp +238 -139
  70. package/src/llama.cpp/examples/server/utils.hpp +51 -2
  71. package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
  72. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
  73. package/src/llama.cpp/examples/sycl/build.sh +2 -2
  74. package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
  75. package/src/llama.cpp/examples/tts/tts.cpp +6 -9
  76. package/src/llama.cpp/ggml/CMakeLists.txt +8 -2
  77. package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
  78. package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
  79. package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
  80. package/src/llama.cpp/ggml/include/ggml.h +66 -99
  81. package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
  82. package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
  83. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
  84. package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
  85. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
  86. package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
  87. package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
  88. package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
  89. package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
  90. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +48 -22
  91. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
  92. package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
  93. package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
  94. package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
  95. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -192
  96. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
  97. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +754 -404
  98. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1003 -13519
  99. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -0
  100. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +2 -7
  101. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +0 -1
  102. package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +3 -4
  103. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +533 -88
  104. package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8809 -0
  105. package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
  106. package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
  107. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
  108. package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
  109. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +258 -0
  110. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
  111. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
  112. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
  113. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
  114. package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
  115. package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +70 -3
  116. package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
  117. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -260
  118. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +293 -40
  119. package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +96 -22
  120. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  121. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +350 -0
  122. package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
  123. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
  124. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +2 -292
  125. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
  126. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +967 -438
  127. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
  128. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
  129. package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
  130. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +204 -280
  131. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
  132. package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
  133. package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
  134. package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
  135. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
  136. package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
  137. package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
  138. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +23 -0
  139. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +646 -114
  140. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +12 -0
  141. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +17 -8
  142. package/src/llama.cpp/ggml/src/ggml.c +141 -245
  143. package/src/llama.cpp/ggml/src/gguf.cpp +1 -0
  144. package/src/llama.cpp/include/llama.h +30 -11
  145. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
  146. package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
  147. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
  148. package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
  149. package/src/llama.cpp/requirements/requirements-all.txt +2 -0
  150. package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
  151. package/src/llama.cpp/src/CMakeLists.txt +3 -2
  152. package/src/llama.cpp/src/llama-adapter.cpp +37 -1
  153. package/src/llama.cpp/src/llama-arch.cpp +160 -17
  154. package/src/llama.cpp/src/llama-arch.h +16 -0
  155. package/src/llama.cpp/src/llama-chat.cpp +82 -17
  156. package/src/llama.cpp/src/llama-chat.h +6 -2
  157. package/src/llama.cpp/src/llama-context.cpp +108 -92
  158. package/src/llama.cpp/src/llama-context.h +1 -2
  159. package/src/llama.cpp/src/llama-graph.cpp +189 -119
  160. package/src/llama.cpp/src/llama-graph.h +26 -6
  161. package/src/llama.cpp/src/llama-hparams.h +13 -0
  162. package/src/llama.cpp/src/llama-kv-cache.cpp +70 -123
  163. package/src/llama.cpp/src/llama-kv-cache.h +41 -115
  164. package/src/llama.cpp/src/llama-memory.h +1 -1
  165. package/src/llama.cpp/src/llama-mmap.cpp +1 -1
  166. package/src/llama.cpp/src/llama-model-loader.cpp +10 -5
  167. package/src/llama.cpp/src/llama-model-loader.h +5 -3
  168. package/src/llama.cpp/src/llama-model.cpp +1760 -534
  169. package/src/llama.cpp/src/llama-model.h +13 -1
  170. package/src/llama.cpp/src/llama-quant.cpp +29 -8
  171. package/src/llama.cpp/src/llama-sampling.cpp +7 -1
  172. package/src/llama.cpp/src/llama-vocab.cpp +44 -6
  173. package/src/llama.cpp/src/llama.cpp +1 -1
  174. package/src/llama.cpp/tests/CMakeLists.txt +43 -30
  175. package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
  176. package/src/llama.cpp/tests/test-backend-ops.cpp +82 -43
  177. package/src/llama.cpp/tests/test-chat-template.cpp +34 -13
  178. package/src/llama.cpp/tests/test-chat.cpp +12 -2
  179. package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
  180. package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
  181. package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
  182. package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
  183. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
  184. package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
  185. package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
  186. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
  187. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
  188. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
  189. package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
  190. package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
  191. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
  192. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
  193. package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
  194. package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
  195. package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
  196. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
  197. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
  198. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
  199. package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
  200. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
  201. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
  202. package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
@@ -39,11 +39,14 @@ enum llm_type {
39
39
  LLM_TYPE_770M,
40
40
  LLM_TYPE_780M,
41
41
  LLM_TYPE_0_5B,
42
+ LLM_TYPE_0_6B,
42
43
  LLM_TYPE_1B,
43
44
  LLM_TYPE_1_3B,
44
45
  LLM_TYPE_1_4B,
45
46
  LLM_TYPE_1_5B,
46
47
  LLM_TYPE_1_6B,
48
+ LLM_TYPE_1_7B,
49
+ LLM_TYPE_1_8B,
47
50
  LLM_TYPE_2B,
48
51
  LLM_TYPE_2_8B,
49
52
  LLM_TYPE_2_9B,
@@ -61,6 +64,7 @@ enum llm_type {
61
64
  LLM_TYPE_15B,
62
65
  LLM_TYPE_16B,
63
66
  LLM_TYPE_20B,
67
+ LLM_TYPE_27B,
64
68
  LLM_TYPE_30B,
65
69
  LLM_TYPE_32B,
66
70
  LLM_TYPE_34B,
@@ -69,6 +73,7 @@ enum llm_type {
69
73
  LLM_TYPE_65B,
70
74
  LLM_TYPE_70B,
71
75
  LLM_TYPE_236B,
76
+ LLM_TYPE_290B,
72
77
  LLM_TYPE_314B,
73
78
  LLM_TYPE_671B,
74
79
  LLM_TYPE_SMALL,
@@ -83,7 +88,10 @@ enum llm_type {
83
88
  LLM_TYPE_16x3_8B,
84
89
  LLM_TYPE_10B_128x3_66B,
85
90
  LLM_TYPE_57B_A14B,
86
- LLM_TYPE_27B,
91
+ LLM_TYPE_17B_16E, // llama4 Scout
92
+ LLM_TYPE_17B_128E, // llama4 Maverick
93
+ LLM_TYPE_30B_A3B,
94
+ LLM_TYPE_235B_A22B,
87
95
  };
88
96
 
89
97
  struct llama_layer_posnet {
@@ -167,6 +175,8 @@ struct llama_layer {
167
175
  struct ggml_tensor * wq_b = nullptr;
168
176
  struct ggml_tensor * wkv_a_mqa = nullptr;
169
177
  struct ggml_tensor * wkv_b = nullptr;
178
+ struct ggml_tensor * wk_b = nullptr;
179
+ struct ggml_tensor * wv_b = nullptr;
170
180
  struct ggml_tensor * wq_cross = nullptr;
171
181
  struct ggml_tensor * wk_cross = nullptr;
172
182
  struct ggml_tensor * wv_cross = nullptr;
@@ -380,6 +390,8 @@ struct llama_model {
380
390
 
381
391
  ggml_backend_buffer_type_t select_buft(int il) const;
382
392
 
393
+ bool has_tensor_overrides() const;
394
+
383
395
  const struct ggml_tensor * get_tensor(const char * name) const;
384
396
 
385
397
  // TODO: move this to new llm_arch_model_i interface
@@ -10,6 +10,7 @@
10
10
  #include <cinttypes>
11
11
  #include <fstream>
12
12
  #include <mutex>
13
+ #include <regex>
13
14
  #include <thread>
14
15
  #include <unordered_map>
15
16
 
@@ -47,8 +48,14 @@ struct quantize_state_impl {
47
48
  {}
48
49
  };
49
50
 
51
+ // changes to this struct must be replicated in quantize.cpp
52
+ struct tensor_quantization {
53
+ std::string name;
54
+ ggml_type quant = GGML_TYPE_COUNT;
55
+ };
56
+
50
57
  static void llama_tensor_dequantize_impl(
51
- struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
58
+ ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
52
59
  const size_t nelements, const int nthread
53
60
  ) {
54
61
  if (output.size() < nelements) {
@@ -527,7 +534,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
527
534
  }
528
535
 
529
536
  std::vector<std::string> splits = {};
530
- llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides);
537
+ llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides, nullptr);
531
538
  ml.init_mappings(false); // no prefetching
532
539
 
533
540
  llama_model model(llama_model_default_params());
@@ -536,7 +543,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
536
543
  model.load_hparams(ml);
537
544
  model.load_stats (ml);
538
545
 
539
- struct quantize_state_impl qs(model, params);
546
+ quantize_state_impl qs(model, params);
540
547
 
541
548
  if (params->only_copy) {
542
549
  ftype = ml.ftype;
@@ -661,7 +668,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
661
668
  // populate the original tensors so we get an initial meta data
662
669
  for (const auto * it : tensors) {
663
670
  uint16_t i_split = params->keep_split ? it->idx : 0;
664
- struct ggml_tensor * tensor = it->tensor;
671
+ ggml_tensor * tensor = it->tensor;
665
672
  if (!ctx_outs[i_split]) {
666
673
  ctx_outs[i_split].reset(gguf_init_empty());
667
674
  }
@@ -710,7 +717,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
710
717
  new_ofstream(0);
711
718
  for (const auto * it : tensors) {
712
719
  const auto & weight = *it;
713
- struct ggml_tensor * tensor = weight.tensor;
720
+ ggml_tensor * tensor = weight.tensor;
714
721
  if (weight.idx != cur_split && params->keep_split) {
715
722
  close_ofstream();
716
723
  new_ofstream(weight.idx);
@@ -776,7 +783,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
776
783
  // do not quantize relative position bias (T5)
777
784
  quantize &= name.find("attn_rel_b.weight") == std::string::npos;
778
785
 
779
- enum ggml_type new_type;
786
+ ggml_type new_type;
780
787
  void * new_data;
781
788
  size_t new_size;
782
789
 
@@ -786,6 +793,19 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
786
793
  // get more optimal quantization type based on the tensor shape, layer, etc.
787
794
  if (!params->pure && ggml_is_quantized(default_type)) {
788
795
  new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
796
+ // unless the user specifies a type
797
+ if (params->tensor_types) {
798
+ const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
799
+ for (const auto & [tname, qtype] : tensor_types) {
800
+ if (std::regex pattern(tname); std::regex_search(tensor->name, pattern)) {
801
+ if (qtype != new_type) {
802
+ LLAMA_LOG_DEBUG("(overriding %s -> %s), ", ggml_type_name(new_type), ggml_type_name(qtype));
803
+ }
804
+ new_type = qtype;
805
+ break;
806
+ }
807
+ }
808
+ }
789
809
  }
790
810
  if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
791
811
  new_type = params->token_embedding_type;
@@ -910,8 +930,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
910
930
  // interface implementation
911
931
  //
912
932
 
913
- struct llama_model_quantize_params llama_model_quantize_default_params() {
914
- struct llama_model_quantize_params result = {
933
+ llama_model_quantize_params llama_model_quantize_default_params() {
934
+ llama_model_quantize_params result = {
915
935
  /*.nthread =*/ 0,
916
936
  /*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
917
937
  /*.output_tensor_type =*/ GGML_TYPE_COUNT,
@@ -923,6 +943,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
923
943
  /*.keep_split =*/ false,
924
944
  /*.imatrix =*/ nullptr,
925
945
  /*.kv_overrides =*/ nullptr,
946
+ /*.tensor_type =*/ nullptr,
926
947
  };
927
948
 
928
949
  return result;
@@ -232,7 +232,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
232
232
  // }
233
233
 
234
234
  if (k <= 0) {
235
- k = cur_p->size;
235
+ return;
236
236
  }
237
237
 
238
238
  k = std::min(k, (int) cur_p->size);
@@ -298,6 +298,7 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
298
298
  }
299
299
  cur_p->sorted = true;
300
300
  }
301
+
301
302
  cur_p->size = k;
302
303
  }
303
304
 
@@ -1477,6 +1478,7 @@ static struct llama_sampler * llama_sampler_grammar_clone(const struct llama_sam
1477
1478
  const auto * ctx = (const llama_sampler_grammar *) smpl->ctx;
1478
1479
 
1479
1480
  auto * result = llama_sampler_init_grammar_impl(ctx->vocab, nullptr, nullptr, false, nullptr, 0, nullptr, 0, nullptr, 0);
1481
+ GGML_ASSERT(result);
1480
1482
 
1481
1483
  // copy the state
1482
1484
  {
@@ -1548,6 +1550,10 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
1548
1550
  /* .grammar_root = */ grammar_root,
1549
1551
  /* .grammar = */ llama_grammar_init_impl(vocab, grammar_str, grammar_root, lazy, trigger_patterns, num_trigger_patterns, trigger_tokens, num_trigger_tokens),
1550
1552
  };
1553
+ if (!ctx->grammar) {
1554
+ delete ctx;
1555
+ return nullptr;
1556
+ }
1551
1557
  } else {
1552
1558
  *ctx = {
1553
1559
  /* .vocab = */ vocab,
@@ -342,6 +342,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
342
342
  case LLAMA_VOCAB_PRE_TYPE_MPT:
343
343
  case LLAMA_VOCAB_PRE_TYPE_OLMO:
344
344
  case LLAMA_VOCAB_PRE_TYPE_JAIS:
345
+ case LLAMA_VOCAB_PRE_TYPE_TRILLION:
345
346
  regex_exprs = {
346
347
  "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
347
348
  };
@@ -400,6 +401,20 @@ struct llm_tokenizer_bpe : llm_tokenizer {
400
401
  "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
401
402
  };
402
403
  break;
404
+ case LLAMA_VOCAB_PRE_TYPE_SUPERBPE:
405
+ regex_exprs = {
406
+ "\\p{N}+",
407
+ "(?=(\\d{3})+(?!\\d))",
408
+ };
409
+ break;
410
+ case LLAMA_VOCAB_PRE_TYPE_BAILINGMOE:
411
+ regex_exprs = {
412
+ // original regex from tokenizer.json
413
+ // "'(?i:[sdmt]|ll|ve|re)|[^\\r\\n\\p{L}\\p{N}]?+\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]++[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+"
414
+ // FIXME? Changed possessive quantifiers (?+ and ++) to greedy to avoid errors and imatrix hanging (tried atomic grouping but it's not supported?)
415
+ "'(?:[sSdDmMtT]|[lL][lL]|[vV][eE]|[rR][eE])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
416
+ };
417
+ break;
403
418
  default:
404
419
  // default regex for BPE tokenization pre-processing
405
420
  regex_exprs = {
@@ -1491,7 +1506,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1491
1506
  tokenizer_pre == "llama3" ||
1492
1507
  tokenizer_pre == "llama-v3" ||
1493
1508
  tokenizer_pre == "llama-bpe"||
1494
- tokenizer_pre == "falcon3") {
1509
+ tokenizer_pre == "falcon3" ||
1510
+ tokenizer_pre == "pixtral") {
1495
1511
  pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
1496
1512
  ignore_merges = true;
1497
1513
  add_bos = true;
@@ -1557,6 +1573,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1557
1573
  pre_type = LLAMA_VOCAB_PRE_TYPE_PORO;
1558
1574
  clean_spaces = false;
1559
1575
  } else if (
1576
+ tokenizer_pre == "glm4" ||
1560
1577
  tokenizer_pre == "chatglm-bpe") {
1561
1578
  pre_type = LLAMA_VOCAB_PRE_TYPE_CHATGLM4;
1562
1579
  special_bos_id = LLAMA_TOKEN_NULL;
@@ -1601,9 +1618,22 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1601
1618
  tokenizer_pre == "megrez") {
1602
1619
  pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
1603
1620
  } else if (
1604
- tokenizer_pre == "gpt-4o") {
1621
+ tokenizer_pre == "gpt-4o" ||
1622
+ tokenizer_pre == "llama4") {
1605
1623
  pre_type = LLAMA_VOCAB_PRE_TYPE_GPT4O;
1606
1624
  clean_spaces = false;
1625
+ } else if (
1626
+ tokenizer_pre == "superbpe") {
1627
+ pre_type = LLAMA_VOCAB_PRE_TYPE_SUPERBPE;
1628
+ clean_spaces = false;
1629
+ } else if (
1630
+ tokenizer_pre == "trillion") {
1631
+ pre_type = LLAMA_VOCAB_PRE_TYPE_TRILLION;
1632
+ clean_spaces = false;
1633
+ } else if (
1634
+ tokenizer_pre == "bailingmoe") {
1635
+ pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
1636
+ clean_spaces = false;
1607
1637
  } else {
1608
1638
  throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
1609
1639
  }
@@ -1781,6 +1811,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1781
1811
  || t.first == "<end_of_turn>"
1782
1812
  || t.first == "<|endoftext|>"
1783
1813
  || t.first == "<EOT>"
1814
+ || t.first == "_<EOT>"
1784
1815
  || t.first == "<|end▁of▁sentence|>" // DeepSeek
1785
1816
  ) {
1786
1817
  special_eot_id = t.second;
@@ -1811,8 +1842,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1811
1842
  if (false
1812
1843
  || t.first == "<|fim_prefix|>" // Qwen
1813
1844
  || t.first == "<fim-prefix>"
1845
+ || t.first == "<fim_prefix>" // Granite
1814
1846
  || t.first == "<|fim▁begin|>" // DeepSeek
1815
1847
  || t.first == "<PRE>"
1848
+ || t.first == "▁<PRE>" // CodeLlama
1816
1849
  ) {
1817
1850
  special_fim_pre_id = t.second;
1818
1851
  if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@@ -1828,8 +1861,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1828
1861
  if (false
1829
1862
  || t.first == "<|fim_suffix|>" // Qwen
1830
1863
  || t.first == "<fim-suffix>"
1864
+ || t.first == "<fim_suffix>" // Granite
1831
1865
  || t.first == "<|fim▁hole|>" // DeepSeek
1832
1866
  || t.first == "<SUF>"
1867
+ || t.first == "▁<SUF>" // CodeLlama
1833
1868
  ) {
1834
1869
  special_fim_suf_id = t.second;
1835
1870
  if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@@ -1845,8 +1880,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1845
1880
  if (false
1846
1881
  || t.first == "<|fim_middle|>" // Qwen
1847
1882
  || t.first == "<fim-middle>"
1883
+ || t.first == "<fim_middle>" // Granite
1848
1884
  || t.first == "<|fim▁end|>" // DeepSeek
1849
1885
  || t.first == "<MID>"
1886
+ || t.first == "▁<MID>" // CodeLlama
1850
1887
  ) {
1851
1888
  special_fim_mid_id = t.second;
1852
1889
  if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@@ -1862,6 +1899,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1862
1899
  if (false
1863
1900
  || t.first == "<|fim_pad|>" // Qwen
1864
1901
  || t.first == "<fim-pad>"
1902
+ || t.first == "<fim_pad>" // Granite
1865
1903
  || t.first == "<PAD>"
1866
1904
  ) {
1867
1905
  special_fim_pad_id = t.second;
@@ -1880,6 +1918,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1880
1918
  || t.first == "<|repo_name|>"
1881
1919
  || t.first == "<fim-repo>"
1882
1920
  || t.first == "<REPO>"
1921
+ || t.first == "<reponame>" // Granite
1883
1922
  ) {
1884
1923
  special_fim_rep_id = t.second;
1885
1924
  if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@@ -1931,6 +1970,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1931
1970
  || t.first == "<|endoftext|>"
1932
1971
  || t.first == "<|eom_id|>"
1933
1972
  || t.first == "<EOT>"
1973
+ || t.first == "_<EOT>"
1934
1974
  ) {
1935
1975
  special_eog_ids.insert(t.second);
1936
1976
  if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@@ -2189,14 +2229,12 @@ void llama_vocab::impl::tokenizer_st_partition(std::forward_list<fragment_buffer
2189
2229
  // find the first occurrence of a given special token in this fragment
2190
2230
  // passing offset argument only limit the "search area" but match coordinates
2191
2231
  // are still relative to the source full raw_text
2192
- auto match = raw_text.find(text, raw_text_base_offset);
2232
+ // string_view begins at pos 0 for the same reason
2233
+ auto match = std::string_view(raw_text.data(), raw_text_base_offset + raw_text_base_length).find(text, raw_text_base_offset);
2193
2234
 
2194
2235
  // no occurrences found, stop processing this fragment for a given special token
2195
2236
  if (match == std::string::npos) break;
2196
2237
 
2197
- // check if match is within bounds of offset <-> length
2198
- if (match + text.length() > raw_text_base_offset + raw_text_base_length) break;
2199
-
2200
2238
  #ifdef PRETOKENIZERDEBUG
2201
2239
  LLAMA_LOG_WARN("FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
2202
2240
  #endif
@@ -92,7 +92,7 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
92
92
  model.t_start_us = tm.t_start_us;
93
93
 
94
94
  try {
95
- llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides);
95
+ llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides, params.tensor_buft_overrides);
96
96
 
97
97
  ml.print_info();
98
98
 
@@ -1,5 +1,17 @@
1
1
  llama_add_compile_flags()
2
2
 
3
+ function(llama_build source)
4
+ if (DEFINED LLAMA_TEST_NAME)
5
+ set(TEST_TARGET ${LLAMA_TEST_NAME})
6
+ else()
7
+ get_filename_component(TEST_TARGET ${source} NAME_WE)
8
+ endif()
9
+
10
+ add_executable(${TEST_TARGET} ${source})
11
+ target_link_libraries(${TEST_TARGET} PRIVATE common)
12
+ install(TARGETS ${TEST_TARGET} RUNTIME)
13
+ endfunction()
14
+
3
15
  function(llama_test target)
4
16
  include(CMakeParseArguments)
5
17
  set(options)
@@ -36,7 +48,7 @@ endfunction()
36
48
  # - LABEL: label for the test (defaults to main)
37
49
  # - ARGS: arguments to pass to the test executable
38
50
  # - WORKING_DIRECTORY
39
- function(llama_target_and_test source)
51
+ function(llama_build_and_test source)
40
52
  include(CMakeParseArguments)
41
53
  set(options)
42
54
  set(oneValueArgs NAME LABEL WORKING_DIRECTORY)
@@ -58,6 +70,7 @@ function(llama_target_and_test source)
58
70
  add_executable(${TEST_TARGET} ${source} get-model.cpp)
59
71
  install(TARGETS ${TEST_TARGET} RUNTIME)
60
72
  target_link_libraries(${TEST_TARGET} PRIVATE common)
73
+
61
74
  add_test(
62
75
  NAME ${TEST_TARGET}
63
76
  WORKING_DIRECTORY ${LLAMA_TEST_WORKING_DIRECTORY}
@@ -68,9 +81,7 @@ function(llama_target_and_test source)
68
81
  endfunction()
69
82
 
70
83
  # build test-tokenizer-0 target once and add many tests
71
- add_executable(test-tokenizer-0 test-tokenizer-0.cpp)
72
- target_link_libraries(test-tokenizer-0 PRIVATE common)
73
- install(TARGETS test-tokenizer-0 RUNTIME)
84
+ llama_build(test-tokenizer-0.cpp)
74
85
 
75
86
  llama_test(test-tokenizer-0 NAME test-tokenizer-0-bert-bge ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bert-bge.gguf)
76
87
  llama_test(test-tokenizer-0 NAME test-tokenizer-0-command-r ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-command-r.gguf)
@@ -87,27 +98,27 @@ llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact ARGS ${CMAKE
87
98
  llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
88
99
 
89
100
  if (LLAMA_LLGUIDANCE)
90
- llama_target_and_test(test-grammar-llguidance.cpp ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
101
+ llama_build_and_test(test-grammar-llguidance.cpp ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
91
102
  endif ()
92
103
 
93
104
  if (NOT WIN32)
94
105
  # these tests are disabled on Windows because they use internal functions not exported with LLAMA_API
95
- llama_target_and_test(test-sampling.cpp)
96
- llama_target_and_test(test-grammar-parser.cpp)
97
- llama_target_and_test(test-grammar-integration.cpp)
98
- llama_target_and_test(test-llama-grammar.cpp)
99
- llama_target_and_test(test-chat.cpp)
106
+ llama_build_and_test(test-sampling.cpp)
107
+ llama_build_and_test(test-grammar-parser.cpp)
108
+ llama_build_and_test(test-grammar-integration.cpp)
109
+ llama_build_and_test(test-llama-grammar.cpp)
110
+ llama_build_and_test(test-chat.cpp)
100
111
  # TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8
101
112
  if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
102
- llama_target_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
113
+ llama_build_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
103
114
  target_include_directories(test-json-schema-to-grammar PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../examples/server)
104
115
  endif()
105
116
 
117
+ llama_build(test-quantize-stats.cpp)
118
+ llama_build(test-gbnf-validator.cpp)
106
119
 
107
120
  # build test-tokenizer-1-bpe target once and add many tests
108
- add_executable(test-tokenizer-1-bpe test-tokenizer-1-bpe.cpp)
109
- target_link_libraries(test-tokenizer-1-bpe PRIVATE common)
110
- install(TARGETS test-tokenizer-1-bpe RUNTIME)
121
+ llama_build(test-tokenizer-1-bpe.cpp)
111
122
 
112
123
  # TODO: disabled due to slowness
113
124
  #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-aquila ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
@@ -120,33 +131,35 @@ if (NOT WIN32)
120
131
  #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
121
132
 
122
133
  # build test-tokenizer-1-spm target once and add many tests
123
- add_executable(test-tokenizer-1-spm test-tokenizer-1-spm.cpp)
124
- target_link_libraries(test-tokenizer-1-spm PRIVATE common)
125
- install(TARGETS test-tokenizer-1-spm RUNTIME)
134
+ llama_build(test-tokenizer-1-spm.cpp)
126
135
 
127
136
  llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-llama-spm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf)
128
137
  #llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-baichuan ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
129
138
 
130
- # llama_target_and_test(test-double-float.cpp) # SLOW
139
+ # llama_build_and_test(test-double-float.cpp) # SLOW
131
140
  endif()
132
141
 
133
- llama_target_and_test(test-log.cpp)
134
- llama_target_and_test(test-arg-parser.cpp)
135
- llama_target_and_test(test-chat-template.cpp)
142
+ llama_build_and_test(test-log.cpp)
143
+ llama_build_and_test(test-chat-template.cpp)
144
+
145
+ # this fails on windows (github hosted runner) due to curl DLL not found (exit code 0xc0000135)
146
+ if (NOT WIN32)
147
+ llama_build_and_test(test-arg-parser.cpp)
148
+ endif()
136
149
 
137
- # llama_target_and_test(test-opt.cpp) # SLOW
138
- llama_target_and_test(test-gguf.cpp)
139
- llama_target_and_test(test-backend-ops.cpp)
150
+ # llama_build_and_test(test-opt.cpp) # SLOW
151
+ llama_build_and_test(test-gguf.cpp)
152
+ llama_build_and_test(test-backend-ops.cpp)
140
153
 
141
- llama_target_and_test(test-model-load-cancel.cpp LABEL "model")
142
- llama_target_and_test(test-autorelease.cpp LABEL "model")
154
+ llama_build_and_test(test-model-load-cancel.cpp LABEL "model")
155
+ llama_build_and_test(test-autorelease.cpp LABEL "model")
143
156
 
144
157
  if (NOT GGML_BACKEND_DL)
145
158
  # these tests use the backends directly and cannot be built with dynamic loading
146
- llama_target_and_test(test-barrier.cpp)
147
- llama_target_and_test(test-quantize-fns.cpp)
148
- llama_target_and_test(test-quantize-perf.cpp)
149
- llama_target_and_test(test-rope.cpp)
159
+ llama_build_and_test(test-barrier.cpp)
160
+ llama_build_and_test(test-quantize-fns.cpp)
161
+ llama_build_and_test(test-quantize-perf.cpp)
162
+ llama_build_and_test(test-rope.cpp)
150
163
  endif()
151
164
 
152
165
 
@@ -77,7 +77,7 @@ int main(void) {
77
77
 
78
78
  argv = {"binary_name", "-m", "model_file.gguf"};
79
79
  assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
80
- assert(params.model == "model_file.gguf");
80
+ assert(params.model.path == "model_file.gguf");
81
81
 
82
82
  argv = {"binary_name", "-t", "1234"};
83
83
  assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
@@ -89,7 +89,7 @@ int main(void) {
89
89
 
90
90
  argv = {"binary_name", "-m", "abc.gguf", "--predict", "6789", "--batch-size", "9090"};
91
91
  assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
92
- assert(params.model == "abc.gguf");
92
+ assert(params.model.path == "abc.gguf");
93
93
  assert(params.n_predict == 6789);
94
94
  assert(params.n_batch == 9090);
95
95
 
@@ -112,7 +112,7 @@ int main(void) {
112
112
  setenv("LLAMA_ARG_THREADS", "1010", true);
113
113
  argv = {"binary_name"};
114
114
  assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
115
- assert(params.model == "blah.gguf");
115
+ assert(params.model.path == "blah.gguf");
116
116
  assert(params.cpuparams.n_threads == 1010);
117
117
 
118
118
 
@@ -122,10 +122,57 @@ int main(void) {
122
122
  setenv("LLAMA_ARG_THREADS", "1010", true);
123
123
  argv = {"binary_name", "-m", "overwritten.gguf"};
124
124
  assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
125
- assert(params.model == "overwritten.gguf");
125
+ assert(params.model.path == "overwritten.gguf");
126
126
  assert(params.cpuparams.n_threads == 1010);
127
127
  #endif // _WIN32
128
128
 
129
+ if (common_has_curl()) {
130
+ printf("test-arg-parser: test curl-related functions\n\n");
131
+ const char * GOOD_URL = "https://raw.githubusercontent.com/ggml-org/llama.cpp/refs/heads/master/README.md";
132
+ const char * BAD_URL = "https://www.google.com/404";
133
+ const char * BIG_FILE = "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v1.bin";
134
+
135
+ {
136
+ printf("test-arg-parser: test good URL\n\n");
137
+ auto res = common_remote_get_content(GOOD_URL, {});
138
+ assert(res.first == 200);
139
+ assert(res.second.size() > 0);
140
+ std::string str(res.second.data(), res.second.size());
141
+ assert(str.find("llama.cpp") != std::string::npos);
142
+ }
143
+
144
+ {
145
+ printf("test-arg-parser: test bad URL\n\n");
146
+ auto res = common_remote_get_content(BAD_URL, {});
147
+ assert(res.first == 404);
148
+ }
149
+
150
+ {
151
+ printf("test-arg-parser: test max size error\n");
152
+ common_remote_params params;
153
+ params.max_size = 1;
154
+ try {
155
+ common_remote_get_content(GOOD_URL, params);
156
+ assert(false && "it should throw an error");
157
+ } catch (std::exception & e) {
158
+ printf(" expected error: %s\n\n", e.what());
159
+ }
160
+ }
161
+
162
+ {
163
+ printf("test-arg-parser: test timeout error\n");
164
+ common_remote_params params;
165
+ params.timeout = 1;
166
+ try {
167
+ common_remote_get_content(BIG_FILE, params);
168
+ assert(false && "it should throw an error");
169
+ } catch (std::exception & e) {
170
+ printf(" expected error: %s\n\n", e.what());
171
+ }
172
+ }
173
+ } else {
174
+ printf("test-arg-parser: no curl, skipping curl-related functions\n");
175
+ }
129
176
 
130
177
  printf("test-arg-parser: all tests OK\n\n");
131
178
  }