@fugood/llama.node 1.4.7 → 1.4.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. package/lib/binding.ts +8 -0
  2. package/package.json +15 -15
  3. package/scripts/llama.cpp.patch +23 -24
  4. package/src/LlamaContext.cpp +4 -2
  5. package/src/llama.cpp/common/CMakeLists.txt +2 -0
  6. package/src/llama.cpp/common/arg.cpp +470 -223
  7. package/src/llama.cpp/common/arg.h +43 -2
  8. package/src/llama.cpp/common/chat-peg-parser.cpp +16 -2
  9. package/src/llama.cpp/common/chat.cpp +140 -0
  10. package/src/llama.cpp/common/common.cpp +130 -67
  11. package/src/llama.cpp/common/common.h +44 -17
  12. package/src/llama.cpp/common/console.cpp +98 -18
  13. package/src/llama.cpp/common/console.h +30 -8
  14. package/src/llama.cpp/common/download.cpp +69 -25
  15. package/src/llama.cpp/common/json-schema-to-grammar.cpp +132 -3
  16. package/src/llama.cpp/common/json-schema-to-grammar.h +20 -0
  17. package/src/llama.cpp/common/log.cpp +5 -0
  18. package/src/llama.cpp/common/log.h +1 -0
  19. package/src/llama.cpp/common/peg-parser.cpp +1 -1
  20. package/src/llama.cpp/common/preset.cpp +206 -0
  21. package/src/llama.cpp/common/preset.h +32 -0
  22. package/src/llama.cpp/common/sampling.cpp +67 -54
  23. package/src/llama.cpp/common/sampling.h +8 -0
  24. package/src/llama.cpp/ggml/CMakeLists.txt +4 -0
  25. package/src/llama.cpp/ggml/include/ggml-alloc.h +9 -0
  26. package/src/llama.cpp/ggml/include/ggml-backend.h +1 -0
  27. package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
  28. package/src/llama.cpp/ggml/include/ggml.h +7 -8
  29. package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
  30. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +4 -0
  31. package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +285 -0
  32. package/src/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +28 -0
  33. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +111 -45
  34. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
  35. package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +288 -1
  36. package/src/llama.cpp/ggml/src/ggml-cpu/repack.h +8 -0
  37. package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +41 -1
  38. package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +125 -22
  39. package/src/llama.cpp/include/llama.h +18 -1
  40. package/src/llama.cpp/src/llama-arch.cpp +1890 -2248
  41. package/src/llama.cpp/src/llama-arch.h +9 -2
  42. package/src/llama.cpp/src/llama-batch.cpp +12 -2
  43. package/src/llama.cpp/src/llama-batch.h +4 -2
  44. package/src/llama.cpp/src/llama-context.cpp +93 -23
  45. package/src/llama.cpp/src/llama-context.h +8 -2
  46. package/src/llama.cpp/src/llama-graph.cpp +84 -16
  47. package/src/llama.cpp/src/llama-graph.h +17 -4
  48. package/src/llama.cpp/src/llama-hparams.cpp +6 -0
  49. package/src/llama.cpp/src/llama-hparams.h +5 -1
  50. package/src/llama.cpp/src/llama-impl.cpp +4 -0
  51. package/src/llama.cpp/src/llama-kv-cache.cpp +90 -42
  52. package/src/llama.cpp/src/llama-kv-cache.h +19 -2
  53. package/src/llama.cpp/src/llama-memory-hybrid.cpp +1 -1
  54. package/src/llama.cpp/src/llama-mmap.cpp +123 -28
  55. package/src/llama.cpp/src/llama-mmap.h +5 -1
  56. package/src/llama.cpp/src/llama-model-loader.cpp +58 -13
  57. package/src/llama.cpp/src/llama-model-loader.h +2 -0
  58. package/src/llama.cpp/src/llama-model.cpp +110 -49
  59. package/src/llama.cpp/src/llama-model.h +1 -0
  60. package/src/llama.cpp/src/llama-quant.cpp +1 -1
  61. package/src/llama.cpp/src/llama-sampling.cpp +16 -0
  62. package/src/llama.cpp/src/llama-vocab.cpp +2 -1
  63. package/src/llama.cpp/src/llama.cpp +665 -1
  64. package/src/llama.cpp/src/models/deepseek2.cpp +9 -5
  65. package/src/llama.cpp/src/models/glm4-moe.cpp +28 -11
  66. package/src/llama.cpp/src/models/glm4.cpp +27 -4
  67. package/src/llama.cpp/src/models/models.h +5 -5
  68. package/src/llama.cpp/src/models/nemotron-h.cpp +35 -6
  69. package/src/llama.cpp/src/models/qwen2.cpp +12 -3
  70. package/src/llama.cpp/src/models/qwen3next.cpp +81 -266
@@ -82,7 +82,8 @@ int32_t cpu_get_num_math();
82
82
  enum llama_example {
83
83
  LLAMA_EXAMPLE_COMMON,
84
84
  LLAMA_EXAMPLE_SPECULATIVE,
85
- LLAMA_EXAMPLE_MAIN,
85
+ LLAMA_EXAMPLE_COMPLETION,
86
+ LLAMA_EXAMPLE_CLI,
86
87
  LLAMA_EXAMPLE_EMBEDDING,
87
88
  LLAMA_EXAMPLE_PERPLEXITY,
88
89
  LLAMA_EXAMPLE_RETRIEVAL,
@@ -98,6 +99,7 @@ enum llama_example {
98
99
  LLAMA_EXAMPLE_TTS,
99
100
  LLAMA_EXAMPLE_DIFFUSION,
100
101
  LLAMA_EXAMPLE_FINETUNE,
102
+ LLAMA_EXAMPLE_FIT_PARAMS,
101
103
 
102
104
  LLAMA_EXAMPLE_COUNT,
103
105
  };
@@ -194,7 +196,6 @@ struct common_params_sampling {
194
196
 
195
197
  std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY
196
198
 
197
-
198
199
  std::vector<enum common_sampler_type> samplers = {
199
200
  COMMON_SAMPLER_TYPE_PENALTIES,
200
201
  COMMON_SAMPLER_TYPE_DRY,
@@ -215,6 +216,10 @@ struct common_params_sampling {
215
216
  std::vector<llama_logit_bias> logit_bias; // logit biases to apply
216
217
  std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens
217
218
 
219
+ bool has_logit_bias() const {
220
+ return !logit_bias.empty();
221
+ }
222
+
218
223
  // print the parameters into a string
219
224
  std::string print() const;
220
225
  };
@@ -303,8 +308,8 @@ struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
303
308
 
304
309
  struct common_params {
305
310
  bool vocab_only = false;
306
- int32_t n_predict = -1; // new tokens to predict
307
- int32_t n_ctx = 4096; // context size
311
+ int32_t n_predict = -1; // max. number of new tokens to predict, -1 == no limit
312
+ int32_t n_ctx = 0; // context size, 0 == context the model was trained with
308
313
  int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
309
314
  int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
310
315
  int32_t n_keep = 0; // number of tokens to keep from initial prompt
@@ -325,9 +330,12 @@ struct common_params {
325
330
  // offload params
326
331
  std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
327
332
 
328
- int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
329
- int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
330
- float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
333
+ int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
334
+ int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
335
+ float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
336
+ bool fit_params = true; // whether to fit unset model/context parameters to free device memory
337
+ size_t fit_params_target = 1024 * 1024*1024; // margin per device in bytes for fitting parameters to free memory
338
+ int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
331
339
 
332
340
  enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
333
341
 
@@ -407,6 +415,7 @@ struct common_params {
407
415
  bool simple_io = false; // improves compatibility with subprocesses and limited consoles
408
416
  bool cont_batching = true; // insert new sequences for decoding on-the-fly
409
417
  bool no_perf = false; // disable performance metrics
418
+ bool show_timings = true; // show timing information on CLI
410
419
  bool ctx_shift = false; // context shift on infinite text generation
411
420
  bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
412
421
  bool kv_unified = false; // enable unified KV cache
@@ -463,7 +472,7 @@ struct common_params {
463
472
  std::string public_path = ""; // NOLINT
464
473
  std::string api_prefix = ""; // NOLINT
465
474
  std::string chat_template = ""; // NOLINT
466
- bool use_jinja = false; // NOLINT
475
+ bool use_jinja = true; // NOLINT
467
476
  bool enable_chat_template = true;
468
477
  common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
469
478
  int reasoning_budget = -1;
@@ -476,16 +485,20 @@ struct common_params {
476
485
 
477
486
  std::map<std::string, std::string> default_template_kwargs;
478
487
 
488
+ // webui configs
489
+ bool webui = true;
490
+ std::string webui_config_json;
491
+
479
492
  // "advanced" endpoints are disabled by default for better security
480
- bool webui = true;
481
493
  bool endpoint_slots = true;
482
494
  bool endpoint_props = false; // only control POST requests, not GET
483
495
  bool endpoint_metrics = false;
484
496
 
485
497
  // router server configs
486
- std::string models_dir = ""; // directory containing models for the router server
487
- int models_max = 4; // maximum number of models to load simultaneously
488
- bool models_autoload = true; // automatically load models when requested via the router server
498
+ std::string models_dir = ""; // directory containing models for the router server
499
+ std::string models_preset = ""; // directory containing model presets for the router server
500
+ int models_max = 4; // maximum number of models to load simultaneously
501
+ bool models_autoload = true; // automatically load models when requested via the router server
489
502
 
490
503
  bool log_json = false;
491
504
 
@@ -667,15 +680,29 @@ bool tty_can_use_colors();
667
680
  // Model utils
668
681
  //
669
682
 
670
- // note: defines object's lifetime
683
+ struct common_sampler;
684
+
685
+ // note: defines the model, context, samplers, ets. lifetimes
671
686
  struct common_init_result {
672
- llama_model_ptr model;
673
- llama_context_ptr context;
687
+ common_init_result(common_params & params);
688
+ ~common_init_result();
674
689
 
675
- std::vector<llama_adapter_lora_ptr> lora;
690
+ llama_model * model();
691
+ llama_context * context();
692
+ common_sampler * sampler(llama_seq_id seq_id);
693
+
694
+ std::vector<llama_adapter_lora_ptr> & lora();
695
+
696
+ void free_context();
697
+
698
+ private:
699
+ struct impl;
700
+ std::unique_ptr<impl> pimpl;
676
701
  };
677
702
 
678
- struct common_init_result common_init_from_params(common_params & params);
703
+ using common_init_result_ptr = std::unique_ptr<common_init_result>;
704
+
705
+ common_init_result_ptr common_init_from_params(common_params & params);
679
706
 
680
707
  struct llama_model_params common_model_params_to_llama ( common_params & params);
681
708
  struct llama_context_params common_context_params_to_llama(const common_params & params);
@@ -1,4 +1,5 @@
1
1
  #include "console.h"
2
+ #include "log.h"
2
3
  #include <vector>
3
4
  #include <iostream>
4
5
  #include <cassert>
@@ -6,6 +7,10 @@
6
7
  #include <cctype>
7
8
  #include <cwctype>
8
9
  #include <cstdint>
10
+ #include <condition_variable>
11
+ #include <mutex>
12
+ #include <thread>
13
+ #include <stdarg.h>
9
14
 
10
15
  #if defined(_WIN32)
11
16
  #define WIN32_LEAN_AND_MEAN
@@ -35,6 +40,7 @@
35
40
  #define ANSI_COLOR_BLUE "\x1b[34m"
36
41
  #define ANSI_COLOR_MAGENTA "\x1b[35m"
37
42
  #define ANSI_COLOR_CYAN "\x1b[36m"
43
+ #define ANSI_COLOR_GRAY "\x1b[90m"
38
44
  #define ANSI_COLOR_RESET "\x1b[0m"
39
45
  #define ANSI_BOLD "\x1b[1m"
40
46
 
@@ -61,17 +67,17 @@ namespace console {
61
67
  //
62
68
  #endif
63
69
 
64
- static bool advanced_display = false;
65
- static bool simple_io = true;
66
- static display_t current_display = reset;
70
+ static bool advanced_display = false;
71
+ static bool simple_io = true;
72
+ static display_type current_display = DISPLAY_TYPE_RESET;
67
73
 
68
- static FILE* out = stdout;
74
+ static FILE* out = stdout;
69
75
 
70
76
  #if defined (_WIN32)
71
- static void* hConsole;
77
+ static void* hConsole;
72
78
  #else
73
- static FILE* tty = nullptr;
74
- static termios initial_state;
79
+ static FILE* tty = nullptr;
80
+ static termios initial_state;
75
81
  #endif
76
82
 
77
83
  //
@@ -142,7 +148,7 @@ namespace console {
142
148
 
143
149
  void cleanup() {
144
150
  // Reset console display
145
- set_display(reset);
151
+ set_display(DISPLAY_TYPE_RESET);
146
152
 
147
153
  #if !defined(_WIN32)
148
154
  // Restore settings on POSIX systems
@@ -162,20 +168,26 @@ namespace console {
162
168
  //
163
169
 
164
170
  // Keep track of current display and only emit ANSI code if it changes
165
- void set_display(display_t display) {
171
+ void set_display(display_type display) {
166
172
  if (advanced_display && current_display != display) {
167
- fflush(stdout);
173
+ common_log_flush(common_log_main());
168
174
  switch(display) {
169
- case reset:
175
+ case DISPLAY_TYPE_RESET:
170
176
  fprintf(out, ANSI_COLOR_RESET);
171
177
  break;
172
- case prompt:
178
+ case DISPLAY_TYPE_INFO:
179
+ fprintf(out, ANSI_COLOR_MAGENTA);
180
+ break;
181
+ case DISPLAY_TYPE_PROMPT:
173
182
  fprintf(out, ANSI_COLOR_YELLOW);
174
183
  break;
175
- case user_input:
184
+ case DISPLAY_TYPE_REASONING:
185
+ fprintf(out, ANSI_COLOR_GRAY);
186
+ break;
187
+ case DISPLAY_TYPE_USER_INPUT:
176
188
  fprintf(out, ANSI_BOLD ANSI_COLOR_GREEN);
177
189
  break;
178
- case error:
190
+ case DISPLAY_TYPE_ERROR:
179
191
  fprintf(out, ANSI_BOLD ANSI_COLOR_RED);
180
192
  }
181
193
  current_display = display;
@@ -778,7 +790,6 @@ namespace console {
778
790
  }
779
791
 
780
792
  if (is_special_char) {
781
- set_display(user_input);
782
793
  replace_last(line.back());
783
794
  is_special_char = false;
784
795
  }
@@ -961,7 +972,6 @@ namespace console {
961
972
  }
962
973
 
963
974
  if (!line.empty() && (line.back() == '\\' || line.back() == '/')) {
964
- set_display(prompt);
965
975
  replace_last(line.back());
966
976
  is_special_char = true;
967
977
  }
@@ -1046,12 +1056,82 @@ namespace console {
1046
1056
  }
1047
1057
 
1048
1058
  bool readline(std::string & line, bool multiline_input) {
1049
- set_display(user_input);
1050
-
1051
1059
  if (simple_io) {
1052
1060
  return readline_simple(line, multiline_input);
1053
1061
  }
1054
1062
  return readline_advanced(line, multiline_input);
1055
1063
  }
1056
1064
 
1065
+ namespace spinner {
1066
+ static const char LOADING_CHARS[] = {'|', '/', '-', '\\'};
1067
+ static std::condition_variable cv_stop;
1068
+ static std::thread th;
1069
+ static size_t frame = 0; // only modified by one thread
1070
+ static bool running = false;
1071
+ static std::mutex mtx;
1072
+ static auto wait_time = std::chrono::milliseconds(100);
1073
+ static void draw_next_frame() {
1074
+ // don't need lock because only one thread modifies running
1075
+ frame = (frame + 1) % sizeof(LOADING_CHARS);
1076
+ replace_last(LOADING_CHARS[frame]);
1077
+ fflush(out);
1078
+ }
1079
+ void start() {
1080
+ std::unique_lock<std::mutex> lock(mtx);
1081
+ if (simple_io || running) {
1082
+ return;
1083
+ }
1084
+ common_log_flush(common_log_main());
1085
+ fprintf(out, "%c", LOADING_CHARS[0]);
1086
+ fflush(out);
1087
+ frame = 1;
1088
+ running = true;
1089
+ th = std::thread([]() {
1090
+ std::unique_lock<std::mutex> lock(mtx);
1091
+ while (true) {
1092
+ if (cv_stop.wait_for(lock, wait_time, []{ return !running; })) {
1093
+ break;
1094
+ }
1095
+ draw_next_frame();
1096
+ }
1097
+ });
1098
+ }
1099
+ void stop() {
1100
+ {
1101
+ std::unique_lock<std::mutex> lock(mtx);
1102
+ if (simple_io || !running) {
1103
+ return;
1104
+ }
1105
+ running = false;
1106
+ cv_stop.notify_all();
1107
+ }
1108
+ if (th.joinable()) {
1109
+ th.join();
1110
+ }
1111
+ replace_last(' ');
1112
+ pop_cursor();
1113
+ fflush(out);
1114
+ }
1115
+ }
1116
+
1117
+ void log(const char * fmt, ...) {
1118
+ va_list args;
1119
+ va_start(args, fmt);
1120
+ vfprintf(out, fmt, args);
1121
+ va_end(args);
1122
+ }
1123
+
1124
+ void error(const char * fmt, ...) {
1125
+ va_list args;
1126
+ va_start(args, fmt);
1127
+ display_type cur = current_display;
1128
+ set_display(DISPLAY_TYPE_ERROR);
1129
+ vfprintf(out, fmt, args);
1130
+ set_display(cur); // restore previous color
1131
+ va_end(args);
1132
+ }
1133
+
1134
+ void flush() {
1135
+ fflush(out);
1136
+ }
1057
1137
  }
@@ -2,18 +2,40 @@
2
2
 
3
3
  #pragma once
4
4
 
5
+ #include "common.h"
6
+
5
7
  #include <string>
6
8
 
7
- namespace console {
8
- enum display_t {
9
- reset = 0,
10
- prompt,
11
- user_input,
12
- error
13
- };
9
+ enum display_type {
10
+ DISPLAY_TYPE_RESET = 0,
11
+ DISPLAY_TYPE_INFO,
12
+ DISPLAY_TYPE_PROMPT,
13
+ DISPLAY_TYPE_REASONING,
14
+ DISPLAY_TYPE_USER_INPUT,
15
+ DISPLAY_TYPE_ERROR
16
+ };
14
17
 
18
+ namespace console {
15
19
  void init(bool use_simple_io, bool use_advanced_display);
16
20
  void cleanup();
17
- void set_display(display_t display);
21
+ void set_display(display_type display);
18
22
  bool readline(std::string & line, bool multiline_input);
23
+
24
+ namespace spinner {
25
+ void start();
26
+ void stop();
27
+ }
28
+
29
+ // note: the logging API below output directly to stdout
30
+ // it can negatively impact performance if used on inference thread
31
+ // only use in in a dedicated CLI thread
32
+ // for logging in inference thread, use log.h instead
33
+
34
+ LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
35
+ void log(const char * fmt, ...);
36
+
37
+ LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
38
+ void error(const char * fmt, ...);
39
+
40
+ void flush();
19
41
  }
@@ -12,6 +12,8 @@
12
12
  #include <filesystem>
13
13
  #include <fstream>
14
14
  #include <future>
15
+ #include <map>
16
+ #include <mutex>
15
17
  #include <regex>
16
18
  #include <string>
17
19
  #include <thread>
@@ -472,36 +474,79 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &
472
474
 
473
475
  #elif defined(LLAMA_USE_HTTPLIB)
474
476
 
475
- static bool is_output_a_tty() {
477
+ class ProgressBar {
478
+ static inline std::mutex mutex;
479
+ static inline std::map<const ProgressBar *, int> lines;
480
+ static inline int max_line = 0;
481
+
482
+ static void cleanup(const ProgressBar * line) {
483
+ lines.erase(line);
484
+ if (lines.empty()) {
485
+ max_line = 0;
486
+ }
487
+ }
488
+
489
+ static bool is_output_a_tty() {
476
490
  #if defined(_WIN32)
477
- return _isatty(_fileno(stdout));
491
+ return _isatty(_fileno(stdout));
478
492
  #else
479
- return isatty(1);
493
+ return isatty(1);
480
494
  #endif
481
- }
495
+ }
482
496
 
483
- static void print_progress(size_t current, size_t total) {
484
- if (!is_output_a_tty()) {
485
- return;
497
+ public:
498
+ ProgressBar() = default;
499
+
500
+ ~ProgressBar() {
501
+ std::lock_guard<std::mutex> lock(mutex);
502
+ cleanup(this);
486
503
  }
487
504
 
488
- if (!total) {
489
- return;
505
+ void update(size_t current, size_t total) {
506
+ if (!is_output_a_tty()) {
507
+ return;
508
+ }
509
+
510
+ if (!total) {
511
+ return;
512
+ }
513
+
514
+ std::lock_guard<std::mutex> lock(mutex);
515
+
516
+ if (lines.find(this) == lines.end()) {
517
+ lines[this] = max_line++;
518
+ std::cout << "\n";
519
+ }
520
+ int lines_up = max_line - lines[this];
521
+
522
+ size_t width = 50;
523
+ size_t pct = (100 * current) / total;
524
+ size_t pos = (width * current) / total;
525
+
526
+ std::cout << "\033[s";
527
+
528
+ if (lines_up > 0) {
529
+ std::cout << "\033[" << lines_up << "A";
530
+ }
531
+ std::cout << "\033[2K\r["
532
+ << std::string(pos, '=')
533
+ << (pos < width ? ">" : "")
534
+ << std::string(width - pos, ' ')
535
+ << "] " << std::setw(3) << pct << "% ("
536
+ << current / (1024 * 1024) << " MB / "
537
+ << total / (1024 * 1024) << " MB) "
538
+ << "\033[u";
539
+
540
+ std::cout.flush();
541
+
542
+ if (current == total) {
543
+ cleanup(this);
544
+ }
490
545
  }
491
546
 
492
- size_t width = 50;
493
- size_t pct = (100 * current) / total;
494
- size_t pos = (width * current) / total;
495
-
496
- std::cout << "["
497
- << std::string(pos, '=')
498
- << (pos < width ? ">" : "")
499
- << std::string(width - pos, ' ')
500
- << "] " << std::setw(3) << pct << "% ("
501
- << current / (1024 * 1024) << " MB / "
502
- << total / (1024 * 1024) << " MB)\r";
503
- std::cout.flush();
504
- }
547
+ ProgressBar(const ProgressBar &) = delete;
548
+ ProgressBar & operator=(const ProgressBar &) = delete;
549
+ };
505
550
 
506
551
  static bool common_pull_file(httplib::Client & cli,
507
552
  const std::string & resolve_path,
@@ -523,6 +568,7 @@ static bool common_pull_file(httplib::Client & cli,
523
568
  const char * func = __func__; // avoid __func__ inside a lambda
524
569
  size_t downloaded = existing_size;
525
570
  size_t progress_step = 0;
571
+ ProgressBar bar;
526
572
 
527
573
  auto res = cli.Get(resolve_path, headers,
528
574
  [&](const httplib::Response &response) {
@@ -554,7 +600,7 @@ static bool common_pull_file(httplib::Client & cli,
554
600
  progress_step += len;
555
601
 
556
602
  if (progress_step >= total_size / 1000 || downloaded == total_size) {
557
- print_progress(downloaded, total_size);
603
+ bar.update(downloaded, total_size);
558
604
  progress_step = 0;
559
605
  }
560
606
  return true;
@@ -562,8 +608,6 @@ static bool common_pull_file(httplib::Client & cli,
562
608
  nullptr
563
609
  );
564
610
 
565
- std::cout << "\n";
566
-
567
611
  if (!res) {
568
612
  LOG_ERR("%s: error during download. Status: %d\n", __func__, res ? res->status : -1);
569
613
  return false;
@@ -305,8 +305,9 @@ static std::string format_literal(const std::string & literal) {
305
305
 
306
306
  std::string gbnf_format_literal(const std::string & literal) { return format_literal(literal); }
307
307
 
308
- class SchemaConverter {
308
+ class common_schema_converter {
309
309
  private:
310
+ friend class common_schema_info;
310
311
  friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
311
312
  std::function<json(const std::string &)> _fetch_json;
312
313
  bool _dotall;
@@ -729,7 +730,7 @@ private:
729
730
  }
730
731
 
731
732
  public:
732
- SchemaConverter(
733
+ common_schema_converter(
733
734
  const std::function<json(const std::string &)> & fetch_json,
734
735
  bool dotall)
735
736
  : _fetch_json(fetch_json), _dotall(dotall)
@@ -990,6 +991,134 @@ public:
990
991
  }
991
992
  };
992
993
 
994
+ // common_schema_info implementation (pimpl)
995
+
996
+ common_schema_info::common_schema_info()
997
+ : impl_(std::make_unique<common_schema_converter>(
998
+ [](const std::string &) { return json(); },
999
+ false)) {}
1000
+
1001
+ common_schema_info::~common_schema_info() = default;
1002
+
1003
+ common_schema_info::common_schema_info(common_schema_info &&) noexcept = default;
1004
+ common_schema_info & common_schema_info::operator=(common_schema_info &&) noexcept = default;
1005
+
1006
+ void common_schema_info::resolve_refs(nlohmann::ordered_json & schema) {
1007
+ impl_->resolve_refs(schema, "");
1008
+ }
1009
+
1010
+ // Determines if a JSON schema can resolve to a string type through any path.
1011
+ // Some models emit raw string values rather than JSON-encoded strings for string parameters.
1012
+ // If any branch of the schema (via oneOf, anyOf, $ref, etc.) permits a string, this returns
1013
+ // true, allowing callers to handle the value as a raw string for simplicity.
1014
+ bool common_schema_info::resolves_to_string(const nlohmann::ordered_json & schema) {
1015
+ std::unordered_set<std::string> visited_refs;
1016
+
1017
+ std::function<bool(const json &)> check = [&](const json & s) -> bool {
1018
+ if (!s.is_object()) {
1019
+ return false;
1020
+ }
1021
+
1022
+ // Handle $ref
1023
+ if (s.contains("$ref")) {
1024
+ const std::string & ref = s["$ref"];
1025
+ if (visited_refs.find(ref) != visited_refs.end()) {
1026
+ // Circular reference, assume not a string to be safe
1027
+ return false;
1028
+ }
1029
+ visited_refs.insert(ref);
1030
+ auto it = impl_->_refs.find(ref);
1031
+ if (it != impl_->_refs.end()) {
1032
+ return check(it->second);
1033
+ }
1034
+ return false;
1035
+ }
1036
+
1037
+ // Check type field
1038
+ if (s.contains("type")) {
1039
+ const json & schema_type = s["type"];
1040
+ if (schema_type.is_string()) {
1041
+ if (schema_type == "string") {
1042
+ return true;
1043
+ }
1044
+ } else if (schema_type.is_array()) {
1045
+ // Type can be an array like ["string", "null"]
1046
+ for (const auto & t : schema_type) {
1047
+ if (t == "string") {
1048
+ return true;
1049
+ }
1050
+ }
1051
+ }
1052
+ }
1053
+
1054
+ // Check oneOf/anyOf - if any alternative can be a string
1055
+ if (s.contains("oneOf")) {
1056
+ for (const auto & alt : s["oneOf"]) {
1057
+ if (check(alt)) {
1058
+ return true;
1059
+ }
1060
+ }
1061
+ }
1062
+ if (s.contains("anyOf")) {
1063
+ for (const auto & alt : s["anyOf"]) {
1064
+ if (check(alt)) {
1065
+ return true;
1066
+ }
1067
+ }
1068
+ }
1069
+
1070
+ // Check allOf - all components must be compatible with string type
1071
+ if (s.contains("allOf")) {
1072
+ bool all_string = true;
1073
+ for (const auto & component : s["allOf"]) {
1074
+ if (!check(component)) {
1075
+ all_string = false;
1076
+ break;
1077
+ }
1078
+ }
1079
+ if (all_string) {
1080
+ return true;
1081
+ }
1082
+ }
1083
+
1084
+ // Check const - if the constant value is a string
1085
+ if (s.contains("const")) {
1086
+ if (s["const"].is_string()) {
1087
+ return true;
1088
+ }
1089
+ }
1090
+
1091
+ // Check enum - if any enum value is a string
1092
+ if (s.contains("enum")) {
1093
+ for (const auto & val : s["enum"]) {
1094
+ if (val.is_string()) {
1095
+ return true;
1096
+ }
1097
+ }
1098
+ }
1099
+
1100
+ // String-specific keywords imply string type
1101
+ if (s.contains("pattern") || s.contains("minLength") || s.contains("maxLength")) {
1102
+ return true;
1103
+ }
1104
+
1105
+ // Check format - many formats imply string
1106
+ if (s.contains("format")) {
1107
+ const std::string & fmt = s["format"];
1108
+ if (fmt == "date" || fmt == "time" || fmt == "date-time" ||
1109
+ fmt == "uri" || fmt == "email" || fmt == "hostname" ||
1110
+ fmt == "ipv4" || fmt == "ipv6" || fmt == "uuid" ||
1111
+ fmt.find("uuid") == 0) {
1112
+ return true;
1113
+ }
1114
+ }
1115
+
1116
+ return false;
1117
+ };
1118
+
1119
+ return check(schema);
1120
+ }
1121
+
993
1122
  std::string json_schema_to_grammar(const json & schema, bool force_gbnf) {
994
1123
  #ifdef LLAMA_USE_LLGUIDANCE
995
1124
  if (!force_gbnf) {
@@ -1006,7 +1135,7 @@ std::string json_schema_to_grammar(const json & schema, bool force_gbnf) {
1006
1135
  }
1007
1136
 
1008
1137
  std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options) {
1009
- SchemaConverter converter([&](const std::string &) { return json(); }, options.dotall);
1138
+ common_schema_converter converter([&](const std::string &) { return json(); }, options.dotall);
1010
1139
  common_grammar_builder builder {
1011
1140
  /* .add_rule = */ [&](const std::string & name, const std::string & rule) {
1012
1141
  return converter._add_rule(name, rule);