@fugood/llama.node 1.4.7 → 1.4.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/binding.ts +8 -0
- package/package.json +15 -15
- package/scripts/llama.cpp.patch +22 -23
- package/src/LlamaContext.cpp +2 -2
- package/src/llama.cpp/common/CMakeLists.txt +2 -0
- package/src/llama.cpp/common/arg.cpp +364 -193
- package/src/llama.cpp/common/arg.h +43 -2
- package/src/llama.cpp/common/chat-peg-parser.cpp +16 -2
- package/src/llama.cpp/common/chat.cpp +140 -0
- package/src/llama.cpp/common/common.cpp +130 -67
- package/src/llama.cpp/common/common.h +40 -16
- package/src/llama.cpp/common/console.cpp +98 -18
- package/src/llama.cpp/common/console.h +30 -8
- package/src/llama.cpp/common/download.cpp +69 -25
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +132 -3
- package/src/llama.cpp/common/json-schema-to-grammar.h +20 -0
- package/src/llama.cpp/common/log.cpp +5 -0
- package/src/llama.cpp/common/log.h +1 -0
- package/src/llama.cpp/common/peg-parser.cpp +1 -1
- package/src/llama.cpp/common/preset.cpp +206 -0
- package/src/llama.cpp/common/preset.h +32 -0
- package/src/llama.cpp/common/sampling.cpp +91 -92
- package/src/llama.cpp/common/sampling.h +11 -6
- package/src/llama.cpp/common/speculative.cpp +1 -1
- package/src/llama.cpp/ggml/CMakeLists.txt +4 -0
- package/src/llama.cpp/ggml/include/ggml-alloc.h +9 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +1 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +1 -0
- package/src/llama.cpp/ggml/include/ggml.h +7 -8
- package/src/llama.cpp/ggml/src/CMakeLists.txt +3 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +60 -39
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +4 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/repack.cpp +2 -1
- package/src/llama.cpp/include/llama.h +18 -1
- package/src/llama.cpp/src/llama-arch.cpp +1890 -2248
- package/src/llama.cpp/src/llama-arch.h +9 -2
- package/src/llama.cpp/src/llama-batch.cpp +12 -2
- package/src/llama.cpp/src/llama-batch.h +4 -2
- package/src/llama.cpp/src/llama-context.cpp +93 -23
- package/src/llama.cpp/src/llama-context.h +8 -2
- package/src/llama.cpp/src/llama-graph.cpp +84 -16
- package/src/llama.cpp/src/llama-graph.h +17 -4
- package/src/llama.cpp/src/llama-hparams.cpp +6 -0
- package/src/llama.cpp/src/llama-hparams.h +5 -1
- package/src/llama.cpp/src/llama-impl.cpp +4 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +90 -42
- package/src/llama.cpp/src/llama-kv-cache.h +19 -2
- package/src/llama.cpp/src/llama-memory-hybrid.cpp +1 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +2 -0
- package/src/llama.cpp/src/llama-model-loader.h +2 -0
- package/src/llama.cpp/src/llama-model.cpp +103 -44
- package/src/llama.cpp/src/llama-model.h +1 -0
- package/src/llama.cpp/src/llama-quant.cpp +1 -1
- package/src/llama.cpp/src/llama-vocab.cpp +2 -1
- package/src/llama.cpp/src/llama.cpp +675 -1
- package/src/llama.cpp/src/models/deepseek2.cpp +9 -5
- package/src/llama.cpp/src/models/glm4-moe.cpp +28 -11
- package/src/llama.cpp/src/models/glm4.cpp +27 -4
- package/src/llama.cpp/src/models/models.h +5 -5
- package/src/llama.cpp/src/models/nemotron-h.cpp +35 -6
- package/src/llama.cpp/src/models/qwen2.cpp +12 -3
- package/src/llama.cpp/src/models/qwen3next.cpp +81 -266
|
@@ -82,7 +82,8 @@ int32_t cpu_get_num_math();
|
|
|
82
82
|
enum llama_example {
|
|
83
83
|
LLAMA_EXAMPLE_COMMON,
|
|
84
84
|
LLAMA_EXAMPLE_SPECULATIVE,
|
|
85
|
-
|
|
85
|
+
LLAMA_EXAMPLE_COMPLETION,
|
|
86
|
+
LLAMA_EXAMPLE_CLI,
|
|
86
87
|
LLAMA_EXAMPLE_EMBEDDING,
|
|
87
88
|
LLAMA_EXAMPLE_PERPLEXITY,
|
|
88
89
|
LLAMA_EXAMPLE_RETRIEVAL,
|
|
@@ -98,6 +99,7 @@ enum llama_example {
|
|
|
98
99
|
LLAMA_EXAMPLE_TTS,
|
|
99
100
|
LLAMA_EXAMPLE_DIFFUSION,
|
|
100
101
|
LLAMA_EXAMPLE_FINETUNE,
|
|
102
|
+
LLAMA_EXAMPLE_FIT_PARAMS,
|
|
101
103
|
|
|
102
104
|
LLAMA_EXAMPLE_COUNT,
|
|
103
105
|
};
|
|
@@ -194,7 +196,6 @@ struct common_params_sampling {
|
|
|
194
196
|
|
|
195
197
|
std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY
|
|
196
198
|
|
|
197
|
-
|
|
198
199
|
std::vector<enum common_sampler_type> samplers = {
|
|
199
200
|
COMMON_SAMPLER_TYPE_PENALTIES,
|
|
200
201
|
COMMON_SAMPLER_TYPE_DRY,
|
|
@@ -215,6 +216,10 @@ struct common_params_sampling {
|
|
|
215
216
|
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
|
|
216
217
|
std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens
|
|
217
218
|
|
|
219
|
+
bool has_logit_bias() const {
|
|
220
|
+
return !logit_bias.empty();
|
|
221
|
+
}
|
|
222
|
+
|
|
218
223
|
// print the parameters into a string
|
|
219
224
|
std::string print() const;
|
|
220
225
|
};
|
|
@@ -303,8 +308,8 @@ struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata);
|
|
|
303
308
|
|
|
304
309
|
struct common_params {
|
|
305
310
|
bool vocab_only = false;
|
|
306
|
-
int32_t n_predict = -1; // new tokens to predict
|
|
307
|
-
int32_t n_ctx =
|
|
311
|
+
int32_t n_predict = -1; // max. number of new tokens to predict, -1 == no limit
|
|
312
|
+
int32_t n_ctx = 0; // context size, 0 == context the model was trained with
|
|
308
313
|
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
|
309
314
|
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
|
|
310
315
|
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
|
@@ -325,9 +330,12 @@ struct common_params {
|
|
|
325
330
|
// offload params
|
|
326
331
|
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
|
|
327
332
|
|
|
328
|
-
int32_t n_gpu_layers
|
|
329
|
-
int32_t main_gpu
|
|
330
|
-
float tensor_split[128]
|
|
333
|
+
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
|
334
|
+
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
|
335
|
+
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
|
336
|
+
bool fit_params = true; // whether to fit unset model/context parameters to free device memory
|
|
337
|
+
size_t fit_params_target = 1024 * 1024*1024; // margin per device in bytes for fitting parameters to free memory
|
|
338
|
+
int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
|
|
331
339
|
|
|
332
340
|
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
|
|
333
341
|
|
|
@@ -407,6 +415,7 @@ struct common_params {
|
|
|
407
415
|
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
|
|
408
416
|
bool cont_batching = true; // insert new sequences for decoding on-the-fly
|
|
409
417
|
bool no_perf = false; // disable performance metrics
|
|
418
|
+
bool show_timings = true; // show timing information on CLI
|
|
410
419
|
bool ctx_shift = false; // context shift on infinite text generation
|
|
411
420
|
bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
|
|
412
421
|
bool kv_unified = false; // enable unified KV cache
|
|
@@ -463,7 +472,7 @@ struct common_params {
|
|
|
463
472
|
std::string public_path = ""; // NOLINT
|
|
464
473
|
std::string api_prefix = ""; // NOLINT
|
|
465
474
|
std::string chat_template = ""; // NOLINT
|
|
466
|
-
bool use_jinja =
|
|
475
|
+
bool use_jinja = true; // NOLINT
|
|
467
476
|
bool enable_chat_template = true;
|
|
468
477
|
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
|
|
469
478
|
int reasoning_budget = -1;
|
|
@@ -483,9 +492,10 @@ struct common_params {
|
|
|
483
492
|
bool endpoint_metrics = false;
|
|
484
493
|
|
|
485
494
|
// router server configs
|
|
486
|
-
std::string models_dir
|
|
487
|
-
|
|
488
|
-
|
|
495
|
+
std::string models_dir = ""; // directory containing models for the router server
|
|
496
|
+
std::string models_preset = ""; // directory containing model presets for the router server
|
|
497
|
+
int models_max = 4; // maximum number of models to load simultaneously
|
|
498
|
+
bool models_autoload = true; // automatically load models when requested via the router server
|
|
489
499
|
|
|
490
500
|
bool log_json = false;
|
|
491
501
|
|
|
@@ -667,15 +677,29 @@ bool tty_can_use_colors();
|
|
|
667
677
|
// Model utils
|
|
668
678
|
//
|
|
669
679
|
|
|
670
|
-
|
|
680
|
+
struct common_sampler;
|
|
681
|
+
|
|
682
|
+
// note: defines the model, context, samplers, ets. lifetimes
|
|
671
683
|
struct common_init_result {
|
|
672
|
-
|
|
673
|
-
|
|
684
|
+
common_init_result(common_params & params);
|
|
685
|
+
~common_init_result();
|
|
674
686
|
|
|
675
|
-
|
|
687
|
+
llama_model * model();
|
|
688
|
+
llama_context * context();
|
|
689
|
+
common_sampler * sampler(llama_seq_id seq_id);
|
|
690
|
+
|
|
691
|
+
std::vector<llama_adapter_lora_ptr> & lora();
|
|
692
|
+
|
|
693
|
+
void free_context();
|
|
694
|
+
|
|
695
|
+
private:
|
|
696
|
+
struct impl;
|
|
697
|
+
std::unique_ptr<impl> pimpl;
|
|
676
698
|
};
|
|
677
699
|
|
|
678
|
-
|
|
700
|
+
using common_init_result_ptr = std::unique_ptr<common_init_result>;
|
|
701
|
+
|
|
702
|
+
common_init_result_ptr common_init_from_params(common_params & params);
|
|
679
703
|
|
|
680
704
|
struct llama_model_params common_model_params_to_llama ( common_params & params);
|
|
681
705
|
struct llama_context_params common_context_params_to_llama(const common_params & params);
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
#include "console.h"
|
|
2
|
+
#include "log.h"
|
|
2
3
|
#include <vector>
|
|
3
4
|
#include <iostream>
|
|
4
5
|
#include <cassert>
|
|
@@ -6,6 +7,10 @@
|
|
|
6
7
|
#include <cctype>
|
|
7
8
|
#include <cwctype>
|
|
8
9
|
#include <cstdint>
|
|
10
|
+
#include <condition_variable>
|
|
11
|
+
#include <mutex>
|
|
12
|
+
#include <thread>
|
|
13
|
+
#include <stdarg.h>
|
|
9
14
|
|
|
10
15
|
#if defined(_WIN32)
|
|
11
16
|
#define WIN32_LEAN_AND_MEAN
|
|
@@ -35,6 +40,7 @@
|
|
|
35
40
|
#define ANSI_COLOR_BLUE "\x1b[34m"
|
|
36
41
|
#define ANSI_COLOR_MAGENTA "\x1b[35m"
|
|
37
42
|
#define ANSI_COLOR_CYAN "\x1b[36m"
|
|
43
|
+
#define ANSI_COLOR_GRAY "\x1b[90m"
|
|
38
44
|
#define ANSI_COLOR_RESET "\x1b[0m"
|
|
39
45
|
#define ANSI_BOLD "\x1b[1m"
|
|
40
46
|
|
|
@@ -61,17 +67,17 @@ namespace console {
|
|
|
61
67
|
//
|
|
62
68
|
#endif
|
|
63
69
|
|
|
64
|
-
static bool
|
|
65
|
-
static bool
|
|
66
|
-
static
|
|
70
|
+
static bool advanced_display = false;
|
|
71
|
+
static bool simple_io = true;
|
|
72
|
+
static display_type current_display = DISPLAY_TYPE_RESET;
|
|
67
73
|
|
|
68
|
-
static FILE*
|
|
74
|
+
static FILE* out = stdout;
|
|
69
75
|
|
|
70
76
|
#if defined (_WIN32)
|
|
71
|
-
static void*
|
|
77
|
+
static void* hConsole;
|
|
72
78
|
#else
|
|
73
|
-
static FILE*
|
|
74
|
-
static termios
|
|
79
|
+
static FILE* tty = nullptr;
|
|
80
|
+
static termios initial_state;
|
|
75
81
|
#endif
|
|
76
82
|
|
|
77
83
|
//
|
|
@@ -142,7 +148,7 @@ namespace console {
|
|
|
142
148
|
|
|
143
149
|
void cleanup() {
|
|
144
150
|
// Reset console display
|
|
145
|
-
set_display(
|
|
151
|
+
set_display(DISPLAY_TYPE_RESET);
|
|
146
152
|
|
|
147
153
|
#if !defined(_WIN32)
|
|
148
154
|
// Restore settings on POSIX systems
|
|
@@ -162,20 +168,26 @@ namespace console {
|
|
|
162
168
|
//
|
|
163
169
|
|
|
164
170
|
// Keep track of current display and only emit ANSI code if it changes
|
|
165
|
-
void set_display(
|
|
171
|
+
void set_display(display_type display) {
|
|
166
172
|
if (advanced_display && current_display != display) {
|
|
167
|
-
|
|
173
|
+
common_log_flush(common_log_main());
|
|
168
174
|
switch(display) {
|
|
169
|
-
case
|
|
175
|
+
case DISPLAY_TYPE_RESET:
|
|
170
176
|
fprintf(out, ANSI_COLOR_RESET);
|
|
171
177
|
break;
|
|
172
|
-
case
|
|
178
|
+
case DISPLAY_TYPE_INFO:
|
|
179
|
+
fprintf(out, ANSI_COLOR_MAGENTA);
|
|
180
|
+
break;
|
|
181
|
+
case DISPLAY_TYPE_PROMPT:
|
|
173
182
|
fprintf(out, ANSI_COLOR_YELLOW);
|
|
174
183
|
break;
|
|
175
|
-
case
|
|
184
|
+
case DISPLAY_TYPE_REASONING:
|
|
185
|
+
fprintf(out, ANSI_COLOR_GRAY);
|
|
186
|
+
break;
|
|
187
|
+
case DISPLAY_TYPE_USER_INPUT:
|
|
176
188
|
fprintf(out, ANSI_BOLD ANSI_COLOR_GREEN);
|
|
177
189
|
break;
|
|
178
|
-
case
|
|
190
|
+
case DISPLAY_TYPE_ERROR:
|
|
179
191
|
fprintf(out, ANSI_BOLD ANSI_COLOR_RED);
|
|
180
192
|
}
|
|
181
193
|
current_display = display;
|
|
@@ -778,7 +790,6 @@ namespace console {
|
|
|
778
790
|
}
|
|
779
791
|
|
|
780
792
|
if (is_special_char) {
|
|
781
|
-
set_display(user_input);
|
|
782
793
|
replace_last(line.back());
|
|
783
794
|
is_special_char = false;
|
|
784
795
|
}
|
|
@@ -961,7 +972,6 @@ namespace console {
|
|
|
961
972
|
}
|
|
962
973
|
|
|
963
974
|
if (!line.empty() && (line.back() == '\\' || line.back() == '/')) {
|
|
964
|
-
set_display(prompt);
|
|
965
975
|
replace_last(line.back());
|
|
966
976
|
is_special_char = true;
|
|
967
977
|
}
|
|
@@ -1046,12 +1056,82 @@ namespace console {
|
|
|
1046
1056
|
}
|
|
1047
1057
|
|
|
1048
1058
|
bool readline(std::string & line, bool multiline_input) {
|
|
1049
|
-
set_display(user_input);
|
|
1050
|
-
|
|
1051
1059
|
if (simple_io) {
|
|
1052
1060
|
return readline_simple(line, multiline_input);
|
|
1053
1061
|
}
|
|
1054
1062
|
return readline_advanced(line, multiline_input);
|
|
1055
1063
|
}
|
|
1056
1064
|
|
|
1065
|
+
namespace spinner {
|
|
1066
|
+
static const char LOADING_CHARS[] = {'|', '/', '-', '\\'};
|
|
1067
|
+
static std::condition_variable cv_stop;
|
|
1068
|
+
static std::thread th;
|
|
1069
|
+
static size_t frame = 0; // only modified by one thread
|
|
1070
|
+
static bool running = false;
|
|
1071
|
+
static std::mutex mtx;
|
|
1072
|
+
static auto wait_time = std::chrono::milliseconds(100);
|
|
1073
|
+
static void draw_next_frame() {
|
|
1074
|
+
// don't need lock because only one thread modifies running
|
|
1075
|
+
frame = (frame + 1) % sizeof(LOADING_CHARS);
|
|
1076
|
+
replace_last(LOADING_CHARS[frame]);
|
|
1077
|
+
fflush(out);
|
|
1078
|
+
}
|
|
1079
|
+
void start() {
|
|
1080
|
+
std::unique_lock<std::mutex> lock(mtx);
|
|
1081
|
+
if (simple_io || running) {
|
|
1082
|
+
return;
|
|
1083
|
+
}
|
|
1084
|
+
common_log_flush(common_log_main());
|
|
1085
|
+
fprintf(out, "%c", LOADING_CHARS[0]);
|
|
1086
|
+
fflush(out);
|
|
1087
|
+
frame = 1;
|
|
1088
|
+
running = true;
|
|
1089
|
+
th = std::thread([]() {
|
|
1090
|
+
std::unique_lock<std::mutex> lock(mtx);
|
|
1091
|
+
while (true) {
|
|
1092
|
+
if (cv_stop.wait_for(lock, wait_time, []{ return !running; })) {
|
|
1093
|
+
break;
|
|
1094
|
+
}
|
|
1095
|
+
draw_next_frame();
|
|
1096
|
+
}
|
|
1097
|
+
});
|
|
1098
|
+
}
|
|
1099
|
+
void stop() {
|
|
1100
|
+
{
|
|
1101
|
+
std::unique_lock<std::mutex> lock(mtx);
|
|
1102
|
+
if (simple_io || !running) {
|
|
1103
|
+
return;
|
|
1104
|
+
}
|
|
1105
|
+
running = false;
|
|
1106
|
+
cv_stop.notify_all();
|
|
1107
|
+
}
|
|
1108
|
+
if (th.joinable()) {
|
|
1109
|
+
th.join();
|
|
1110
|
+
}
|
|
1111
|
+
replace_last(' ');
|
|
1112
|
+
pop_cursor();
|
|
1113
|
+
fflush(out);
|
|
1114
|
+
}
|
|
1115
|
+
}
|
|
1116
|
+
|
|
1117
|
+
void log(const char * fmt, ...) {
|
|
1118
|
+
va_list args;
|
|
1119
|
+
va_start(args, fmt);
|
|
1120
|
+
vfprintf(out, fmt, args);
|
|
1121
|
+
va_end(args);
|
|
1122
|
+
}
|
|
1123
|
+
|
|
1124
|
+
void error(const char * fmt, ...) {
|
|
1125
|
+
va_list args;
|
|
1126
|
+
va_start(args, fmt);
|
|
1127
|
+
display_type cur = current_display;
|
|
1128
|
+
set_display(DISPLAY_TYPE_ERROR);
|
|
1129
|
+
vfprintf(out, fmt, args);
|
|
1130
|
+
set_display(cur); // restore previous color
|
|
1131
|
+
va_end(args);
|
|
1132
|
+
}
|
|
1133
|
+
|
|
1134
|
+
void flush() {
|
|
1135
|
+
fflush(out);
|
|
1136
|
+
}
|
|
1057
1137
|
}
|
|
@@ -2,18 +2,40 @@
|
|
|
2
2
|
|
|
3
3
|
#pragma once
|
|
4
4
|
|
|
5
|
+
#include "common.h"
|
|
6
|
+
|
|
5
7
|
#include <string>
|
|
6
8
|
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
9
|
+
enum display_type {
|
|
10
|
+
DISPLAY_TYPE_RESET = 0,
|
|
11
|
+
DISPLAY_TYPE_INFO,
|
|
12
|
+
DISPLAY_TYPE_PROMPT,
|
|
13
|
+
DISPLAY_TYPE_REASONING,
|
|
14
|
+
DISPLAY_TYPE_USER_INPUT,
|
|
15
|
+
DISPLAY_TYPE_ERROR
|
|
16
|
+
};
|
|
14
17
|
|
|
18
|
+
namespace console {
|
|
15
19
|
void init(bool use_simple_io, bool use_advanced_display);
|
|
16
20
|
void cleanup();
|
|
17
|
-
void set_display(
|
|
21
|
+
void set_display(display_type display);
|
|
18
22
|
bool readline(std::string & line, bool multiline_input);
|
|
23
|
+
|
|
24
|
+
namespace spinner {
|
|
25
|
+
void start();
|
|
26
|
+
void stop();
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
// note: the logging API below output directly to stdout
|
|
30
|
+
// it can negatively impact performance if used on inference thread
|
|
31
|
+
// only use in in a dedicated CLI thread
|
|
32
|
+
// for logging in inference thread, use log.h instead
|
|
33
|
+
|
|
34
|
+
LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
|
|
35
|
+
void log(const char * fmt, ...);
|
|
36
|
+
|
|
37
|
+
LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
|
|
38
|
+
void error(const char * fmt, ...);
|
|
39
|
+
|
|
40
|
+
void flush();
|
|
19
41
|
}
|
|
@@ -12,6 +12,8 @@
|
|
|
12
12
|
#include <filesystem>
|
|
13
13
|
#include <fstream>
|
|
14
14
|
#include <future>
|
|
15
|
+
#include <map>
|
|
16
|
+
#include <mutex>
|
|
15
17
|
#include <regex>
|
|
16
18
|
#include <string>
|
|
17
19
|
#include <thread>
|
|
@@ -472,36 +474,79 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &
|
|
|
472
474
|
|
|
473
475
|
#elif defined(LLAMA_USE_HTTPLIB)
|
|
474
476
|
|
|
475
|
-
|
|
477
|
+
class ProgressBar {
|
|
478
|
+
static inline std::mutex mutex;
|
|
479
|
+
static inline std::map<const ProgressBar *, int> lines;
|
|
480
|
+
static inline int max_line = 0;
|
|
481
|
+
|
|
482
|
+
static void cleanup(const ProgressBar * line) {
|
|
483
|
+
lines.erase(line);
|
|
484
|
+
if (lines.empty()) {
|
|
485
|
+
max_line = 0;
|
|
486
|
+
}
|
|
487
|
+
}
|
|
488
|
+
|
|
489
|
+
static bool is_output_a_tty() {
|
|
476
490
|
#if defined(_WIN32)
|
|
477
|
-
|
|
491
|
+
return _isatty(_fileno(stdout));
|
|
478
492
|
#else
|
|
479
|
-
|
|
493
|
+
return isatty(1);
|
|
480
494
|
#endif
|
|
481
|
-
}
|
|
495
|
+
}
|
|
482
496
|
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
497
|
+
public:
|
|
498
|
+
ProgressBar() = default;
|
|
499
|
+
|
|
500
|
+
~ProgressBar() {
|
|
501
|
+
std::lock_guard<std::mutex> lock(mutex);
|
|
502
|
+
cleanup(this);
|
|
486
503
|
}
|
|
487
504
|
|
|
488
|
-
|
|
489
|
-
|
|
505
|
+
void update(size_t current, size_t total) {
|
|
506
|
+
if (!is_output_a_tty()) {
|
|
507
|
+
return;
|
|
508
|
+
}
|
|
509
|
+
|
|
510
|
+
if (!total) {
|
|
511
|
+
return;
|
|
512
|
+
}
|
|
513
|
+
|
|
514
|
+
std::lock_guard<std::mutex> lock(mutex);
|
|
515
|
+
|
|
516
|
+
if (lines.find(this) == lines.end()) {
|
|
517
|
+
lines[this] = max_line++;
|
|
518
|
+
std::cout << "\n";
|
|
519
|
+
}
|
|
520
|
+
int lines_up = max_line - lines[this];
|
|
521
|
+
|
|
522
|
+
size_t width = 50;
|
|
523
|
+
size_t pct = (100 * current) / total;
|
|
524
|
+
size_t pos = (width * current) / total;
|
|
525
|
+
|
|
526
|
+
std::cout << "\033[s";
|
|
527
|
+
|
|
528
|
+
if (lines_up > 0) {
|
|
529
|
+
std::cout << "\033[" << lines_up << "A";
|
|
530
|
+
}
|
|
531
|
+
std::cout << "\033[2K\r["
|
|
532
|
+
<< std::string(pos, '=')
|
|
533
|
+
<< (pos < width ? ">" : "")
|
|
534
|
+
<< std::string(width - pos, ' ')
|
|
535
|
+
<< "] " << std::setw(3) << pct << "% ("
|
|
536
|
+
<< current / (1024 * 1024) << " MB / "
|
|
537
|
+
<< total / (1024 * 1024) << " MB) "
|
|
538
|
+
<< "\033[u";
|
|
539
|
+
|
|
540
|
+
std::cout.flush();
|
|
541
|
+
|
|
542
|
+
if (current == total) {
|
|
543
|
+
cleanup(this);
|
|
544
|
+
}
|
|
490
545
|
}
|
|
491
546
|
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
std::cout << "["
|
|
497
|
-
<< std::string(pos, '=')
|
|
498
|
-
<< (pos < width ? ">" : "")
|
|
499
|
-
<< std::string(width - pos, ' ')
|
|
500
|
-
<< "] " << std::setw(3) << pct << "% ("
|
|
501
|
-
<< current / (1024 * 1024) << " MB / "
|
|
502
|
-
<< total / (1024 * 1024) << " MB)\r";
|
|
503
|
-
std::cout.flush();
|
|
504
|
-
}
|
|
547
|
+
ProgressBar(const ProgressBar &) = delete;
|
|
548
|
+
ProgressBar & operator=(const ProgressBar &) = delete;
|
|
549
|
+
};
|
|
505
550
|
|
|
506
551
|
static bool common_pull_file(httplib::Client & cli,
|
|
507
552
|
const std::string & resolve_path,
|
|
@@ -523,6 +568,7 @@ static bool common_pull_file(httplib::Client & cli,
|
|
|
523
568
|
const char * func = __func__; // avoid __func__ inside a lambda
|
|
524
569
|
size_t downloaded = existing_size;
|
|
525
570
|
size_t progress_step = 0;
|
|
571
|
+
ProgressBar bar;
|
|
526
572
|
|
|
527
573
|
auto res = cli.Get(resolve_path, headers,
|
|
528
574
|
[&](const httplib::Response &response) {
|
|
@@ -554,7 +600,7 @@ static bool common_pull_file(httplib::Client & cli,
|
|
|
554
600
|
progress_step += len;
|
|
555
601
|
|
|
556
602
|
if (progress_step >= total_size / 1000 || downloaded == total_size) {
|
|
557
|
-
|
|
603
|
+
bar.update(downloaded, total_size);
|
|
558
604
|
progress_step = 0;
|
|
559
605
|
}
|
|
560
606
|
return true;
|
|
@@ -562,8 +608,6 @@ static bool common_pull_file(httplib::Client & cli,
|
|
|
562
608
|
nullptr
|
|
563
609
|
);
|
|
564
610
|
|
|
565
|
-
std::cout << "\n";
|
|
566
|
-
|
|
567
611
|
if (!res) {
|
|
568
612
|
LOG_ERR("%s: error during download. Status: %d\n", __func__, res ? res->status : -1);
|
|
569
613
|
return false;
|
|
@@ -305,8 +305,9 @@ static std::string format_literal(const std::string & literal) {
|
|
|
305
305
|
|
|
306
306
|
std::string gbnf_format_literal(const std::string & literal) { return format_literal(literal); }
|
|
307
307
|
|
|
308
|
-
class
|
|
308
|
+
class common_schema_converter {
|
|
309
309
|
private:
|
|
310
|
+
friend class common_schema_info;
|
|
310
311
|
friend std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options);
|
|
311
312
|
std::function<json(const std::string &)> _fetch_json;
|
|
312
313
|
bool _dotall;
|
|
@@ -729,7 +730,7 @@ private:
|
|
|
729
730
|
}
|
|
730
731
|
|
|
731
732
|
public:
|
|
732
|
-
|
|
733
|
+
common_schema_converter(
|
|
733
734
|
const std::function<json(const std::string &)> & fetch_json,
|
|
734
735
|
bool dotall)
|
|
735
736
|
: _fetch_json(fetch_json), _dotall(dotall)
|
|
@@ -990,6 +991,134 @@ public:
|
|
|
990
991
|
}
|
|
991
992
|
};
|
|
992
993
|
|
|
994
|
+
// common_schema_info implementation (pimpl)
|
|
995
|
+
|
|
996
|
+
common_schema_info::common_schema_info()
|
|
997
|
+
: impl_(std::make_unique<common_schema_converter>(
|
|
998
|
+
[](const std::string &) { return json(); },
|
|
999
|
+
false)) {}
|
|
1000
|
+
|
|
1001
|
+
common_schema_info::~common_schema_info() = default;
|
|
1002
|
+
|
|
1003
|
+
common_schema_info::common_schema_info(common_schema_info &&) noexcept = default;
|
|
1004
|
+
common_schema_info & common_schema_info::operator=(common_schema_info &&) noexcept = default;
|
|
1005
|
+
|
|
1006
|
+
void common_schema_info::resolve_refs(nlohmann::ordered_json & schema) {
|
|
1007
|
+
impl_->resolve_refs(schema, "");
|
|
1008
|
+
}
|
|
1009
|
+
|
|
1010
|
+
// Determines if a JSON schema can resolve to a string type through any path.
|
|
1011
|
+
// Some models emit raw string values rather than JSON-encoded strings for string parameters.
|
|
1012
|
+
// If any branch of the schema (via oneOf, anyOf, $ref, etc.) permits a string, this returns
|
|
1013
|
+
// true, allowing callers to handle the value as a raw string for simplicity.
|
|
1014
|
+
bool common_schema_info::resolves_to_string(const nlohmann::ordered_json & schema) {
|
|
1015
|
+
std::unordered_set<std::string> visited_refs;
|
|
1016
|
+
|
|
1017
|
+
std::function<bool(const json &)> check = [&](const json & s) -> bool {
|
|
1018
|
+
if (!s.is_object()) {
|
|
1019
|
+
return false;
|
|
1020
|
+
}
|
|
1021
|
+
|
|
1022
|
+
// Handle $ref
|
|
1023
|
+
if (s.contains("$ref")) {
|
|
1024
|
+
const std::string & ref = s["$ref"];
|
|
1025
|
+
if (visited_refs.find(ref) != visited_refs.end()) {
|
|
1026
|
+
// Circular reference, assume not a string to be safe
|
|
1027
|
+
return false;
|
|
1028
|
+
}
|
|
1029
|
+
visited_refs.insert(ref);
|
|
1030
|
+
auto it = impl_->_refs.find(ref);
|
|
1031
|
+
if (it != impl_->_refs.end()) {
|
|
1032
|
+
return check(it->second);
|
|
1033
|
+
}
|
|
1034
|
+
return false;
|
|
1035
|
+
}
|
|
1036
|
+
|
|
1037
|
+
// Check type field
|
|
1038
|
+
if (s.contains("type")) {
|
|
1039
|
+
const json & schema_type = s["type"];
|
|
1040
|
+
if (schema_type.is_string()) {
|
|
1041
|
+
if (schema_type == "string") {
|
|
1042
|
+
return true;
|
|
1043
|
+
}
|
|
1044
|
+
} else if (schema_type.is_array()) {
|
|
1045
|
+
// Type can be an array like ["string", "null"]
|
|
1046
|
+
for (const auto & t : schema_type) {
|
|
1047
|
+
if (t == "string") {
|
|
1048
|
+
return true;
|
|
1049
|
+
}
|
|
1050
|
+
}
|
|
1051
|
+
}
|
|
1052
|
+
}
|
|
1053
|
+
|
|
1054
|
+
// Check oneOf/anyOf - if any alternative can be a string
|
|
1055
|
+
if (s.contains("oneOf")) {
|
|
1056
|
+
for (const auto & alt : s["oneOf"]) {
|
|
1057
|
+
if (check(alt)) {
|
|
1058
|
+
return true;
|
|
1059
|
+
}
|
|
1060
|
+
}
|
|
1061
|
+
}
|
|
1062
|
+
if (s.contains("anyOf")) {
|
|
1063
|
+
for (const auto & alt : s["anyOf"]) {
|
|
1064
|
+
if (check(alt)) {
|
|
1065
|
+
return true;
|
|
1066
|
+
}
|
|
1067
|
+
}
|
|
1068
|
+
}
|
|
1069
|
+
|
|
1070
|
+
// Check allOf - all components must be compatible with string type
|
|
1071
|
+
if (s.contains("allOf")) {
|
|
1072
|
+
bool all_string = true;
|
|
1073
|
+
for (const auto & component : s["allOf"]) {
|
|
1074
|
+
if (!check(component)) {
|
|
1075
|
+
all_string = false;
|
|
1076
|
+
break;
|
|
1077
|
+
}
|
|
1078
|
+
}
|
|
1079
|
+
if (all_string) {
|
|
1080
|
+
return true;
|
|
1081
|
+
}
|
|
1082
|
+
}
|
|
1083
|
+
|
|
1084
|
+
// Check const - if the constant value is a string
|
|
1085
|
+
if (s.contains("const")) {
|
|
1086
|
+
if (s["const"].is_string()) {
|
|
1087
|
+
return true;
|
|
1088
|
+
}
|
|
1089
|
+
}
|
|
1090
|
+
|
|
1091
|
+
// Check enum - if any enum value is a string
|
|
1092
|
+
if (s.contains("enum")) {
|
|
1093
|
+
for (const auto & val : s["enum"]) {
|
|
1094
|
+
if (val.is_string()) {
|
|
1095
|
+
return true;
|
|
1096
|
+
}
|
|
1097
|
+
}
|
|
1098
|
+
}
|
|
1099
|
+
|
|
1100
|
+
// String-specific keywords imply string type
|
|
1101
|
+
if (s.contains("pattern") || s.contains("minLength") || s.contains("maxLength")) {
|
|
1102
|
+
return true;
|
|
1103
|
+
}
|
|
1104
|
+
|
|
1105
|
+
// Check format - many formats imply string
|
|
1106
|
+
if (s.contains("format")) {
|
|
1107
|
+
const std::string & fmt = s["format"];
|
|
1108
|
+
if (fmt == "date" || fmt == "time" || fmt == "date-time" ||
|
|
1109
|
+
fmt == "uri" || fmt == "email" || fmt == "hostname" ||
|
|
1110
|
+
fmt == "ipv4" || fmt == "ipv6" || fmt == "uuid" ||
|
|
1111
|
+
fmt.find("uuid") == 0) {
|
|
1112
|
+
return true;
|
|
1113
|
+
}
|
|
1114
|
+
}
|
|
1115
|
+
|
|
1116
|
+
return false;
|
|
1117
|
+
};
|
|
1118
|
+
|
|
1119
|
+
return check(schema);
|
|
1120
|
+
}
|
|
1121
|
+
|
|
993
1122
|
std::string json_schema_to_grammar(const json & schema, bool force_gbnf) {
|
|
994
1123
|
#ifdef LLAMA_USE_LLGUIDANCE
|
|
995
1124
|
if (!force_gbnf) {
|
|
@@ -1006,7 +1135,7 @@ std::string json_schema_to_grammar(const json & schema, bool force_gbnf) {
|
|
|
1006
1135
|
}
|
|
1007
1136
|
|
|
1008
1137
|
std::string build_grammar(const std::function<void(const common_grammar_builder &)> & cb, const common_grammar_options & options) {
|
|
1009
|
-
|
|
1138
|
+
common_schema_converter converter([&](const std::string &) { return json(); }, options.dotall);
|
|
1010
1139
|
common_grammar_builder builder {
|
|
1011
1140
|
/* .add_rule = */ [&](const std::string & name, const std::string & rule) {
|
|
1012
1141
|
return converter._add_rule(name, rule);
|